def test_word2vec(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) match_op = Matching() wcd = WordCentroidDistance(model.wv) retrieval = Retrieval(wcd, matching=match_op) retrieval.fit(documents) result = retrieval.query('dog') assert result[0] == 0
def test_doc2vec_inference(): tagged_docs = [ TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents) ] model = Doc2Vec(tagged_docs, epochs=1, min_count=1) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("scientists") assert result[0] == 1
def test_combined(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) wcd = WordCentroidDistance(model.wv) tfidf = Tfidf() wcd.fit(documents) # # they can operate on different feilds tfidf.fit(['fox', 'scientists']) match_op = Matching().fit(documents) combined = wcd + tfidf**2 retrieval = Retrieval(combined, matching=match_op, labels=[7, 42]) result = retrieval.query('fox') assert result[0] == 7 result = retrieval.query('scientists') assert result[0] == 42
def test_expansion_inside_retrieval(): # Integration test within full retrieval pipeline model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) n_expansions = 2 tfidf = Tfidf() match_op = Matching() expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval( tfidf, # The retrieval model matching=match_op, query_expansion=expansion_op) ids = ['fox_ex', 'surf_ex'] retrieval.fit(DOCUMENTS, ids) result = retrieval.query('surfing surfers do surf green') assert result[0] == 'surf_ex'
def test_matching(): match_op = Matching() match_op.fit(documents) matched = match_op.predict("fox") assert matched == [0]