def test_doc2vec_inference(): tagged_docs = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents)] model = Doc2Vec(tagged_docs, epochs=1, min_count=1) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("Computer scientists are lazy vec4ir-evaluate for vec4ir evaluation of an information retrieval" ) print(result)
def test_retrieval(): DOCUMENTS = ["The quick brown fox jumps over the lazy dog", "Surfing surfers do surf on green waves"] # Test retrieval with given ids tfidf = Tfidf() retrieval = Retrieval(tfidf) ids = ['fox_example', 'lazy_example'] retrieval.fit(DOCUMENTS, ids) result = retrieval.query('fox') print(result)
def test_doc2vec_inference_saveload(): tagged_docs = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents)] model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10) model.save(TEST_FILE) del model model = Doc2Vec.load(TEST_FILE) os.remove(TEST_FILE) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("scientists") assert result[0] == 1
def test_word2vec(): model = Word2Vec([DEFAULT_ANALYZER(doc) for doc in DOCUMENTS], iter=3, min_count=1) model.save('model_w2v') model.init_sims(replace=True) model = Word2Vec.load('model_w2v') match_op = Matching() wcd = WordCentroidDistance(model.wv) retrieval = Retrieval(wcd, matching=match_op) retrieval.fit(DOCUMENTS) start = time.time() # 시작 시간 저장 result, score = retrieval.query("general", return_scores=True) print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 print(result) print(score)
def test_word2vec_similar_ir(): model = Word2Vec([DEFAULT_ANALYZER(doc) for doc in DOCUMENTS], iter=3, min_count=1) model.save('model_w2v') model.init_sims(replace=True) model = Word2Vec.load('model_w2v') # match_op = Matching() wcr = Word2VecRetrieval(model.wv, analyzer=DEFAULT_ANALYZER) retrieval = Retrieval(wcr) #, matching=match_op) #, labels=['1번', '2번', '3번', '4번', '5번', '6번', '7번', '8번']) retrieval.fit(DOCUMENTS) start = time.time() # 시작 시간 저장 result, score = retrieval.query("안냥", return_scores=True) print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 print(result) print(score)
def test_word2vec(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) match_op = Matching() with pytest.raises(ValueError): wcd = WordCentroidDistance(model) wcd = WordCentroidDistance(model.wv) retrieval = Retrieval(wcd, matching=match_op) retrieval.fit(documents) result, score = retrieval.query('art news', return_scores=True) print(result) print(score) retrieval1 = Retrieval(wcd, matching=match_op, labels=['1번', '2번', '3번', '4번', '5번', '6번']) retrieval1.fit(documents) result1 = retrieval1.query('art news') print(result1)
def test_fasttext(): import config print("time :start") # 현재시각 - 시작시간 = 실행 시간 # model = FastText([doc.split() for doc in DOCUMENTS], size=100, workers=16, sg=1, iter=3, word_ngrams=5) # model = FastText([tokenize_by_eojeol_jaso(doc) for doc in DOCUMENTS], size=100, workers=16, sg=1, iter=3, word_ngrams=5) # model = FastText([to_jaso(doc) for doc in DOCUMENTS], size=50, workers=12, sg=1, iter=3, word_ngrams=1) # model = FastText([tokenize_by_eojeol_jaso(doc) for doc in DOCUMENTS], size=config.MODEL_SIZE, window=config.MODEL_WINDOW, min_count=config.MODEL_MIN_COUNT, workers=config.MODEL_WORKERS) # model = FastText([tokenize_by_eojeol_jaso(doc) for doc in DOCUMENTS], size=config.MODEL_SIZE, window=config.MODEL_WINDOW, min_count=config.MODEL_MIN_COUNT, workers=config.MODEL_WORKERS) model = FastText([tokenize_by_morpheme_char(doc) for doc in DOCUMENTS], size=config.MODEL_SIZE, window=config.MODEL_WINDOW, min_count=config.MODEL_MIN_COUNT, workers=config.MODEL_WORKERS) # model.train(DOCUMENTS, total_examples=len(DOCUMENTS), epochs=config.MODEL_EPOCHS) model.save('model_ft') print("save model_ft") # model.init_sims(replace=True) model = FastText.load('model_ft') match_op = Matching() wcd = FastTextCentroidDistance(model.wv) ### simple mode # retrieval = Retrieval(wcd, matching=match_op) ### expansion mode n_expansions = 2 expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval( wcd, # The retrieval model matching=match_op, query_expansion=expansion_op) retrieval.fit(DOCUMENTS) start = time.time() # 시작 시간 저장 q = '한국에서 가장 좋은 나라' # qparse = to_jaso(q) print(q) result, score = retrieval.query(q, return_scores=True) # result = retrieval.query(q) print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 print(result, score)
def test_expansion_inside_retrieval(): # Integration test within full retrieval pipeline model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) n_expansions = 2 tfidf = Tfidf() match_op = Matching() expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval(tfidf, # The retrieval model matching=match_op, query_expansion=expansion_op) # ids = ['fox_ex', 'surf_ex'] # retrieval.fit(DOCUMENTS, ids) retrieval.fit(DOCUMENTS) # result = retrieval.query('vec4ir evaluate uses IDF re-weighted') result = retrieval.query('art news') print(result)
def test_combined(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) wcd = WordCentroidDistance(model.wv) tfidf = Tfidf() wcd.fit(documents) # # they can operate on different feilds tfidf.fit(['fox', 'scientists']) match_op = Matching().fit(documents) combined = wcd + tfidf ** 2 retrieval = Retrieval(combined, matching=match_op, labels=[7,42]) result, score = retrieval.query('fox', return_scores=True) result, score = retrieval.query('scientists', return_scores=True) print(result, score) assert result[0] == 7 result = retrieval.query('scientists') assert result[0] == 42
def test_expansion_inside_retrieval(): # Integration test within full retrieval pipeline model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) # model.save('model_w2v_e') # model.init_sims(replace=True) # model = Word2Vec.load('model_w2v_e') n_expansions = 2 tfidf = Tfidf() match_op = Matching() expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval( tfidf, # The retrieval model matching=match_op, query_expansion=expansion_op) # ids = ['fox_ex', 'surf_ex'] # retrieval.fit(DOCUMENTS, ids) retrieval.fit(DOCUMENTS) start = time.time() # 시작 시간 저장 result = retrieval.query("An 81-year-old woman named Eileen") print(result) result, score = retrieval.query("한국에서 가장 좋은 나라", return_scores=True) print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 print('result:%s' % result) print('score:%s' % score)
# wcd # match_op = Matching() # wcd = WordCentroidDistance(load_ft_model.wv) # vvoc_retrieval = Retrieval(wcd, matching=match_op, labels=vvoc_l) # vvoc_retrieval.fit(vvoc_l) # combination tfidf = Tfidf() tfidf.fit(vvoc_l) wcd = WordCentroidDistance(load_ft_model.wv) wcd.fit(vvoc_l) # # they can operate on different feilds match_op = Matching().fit(vvoc_l) combined = wcd + tfidf**2 vvoc_retrieval = Retrieval(combined, matching=match_op, labels=vvoc_l) # print('========= voca 검색어 ==========') # vocas, score = vvoc_retrieval.query(q, return_scores=True) # print('vocas, score') # print(vocas, score) # # print('========= docu 검색어 ==========') # jamo_document = list(map(lambda x: jamo_sentence(x), document)) # docu_retrieval = Retrieval(wcd, matching=match_op, labels=document) # docu_retrieval.fit(jamo_document) # docus, score = docu_retrieval.query(q, return_scores=True) # print('docus, score') # print(docus, score) # q = jamo_sentence('후대폰')