def test_tfidf(): # Test tfidf retrieval with auto-generated ids tfidf = Tfidf() tfidf.fit(DOCUMENTS) result, score = tfidf.query('안녕 scientists', return_scores=True) print(result, score) # test_tfidf()
def test_tfidf(): DOCUMENTS = ["The quick brown fox jumps over the lazy dog", "Surfing surfers do surf on green waves"] # Test tfidf retrieval with auto-generated ids tfidf = Tfidf() tfidf.fit(DOCUMENTS) result = tfidf.query('dog') print(result) assert result[0] == 1 assert result[1] == 0
def test_expansion_inside_retrieval(): # Integration test within full retrieval pipeline model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) # model.save('model_w2v_e') # model.init_sims(replace=True) # model = Word2Vec.load('model_w2v_e') n_expansions = 2 tfidf = Tfidf() match_op = Matching() expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval( tfidf, # The retrieval model matching=match_op, query_expansion=expansion_op) # ids = ['fox_ex', 'surf_ex'] # retrieval.fit(DOCUMENTS, ids) retrieval.fit(DOCUMENTS) start = time.time() # 시작 시간 저장 result = retrieval.query("An 81-year-old woman named Eileen") print(result) result, score = retrieval.query("한국에서 가장 좋은 나라", return_scores=True) print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 print('result:%s' % result) print('score:%s' % score)
def test_retrieval(): DOCUMENTS = ["The quick brown fox jumps over the lazy dog", "Surfing surfers do surf on green waves"] # Test retrieval with given ids tfidf = Tfidf() retrieval = Retrieval(tfidf) ids = ['fox_example', 'lazy_example'] retrieval.fit(DOCUMENTS, ids) result = retrieval.query('fox') print(result)
def test_combined(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) wcd = WordCentroidDistance(model.wv) tfidf = Tfidf() wcd.fit(documents) # # they can operate on different feilds tfidf.fit(['fox', 'scientists']) match_op = Matching().fit(documents) combined = wcd + tfidf ** 2 retrieval = Retrieval(combined, matching=match_op, labels=[7,42]) result, score = retrieval.query('fox', return_scores=True) result, score = retrieval.query('scientists', return_scores=True) print(result, score) assert result[0] == 7 result = retrieval.query('scientists') assert result[0] == 42
def test_expansion_inside_retrieval(): # Integration test within full retrieval pipeline model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) n_expansions = 2 tfidf = Tfidf() match_op = Matching() expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval(tfidf, # The retrieval model matching=match_op, query_expansion=expansion_op) # ids = ['fox_ex', 'surf_ex'] # retrieval.fit(DOCUMENTS, ids) retrieval.fit(DOCUMENTS) # result = retrieval.query('vec4ir evaluate uses IDF re-weighted') result = retrieval.query('art news') print(result)
def test_tfidf(): # Test tfidf retrieval with auto-generated ids tfidf = Tfidf() tfidf.fit(documents) result, score = tfidf.query('article', return_scores=True) print(result, score)
vvoca_docs_d = load_ft_model.wv.vocab vvoc_l = list(vvoca_docs_d.keys()) print('===== start ==== copus vocas ==========') print('vvoc_l:%s' % vvoc_l) print('===== end ==== copus vocas ==========') q = jamo_sentence('후대폰 하니님 kt') # wcd # match_op = Matching() # wcd = WordCentroidDistance(load_ft_model.wv) # vvoc_retrieval = Retrieval(wcd, matching=match_op, labels=vvoc_l) # vvoc_retrieval.fit(vvoc_l) # combination tfidf = Tfidf() tfidf.fit(vvoc_l) wcd = WordCentroidDistance(load_ft_model.wv) wcd.fit(vvoc_l) # # they can operate on different feilds match_op = Matching().fit(vvoc_l) combined = wcd + tfidf**2 vvoc_retrieval = Retrieval(combined, matching=match_op, labels=vvoc_l) # print('========= voca 검색어 ==========') # vocas, score = vvoc_retrieval.query(q, return_scores=True) # print('vocas, score') # print(vocas, score) # # print('========= docu 검색어 ==========')