def test_expansion_inside_retrieval(): # Integration test within full retrieval pipeline model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) # model.save('model_w2v_e') # model.init_sims(replace=True) # model = Word2Vec.load('model_w2v_e') n_expansions = 2 tfidf = Tfidf() match_op = Matching() expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval( tfidf, # The retrieval model matching=match_op, query_expansion=expansion_op) # ids = ['fox_ex', 'surf_ex'] # retrieval.fit(DOCUMENTS, ids) retrieval.fit(DOCUMENTS) start = time.time() # 시작 시간 저장 result = retrieval.query("An 81-year-old woman named Eileen") print(result) result, score = retrieval.query("한국에서 가장 좋은 나라", return_scores=True) print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 print('result:%s' % result) print('score:%s' % score)
def test_retrieval(): DOCUMENTS = ["The quick brown fox jumps over the lazy dog", "Surfing surfers do surf on green waves"] # Test retrieval with given ids tfidf = Tfidf() retrieval = Retrieval(tfidf) ids = ['fox_example', 'lazy_example'] retrieval.fit(DOCUMENTS, ids) result = retrieval.query('fox') print(result)
def test_word2vec(): model = Word2Vec([DEFAULT_ANALYZER(doc) for doc in DOCUMENTS], iter=3, min_count=1) model.save('model_w2v') model.init_sims(replace=True) model = Word2Vec.load('model_w2v') match_op = Matching() wcd = WordCentroidDistance(model.wv) retrieval = Retrieval(wcd, matching=match_op) retrieval.fit(DOCUMENTS) start = time.time() # 시작 시간 저장 result, score = retrieval.query("general", return_scores=True) print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 print(result) print(score)
def test_word2vec_similar_ir(): model = Word2Vec([DEFAULT_ANALYZER(doc) for doc in DOCUMENTS], iter=3, min_count=1) model.save('model_w2v') model.init_sims(replace=True) model = Word2Vec.load('model_w2v') # match_op = Matching() wcr = Word2VecRetrieval(model.wv, analyzer=DEFAULT_ANALYZER) retrieval = Retrieval(wcr) #, matching=match_op) #, labels=['1번', '2번', '3번', '4번', '5번', '6번', '7번', '8번']) retrieval.fit(DOCUMENTS) start = time.time() # 시작 시간 저장 result, score = retrieval.query("안냥", return_scores=True) print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 print(result) print(score)
def test_word2vec(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) match_op = Matching() with pytest.raises(ValueError): wcd = WordCentroidDistance(model) wcd = WordCentroidDistance(model.wv) retrieval = Retrieval(wcd, matching=match_op) retrieval.fit(documents) result, score = retrieval.query('art news', return_scores=True) print(result) print(score) retrieval1 = Retrieval(wcd, matching=match_op, labels=['1번', '2번', '3번', '4번', '5번', '6번']) retrieval1.fit(documents) result1 = retrieval1.query('art news') print(result1)
def test_fasttext(): import config print("time :start") # 현재시각 - 시작시간 = 실행 시간 # model = FastText([doc.split() for doc in DOCUMENTS], size=100, workers=16, sg=1, iter=3, word_ngrams=5) # model = FastText([tokenize_by_eojeol_jaso(doc) for doc in DOCUMENTS], size=100, workers=16, sg=1, iter=3, word_ngrams=5) # model = FastText([to_jaso(doc) for doc in DOCUMENTS], size=50, workers=12, sg=1, iter=3, word_ngrams=1) # model = FastText([tokenize_by_eojeol_jaso(doc) for doc in DOCUMENTS], size=config.MODEL_SIZE, window=config.MODEL_WINDOW, min_count=config.MODEL_MIN_COUNT, workers=config.MODEL_WORKERS) # model = FastText([tokenize_by_eojeol_jaso(doc) for doc in DOCUMENTS], size=config.MODEL_SIZE, window=config.MODEL_WINDOW, min_count=config.MODEL_MIN_COUNT, workers=config.MODEL_WORKERS) model = FastText([tokenize_by_morpheme_char(doc) for doc in DOCUMENTS], size=config.MODEL_SIZE, window=config.MODEL_WINDOW, min_count=config.MODEL_MIN_COUNT, workers=config.MODEL_WORKERS) # model.train(DOCUMENTS, total_examples=len(DOCUMENTS), epochs=config.MODEL_EPOCHS) model.save('model_ft') print("save model_ft") # model.init_sims(replace=True) model = FastText.load('model_ft') match_op = Matching() wcd = FastTextCentroidDistance(model.wv) ### simple mode # retrieval = Retrieval(wcd, matching=match_op) ### expansion mode n_expansions = 2 expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval( wcd, # The retrieval model matching=match_op, query_expansion=expansion_op) retrieval.fit(DOCUMENTS) start = time.time() # 시작 시간 저장 q = '한국에서 가장 좋은 나라' # qparse = to_jaso(q) print(q) result, score = retrieval.query(q, return_scores=True) # result = retrieval.query(q) print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 print(result, score)
def test_expansion_inside_retrieval(): # Integration test within full retrieval pipeline model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) n_expansions = 2 tfidf = Tfidf() match_op = Matching() expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval(tfidf, # The retrieval model matching=match_op, query_expansion=expansion_op) # ids = ['fox_ex', 'surf_ex'] # retrieval.fit(DOCUMENTS, ids) retrieval.fit(DOCUMENTS) # result = retrieval.query('vec4ir evaluate uses IDF re-weighted') result = retrieval.query('art news') print(result)