Beispiel #1
0
def test_doc2vec_inference():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("Computer scientists are lazy vec4ir-evaluate for vec4ir evaluation of an information retrieval" )
    print(result)
Beispiel #2
0
def test_retrieval():
    DOCUMENTS = ["The quick brown fox jumps over the lazy dog",
                 "Surfing surfers do surf on green waves"]
    # Test retrieval with given ids
    tfidf = Tfidf()
    retrieval = Retrieval(tfidf)
    ids = ['fox_example', 'lazy_example']
    retrieval.fit(DOCUMENTS, ids)
    result = retrieval.query('fox')
    print(result)
Beispiel #3
0
def test_doc2vec_inference_saveload():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
    model.save(TEST_FILE)
    del model
    model = Doc2Vec.load(TEST_FILE)
    os.remove(TEST_FILE)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1
Beispiel #4
0
def test_word2vec():
    model = Word2Vec([DEFAULT_ANALYZER(doc) for doc in DOCUMENTS], iter=3, min_count=1)
    model.save('model_w2v')
    model.init_sims(replace=True)

    model = Word2Vec.load('model_w2v')
    match_op = Matching()
    wcd = WordCentroidDistance(model.wv)
    retrieval = Retrieval(wcd, matching=match_op)
    retrieval.fit(DOCUMENTS)

    start = time.time()  # 시작 시간 저장
    result, score = retrieval.query("general", return_scores=True)
    print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간
    print(result)
    print(score)
Beispiel #5
0
def test_word2vec_similar_ir():
    model = Word2Vec([DEFAULT_ANALYZER(doc) for doc in DOCUMENTS], iter=3, min_count=1)
    model.save('model_w2v')
    model.init_sims(replace=True)

    model = Word2Vec.load('model_w2v')
    # match_op = Matching()
    wcr = Word2VecRetrieval(model.wv, analyzer=DEFAULT_ANALYZER)
    retrieval = Retrieval(wcr)  #, matching=match_op)  #, labels=['1번', '2번', '3번', '4번', '5번', '6번', '7번', '8번'])
    retrieval.fit(DOCUMENTS)

    start = time.time()  # 시작 시간 저장
    result, score = retrieval.query("안냥", return_scores=True)
    print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간
    print(result)
    print(score)
Beispiel #6
0
def test_word2vec():
    model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1)
    match_op = Matching()
    with pytest.raises(ValueError):
        wcd = WordCentroidDistance(model)

    wcd = WordCentroidDistance(model.wv)
    retrieval = Retrieval(wcd, matching=match_op)
    retrieval.fit(documents)
    result, score = retrieval.query('art news', return_scores=True)
    print(result)
    print(score)

    retrieval1 = Retrieval(wcd, matching=match_op, labels=['1번', '2번', '3번', '4번', '5번', '6번'])
    retrieval1.fit(documents)
    result1 = retrieval1.query('art news')
    print(result1)
Beispiel #7
0
def test_fasttext():
    import config

    print("time :start")  # 현재시각 - 시작시간 = 실행 시간
    # model = FastText([doc.split() for doc in DOCUMENTS], size=100, workers=16, sg=1, iter=3, word_ngrams=5)
    # model = FastText([tokenize_by_eojeol_jaso(doc) for doc in DOCUMENTS], size=100, workers=16, sg=1, iter=3, word_ngrams=5)
    # model = FastText([to_jaso(doc) for doc in DOCUMENTS], size=50, workers=12, sg=1, iter=3, word_ngrams=1)

    # model = FastText([tokenize_by_eojeol_jaso(doc) for doc in DOCUMENTS], size=config.MODEL_SIZE, window=config.MODEL_WINDOW, min_count=config.MODEL_MIN_COUNT, workers=config.MODEL_WORKERS)
    # model = FastText([tokenize_by_eojeol_jaso(doc) for doc in DOCUMENTS], size=config.MODEL_SIZE, window=config.MODEL_WINDOW, min_count=config.MODEL_MIN_COUNT, workers=config.MODEL_WORKERS)

    model = FastText([tokenize_by_morpheme_char(doc) for doc in DOCUMENTS],
                     size=config.MODEL_SIZE,
                     window=config.MODEL_WINDOW,
                     min_count=config.MODEL_MIN_COUNT,
                     workers=config.MODEL_WORKERS)
    # model.train(DOCUMENTS, total_examples=len(DOCUMENTS), epochs=config.MODEL_EPOCHS)
    model.save('model_ft')
    print("save model_ft")
    # model.init_sims(replace=True)
    model = FastText.load('model_ft')
    match_op = Matching()
    wcd = FastTextCentroidDistance(model.wv)

    ### simple mode
    # retrieval = Retrieval(wcd, matching=match_op)

    ### expansion mode
    n_expansions = 2
    expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions)
    retrieval = Retrieval(
        wcd,  # The retrieval model
        matching=match_op,
        query_expansion=expansion_op)

    retrieval.fit(DOCUMENTS)

    start = time.time()  # 시작 시간 저장
    q = '한국에서 가장 좋은 나라'
    # qparse = to_jaso(q)
    print(q)
    result, score = retrieval.query(q, return_scores=True)
    # result = retrieval.query(q)
    print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간
    print(result, score)
def test_expansion_inside_retrieval():
    # Integration test within full retrieval pipeline
    model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1)
    n_expansions = 2

    tfidf = Tfidf()
    match_op = Matching()
    expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions)

    retrieval = Retrieval(tfidf,  # The retrieval model
                          matching=match_op,
                          query_expansion=expansion_op)
    # ids = ['fox_ex', 'surf_ex']
    # retrieval.fit(DOCUMENTS, ids)
    retrieval.fit(DOCUMENTS)
    # result = retrieval.query('vec4ir evaluate uses IDF re-weighted')
    result = retrieval.query('art news')
    print(result)
Beispiel #9
0
def test_combined():
    model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1)
    wcd = WordCentroidDistance(model.wv)
    tfidf = Tfidf()

    wcd.fit(documents)
    # # they can operate on different feilds
    tfidf.fit(['fox', 'scientists'])
    match_op = Matching().fit(documents)

    combined = wcd + tfidf ** 2

    retrieval = Retrieval(combined, matching=match_op, labels=[7,42])
    result, score = retrieval.query('fox', return_scores=True)
    result, score = retrieval.query('scientists', return_scores=True)

    print(result, score)

    assert result[0] == 7
    result = retrieval.query('scientists')
    assert result[0] == 42
Beispiel #10
0
def test_expansion_inside_retrieval():
    # Integration test within full retrieval pipeline
    model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1)
    # model.save('model_w2v_e')
    # model.init_sims(replace=True)
    # model = Word2Vec.load('model_w2v_e')
    n_expansions = 2
    tfidf = Tfidf()
    match_op = Matching()
    expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions)

    retrieval = Retrieval(
        tfidf,  # The retrieval model
        matching=match_op,
        query_expansion=expansion_op)
    # ids = ['fox_ex', 'surf_ex']
    # retrieval.fit(DOCUMENTS, ids)
    retrieval.fit(DOCUMENTS)
    start = time.time()  # 시작 시간 저장
    result = retrieval.query("An 81-year-old woman named Eileen")
    print(result)

    result, score = retrieval.query("한국에서 가장 좋은 나라", return_scores=True)

    print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간
    print('result:%s' % result)
    print('score:%s' % score)
# wcd
# match_op = Matching()
# wcd = WordCentroidDistance(load_ft_model.wv)
# vvoc_retrieval = Retrieval(wcd, matching=match_op, labels=vvoc_l)
# vvoc_retrieval.fit(vvoc_l)

# combination
tfidf = Tfidf()
tfidf.fit(vvoc_l)

wcd = WordCentroidDistance(load_ft_model.wv)
wcd.fit(vvoc_l)
# # they can operate on different feilds
match_op = Matching().fit(vvoc_l)
combined = wcd + tfidf**2
vvoc_retrieval = Retrieval(combined, matching=match_op, labels=vvoc_l)

# print('========= voca 검색어 ==========')
# vocas, score = vvoc_retrieval.query(q, return_scores=True)
# print('vocas, score')
# print(vocas, score)
#
# print('========= docu 검색어 ==========')
# jamo_document = list(map(lambda x: jamo_sentence(x), document))
# docu_retrieval = Retrieval(wcd, matching=match_op, labels=document)
# docu_retrieval.fit(jamo_document)
# docus, score = docu_retrieval.query(q, return_scores=True)
# print('docus, score')
# print(docus, score)

# q = jamo_sentence('후대폰')