Ejemplo n.º 1
0
class SearchEngine:
    def __init__(self, connection_provider, ft_embed_size):

        print('Search engine initialization')

        self.conn_provider = connection_provider

        self.fasttext = FastText(size=ft_embed_size,
                                 window=3,
                                 min_count=1,
                                 iter=100,
                                 workers=-1,
                                 min_n=1,
                                 max_n=5)

        name_corpus = []
        for i, app in enumerate(
                self.conn_provider.get_all_articles().iterator()):
            name_corpus.append([app.appendix])
            if i >= 3000:
                break

        self.fasttext.build_vocab(name_corpus)
        corp_count = self.fasttext.corpus_count
        n_iter = 1000
        self.fasttext.train(name_corpus,
                            total_examples=corp_count,
                            epochs=n_iter)
        del name_corpus

        print('Done')

    def find_match(self, new_id, top_n=5):

        new = self.conn_provider.get_article(new_id)
        new_entities = new.named_entities

        scores = []
        for candidate in self.conn_provider.get_all_articles().iterator():
            if candidate.global_id == new.global_id:
                continue
            score = 0
            cand_entities = set(candidate.named_entities)
            for ent in new_entities:
                if ent in cand_entities:
                    score += 1
            score += self.fasttext.similarity(new.appendix, candidate.appendix)
            score = score / (len(new_entities) + 1)
            scores.append((candidate.global_id, score))

        scores = sorted(scores, key=lambda x: -x[1])
        return scores[:top_n]
Ejemplo n.º 2
0
                    size=25,
                    window=5,
                    min_count=2,
                    workers=4,
                    sg=1)

if 0:
    from gensim.models import FastText
    model_rw = FastText(sentences=desc_token,
                        size=25,
                        window=5,
                        min_count=2,
                        workers=4,
                        sg=1)

model_rw.similarity('12 Linajes Reserva 2012',
                    '13th Street Burger Blend Gamay Pinot Noir VQA')

model_rw.save("word2vec_model_test.model")

# model_rw_load = Word2Vec.load("word2vec_model_test.model")

model_rw.wv.most_similar('dri fruit')
model_rw.wv.vectors.shape

len(model_rw.wv.vocab)
model_rw.wv.vocab

model_rw.vocabulary
model_rw.wv.similarity('dri', 'fruit')
Ejemplo n.º 3
0
)

#Saving The Model
modelSGw2v.wv.save_word2vec_format("SGw2v.txt", binary=False)

#Delete model in order not to load RAM a lot
modelSGw2v = None

#Creating CBOW FastText model
modelCBOWFT = FastText(
    sentences=gensim.models.word2vec.LineSentence("path_to_data_corpus"),
    min_n=4,
    max_n=2)

#Checking cosine similarity between two words
modelCBOWFT.similarity('first_word', 'second_word')

#Showing top 5 similar words to a given words with their cosie similarities
modelCBOWFT.wv.most_similar("word", topn=5)

#Checking Word Analogy
modelCBOWFT.wv.most_similar(
    positive=["first_positive_word", "second_positive_word"],
    negative=["negative_word"],
    topn=1)

#Checking syntactic and semantic, capital-country scores of the model, Intrinsic Evaluation
print(
    f" syntactic score: - {modelCBOWFT.wv.evaluate_word_analogies('path_to_syntactic_inputs')[0]}"
)
print(