def test_vectors_similarity_TD(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) with pytest.warns(UserWarning): assert isinstance(doc.similarity(doc[0]), float) assert isinstance(doc[0].similarity(doc), float) assert doc.similarity(doc[0]) == doc[0].similarity(doc)
def test_doc_api_similarity_match(): doc = Doc(Vocab(), words=["a"]) assert doc.similarity(doc[0]) == 1.0 assert doc.similarity(doc.vocab["a"]) == 1.0 doc2 = Doc(doc.vocab, words=["a", "b", "c"]) with pytest.warns(ModelsWarning): assert doc.similarity(doc2[:1]) == 1.0 assert doc.similarity(doc2) == 0.0
def test_doc_api_similarity_match(): doc = Doc(Vocab(), words=["a"]) assert doc.similarity(doc[0]) == 1.0 assert doc.similarity(doc.vocab["a"]) == 1.0 doc2 = Doc(doc.vocab, words=["a", "b", "c"]) with pytest.warns(UserWarning): assert doc.similarity(doc2[:1]) == 1.0 assert doc.similarity(doc2) == 0.0
def queryBySentOnePaper(bank, qsent, sent_tokens, distance, useCosine=False, badwords=None): if useCosine: # qdoc=Doc(bank.vocabulary,words=list(nlp(qsent)))) qdoc = Doc(bank.vocabulary, words=my_token_analyzer(qsent, bank.model)) else: qdoc = bank.model(qsent) hits = [] if badwords is not None: badwords_ = badwords else: badwords_ = ["copyright", 'copy right', 'preprint', 'biorxiv', 'medrxiv', 'bioRxiv', 'medRxiv', 'acknowledgement', 'acknowledgements', 'https', 'http', 'palabra clave'] for tokens in sent_tokens: tokens = list(filter(lambda a: a not in badwords_, tokens)) if len(tokens) > 2: refdoc = Doc(bank.vocabulary, words=tokens) try: wmddist = qdoc.similarity(refdoc) if useCosine: if wmddist >= distance: hits.append([' '.join(tokens), wmddist]) else: if wmddist <= distance: hits.append([' '.join(tokens), wmddist]) except: continue # print(qsent,tokens) ## Yunchen Yang: replace number 1 with 'continue' return hits
def sentSimilarity(bank, sent1, sent2, useCosine=False): if useCosine: doc1 = Doc(bank.vocabulary, words=my_token_analyzer(sent1, bank.model)) doc2 = Doc(bank.vocabulary, words=my_token_analyzer(sent2, bank.model)) else: doc1 = bank.model(sent1) doc2 = bank.model(sent2) wmddist = doc1.similarity(doc2) return wmddist
def test_vectors_similarity_no_vectors(): vocab = Vocab() doc1 = Doc(vocab, words=["a", "b"]) doc2 = Doc(vocab, words=["c", "d", "e"]) with pytest.warns(UserWarning): doc1.similarity(doc2) with pytest.warns(UserWarning): doc1.similarity(doc2[1]) with pytest.warns(UserWarning): doc1.similarity(doc2[:2]) with pytest.warns(UserWarning): doc2.similarity(doc1) with pytest.warns(UserWarning): doc2[1].similarity(doc1) with pytest.warns(UserWarning): doc2[:2].similarity(doc1)
def test_vectors_lexeme_doc_similarity(vocab, text): doc = Doc(vocab, words=text) lex = vocab[text[0]] assert lex.similarity(doc) == doc.similarity(lex) assert -1.0 < lex.similarity(doc) < 1.0
def test_vectors_lexeme_span_similarity(vocab, text): doc = Doc(vocab, words=text) lex = vocab[text[0]] assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex) assert -1.0 < doc.similarity(doc[1:3]) < 1.0
def test_vectors_token_doc_similarity(vocab, text): doc = Doc(vocab, words=text) assert doc[0].similarity(doc) == doc.similarity(doc[0]) assert -1.0 < doc[0].similarity(doc) < 1.0
def test_vectors_doc_doc_similarity(vocab, text1, text2): doc1 = Doc(vocab, words=text1) doc2 = Doc(vocab, words=text2) assert doc1.similarity(doc2) == doc2.similarity(doc1) assert -1.0 < doc1.similarity(doc2) < 1.0
def test_vectors_span_doc_similarity(vocab, text): doc = Doc(vocab, words=text) with pytest.warns(UserWarning): assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) assert -1.0 < doc[0:2].similarity(doc) < 1.0
def test_vectors_similarity_DD(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc1 = Doc(vocab, words=[word1, word2]) doc2 = Doc(vocab, words=[word2, word1]) assert isinstance(doc1.similarity(doc2), float) assert doc1.similarity(doc2) == doc2.similarity(doc1)
def test_vectors_similarity_DS(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
# compare 2 doc doc1 = nlp("I like fast food") doc2 = nlp("I like pizza") print("Fast Food (doc) Similiarity:", doc1.similarity(doc2)) # compare 2 tokens doc = nlp("I like chicken thighs and legs") token1 = doc[3] token2 = doc[5] print ("Pizza / Pasta (token) Similarity:", token1.similarity(token2)) # compare doc w/ token doc = nlp("I like pizza") token = nlp("pasta")[0] print ("doc vs token:", doc.similarity(token)) # compare span w/ doc span = nlp("I like burgers and fries")[2:5] doc = nlp("McDonalds sells burgers") print ("doc vs span:", span.similarity(doc)) # --- word vectors --- nlp = spacy.load('en_core_web_md') doc = nlp("I have a banana") # bananas_vector = doc[3].vector # print ("Banana Vector:", len(bananas_vector)) # print (bananas_vector)
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False, num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if num_topics is None: num_topics = 100 possible_model_names = [ 'tf_idf', # 0 'lsi_bow', 'lsi_tf_idf', # 1, 2 'rp_bow', 'rp_tf_idf', # 3, 4 'lda_bow', 'lda_tf_idf', # 5, 6 'hdp_bow', 'hdp_tf_idf', # 7, 8 'word2vec', # 9 ] chosen_model_name = possible_model_names[chosen_model_no] print(chosen_model_name) game_names, _ = load_game_names(include_genres=False, include_categories=False) steam_tokens = load_tokens() nlp = spacy.load('en_core_web_lg') documents = list(steam_tokens.values()) dct = Dictionary(documents) print(len(dct)) dct.filter_extremes(no_below=no_below, no_above=no_above) print(len(dct)) corpus = [dct.doc2bow(doc) for doc in documents] # Pre-processing pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf') tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors) if pre_process_corpus_with_tf_idf: # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf! print('Corpus as Tf-Idf') pre_processed_corpus = tfidf_model[corpus] else: print('Corpus as Bag-of-Words') pre_processed_corpus = corpus # Model model = None wv = None index2word_set = None if chosen_model_name == 'tf_idf': print('Term Frequency * Inverse Document Frequency (Tf-Idf)') model = tfidf_model elif chosen_model_name.startswith('lsi'): print('Latent Semantic Indexing (LSI/LSA)') model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('rp'): print('Random Projections (RP)') model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('lda'): print('Latent Dirichlet Allocation (LDA)') model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('hdp'): print('Hierarchical Dirichlet Process (HDP)') model = HdpModel(pre_processed_corpus, id2word=dct) elif chosen_model_name == 'word2vec': use_a_lot_of_ram = False if use_a_lot_of_ram: model = None print('Loading Word2Vec based on Google News') # Warning: this takes a lot of time and uses a ton of RAM! wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True) else: if use_spacy: print('Using Word2Vec with spaCy') else: print('Training Word2Vec') model = Word2Vec(documents) wv = model.wv if not use_spacy: wv.init_sims(replace=normalize_vectors) index2word_set = set(wv.index2word) else: print('No model specified.') model = None if chosen_model_name != 'word2vec': if not use_soft_cosine_similarity: index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct)) else: w2v_model = Word2Vec(documents) similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100) index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix) else: index = None query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True) app_ids = list(int(app_id) for app_id in steam_tokens.keys()) matches_as_app_ids = [] for query_count, query_app_id in enumerate(query_app_ids): print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids), query_app_id, get_app_name(query_app_id, game_names))) query = steam_tokens[str(query_app_id)] if use_spacy: spacy_query = Doc(nlp.vocab, query) else: spacy_query = None if chosen_model_name != 'word2vec': vec_bow = dct.doc2bow(query) if pre_process_corpus_with_tf_idf: pre_preoccessed_vec = tfidf_model[vec_bow] else: pre_preoccessed_vec = vec_bow vec_lsi = model[pre_preoccessed_vec] sims = index[vec_lsi] if use_soft_cosine_similarity: sims = enumerate(sims) similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims] similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples) else: if use_spacy: similarity_scores = {} for app_id in steam_tokens: reference_sentence = steam_tokens[app_id] spacy_reference = Doc(nlp.vocab, reference_sentence) similarity_scores[app_id] = spacy_query.similarity(spacy_reference) else: query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set) similarity_scores = {} counter = 0 num_games = len(steam_tokens) for app_id in steam_tokens: counter += 1 if (counter % 1000) == 0: print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id])) reference_sentence = steam_tokens[app_id] reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set) try: similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence) except ZeroDivisionError: similarity_scores[app_id] = 0 similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed, verbose=False) matches_as_app_ids.append(similar_app_ids) print_ranking(query_app_ids, matches_as_app_ids, only_print_banners=True) return