def walid_similarity_query(self, answer: str, key: str): if len(answer) == 0 or len(key) == 0: return False if self.model_ready: documents = [answer, key] if self.verbose: print( f'{len(documents)} documents loaded and ready to preprocess' ) corpus = [self.preprocess(document) for document in documents] if self.verbose: print(f'{len(corpus)} documents loaded into corpus') dictionary = Dictionary(corpus) tfidf = TfidfModel(dictionary=dictionary) similarity_matrix = SparseTermSimilarityMatrix( self.similarity_index, dictionary, tfidf) answer_bow = dictionary.doc2bow(self.preprocess(answer)) key_bow = dictionary.doc2bow(self.preprocess(key)) # Measure soft cosine similarity scores = similarity_matrix.inner_product(answer_bow, key_bow, normalized=True) return scores else: raise NotReadyError('Word embedding model is not ready.')
def W2VH(): docbrown = "" for w in brown.words(categories='mystery'): docbrown += str(w.lower().split()) docbrown1, docbrown2 = docbrown[:int(len(docbrown) / 2)], docbrown[int(len(docbrown) / 2):] stop_words = stopwords.words('english') docbrown1 = [w for w in docbrown1 if w not in stop_words] docbrown2 = [w for w in docbrown2 if w not in stop_words] documents = [docbrown1, docbrown2] dictionary = corpora.Dictionary(documents) docbrown1 = dictionary.doc2bow(docbrown1) docbrown2 = dictionary.doc2bow(docbrown2) model = Word2Vec(common_texts, size=20, min_count=1) termsim_index = WordEmbeddingSimilarityIndex(model.wv) similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) similarity = similarity_matrix.inner_product(docbrown1, docbrown2, normalized=True) print('= %.4f' % similarity)
class Similarity: def __init__(self): self.stop_words = stopwords.words('english') self.w2v_model = api.load("glove-wiki-gigaword-50") self.similarity_index = WordEmbeddingSimilarityIndex(self.w2v_model) def make_document(self, headline, articles): temp = [] headline = [ w for w in headline.lower().split() if w not in self.stop_words ] for article in articles: article = [ w for w in article.lower().split() if w not in self.stop_words ] temp.append(article) self.documents = [headline] + temp dictionary = corpora.Dictionary(self.documents) self.similarity_matrix = SparseTermSimilarityMatrix( self.similarity_index, dictionary) headline = dictionary.doc2bow(headline) articles = [dictionary.doc2bow(i) for i in temp] similarities = [] for i in articles: similarities.append(self.get_similarity(headline, i)) return similarities def get_similarity(self, s1, s2): return self.similarity_matrix.inner_product(s1, s2, normalized=True)
class Word2VecSeqVect(BasicSequenceVectorization): def __init__(self, params): super().__init__(params) self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model']) self.new_model.init_sims( replace=True) # Normalizes the vectors in the word2vec class. #Computes cosine similarities between word embeddings and retrieves the closest #word embeddings by cosine similarity for a given word embedding. self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv) #Build a term similarity matrix and compute the Soft Cosine Measure. self.similarity_matrix = SparseTermSimilarityMatrix( self.similarity_index, self.dictionary) self.dict_distance_dispatcher = { DistanceMetric.COS: self.cos_scipy, SimilarityMetric.Pearson: self.pearson_abs_scipy, DistanceMetric.WMD: self.wmd_gensim, DistanceMetric.SCM: self.scm_gensim } def wmd_gensim(self, sentence_a, sentence_b): wmd = self.new_model.wv.wmdistance(sentence_a, sentence_b) return [wmd, self.wmd_similarity(wmd)] def wmd_similarity(self, dist): return 1. / (1. + float(dist)) #Associated Similarity def scm_gensim(self, sentence_a, sentence_b): '''Compute SoftCosine Similarity of Gensim''' #Convert the sentences into bag-of-words vectors. sentence_1 = self.dictionary.doc2bow(sentence_a) sentence_2 = self.dictionary.doc2bow(sentence_b) #Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis, #where the dot product between the basis vectors is given by the sparse term similarity matrix. scm_similarity = self.similarity_matrix.inner_product(sentence_1, sentence_2, normalized=True) return [1 - scm_similarity, scm_similarity] def distance(self, metric_list, link): '''Iterate on the metrics''' #Computation of sentences can be moved directly to wmd_gensim method if we cannot generalize it for #the remaining metrics sentence_a = self.df_source[self.df_source['ids'].str.contains( link[0])]['text'].values[0].split() sentence_b = self.df_target[self.df_target['ids'].str.contains( link[1])]['text'].values[0].split() dist = [ self.dict_distance_dispatcher[metric](sentence_a, sentence_b) for metric in metric_list ] logging.info("Computed distances or similarities " + str(link) + str(dist)) return functools.reduce(lambda a, b: a + b, dist) #Always return a list
def calculate_soft_cosine_similarity(self, topic_models, sentences, *args, **kwargs): topic_claim_relations = {} for topic in topic_models: topic_claim_relations[topic] = [] documents = [] for topic in topic_models: documents.append(topic.lower().split()) for sentence in sentences: documents.append(sentence.lower().split()) dictionary = corpora.Dictionary(documents) w2v_model = api.load("glove-wiki-gigaword-100") similarity_index = WordEmbeddingSimilarityIndex(w2v_model) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary) for sentence in sentences: best_cosine_result = 0 x = 0 normal_sentence = sentence sentence = sentence.lower().split() stop_words = stopwords.words('english') sentence = [w for w in sentence if w not in stop_words] while x <= len(topic_models) - 1: topic_model = (topic_models[x]).lower().split() topic_model = [w for w in topic_model if w not in stop_words] topic_model_bow = dictionary.doc2bow(topic_model) sentence_bow = dictionary.doc2bow(sentence) similarity = similarity_matrix.inner_product(topic_model_bow, sentence_bow, normalized=True) print('similarity = %.4f' % similarity) if similarity > best_cosine_result: best_cosine_result = similarity matched_topic = topic_models[x] if x == len(topic_models) - 1: if best_cosine_result > 0.3: topic_claim_relations[matched_topic].append( normal_sentence) x = x + 1 return topic_claim_relations
def calculate_softcosine_w2v(test_data): data = [i.split() for i in (test_data.text).tolist()] dictionary = corpora.Dictionary(data) corpus = [dictionary.doc2bow(d) for d in data] similarity_index = WordEmbeddingSimilarityIndex(w2v_model) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary) softsim_w2v_matrix = np.empty(shape=(len(data), len(data))) * np.nan for d1 in range(0, len(data)): for d2 in range(0, len(data)): softsim_w2v_matrix[d1, d2] = similarity_matrix.inner_product( corpus[d1], corpus[d2], normalized=True) doc_sim_max_index, doc_sim_max_values = calculate_max_similarity( softsim_w2v_matrix) softsim_w2v_df = export_result(test_data, doc_sim_max_index, doc_sim_max_values, 'softsim_w2v') print( "Similarity using soft cosine similarity using w2v vectors is calculated!!" ) return softsim_w2v_df
corpus_neg_reviews = [neg_dictionary.doc2bow(text) for text in list(df_negative_sentences['review_sentence_cleaned'])] corpus_neg_topics = [neg_dictionary.doc2bow(text) for text in topics_cleaned] # build similarity matrix of word embeddings print('Building similarity matrix of word embeddings. Might take a few minutes...') termsim_index = WordEmbeddingSimilarityIndex(fasttext_model300) similarity_matrix = SparseTermSimilarityMatrix(termsim_index,neg_dictionary) print('done') # compute soft cosine similarity between sentences and topics print('Computing soft cosine similarity between sentences and topics. Might take a few minutes...') neg_data_topics = [] for review_item in corpus_neg_reviews: review_item_topics = [] for topic in corpus_neg_topics: review_item_topics.append(similarity_matrix.inner_product(review_item,topic,normalized=True)) neg_data_topics.append(review_item_topics) print('done') # extract topic with highest soft cosine similarity # I set a minimum threshold (0.10) that needs to be reached in order to assign a topic. # If above-threshold topics are within 0.01, I assign -1 (i.e. no main topic) neg_data_closest_topic = [] cossim_threshold = 0.10 for review_item_topic_list in neg_data_topics: if max(review_item_topic_list)>cossim_threshold: review_item_array = np.array(review_item_topic_list) sorted_review_item_array = sorted(review_item_array,reverse=True) num_topics=1 for item in sorted_review_item_array[1:]:
class Word2VecSeqVect(BasicSequenceVectorization): def __init__(self, params): super().__init__(params) self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] ) self.new_model.init_sims(replace=True) # Normalizes the vectors in the word2vec class. #Computes cosine similarities between word embeddings and retrieves the closest #word embeddings by cosine similarity for a given word embedding. self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv) #Build a term similarity matrix and compute the Soft Cosine Measure. self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, self.dictionary) self.dict_distance_dispatcher = { DistanceMetric.COS: self.cos_scipy, SimilarityMetric.Pearson: self.pearson_abs_scipy, DistanceMetric.WMD: self.wmd_gensim, DistanceMetric.SCM: self.scm_gensim, EntropyMetric.MSI_I: self.msi } def wmd_gensim(self, sentence_a, sentence_b ): wmd = self.new_model.wv.wmdistance(sentence_a, sentence_b) return [wmd, self.wmd_similarity(wmd)] def wmd_similarity(self, dist): return 1./( 1.+float( dist ) ) #Associated Similarity def scm_gensim(self, sentence_a, sentence_b ): '''Compute SoftCosine Similarity of Gensim''' #Convert the sentences into bag-of-words vectors. sentence_1 = self.dictionary.doc2bow(sentence_a) sentence_2 = self.dictionary.doc2bow(sentence_b) #Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis, #where the dot product between the basis vectors is given by the sparse term similarity matrix. scm_similarity = self.similarity_matrix.inner_product(sentence_1, sentence_2, normalized=True) return [1-scm_similarity, scm_similarity] def msi(self, sentence_a, sentence_b): '''@danaderp Minimum Shared Information''' token_counts_1 = self.get_cnts(sentence_a, self.vocab) token_counts_2 = self.get_cnts(sentence_b, self.vocab) logging.info('token count processed') #Minimum Shared Tokens token_counts = { token: min(token_counts_1[token],token_counts_2[token]) for token in self.vocab } alphabet = list(set(token_counts.keys())) #[ list(set(cnt.keys())) for cnt in token_counts ] frequencies = self.get_freqs(token_counts) #[ get_freqs(cnt) for cnt in token_counts ] logging.info('frequencies processed') if not frequencies: #"List is empty" entropies = float('nan') extropies = float('nan') else: scalar_distribution = dit.ScalarDistribution(alphabet, frequencies) #[dit.ScalarDistribution(alphabet[id], frequencies[id]) for id in range( len(token_counts) )] logging.info('scalar_distribution processed') entropies = dit.shannon.entropy( scalar_distribution ) #[ dit.shannon.entropy( dist ) for dist in scalar_distribution ] logging.info('entropies processed') extropies = dit.other.extropy( scalar_distribution )# [ dit.other.extropy( dist ) for dist in scalar_distribution ] logging.info('extropies processed') return [entropies,extropies] def distance(self, metric_list,link): '''Iterate on the metrics''' #Computation of sentences can be moved directly to wmd_gensim method if we cannot generalize it for #the remaining metrics ids = parameters['system_path_config']['names'][0] txt = parameters['system_path_config']['names'][1] if self.params['system_path_config']['prep'] == Preprocessing.conv: #if conventional preprocessing sentence_a = self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0].split() sentence_b = self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0].split() elif self.params['system_path_config']['prep'] == Preprocessing.bpe: sentence_a = eval(self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0]) sentence_b = eval(self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0]) dist = [ self.dict_distance_dispatcher[metric](sentence_a,sentence_b) for metric in metric_list] logging.info("Computed distances or similarities "+ str(link) + str(dist)) return functools.reduce(lambda a,b : a+b, dist) #Always return a list #################################3TODO substitute this block in the future by importing information science module def get_cnts(self, toks, vocab): '''@danaderp Counts tokens within ONE document''' #logging.info("encoding_size:" len cnt = Counter(vocab) for tok in toks: cnt[tok] += 1 return cnt def get_freqs(self, dict_token_counts): num_tokens = sum( dict_token_counts.values() ) #number of subwords inside the document if num_tokens == 0.0: frequencies = [] logging.info('---------------> NO SHARED INFORMATION <-------------------------') else: frequencies = [ (dict_token_counts[token])/num_tokens for token in dict_token_counts ] return frequencies
def test_inner_product(self): """Test the inner product.""" matrix = SparseTermSimilarityMatrix( UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5), self.dictionary) # check zero vectors work as expected vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) self.assertEqual(0.0, matrix.inner_product([], vec2)) self.assertEqual(0.0, matrix.inner_product(vec1, [])) self.assertEqual(0.0, matrix.inner_product([], [])) self.assertEqual(0.0, matrix.inner_product([], vec2, normalized=True)) self.assertEqual(0.0, matrix.inner_product(vec1, [], normalized=True)) self.assertEqual(0.0, matrix.inner_product([], [], normalized=True)) # check that real-world vectors work as expected vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) expected_result = 0.0 expected_result += 2 * 1.0 * 1 # government * s_{ij} * government expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday result = matrix.inner_product(vec1, vec2) self.assertAlmostEqual(expected_result, result, places=5) vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) expected_result = matrix.inner_product(vec1, vec2) expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) result = matrix.inner_product(vec1, vec2, normalized=True) self.assertAlmostEqual(expected_result, result, places=5) # check that real-world (vector, corpus) pairs work as expected vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) expected_result = 0.0 expected_result += 2 * 1.0 * 1 # government * s_{ij} * government expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday expected_result = numpy.full((1, 2), expected_result) result = matrix.inner_product(vec1, [vec2] * 2) self.assertTrue(isinstance(result, numpy.ndarray)) self.assertTrue(numpy.allclose(expected_result, result)) vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) expected_result = matrix.inner_product(vec1, vec2) expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) expected_result = numpy.full((1, 2), expected_result) result = matrix.inner_product(vec1, [vec2] * 2, normalized=True) self.assertTrue(isinstance(result, numpy.ndarray)) self.assertTrue(numpy.allclose(expected_result, result)) # check that real-world (corpus, vector) pairs work as expected vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) expected_result = 0.0 expected_result += 2 * 1.0 * 1 # government * s_{ij} * government expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday expected_result = numpy.full((3, 1), expected_result) result = matrix.inner_product([vec1] * 3, vec2) self.assertTrue(isinstance(result, numpy.ndarray)) self.assertTrue(numpy.allclose(expected_result, result)) vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) expected_result = matrix.inner_product(vec1, vec2) expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) expected_result = numpy.full((3, 1), expected_result) result = matrix.inner_product([vec1] * 3, vec2, normalized=True) self.assertTrue(isinstance(result, numpy.ndarray)) self.assertTrue(numpy.allclose(expected_result, result)) # check that real-world corpora work as expected vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) expected_result = 0.0 expected_result += 2 * 1.0 * 1 # government * s_{ij} * government expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday expected_result = numpy.full((3, 2), expected_result) result = matrix.inner_product([vec1] * 3, [vec2] * 2) self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) self.assertTrue(numpy.allclose(expected_result, result.todense())) vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) expected_result = matrix.inner_product(vec1, vec2) expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) expected_result = numpy.full((3, 2), expected_result) result = matrix.inner_product([vec1] * 3, [vec2] * 2, normalized=True) self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) self.assertTrue(numpy.allclose(expected_result, result.todense()))
# a term similarity mextrix using the embeddings. # # .. Important:: # The embeddings we have chosen here require a lot of memory. # import gensim.downloader as api model = api.load('word2vec-google-news-300') from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex termsim_index = WordEmbeddingSimilarityIndex(model) termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, tfidf) ############################################################################### # So let's compute SCM using the ``inner_product`` method. # similarity = termsim_matrix.inner_product(sentence_obama, sentence_president, normalized=(True, True)) print('similarity = %.4f' % similarity) ############################################################################### # Let's try the same thing with two completely unrelated sentences. # Notice that the similarity is smaller. # similarity = termsim_matrix.inner_product(sentence_obama, sentence_orange, normalized=(True, True)) print('similarity = %.4f' % similarity) ############################################################################### # # References # ---------- # # 1. Grigori Sidorov et al. *Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model*, 2014.
def retrieve_paragraphs(self, file_name, question_input, similarity_threshold): # Reading file names and read them paragraph by paragraph. with open(file_name, "r", encoding="utf-8") as fp: documents = fp.readlines() # Append question to the end of the document and pre-process everything all at once. documents.append(question_input) # convert everything to lower case for better processing. documents = [doc.lower() for doc in documents] # Remove stopwords stop_words = set(stopwords.words('english')) documents_array = [] self.remove_stopwords(documents_array, stop_words, documents) # Prepare a dictionary and a corpus. # Convert a document into a list of tokens. # This module implements the concept of a Dictionary -- a mapping between paragraphs and their integer ids. dictionary = corpora.Dictionary([word for word in documents_array]) # Convert the sentences into bag-of-words vectors. sentences = [] self.convert_sentences(sentences, dictionary, documents_array) # Another way of computing the similarity matrix # similarity_index = WordEmbeddingSimilarityIndex(self.loaded_model) similarity_index = WordEmbeddingSimilarityIndex.load( 'sparse_model.pkl') similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary) # pickle.dump(similarity_matrix, open('matrix-file.sav', 'wb')) # file_matrix = open('matrix-file.sav', 'rb') # loaded_similarity = pickle.load(open('matrix-file.sav', 'rb')) result_matrix = [] result_dict = OrderedDict() questionInput = sentences[-1] for index in range(len(sentences) - 1): result_matrix.append( similarity_matrix.inner_product(questionInput, sentences[index], normalized=True)) for index, result in enumerate(result_matrix): result_dict[index] = result result_dict = OrderedDict( sorted(result_dict.items(), key=itemgetter(1), reverse=True)) para_index = [ list(result_dict.keys())[1], list(result_dict.keys())[2], list(result_dict.keys())[3], list(result_dict.keys())[4], list(result_dict.keys())[5] ] print(" -- FILE NAME: ", file_name) similarity_value = list(result_dict.values())[1] count = 0 doc_paragraph = '' for doc in documents: if doc != '\n': if count in para_index: print(" -- This is the paragraph: ") print(doc) doc_paragraph = doc_paragraph + ' ' + doc count += 1 answer = "There is no answer for this document!" print(" -- This is the span of words:") similarity_threshold = 0.3 if similarity_value > similarity_threshold: predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2018.11.30-charpad.tar.gz" ) prediction = predictor.predict(question=question_input, passage=doc_paragraph) answer = prediction["best_span_str"] else: print("\033[44;33m%s!\033[m" % answer) print("------------------------") doc_paragraph = doc_paragraph.rstrip("\n") wrapper = textwrap.TextWrapper(width=190) word_list = wrapper.fill(text=doc_paragraph) word_list = word_list.replace(answer, '\033[44;33m{}\033[m'.format(answer)) print(word_list) return word_list