def tfidf_sim(self, train_data, body_dict, threshold): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar ''' bodyText_list = list(body_dict.values()) bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] vocab = corpora.Dictionary(bodyText_w) corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w] tfidf_model = models.TfidfModel(corporaBody_bow) unrelated, related, y_true, y_pred = [], [], [], [] for headline, bodyID, stance in train_data: headline_bow = vocab.doc2bow(sent2stokens_wostop(headline)) headlines_tfidf = tfidf_model[headline_bow] corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]] sim = cossim(headlines_tfidf, corporaBody_tfidf) unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred]) print_results([unrelated, related, y_true, y_pred], self.model_type)
def order_by_tf_id_rank(self, headline, sentences, number_of_sentences): headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline)) headline_tfidf = self.tfidf_model[headline_bow] scored_sentences = [] 'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]' #sentences = sentences.replace('\n', ' ') for sentence in self.tokenizer.tokenize(sentences): sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence)) sim = cossim(headline_tfidf, sentence_tfidf) #print(str(sim)) scored_sentences.append([sentence, sim]) sorted_sentences = sorted( scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse=True) ''' for sentence in sorted_sentences: print(str(sentence)) ''' ' return sorted_sentences ' sentences_string = "" current_sentence_number = 0 for sentence in sorted_sentences: current_sentence_number += 1 sentences_string += sentence[0] + ' ' if current_sentence_number == number_of_sentences: break #print("Ranked: \n " + sentences_string) return sentences_string
def avg_embed(self, train_data, body_dict, threshold): embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData, vocab_size=self.vocab_size, embedding_size=self.embedding_size) bodyText_list = list(body_dict.values()) bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] unrelated, related, y_true, y_pred = [], [], [], [] for headline, bodyID, stance in train_data: headline_w = sent2stokens_wostop(headline) body_w = bodyText_w[bodyIds_index[bodyID]] sim = avg_embedding_similarity(embeddings, self.embedding_size, ' '.join(headline_w), ' '.join(body_w)) unrelated, related, y_true, y_pred = create_lists( sim, stance, threshold, [unrelated, related, y_true, y_pred]) print_results([unrelated, related, y_true, y_pred], self.model_type)
def order_by_tf_id_rank(self, headline, sentences, number_of_sentences): headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline)) headline_tfidf = self.tfidf_model[headline_bow] scored_sentences = [] 'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]' #sentences = sentences.replace('\n', ' ') for sentence in self.tokenizer.tokenize(sentences): sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence)) sim = cossim(headline_tfidf, sentence_tfidf) #print(str(sim)) scored_sentences.append([sentence, sim]) sorted_sentences= sorted(scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse= True) ''' for sentence in sorted_sentences: print(str(sentence)) ''' ' return sorted_sentences ' sentences_string = "" current_sentence_number = 0 for sentence in sorted_sentences: current_sentence_number += 1 sentences_string += sentence[0] + ' ' if current_sentence_number == number_of_sentences: break #print("Ranked: \n " + sentences_string) return sentences_string
def generate_tf_idf_corpora(self): data_path = "%s/../data/fnc-1" % (path.dirname(path.dirname(path.abspath(__file__)))) reader = CorpusReader(data_path) body_dict = reader.load_body("train_bodies.csv") bodyText_list = list(body_dict.values()) bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] self.vocab = corpora.Dictionary(bodyText_w) corporaBody_bow = [self.vocab.doc2bow(text) for text in bodyText_w] self.tfidf_model = models.TfidfModel(corporaBody_bow)
def generate_tf_idf_corpora(self): #data_path = myConstants.data_path #reader = CorpusReader(data_path) #body_dict = reader.load_body(myConstants.train_bodies) body_dict = myConstants.d.articles bodyText_list = list(body_dict.values()) bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] self.vocab = corpora.Dictionary(bodyText_w) corporaBody_bow = [self.vocab.doc2bow(text) for text in bodyText_w] self.tfidf_model = models.TfidfModel(corporaBody_bow)
def avg_embed(self, train_data, body_dict, threshold): embeddings = LoadEmbeddings(filepath=self.embeddPath, data_path=self.embeddData, vocab_size=self.vocab_size, embedding_size=self.embedding_size) bodyText_list = list(body_dict.values()) bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] unrelated, related, y_true, y_pred = [], [], [], [] for headline, bodyID, stance in train_data: headline_w = sent2stokens_wostop(headline) body_w = bodyText_w[bodyIds_index[bodyID]] sim = avg_embedding_similarity(embeddings, self.embedding_size, ' '.join(headline_w), ' '.join(body_w)) unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred]) print_results([unrelated, related, y_true, y_pred], self.model_type)