def _lexrank(cid): """ Run LexRank on all sentences from all documents in a cluster. :param cid: :return: rank_records """ _, processed_sents = dataset_parser.cid2sents( cid) # 2d lists, docs => sents flat_processed_sents = list( itertools.chain(*processed_sents)) # 1d sent list lxr = LexRank(processed_sents, stopwords=STOPWORDS['en']) scores = lxr.rank_sentences(flat_processed_sents, threshold=None, fast_power_method=True) sid2score = dict() abs_idx = 0 for doc_idx, doc in enumerate(processed_sents): for sent_idx, sent in enumerate(doc): sid = config.SEP.join((str(doc_idx), str(sent_idx))) score = scores[abs_idx] sid2score[sid] = score abs_idx += 1 sid_score_list = rank_sent.sort_sid2score(sid2score) rank_records = rank_sent.get_rank_records(sid_score_list, sents=processed_sents, flat_sents=False) return rank_records
def summary_nmf_method(file_folder, sumLen): sent_path = os.path.join(file_folder, 'sent_list.pkl') sent_list = joblib.load(sent_path) docs_path = os.path.join(file_folder, 'docs_list.pkl') docs = joblib.load(docs_path) n = len(sent_list) if (n > 1): GRS_sen = get_grs_score(file_folder) surface_score = get_surface_score(docs) # p=pagerank(docs) lxr = LexRank(docs) lx = lxr.rank_sentences(docs, threshold=None, fast_power_method=True) lxr_score = np.array(lx) maxLex = lxr_score.max() lxr_score = (100 * lxr_score) / maxLex total_score = [] for i in range(n): t_sum = float(GRS_sen[i]) + float(surface_score[i]) + float( lxr_score[i]) total_score.append(t_sum) copy_score = total_score.copy() top_list = get_top_list(copy_score, sumLen) summary_final = '<h3>Total Sentecnes :' + str( len(total_score)) + '</h3>' summary_final += '<h3>Selected Sentecnes :' + str(sumLen) + '</h3>' for i in range(n): if (total_score[i] in top_list): summary_final += '<p style="color:#00ff00">' + sent_list[ i] + '<br>' + str(total_score[i]) + '</p>' else: summary_final += '<p style="color:#ff0000">' + sent_list[ i] + '<br>' + str(total_score[i]) + '</p>' os.remove(sent_path) os.remove(docs_path) return summary_final elif (n == 1): os.remove(sent_path) os.remove(docs_path) return sent_list[0] else: os.remove(sent_path) os.remove(docs_path) return 'No adequate sentences found for summary.'
async def on_post(self, req, resp): data = await req.media() text = data["text"] doc = nlp(text) results = [] start_id = 0 end_id = 0 documents = [] for sent_id, sent in enumerate(doc.sents): tokens = [] for ent_id, ent in enumerate(sent.ents): # 文頭から最初のエンティティまでを追加 end_id = ent.start_char if start_id != end_id: tmp = { 'text': text[start_id:end_id], 'label': 'O' } tokens.append(tmp) start_id = ent.start_char end_id = ent.end_char tmp = { 'id': f"{sent_id}_{ent_id}", 'text': text[start_id:end_id], 'label': ent.label_ } tokens.append(tmp) start_id = ent.end_char # 最後のエンティティから文末までを追加 tokens.append({ 'id': f"{sent_id}_{ent_id+1}", 'text': text[start_id:sent.end_char], 'label': 'O' }) start_id = sent.end_char tmp = { 'id': sent_id, 'sent': tokens } results.append(tmp) extract_tokens = [] for token in sent: if token.pos_ in ['PROPM', 'NOUN', 'VERB', 'ADJ']: extract_tokens.append(token.lemma_) documents.append(" ".join(extract_tokens)) lexrank = LexRank(documents) scores = lexrank.rank_sentences(documents, threshold=0.0) ranking = pd.Series(scores).rank( method="min", ascending=False).tolist() for i in range(len(ranking)): results[i]['rank'] = int(ranking[i]) resp.media = results
def train(self): documents = [] documents_dir = Path(self.training_dir) for file_path in documents_dir.files('*.txt'): with file_path.open(mode='rt', encoding='utf-8') as fp: documents.append(fp.readlines()) self.lxr = LexRank(documents, stopwords=STOPWORDS['en'])
def get_ranked_sentences_lexrank(filepath): raw_text = list() raw_text.append(read_file_to_array(filepath)) lxr = LexRank(raw_text, stopwords=STOPWORDS['en']) sentences = sent_tokenize(read_file_to_string(filepath)) summary_sentences = lxr.get_summary(sentences, summary_size=floor(len(sentences) / 2), threshold=.1) # return summary_sentences return sentences
def rank_sentences(cluster_paths: List[str], noise_sentences: List[str]) -> List[str]: final_sentences = cluster_paths.copy() final_sentences.extend(noise_sentences) ranker = LexRank(final_sentences) sentence_scores = ranker.rank_sentences(final_sentences) scored_sentences = {} for idx, score in enumerate(sentence_scores): scored_sentences[final_sentences[idx]] = score sorted_scored_sentences = sorted(scored_sentences, key=scored_sentences.get) return sorted_scored_sentences
def condense(self, percent): # automatically sets percent to condense by if not specified if not percent: percent = self.get_optimal_condense_percent() # calculates number of sentences to return based on input num_sentences = int(len(self.sentences) * percent) if num_sentences < 1: num_sentences = 1 elif num_sentences > len(self.sentences): num_sentences = len(self.sentences) # create corpus from docs dirname = os.path.dirname(__file__) # checks if dumped json exists; if yes, loads that if os.path.isfile(os.path.join(dirname, 'corpus.json')): with open(os.path.join(dirname, 'corpus.json'), 'r') as f: documents = json.load(f) # otherwise, creates new corpus based on files in training_data directory else: documents = make_corpus_from_files('training_data') lxr = LexRank(documents, stopwords=STOPWORDS['en']) # create summary sentences_to_return = lxr.get_summary(self.sentences, summary_size=num_sentences) # joins sentences to make text body # list for each paragraph output = [[] for _ in self.paragraphs] # copies self.paragraphs to prevent destructive edits paragraphs = [paragraph[:] for paragraph in self.paragraphs] for sentence in sentences_to_return: for i, paragraph in enumerate(paragraphs): if sentence in paragraph: output[i].append(sentence) paragraph.remove(sentence) break # joins paragraph sentences with spaces output = [' '.join(paragraph) for paragraph in output] # joins paragraphs with newlines if paragraphs aren't empty output = '\n\n'.join([x for x in output if x.strip() != '']) return output
def extract_summary(self, ): data = read_json(self.full_path) articles = [] abstracts = [] for item in data: articles.append(item['article']) abstracts.append([item['abstract']]) lxr = LexRank(articles, stopwords=STOPWORDS['en']) summaries = [lxr.get_summary(x, summary_size=self.summary_size, threshold=self.threshold) for x in tqdm(articles, desc="LexRank:")] res = test_rouge(summaries, abstracts, self.processors) return res
def build_sim_items_e2e_tfidf_with_lexrank(cid, query, max_ns_doc=None, retrieved_dp=None, rm_dialog=True): """ Initialize LexRank with document-wise organized sentences to get true IDF. :param cid: :param query: :param max_ns_doc: :param retrieved_dp: :param rm_dialog: :return: """ if retrieved_dp: original_sents, processed_sents = load_retrieved_sentences( retrieved_dp=retrieved_dp, cid=cid) else: if 'tdqfs' in config.test_year: original_sents, processed_sents = dataset_parser.cid2sents_tdqfs( cid) else: original_sents, processed_sents = dataset_parser.cid2sents( cid, rm_dialog=rm_dialog, max_ns_doc=max_ns_doc) # 2d lists, docs => sents lxr = LexRank(processed_sents, stopwords=STOPWORDS['en']) doc_sents = list(itertools.chain(*processed_sents)) # 1d sent list doc_sents = copy.deepcopy( doc_sents) # avoid affecting the original doc_sents list doc_sents.append(query) sim_mat = lxr.get_tfidf_similarity_matrix(sentences=doc_sents) doc_sim_mat = sim_mat[:-1, :-1] rel_scores = sim_mat[-1, :-1] # logger.info('doc_sim_mat: {}, rel_scores: {}'.format(doc_sim_mat.shape, rel_scores.shape)) sim_items = { 'doc_sim_mat': doc_sim_mat, 'rel_scores': rel_scores, 'processed_sents': processed_sents, 'original_sents': original_sents, } return sim_items
def test_lexrank(): documents = [] documents_dir = DATA_ROOT / 'bbc_politics' document_files = documents_dir.files() for file_path in document_files: with gzip.open(file_path, mode='rt', encoding='utf-8') as fp: documents.append(fp.readlines()) lxr = LexRank( documents, stopwords=STOPWORDS['en'], keep_numbers=False, keep_emails=False, include_new_words=True, ) sentences = [ 'One of David Cameron\'s closest friends and Conservative allies, ' 'George Osborne rose rapidly after becoming MP for Tatton in 2001.', 'Michael Howard promoted him from shadow chief secretary to the ' 'Treasury to shadow chancellor in May 2005, at the age of 34.', 'Mr Osborne took a key role in the election campaign and has been at ' 'the forefront of the debate on how to deal with the recession and ' 'the UK\'s spending deficit.', 'Even before Mr Cameron became leader the two were being likened to ' 'Labour\'s Blair/Brown duo. The two have emulated them by becoming ' 'prime minister and chancellor, but will want to avoid the spats.', 'Before entering Parliament, he was a special adviser in the ' 'agriculture department when the Tories were in government and later ' 'served as political secretary to William Hague.', 'The BBC understands that as chancellor, Mr Osborne, along with the ' 'Treasury will retain responsibility for overseeing banks and ' 'financial regulation.', 'Mr Osborne said the coalition government was planning to change the ' 'tax system \"to make it fairer for people on low and middle ' 'incomes\", and undertake \"long-term structural reform\" of the ' 'banking sector, education and the welfare state.', ] summary = lxr.get_summary(sentences, discretize=False) assert summary == [sentences[5]]
class Rank(): def __init__(self, training_dir=None): self.training_dir = training_dir self.lxr = None def train(self): documents = [] documents_dir = Path(self.training_dir) for file_path in documents_dir.files('*.txt'): with file_path.open(mode='rt', encoding='utf-8') as fp: documents.append(fp.readlines()) self.lxr = LexRank(documents, stopwords=STOPWORDS['en']) def remove_new_line(self, text): """ Removes all new lines chars in text """ return text.replace('\n', ' ') def setence_tokenize(self, text): """ sentence tokenize """ return sent_tokenize(text) def sort_rankings(self, scores_sents): idx = np.argsort(scores_sents[0])[::-1] scores = np.array(scores_sents[0])[idx] sentences = np.array(scores_sents[1])[idx] return list(zip(scores, sentences)) def rank(self, text): sentences = text sentences = self.remove_new_line(sentences) sentences = self.setence_tokenize(sentences) sentences = self.lxr.get_summary(sentences, summary_size=6, threshold=.85) scores_cont = self.lxr.rank_sentences( sentences, threshold=None, fast_power_method=True, ) return self.sort_rankings((scores_cont, sentences))
def test_lexrank(): documents = [] documents_dir = DATA_ROOT / 'bbc_politics' document_files = documents_dir.files() for file_path in document_files: with gzip.open(file_path, mode='rt', encoding='utf-8') as fp: documents.append(fp.readlines()) lxr = LexRank( documents, stopwords=STOPWORDS['en'], keep_numbers=False, keep_emails=False, include_new_words=True, ) sentences = [ 'One of David Cameron\'s closest friends and Conservative allies, ' 'George Osborne rose rapidly after becoming MP for Tatton in 2001.', 'Michael Howard promoted him from shadow chief secretary to the ' 'Treasury to shadow chancellor in May 2005, at the age of 34.', 'Mr Osborne took a key role in the election campaign and has been at ' 'the forefront of the debate on how to deal with the recession and ' 'the UK\'s spending deficit.', 'Even before Mr Cameron became leader the two were being likened to ' 'Labour\'s Blair/Brown duo. The two have emulated them by becoming ' 'prime minister and chancellor, but will want to avoid the spats.', 'Before entering Parliament, he was a special adviser in the ' 'agriculture department when the Tories were in government and later ' 'served as political secretary to William Hague.', 'The BBC understands that as chancellor, Mr Osborne, along with the ' 'Treasury will retain responsibility for overseeing banks and ' 'financial regulation.', 'Mr Osborne said the coalition government was planning to change the ' 'tax system \"to make it fairer for people on low and middle ' 'incomes\", and undertake \"long-term structural reform\" of the ' 'banking sector, education and the welfare state.', ] summary = lxr.get_summary(sentences, threshold=None) assert summary == [sentences[5]]
def lex_rank_summarize(article): sentences = sent_tokenize(article) summary = LexRank(sentences).get_summary(sentences, summary_size=int( len(sentences) / 2), threshold=.1) return "\n".join(summary)
def __init__(self, all_messages): self.documents = {} for stream, data in all_messages.items(): for topic, messages in data["topics"].items(): text = "\n".join( self.clean_content(message) for message in messages) self.documents[(stream, topic)] = [ sentence.strip() for sentence in text.splitlines() if sentence.strip() ] self.summarizer = LexRank( self.documents.values(), keep_emails=True, keep_urls=True, stopwords=STOPWORDS["en"], )
def summarize(text_file, cat, summary_size, threshold): if cat not in ['business', 'entertainment', 'politics', 'sport', 'tech']: raise ValueError('category must be one of business, entertainment, politics, sport, tech') if summary_size <= 0: raise ValueError('number of summary sentences must be greater than one') if not 0.0 < threshold < 1.0: raise ValueError('summarize threshold must between 0 and 1') # load parsing dataset documents = [] documents_dir = Path('bbc/' + cat) for file_path in documents_dir.files('*.txt'): with file_path.open(mode='rt', encoding='utf-8') as fp: documents.append(fp.readlines()) # initialize LexRank with dataset lxr = LexRank(documents, stopwords=STOPWORDS['en']) # Read STT result sentences = [] with open(text_file, 'r') as f: sentences.extend(f.read().splitlines()) # Spelling correction for index in range(len(sentences)): tmp = '' for word in sentences[index].split(): tmp += spell(word) tmp += ' ' sentences[index] = tmp # Non-English word removal for index in range(len(sentences)): sentences[index] = " ".join( w for w in nltk.wordpunct_tokenize(sentences[index]) if w.lower() in words or not w.isalpha()) # STT translation after correction with open('output_text/' + text_file[:-4] + '_corrected.txt', 'a', encoding='utf-8') as f: for sentence in sentences: f.write(sentence + '\n') # get summary with classical LexRank algorithm summary = lxr.get_summary(sentences, summary_size, threshold) with open('output_text/' + text_file[:-4] + '_sum.txt', 'a', encoding='utf-8') as f: for sentence in summary: f.write(sentence + '\n')
def _make_lexrank_obj(self, stemming=True): idf_docs = [ doc for summ in self.summarizers for doc in summ.topic.docs ] if stemming: idf_docs = [Summarizer._stemming(doc) for doc in idf_docs] lxr = LexRank(idf_docs, stopwords=STOPWORDS['en']) # print(lxr._calculate_idf()) return lxr
def summary_nmf_method(file_folder,sumLen): sent_path=os.path.join(file_folder,'sent_list.pkl') sent_list=joblib.load(sent_path) docs_path=os.path.join(file_folder,'docs_list.pkl') docs=joblib.load(docs_path) n = len(sent_list) if(n>1): GRS_sen = get_grs_score(file_folder) surface_score = get_surface_score(docs) # p=pagerank(docs) lxr = LexRank(docs) lx = lxr.rank_sentences(docs, threshold=None, fast_power_method=True) lxr_score = np.array(lx) maxLex = lxr_score.max() lxr_score = (100 * lxr_score) / maxLex total_score = [] for i in range(n): t_sum = float(GRS_sen[i]) + float(surface_score[i]) + float(lxr_score[i]) total_score.append(t_sum) copy_score = total_score.copy() top_list = get_top_list(copy_score, sumLen) summary_final = '' for i in range(n): if total_score[i] in top_list: summary_final += sent_list[i] + ' \n ' return summary_final elif(n==1): return sent_list[0] else: return 'No adequate sentences found for summary.'
def lexrank_summarize(corpus): list_of_summarization = [] documents = [ split_sentences(sample.replace("story_separator_special_tag", "\n")) for sample in corpus ] print("[" + "Document Size: " + str(len(documents)) + "]") print("[" + time.strftime("%H:%M:%S", time.localtime()) + "]", "Begin building LexRank model...") lxr = LexRank(documents, stopwords=STOPWORDS['en']) print("[" + time.strftime("%H:%M:%S", time.localtime()) + "]", "LexRank model successfully built...") for i in range(len(documents)): sample = documents[i] summary = lxr.get_summary(sample, summary_size=len(sample)) articles = corpus[i].split("story_separator_special_tag") words_counter = 0 summary_counter = 0 tmp_summary = [ [] for _ in range(len(articles)) ] while words_counter < 500 and summary_counter < len(summary): flag = 0 for j in range(len(articles)): if summary[summary_counter] in articles[j]: tmp_summary[j].append(summary[summary_counter]) words_counter += len(summary[summary_counter].split(" ")) flag = 1 if flag == 0: print("[Error] Summary not in original sample.", summary[summary_counter], i) summary_counter += 1 # print("words_counter, summary_counter, total summary", words_counter, summary_counter, len(summary)) for k in range(len(tmp_summary)): tmp_summary[k] = " newline_char ".join(tmp_summary[k]) list_of_summarization.append(" story_separator_special_tag ".join(tmp_summary)) if i %100 == 0: print("------") print(i) print("------") # if i == 100: # break return list_of_summarization
def init_lexrank(review_path): """Pass the reviews text file (reviews only)""" global tokenizer, lxr reviews = [] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') review_file = Path(review_path) with review_file.open(mode='rt', encoding='utf-8') as fp: reviews.append(fp.readlines()) lxr = LexRank(reviews, stopwords=STOPWORDS['en']) return
def _make_lexrank_obj(self): idf_docs = [doc for summ in self.summarizers for doc in summ.topic.docs] # print('idf_docs before length: {}'.format(len(idf_docs))) seen = set() idf_docs = [doc for doc in idf_docs if doc.id not in seen and not seen.add(doc.id)] # uniqify list of docs # print('idf_docs after length: {}'.format(len(idf_docs))) if STEMMING: idf_docs = [Summarizer._stemming(doc) for doc in idf_docs] lxr = LexRank(idf_docs, stopwords=STOPWORDS['en']) # print(lxr._calculate_idf()) return lxr
def _score_graph_initially(sim_mat, rel_vec, cid, damp, abs2sid=None): # todo: check if feeding placeholder documents to init LexRank does no harm # _, processed_sents = dataset_parser.cid2sents(cid, rm_dialog=rm_dialog) # 2d lists, docs => sents # lxr = LexRank(processed_sents, stopwords=STOPWORDS['en']) doc_place_holder = [['test sentence 1', 'test sentence 2'], ['test sentence 3']] lxr = LexRank(doc_place_holder, stopwords=STOPWORDS['en']) params = { 'similarity_matrix': sim_mat, 'threshold': None, 'fast_power_method': True, 'rel_vec': rel_vec, 'damp': damp, } scores = lxr.rank_sentences_with_sim_mat(**params) sid2score = dict() for abs, sc in enumerate(scores): sid2score[abs2sid[abs]] = sc return sid2score
def getResume(sentences, summary_size, threshold): documents = [] documents_dir = Path('./db') stopwords = {} stopwords_dir = Path('./static/stopwords-id.txt') for file_path in documents_dir.files('*.txt'): with file_path.open(mode='rt', encoding='utf-8', errors='ignore') as fp: documents.append(fp.readlines()) # get the stpwords with stopwords_dir.open(mode='rt', encoding='utf-8') as stopFile: stopwords['id'] = set(stopFile.readlines()) stopFile.close() lxr = LexRank(documents, stopwords=stopwords['id']) summary = lxr.get_summary(sentences, summary_size=int(summary_size), threshold=threshold) return summary
class Summarizer: def __init__(self, all_messages): self.documents = {} for stream, data in all_messages.items(): for topic, messages in data["topics"].items(): text = "\n".join( self.clean_content(message) for message in messages) self.documents[(stream, topic)] = [ sentence.strip() for sentence in text.splitlines() if sentence.strip() ] self.summarizer = LexRank( self.documents.values(), keep_emails=True, keep_urls=True, stopwords=STOPWORDS["en"], ) def get_summary(self, stream, topic, show_url_list=True): document = self.documents[(stream, topic)] threshold = 0.03 summary_size = 2 if len(document) > 5 else 1 fast_power_method = True lex_scores = self.summarizer.rank_sentences( document, threshold=threshold, fast_power_method=fast_power_method) sorted_ix = np.argsort(lex_scores)[::-1] url_list = self.get_url_list(document) if show_url_list else [] return ( [document[i] for i in sorted(sorted_ix[:summary_size])], url_list, ) @staticmethod def get_url_list(document): links = [ "[{link}]({link})".format( link=plain_link) if plain_link else md_link for sentence in document for (md_link, plain_link) in LINK_RE.findall(sentence) ] return links @staticmethod def clean_content(message): content = message["content"] return PUNCTUATION_RE.sub("\\1\n", content)
from lexrank import STOPWORDS, LexRank from path import Path documents = [] documents_dir = Path('bbc/politics') for file_path in documents_dir.files('*.txt'): with file_path.open(mode='rt', encoding='utf-8') as fp: documents.append(fp.readlines()) lxr = LexRank(documents, stopwords=STOPWORDS['en']) sentences = [ 'One of David Cameron\'s closest friends and Conservative allies, ' 'George Osborne rose rapidly after becoming MP for Tatton in 2001.', 'Michael Howard promoted him from shadow chief secretary to the ' 'Treasury to shadow chancellor in May 2005, at the age of 34.', 'Mr Osborne took a key role in the election campaign and has been at ' 'the forefront of the debate on how to deal with the recession and ' 'the UK\'s spending deficit.', 'Even before Mr Cameron became leader the two were being likened to ' 'Labour\'s Blair/Brown duo. The two have emulated them by becoming ' 'prime minister and chancellor, but will want to avoid the spats.', 'Before entering Parliament, he was a special adviser in the ' 'agriculture department when the Tories were in government and later ' 'served as political secretary to William Hague.',
#!/usr/bin/python3 # -*- coding: utf-8 -*- from lexrank import LexRank from path import Path import sys documents = [] documents_dir = Path('Dataset') for file_path in documents_dir.files('*.txt'): with file_path.open(mode='rt', encoding='utf-8') as fp: x = fp.readlines() i = x.index('\n') x = x[:i] documents.append(x) lxr = LexRank(documents) with open('Dataset/' + sys.argv[1], 'r') as f: sentences = list(f) # get summary with classical LexRank algorithm summary = lxr.get_summary(sentences, summary_size=3, threshold=0.15) print(summary[0], end='') print(summary[1], end='') print(summary[2], end='')
import os from shutil import copyfile log("load reference doc") documents = [] document_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "bbc") for dirPath, dirNames, fileNames in os.walk(document_dir): for f in fileNames: try: with open(os.path.join(dirPath, f), "rt", encoding="utf8") as file: documents.append(file.readlines()) except Exception as e: log("path: %s%s" % (dirPath, f)) lxr = LexRank(documents, stopwords=STOPWORDS['en']) # 建立wiki上的比對檔案並複製到特定的資料夾 def reference_file(file_group, task_number, sim_type, i): ref = [ "brexit", "missile", "brexit", "brexit", "brexit", "brexit", "catalan", "catalan", "crimea", "crimea", "gravitational", "gravitational", "brexit", "hk", "catalan", "sewol", "syria", "syria", "turkish" ] wiki_file = ref[file_group] if not os.path.exists( "lexrank/{sim_type}/reference".format(sim_type=sim_type)): os.mkdir("lexrank/{sim_type}/reference".format(sim_type=sim_type))
class ImportanceEstimationModel(object): def load_data(self, train_path, dev_path, test_path): train_data = json.load(open(train_path)) dev_data = json.load(open(dev_path)) test_data = json.load(open(test_path)) self.train_data = train_data self.test_data = test_data self.dev_data = dev_data return self.train_data, self.dev_data, self.test_data def print_train_sample(self): sample = random.choice(self.train_data) claims = sample[0] conc = sample[1] print('Conclusion:', conc) print('Claims:') for idx, claim in enumerate(claims): print(idx+1, '.', claim) def num_of_pos_tags_feature(self, claim): claim_annotated = self.nlp_parser.sentences_to_tags([claim]) claim_pos_tags = set([x[1] for x in claim_annotated[0]]) return len(claim_pos_tags) def num_of_ne_feature(self, claim): named_entities = self.nlp_parser.extract_named_entities(claim) return len(named_entities) def _build_tfidf_model(self, texts): tfidf = TfidfVectorizer() tfidf_model = tfidf.fit(texts) return tfidf_model def _build_lexrank_model(self, texts): self.lxrank = LexRank(texts, stopwords=STOPWORDS['en']) def _sentiment_features(self, claim): claim_words = nltk.word_tokenize(claim) num_of_positive_words = 0 num_of_negative_words = 0 num_of_neutral_words = 0 for word in claim_words: synsets = list(swn.senti_synsets(word)) if len(synsets) == 0: num_of_neutral_words +=1 else: syn = synsets[0] if syn.pos_score() > syn.neg_score(): num_of_positive_words +=1 elif syn.pos_score() < syn.neg_score(): num_of_negative_words +=1 else: num_of_neutral_words+=1 return num_of_positive_words, num_of_negative_words , num_of_neutral_words def _num_of_words_feature(self, claim): claim_words = nltk.word_tokenize(claim) return len(claim_words) def _tfidf_features(self, claim): claim_words = nltk.word_tokenize(claim) # Avg. tfidf tfidf_vector = self.tfidf_model.transform([claim]) avg_tfidf_feature = np.sum(tfidf_vector.toarray())/len(claim_words) max_tfidf_feature = np.max(tfidf_vector.toarray()) return avg_tfidf_feature, max_tfidf_feature def _claim_features(self, claim, claims_text): # Number of words num_of_words_feature = self._num_of_words_feature(claim['text']) # Avg. Max. tfidf avg_tfidf_feature, max_tfidf_feature = self._tfidf_features(claim['text']) # Number of postive/negative/neutral words num_of_positive_words, num_of_negative_words , num_of_neutral_words = self._sentiment_features(claim['text']) # Number of POS tags and Number of Named Entities poss = set([p['type'] for p in claim['pos']]) num_of_pos_tags = len(poss) num_of_ne = len(claim['named_entities']) return [num_of_words_feature, avg_tfidf_feature, max_tfidf_feature, num_of_positive_words, num_of_negative_words, num_of_neutral_words, num_of_ne, num_of_pos_tags] def _instance_features(self, claims): claims_sents = [claim['text'] for claim in claims] claims_text = ' '.join(claims_sents) claims_centroidness_scores = self.lxrank.rank_sentences(claims_sents, threshold=None, fast_power_method=False) claims_features = [self._claim_features(claim, claims_text) + claims_centroidness_scores[i] for i, claim in enumerate(claims)] return np.atleast_2d(claims_features) def instance_scores(self, claims, summary): claims_labels = [] for claim in claims: claim_tokens = set(nltk.word_tokenize(claim['text'])) summary_tokens = set(nltk.word_tokenize(summary)) shared_tokens = claim_tokens.intersection(summary_tokens) #overlap_ratio = len(shared_tokens)/(len(claim_tokens) + len(summary_tokens)) claims_labels.append(len(shared_tokens)) return claims_labels def feature_representation(self, data): # 1. build a tf-idf model over the training data arguments = [' '.join([claim['text'] for claim in argument['claims']]) for argument in data] self.tfidf_model = self._build_tfidf_model(arguments) arguments = [[claim['text'] for claim in argument['claims']] for argument in data] self.lxrank_model = self._build_lexrank_model(arguments) # 2. Encode training data into features self.train_X = [] self.train_Y = [] for argument in data: claims = argument['claims'] conclusion = argument['conclusion']['text'] claims_vectors = self._instance_features(claims) claims_scores = self.instance_scores(claims, conclusion) for claim_vector, claim_label in zip(claims_vectors, claims_scores): self.train_X.append(claim_vector) self.train_Y.append(claim_label) self.train_X = np.array(self.train_X) self.train_Y = np.array(self.train_Y) #Normalize claims_scores into [0,1] labels_scaler = MinMaxScaler() labels_scaler.fit(self.train_Y.reshape(-1, 1)) self.train_Y = labels_scaler.transform(self.train_Y.reshape(-1, 1)).reshape(-1) return self.train_X, self.train_Y def train_svr(self, train_X, train_Y): svr_params = {'C': [0.001, 0.1, 1.0, 10, 100]} svr = SVR() clf = GridSearchCV(svr, svr_params, cv=5, scoring='neg_mean_absolute_error', return_train_score=False) clf.fit(train_X, train_Y) best_ridge = clf.best_estimator_ self.best_ridge = best_ridge return clf.best_score_ def kendalltau_evaluation(self, test_data): from scipy import stats total_tau = 0 for sample in test_data: claims = sample[0] conclusion = sample[1] #Predict scores of each claim claims_vectors = self._instance_features(claims) ground_truth_scores = self.instance_scores(claims, conclusion) ground_pred_scores = self.best_ridge.predict(claims_vectors) tau, _ = stats.kendalltau(ground_truth_scores, ground_pred_scores) total_tau += tau return total_tau/len(test_data) def score_data(self, data): for sample in data: claims = sample['claims'] conclusion = sample['conclusion']['text'] #Predict scores of each claim claims_vectors = self._instance_features(claims) claims_scores = self.best_ridge.predict(claims_vectors) for claim, score in zip(claims, claims_scores): claim['imprtance_score'] = score sample['claims'] = claims return data def mrr_evaluation(self, test_data): mrr_value = 0 for sample in test_data: claims = sample['claims'] conclusion = sample['conclusion']['text'] #Predict scores of each claim claims_vectors = self._instance_features(claims) claims_labels = self.instance_scores(claims, conclusion) claims_labels = [c_score > 0 for c_score in claims_labels] claims_scores = self.best_ridge.predict(claims_vectors) #Sort claims based on the score scores_labels_list = list(zip(claims_scores, claims_labels)) sorted_claims = sorted(scores_labels_list, key= lambda x : -x[0]) rank = 1 for x in sorted_claims: if x[1]: break rank +=1 mrr_value += 1/rank return mrr_value/len(test_data)
from .search_resources import final_resources, wiki_summary from .unfurling import OPG from .keyword_ner_search_query import NER # Run the following command in terminal to connect to redis channel # docker run -p 6379:6379 -d redis:5 print('loading dataset and initializing...') documents = [] documents_dir = Path('/home/pranshu/GAMR/gamr/meetingmode/total') for file_path in documents_dir.files('*.txt'): with file_path.open(mode='rt', encoding='latin1') as fp: documents.append(fp.readlines()) lxr = LexRank(documents, stopwords=STOPWORDS['en']) print('dataset load done!') print('server is running!') class StudyConsumer(AsyncWebsocketConsumer): async def connect(self): self.room_name = self.scope['url_route']['kwargs']['username'] self.room_group_name = 'study_%s' % self.room_name # Join room group await self.channel_layer.group_add(self.room_group_name, self.channel_name) await self.accept()
'../gen_backend/backend/cliche_sentences.txt') horror_sentences = get_filler_sentences( '../gen_backend/backend/horror_sentences.txt') romance_sentences = get_filler_sentences( '../gen_backend/backend/romance_sentences.txt') violence_sentences = get_filler_sentences( '../gen_backend/backend/violence_sentences.txt') num_boring = (((int(boring_rating) / 5) * 50) / 100) * len(boring_sentences) num_cliche = (((int(cliche_rating) / 5) * 50) / 100) * len(boring_sentences) num_horror = (((int(horror_rating) / 5) * 50) / 100) * len(boring_sentences) num_romance = (((int(romance_rating) / 5) * 50) / 100) * len(boring_sentences) num_violence = (( (int(violence_rating) / 5) * 50) / 100) * len(boring_sentences) lxr = LexRank(paraphrase_summary, stopwords=STOPWORDS['en']) boring_scores_cont = lxr.rank_sentences(boring_sentences, threshold=None, fast_power_method=True) cliche_scores_cont = lxr.rank_sentences(cliche_sentences, threshold=None, fast_power_method=True) horror_scores_cont = lxr.rank_sentences(horror_sentences, threshold=None, fast_power_method=True) romance_scores_cont = lxr.rank_sentences(romance_sentences, threshold=None, fast_power_method=True) violence_scores_cont = lxr.rank_sentences(violence_sentences, threshold=None, fast_power_method=True)
def _build_lexrank_model(self, texts): self.lxrank = LexRank(texts, stopwords=STOPWORDS['en'])
from lexrank import LexRank from lexrank.mappings.stopwords import STOPWORDS from path import Path documents = [] paths = Path('./bbc-fulltext/bbc').glob('politics/') # print(paths) for category in paths: for file_path in category.files('*.txt'): with file_path.open(mode='r', encoding='utf-8') as file: documents.append(file.readlines()) # print(documents[:5]) lxr = LexRank(documents, stopwords=STOPWORDS['en']) # url = 'https://www.nytimes.com/2021/03/18/opinion/anti-asian-american-violence.html' # article = Article(url) # article.download() # article.parse() # text = article.text text = ''' The grim reality of modern American life is that each new mass killing leads to a fevered study of motives and meaning. Was the latest shooter motivated by racism, misogyny, religion, revenge or some combination thereof? Those are not questions that members of a healthy society should routinely be forced to ask or answer. After eight people — including six people of Asian descent and seven women — were shot to death in Georgia this week, a deputy sheriff chalked the killings up to the suspect’s confessed “sex addiction,” adding that “yesterday was a really bad day” for the alleged shooter. That diagnosis was met with the skepticism it deserved: The same deputy promoted the sale of anti-Asian T-shirts that referred to the coronavirus as an import from “Chy-na.” It’s difficult to disentangle the vile pathologies that lead a man to take so many innocent lives. It’s also impossible to ignore the context in which the murders were committed and the impact that the tragedy has had on communities across America. In an analysis of nearly 4,000 hate-related incidents targeting Asian-Americans documented this year and last, nearly 70 percent of the victims were women, according to a report by the group Stop AAPI Hate. New York was the second state behind California in the total number of incidents documented by the group.