def summarize(filename,option): #LexicalChain(os.getcwd()+'/../filename') summary = textrank(filename,original=option,words=100) op_name='summary'+filename[4:] text_file = open('../RougeEval/syssum/'+option+'/'+op_name, "w") text_file.write(summary) text_file.close() return
def doMagic(userId, channelId, token): # Get all unseen messages data = getMessageData(channelId, token) messageTexts = [] for message in data: txt = message['text'].decode('utf-8', 'ignore') messageTexts.append(txt) # # Group into clusters msgs = generateListOfMessages(data) clusterIndicies = getClusters(msgs) labels = clusterIndicies.keys() messageClusters = [] for label in labels: indicies = clusterIndicies.get(label) cluster = [] for index in indicies: cluster.append(messageTexts[index]) messageClusters.append(cluster) # Find important clusters res = [] for cluster in messageClusters: document = "" for message in cluster: document += message + " " text_ranks = textrank(document) numMessages = math.ceil(len(text_ranks) * .3) importantMessagesInCluster = [] for i in range(int(numMessages)): item = {"text": text_ranks[i][1]} importantMessagesInCluster.append(item) res.append(importantMessagesInCluster) return res
def summarize(text, topn=None): stop_words = load_stopwords('data/stopwords.txt') sentences = tokenize_into_sentences(text) topn = len(sentences) // 3 if not topn else topn print( "Generating top {} most relevant sentences out of {} total sentences". format(topn, len(sentences))) sentences_processed = list(map(process_text, sentences)) remove_stop = partial(remove_stopwords, stop_words) sentences_tokenized = [ sentence for sentence in map(lambda x: remove_stop(x.split()), sentences_processed) if sentence ] matrix, ranks = textrank(sentences_tokenized) res = [] for tup in ranks[:topn]: idx = tup[0] res.append((sentences[idx], tup[1])) return res
for i, x in enumerate(word): l = len(sentence[i].split(' ')) #print(l) sums = 0 j = 0 for y in x: sums += nodes[nodes[:, 0] == y, 2][0] j += 1 if j > 0: #sentenceWeight[i]=sums/j ### average based #sentenceWeight[i]=sums ### score based #sentenceWeight[i]=sums/(1+math.log10(j)) ### log based sentenceWeight[i] = sums / l ### sentence average based #print(sums) sentence = [[i, sentence[i], sentenceWeight[i]] for i, x in enumerate(word)] #sentence = np.sort(np.array(sentence,dtype=object),axis=-0) sentence = sorted(sentence, key=lambda x: -x[2]) sentence = np.array(sentence) return sentence if __name__ == "__main__": path = "data/body/" filens = os.listdir(path) stops = stopwords.words('english') sentence, word = preprocess(path + filens[0]) nodes = textrank(word) generate_summary_bylength(sentence, word, nodes, filens[0])
def summarize(text): # SPLIT TO PARAGRAPHS pre_paragraphs = text.split('\n') paragraphs = [] for i, p in enumerate(pre_paragraphs): if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1 or re.match(r'^\s*$', pre_paragraphs[i + 1])): paragraphs.append(p) # print(f'Num of paragraphs: {len(paragraphs)}') # for i, p in enumerate(paragraphs): # print(f'par#{i+1}: {p}') # SPLIT TO SENTENCES sentences = separator.separate(text) print(f'Num of sentences: {len(sentences)}') for i, s in enumerate(sentences): print(f'#{i+1}: {s}') # TOKENIZE stem = False if stem: tokenized_sentences = [[ czech_stemmer.cz_stem(word, aggressive=False) for word in sentence ] for sentence in tokenize(sentences)] else: tokenized_sentences = tokenize(sentences) # REMOVE STOPWORDS tokenized_sentences_without_stopwords = remove_stop_words( tokenized_sentences, keep_case=False) sentences_without_stopwords_case = remove_stop_words( sentences, keep_case=True, is_tokenized=False, return_tokenized=False) print('===Sentences without stopwords===') for i, s in enumerate(tokenized_sentences_without_stopwords): print(f'''#{i+1}: {' '.join(s)}''') print('===Sentences without stopwords CASE===') for i, s in enumerate(sentences_without_stopwords_case): print(f'''#{i+1}: {s}''') # POS-TAG tagged_sentences = pos_tag(sentences_without_stopwords_case) print('=====Tagged_sentences=====') for i, s in enumerate(tagged_sentences): print(f'''#{i+1}: {s}''') # 1. THEMATICITY FEATURE thematicity_feature_scores = thematicity_feature( tokenized_sentences_without_stopwords) # 2. SENTENCE POSITION FEATURE - NOTE: shitty! sentence_position_scores = sentence_position_feature(len(sentences)) # 3. SENTENCE LENGTH FEATURE sentence_length_scores = sentence_length_feature(tokenized_sentences) # 4. SENTENCE PARAGRAPH POSITION FEATURE # 5. PROPER_NOUN FEATURE proper_noun_scores = proper_noun_feature(tagged_sentences) # 6. NUMERALS FEATURE numerals_scores = numerals_feature(tokenized_sentences) # 7. NAMED ENTITIES FEATURE - very similar to PROPER_NOUN FEATURE # 8. TF_ISF FEATURE - NOTE: TextRank instead of TS_ISF ??? ts_isf_orig is meh tf_isf_scores = tf_isf_orig_feature(tokenized_sentences_without_stopwords) # 9. CENTROID SIMILARITY FEATURE centroid_similarity_scores = centroid_similarity_feature( sentences, tf_isf_scores) # 10. UPPER-CASE FEATURE (not in the paper) upper_case_scores = upper_case_feature(tokenized_sentences) # 11. QUOTES FEATURE (not in the paper) quotes_scores = quotes_feature(sentences) # 12. REFERENCES FEATURE (not in the paper) references_scores = references_feature(tokenized_sentences) # 13. TEXTRANK FEATURE (not in the paper) textrank_scores = textrank.textrank(tokenized_sentences, True, '4-1-0.0001') feature_matrix = [] feature_matrix.append(thematicity_feature_scores) feature_matrix.append(sentence_position_scores) feature_matrix.append(sentence_length_scores) feature_matrix.append(proper_noun_scores) feature_matrix.append(numerals_scores) feature_matrix.append(tf_isf_scores) feature_matrix.append(centroid_similarity_scores) feature_matrix.append(upper_case_scores) features = [ ' thema', 'sen_pos', 'sen_len', ' propn', ' num', ' tf_isf', 'cen_sim', ' upper' ] feature_matrix_2 = np.zeros((len(sentences), len(features))) for i in range(len(features)): for j in range(len(sentences)): feature_matrix_2[j][i] = feature_matrix[i][j] feature_sum = [] for i in range(len(np.sum(feature_matrix_2, axis=1))): feature_sum.append(np.sum(feature_matrix_2, axis=1)[i]) print('=====Scores=====') print(35 * ' ', end='|') for f in features: print(f, end='|') print() for i, s in enumerate(sentences): print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|') for f_s in feature_matrix: print('{: .4f}'.format(round(f_s[i], 4)), end='|') print('{: .4f}'.format(round(feature_sum[i], 4))) print('Training rbm...') rbm_trained = rbm.test_rbm(dataset=feature_matrix_2, learning_rate=0.1, training_epochs=14, batch_size=5, n_chains=5, n_hidden=len(features)) # another implementation of rbm, from sklearn # rbm2 = BernoulliRBM(n_components=len(features), n_iter=14, batch_size=5, learning_rate=0.1) # rbm_trained = rbm2.fit_transform(feature_matrix_2) # print(rbm_trained) rbm_trained_sums = np.sum(rbm_trained, axis=1) print('=====RBM Enhanced Scores=====') print(35 * ' ', end='|') for f in features: print(f, end='|') print() for i, s in enumerate(sentences): print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|') for f_s in rbm_trained[i]: print('{: .4f}'.format(round(f_s, 4)), end='|') print('{: .4f}'.format(round(rbm_trained_sums[i], 4))) enhanced_feature_sum = [] feature_sum = [] for i in range(len(np.sum(rbm_trained, axis=1))): enhanced_feature_sum.append([np.sum(rbm_trained, axis=1)[i], i]) feature_sum.append([np.sum(feature_matrix_2, axis=1)[i], i]) print(f'enhanced_feature_sum: {enhanced_feature_sum}') print(f'feature_sum: {feature_sum}') enhanced_feature_sum.sort(key=lambda x: x[0]) feature_sum.sort(key=lambda x: -1 * x[0]) print('=====Sorted=====') print(f'enhanced_feature_sum: {enhanced_feature_sum}') print(f'feature_sum: {feature_sum}') # print('=====The text=====') # for x in range(len(sentences)): # print(sentences[x]) extracted_sentences_rbm = [] extracted_sentences_rbm.append([sentences[0], 0]) extracted_sentences_simple = [] extracted_sentences_simple.append([sentences[0], 0]) summary_length = max(min(round(len(sentences) / 4), 12), 3) # length between 3-12 sentences for x in range(summary_length): if enhanced_feature_sum[x][1] != 0: extracted_sentences_rbm.append([ sentences[enhanced_feature_sum[x][1]], enhanced_feature_sum[x][1] ]) if feature_sum[x][1] != 0: extracted_sentences_simple.append( [sentences[feature_sum[x][1]], feature_sum[x][1]]) extracted_sentences_rbm.sort(key=lambda x: x[1]) extracted_sentences_simple.sort(key=lambda x: x[1]) final_text_rbm = '' for i in range(len(extracted_sentences_rbm)): final_text_rbm += extracted_sentences_rbm[i][0] + '\n' final_text_simple = '' for i in range(len(extracted_sentences_simple)): final_text_simple += extracted_sentences_simple[i][0] + '\n' print('=====Extracted Final Text RBM=====') print(final_text_rbm) print() print('=====Extracted Final Text simple=====') print(final_text_simple) return final_text_rbm
def download_articles_from_url(api_url, download_directory): # Use the API URL to get a list of articles api_req = requests.get(api_url) article_list = api_req.text.split('\n') # Shuffle the article list to avoid being blocked random.shuffle(article_list) # Creates a Simple Summarizer for summarizing articles ss = summarize.SimpleSummarizer() for articleURL in article_list: if (articles.find_one({ "url": articleURL }) == None): #print 'Trying: ' + articleURL.encode('ascii', 'ignore') if (validate_url(articleURL) is False): continue # For each URL, assign its md5 as a unique identifier m = hashlib.md5() m.update(articleURL) code = m.hexdigest() first_level = code[0:2] second_level = code[2:4] # This code also becomes the filename for the full file path articleFileDirectory = download_directory + first_level + "/" + second_level + "/" articleFilePath = articleFileDirectory + code # TODO: Parse title from article # Download full article and use full-text (if available) for keyword extraction fullArticleText = download_article_file(articleURL, articleFileDirectory, code) if (fullArticleText is not None): keyword_set = textrank(fullArticleText) #articleFeatures = get_article_features(fullArticleText, articleURL) articleFeatures = None guessed_date = guess_date(fullArticleText) summaryText = ss.summarize(fullArticleText,5) # 2nd input is number of lines in summary else: guessed_date = "" # TODO: Fix print "ERROR: Full article text not available" #keyword_set = textrank(summaryText) #articleFeatures = get_article_features(summaryText, articleURL) articleFeatures = None continue keywords = list(keyword_set) print "Downloaded: " + articleURL.encode('ascii', 'ignore') processed_date = datetime.now().strftime("%Y-%m-%d") if (guessed_date is not None): publish_date = guessed_date else: publish_date = processed_date article = [{ #"q": query, # TODO: Fix "_id": code, "c": code, "f": articleFeatures, "pubd": publish_date, "procd": processed_date, "url": articleURL, #"t": articleTitle, # TODO: Fix "abs": summaryText, # TODO: Fix #"sr": articleSource, # TODO: Fix "k": keywords, "fp": articleFilePath, "m": None }] # Write article to MongoDB collection try: article_id = articles.insert(article) except MongoException.DuplicateKey: print "Duplicate key: " + code #print "Inserted into articles: " + articleTitle.encode('ascii', 'ignore') title = '' # TODO: Fix abstract = '' # TODO: Fix json_str = mk_es_json(code, fullArticleText, articleURL, title, abstract, publish_date) #print json_str index = 'article' index_type = 'text' es_url = 'http://localhost:9200' r = post_to_elastic_search(es_url, index, index_type, code, json_str) print r
def parse_webpages(php_directory, term, option, excludes): api_base_url = AAFTER_URL api_args = "&wt=xml&fl=*,score" file_name = (term + '-' + option).replace(" ", "_").replace("/", "_") webpageFileDirectory = php_directory + file_name + "--webpages" + "/" url_term = urllib2.quote('"' + term + '" ' + option + ' ' + excludes) try: api_url = api_base_url + url_term + api_args #print "Downloading XML from " + api_url xml_response = urllib2.urlopen(api_url) except urllib2.HTTPError: print "ERROR: HTTPError at " + api_url.encode('ascii', 'ignore') xml_response = "" except urllib2.URLError: print "ERROR: URLError at " + api_url.encode('ascii', 'ignore') xml_response = "" # Parse the XML responses xml_tree = etree.parse(xml_response) query = xml_tree.xpath("//response/lst[@name='responseHeader']/lst[@name='params']/str[@name='q']/text()") num_result = xml_tree.xpath("//response/result")[0].attrib['numFound'] # Each website result will be stored as a list titles = xml_tree.xpath("//response/result/doc/str[@name='name']/text()") urls = xml_tree.xpath("//response/result/doc/str[@name='url_s']/text()") scores = xml_tree.xpath("//response/result/doc/float[@name='score']/text()") # Count the number of urls passed in XML and use that as the basis for how many results are on the page url_count = xml_tree.xpath("count(//response/result/doc/str[@name='url_s'])") meta_descriptions = meta_keywords = summaries = [] # Add summary and meta information from Subhankar's API # Use loop to avoid IndexError if field does not exist for i in range(len(urls)): try: md = xml_tree.xpath("//response/result/doc/arr[@name='features']/str[1]/text()")[i] meta_descriptions.append(md) except IndexError: meta_descriptions.append("") try: mk = xml_tree.xpath("//response/result/doc/arr[@name='features']/str[2]/text()")[i] meta_keywords.append(mk) except IndexError: meta_keywords.append("") try: s = xml_tree.xpath("//response/result/doc/arr[@name='features']/str[3]/text()")[i] summaries.append(s) except IndexError: summaries.append("") for i in range(len(urls)): # Check to see if webpage has already been inserted. If it has, don't do anything if (webpages.find_one({ "url": urls[i] }) == None): fullWebpageText = None #code = base64.urlsafe_b64encode(os.urandom(18)) m = hashlib.md5() m.update(urls[i]) code = m.hexdigest() webpageFilePath = webpageFileDirectory + code # Download full webpage and use full-text (if available) for keyword extraction # If a directory for files doesn't exist, create it dir = os.path.dirname(webpageFileDirectory) if not os.path.isdir(dir): #print "Created directory: " + dir os.makedirs(dir) try: #fullWebpage = urllib2.urlopen(urls[i]) #print "Opening website URL: " + str(urls[i]) #fullWebpageHTML = fullWebpage.read() # Use boilerpipe to clean text extractor = Extractor(extractor='ArticleExtractor', url=urls[i]) #fullWebpageHTML = extractor.getHTML() fullWebpageText = extractor.getText() # Use lxml's HTML cleaner to remove markup #htmltree = lxml.html.fromstring(fullWebpageText) #cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True) #cleaned_tree = cleaner.clean_html(htmltree) #fullWebpageText = cleaned_tree.text_content() outfile = open(webpageFilePath, 'w+') outfile.write(fullWebpageText.encode('ascii', 'ignore')) outfile.close except urllib2.HTTPError: print "HTTPError: Webpage file download skipped: " + urls[i] return None except urllib2.URLError: print "URLError: Webpage file download skipped: " + urls[i] return None except UnicodeDecodeError: print "UnicodeDecodeError: Webpage file download skipped: " + urls[i] return None except lxml.etree.ParserError: print "lxml.etree.ParserError: Webpage file download skipped: " + urls[i] return None except LookupError: print "LookupError: Webpage file download skipped: " + urls[i] return None if (fullWebpageText is not None): keyword_set = textrank(fullWebpageText) else: keyword_set = textrank(summaries[i]) keywords = list(keyword_set) webpage = [{ "q": query, "nr": num_result, "url": urls[i], "t": titles[i], "c": code, "md": meta_descriptions[i], "mk": meta_keywords[i], "abs": summaries[i], "s": scores[i], "k": keywords, "f": webpageFilePath }] webpage_id = webpages.insert(webpage)
def parse_news_articles(php_directory, download_directory, file_name, query): # Note: Assumes that path is stored as <query>.php/ inpath = php_directory + file_name + "/" file_list = [ f for f in listdir(inpath) if isfile(join(inpath,f)) ] # For each file, get the article Titles and URLs for file in file_list: # Clear out any variables from last file articleURL = articleTitle = articleSource = summaryText = keywords = score = code = "" try: intext = open(inpath + file, 'r').read() html = etree.HTML(intext) except lxml.etree.XMLSyntaxError: print "ERROR: XMLSyntaxError when reading " + inpath + file break for element in html.iter(): if (element.tag == "p" and element.text == "News Result"): # Do nothing pass elif (element.tag == "a"): articleURL = element.attrib["href"] articleTitle = element.text elif (element.tag == "br"): if (element.tail != None): summaryText = element.tail elif (element.tag == "strong"): if (element.tail != "\n"): articleSource = element.tail elif (element.tag == "p"): # Check to see if article already exists using URL. If it exists, don't do anything if (articles.find_one({ "url": articleURL }) is not None): print "INFO: Duplicate article found" else: print "Processing: " + articleURL # For each URL, assign its md5 as a unique identifier #code = base64.urlsafe_b64encode(os.urandom(18)) m = hashlib.md5() m.update(articleURL) code = m.hexdigest() first_level = code[0:2] second_level = code[2:4] # This code also becomes the filename for the full file path #articleFileDirectory = php_directory + file + "--news/" articleFileDirectory = download_directory + first_level + "/" + second_level + "/" articleFilePath = articleFileDirectory + code # Download full article and use full-text (if available) for keyword extraction fullArticleText = download_article_file(articleURL, articleFileDirectory, code) if (fullArticleText is not None): keyword_set = textrank(fullArticleText) #articleFeatures = get_article_features(fullArticleText, articleURL) articleFeatures = None guessed_date = guess_date(fullArticleText) else: keyword_set = textrank(summaryText) #articleFeatures = get_article_features(summaryText, articleURL) articleFeatures = None guessed_date = guess_date(summaryText) keywords = list(keyword_set) processed_date = datetime.now().strftime("%Y-%m-%d") if (guessed_date is not None): publish_date = guessed_date else: publish_date = processed_date article = [{ "q": query, "c": code, "f": articleFeatures, "pubd": publish_date, "procd": processed_date, "url": articleURL, "t": articleTitle, "abs": summaryText, "sr": articleSource, "k": keywords, "fp": articleFilePath, "m": None }] # Write article to MongoDB collection try: article_id = articles.insert(article) except MongoException.DuplicateKey: print "Duplicate key: " + code #print "Inserted into articles: " + articleTitle.encode('ascii', 'ignore') if (fullArticleText is None): fullArticleText = summaryText # Insert into ElasticSearch json_str = mk_es_json(code, fullArticleText, articleURL, articleTitle, summaryText, publish_date) #print json_str index = 'article' index_type = 'text' es_url = 'http://localhost:9200' r = post_to_elastic_search(es_url, index, index_type, code, json_str) print r
def results(algo=None): print("algorithm:", algo if algo in ALGOS else None) samples = 500 # up to 2000 print("sample size:", samples) keys = glob.glob('Inspec/keys/*.key') res = [0] * samples if algo == 'textrank': # load a spaCy model, depending on language, scale, etc. nlp = spacy.load("en_core_web_sm") # add PyTextRank to the spaCy pipeline tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) elif algo == 'sentiment_pos' or algo == 'sentiment_pos_tfidf': sid = SentimentIntensityAnalyzer() for i, key in enumerate(keys[:samples]): # get actual keywords key_file = open(key) whitespace = re.compile(r"\s+") # remove whitespace and convert to lowercase actual = [ whitespace.sub(" ", w).strip().lower() for w in key_file.readlines() ] # get text document corresponding to current key num = re.findall(r'\d+', key)[0] doc = 'Inspec/docsutf8/{}.txt'.format(num) # get extracted keywords if algo == 'rake': extracted = rake(doc) elif algo == 'textrank': extracted = textrank(doc, nlp) elif algo == 'window': extracted = window(doc) elif algo == 'window_w_tf_idf': extracted = window_w_tf_idf(doc) elif algo == 'tf_idf': extracted = tf_idf(doc) elif algo == 'sentiment_pos': extracted = sentiment_pos(doc, sid) elif algo == 'sentiment_pos_tfidf': extracted = sentiment_pos_tfidf(doc, sid) else: extracted = extract(doc) # calculate results tp = len(set(extracted).intersection( set(actual))) # number of true positives precision = tp / len(extracted) recall = tp / len(actual) f_measure = (2 * precision * recall) / ( precision + recall) if precision + recall else 0 res[i] = (precision, recall, f_measure) # calculate average results avg_res = [sum(x) / len(x) for x in zip(*res)] print("precision: {}, recall: {}, F-measure: {}".format(*avg_res))
def linkedin_summary(user): text = '' for line in user['ln']: text += ' ' + line return textrank(text)
def summarize(text): # SPLIT TO PARAGRAPHS pre_paragraphs = text.split('\n') paragraphs = [] for i, p in enumerate(pre_paragraphs): if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1 or re.match(r'^\s*$', pre_paragraphs[i + 1])): paragraphs.append(p) # print(f'Num of paragraphs: {len(paragraphs)}') # for i, p in enumerate(paragraphs): # print(f'par#{i+1}: {p}') # SPLIT TO SENTENCES sentences = separator.separate(text) print(f'Num of sentences: {len(sentences)}') for i, s in enumerate(sentences): print(f'#{i+1}: {s}') # TOKENIZE stem = False if stem: tokenized_sentences = [[ czech_stemmer.cz_stem(word, aggressive=True) for word in sentence ] for sentence in tokenize(sentences)] else: tokenized_sentences = tokenize(sentences) # REMOVE STOPWORDS tokenized_sentences_without_stopwords = remove_stop_words( tokenized_sentences, keep_case=False) sentences_without_stopwords_case = remove_stop_words( sentences, keep_case=True, is_tokenized=False, return_tokenized=False) print('===Sentences without stopwords===') for i, s in enumerate(tokenized_sentences_without_stopwords): print(f'''#{i+1}: {' '.join(s)}''') print('===Sentences without stopwords CASE===') for i, s in enumerate(sentences_without_stopwords_case): print(f'''#{i+1}: {s}''') # POS-TAG tagged_sentences = pos_tag(sentences_without_stopwords_case) print('=====Tagged_sentences=====') for i, s in enumerate(tagged_sentences): print(f'''#{i+1}: {s}''') counter = 0 summary_length = max(min(round(len(sentences) / 4), 15), 3) # length between 3-15 sentences ranked_sentence_indexes = textrank.textrank(tokenized_sentences, True, '3-1-0.0001') print(f'ranked_sentence_indexes: {ranked_sentence_indexes}') # summary = '' # # add 1st sentence always # summary += f'{sentences[0]}\n' # counter += 1 # ranked_sentence_indexes.remove(0) # # # add also 2nd sentence if it is in top 50% # if 1 in ranked_sentence_indexes[:len(ranked_sentence_indexes) // 2]: # summary += f'{sentences[1]}\n' # counter += 1 # ranked_sentence_indexes.remove(1) # for sentence_index in sorted(ranked_sentence_indexes[:summary_length - counter]): # if counter == summary_length: # break # summary += f'{sentences[sentence_index]}\n' # counter += 1 # summary += f'::::: Sentences in original: {len(sentences)}. Sentences in summary: {summary_length}. :::::' # add 1st sentence always summary = [] summary.append(sentences[0]) counter += 1 ranked_sentence_indexes.remove(0) # # add also 2nd sentence if it is in top 50% if 1 in ranked_sentence_indexes[:len(ranked_sentence_indexes) // 2]: summary.append(sentences[1]) counter += 1 ranked_sentence_indexes.remove(1) for sentence_index in sorted(ranked_sentence_indexes[:summary_length - counter]): if counter == summary_length: break summary.append(sentences[sentence_index]) counter += 1 return summary
def main(): parser = argparse.ArgumentParser() parser.add_argument("-q", "--query", nargs='?', default='Airbus Subsidies', type=str, help='query') args = parser.parse_args() corpus_path = 'apnews_sen/apnews_sen.dat' with open(corpus_path, 'r') as corpus: combined_document = corpus.read() corpus.seek(0) documents = corpus.readlines() combined_document = combined_document[:1000] documents = documents[:1695] print(len(documents)) print(len(set(documents))) N_docs = len(documents) # run BM25 searcher = Searcher('apnews-config.toml') search_results = searcher.search(args.query, num_results=N_docs) dupe_dict = dict() for (doc_id, _) in search_results: if doc_id in dupe_dict: print('oh no') return dupe_dict[doc_id] = True combined_document = searcher.get_stringified_list(search_results) # run textrank from law__--less tokenized_sentences = tokenizer.remove_stopwords_and_clean( combined_document) # M_adj = graph_builder.create_sentence_adj_matrix(tokenized_sentences).astype(float) # word_model = ModelGen.train_model(tokenized_sentences) word_model = gensim.models.doc2vec.Doc2Vec.load( 'model/apnews_sen_model.model') graph_model = DocumentGraph.DocumentGraph(tokenized_sentences, word_model) M_adj = graph_model.similarity_matrix M_adj = M_adj / np.sum(M_adj, axis=1) eigen_vectors = np.array(textrank.textrank(M_adj, d=.85)) scores = textrank.get_sentence_scores(tokenized_sentences, eigen_vectors) assert (len(combined_document) == len(scores)) print(scores) all_scores = np.ndarray((scores.shape[0], 2)) all_scores[:, 1] = scores all_scores[:, 0] = [result[1] for result in search_results] z_scores = (all_scores - np.mean(all_scores, axis=0)) / np.std(all_scores, axis=0) averaged_scores = np.mean(z_scores, axis=1) # https://stackoverflow.com/questions/6618515/sorting-list-based-on-values-from-another-list sorted_docs = [ doc for (avg_score, doc ) in sorted(zip(averaged_scores, combined_document), reverse=True) ] print(sorted_docs)
else: os.makedirs(outDir) path = "data/body/" filens = os.listdir(path) stops = stopwords.words('english') for idf, filen in enumerate(filens): ''' idf=0 filen=filens[idf] ''' print(idf) fpath = path + filen sentence, word, textlength, posword = preprocess(fpath, stops) #nodes = textrank(word) nodes = textrank(posword) #sentences = sentence_weight(sentence,word,nodes) sentences = sentence_weight(sentence, posword, nodes) #generate_summary_bylength(sentences,word,nodes,filen, outDir,l) generate_summary_bycompression(sentences, filen, outDir, l, textlength) #generate_summary_bylength_mmr(sentences,word,nodes,filen, outDir,l,0.7) #generate_summary_bycompression_mmr(sentences,word,nodes,filen, outDir,l,textlength,0.9) #dataset 2 # l = [70,80,90,95,98] # #l = [50,100,150,200] # outDir = 'data2/data2-sys-summary-sen_avg-compression-300/' # if os.path.exists(outDir): # shutil.rmtree(outDir) # os.makedirs(outDir) # else: