def summarize(text, n_sentences, sep='\n'): ''' Args: text (str or file): text itself or file in memory of text n_sentences (int): number of sentences to include in summary Kwargs: sep (str): separator to join summary sentences Returns: (str) n_sentences-long, automatically-produced summary of text ''' if isinstance(text, str): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) elif isinstance(text, file): parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE)) else: raise TypeError('text must be either str or file') stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
def test_split_into_words(self): sentences1 = PlaintextParser.from_string("One, two two. Two. Three.", Tokenizer("english")).document.sentences self.assertEqual(["One", "two", "two", "Two", "Three"], _split_into_words(sentences1)) sentences2 = PlaintextParser.from_string("two two. Two. Three.", Tokenizer("english")).document.sentences self.assertEqual(["two", "two", "Two", "Three"], _split_into_words(sentences2))
def summarize_with_info(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) return summarizer(parser.document, length)
def summarizeFile(inputFile): summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") url = findURLS(inputFile) if url != None: if url[-1] == '.': url = url[0:-1] #print (url) #urlContent = 'Summary from URL ['+url+']: \n' urlContent = '' try: parser = HtmlParser.from_url(url, Tokenizer("english")) for sentence in summarizer(parser.document, 3): urlContent = urlContent + str(sentence) + '\n' except: #print (sys.exc_info()[0]) urlContent = '' content = inputFile.read() parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE)) #summarizer = LsaSummarizer(stem_word) #summarizer.stop_words = get_stop_words(LANGUAGE) #summary = 'Event Summary: \n' summary = '' try: for sentence in summarizer(parser.document, SENTENCES_COUNT_1): summary = summary + str(sentence) + '\n' except AssertionError: return None if url != None: return summary + urlContent return summary
def get_summary(source_text, compression_factor): """ Given some input source_text, returns its summary based on the chosen compression factor. """ summary = { 'source_text': source_text, 'compression_factor': compression_factor, 'summary': '', 'success': False } parser = PlaintextParser.from_string(source_text, Tokenizer("english")) summ_algo = LexRankSummarizer() final_line_num = \ int(source_text.count('.')/compression_factor) try: raw_summary = summ_algo(parser.document, final_line_num) for sentence in raw_summary: summary['summary'] += str(sentence) + ' ' except: pass summary['success'] = (len(summary['summary']) != 0) return summary
def test_get_word_ngrams(self): sentences = PlaintextParser.from_string("This is a test.", Tokenizer("english")).document.sentences correct_ngrams = [("This", "is"), ("is", "a"), ("a", "test")] found_ngrams = _get_word_ngrams(2, sentences) for ngram in correct_ngrams: self.assertTrue(ngram in found_ngrams)
def _firstK_score(storyName, highlightName): parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE)) geneSen = parser.document.sentences[:SENTENCES_COUNT] refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences # print geneSen # print "==========" # print refSen # print evaluate(geneSen, refSen) try: return evaluate(geneSen, refSen) except Exception as e: print storyName print e raise e
def summarize(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus,Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) summary = " ".join([obj._text for obj in summarizer(parser.document, length)]) return summary
def kl_rank_sum(path, K): filename = path K = K parser = PlaintextParser.from_file(filename, Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, K) #number of sentences in parenthecies return summary
def summarize(corpus, length, algorithm): summarizer = None summary = "No compatible summarizer was selected, please use one of these : textrank, lexrank, luhn, edmonson*, kl, lsa, sumbasic, random (* doesn\'t work yet)" algorithm = algorithm.lower() try: parser = PlaintextParser.from_string(corpus,Tokenizer(LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(LANGUAGE)) if summarizer: summarizer.stop_words = get_stop_words(LANGUAGE) summary = " ".join([obj._text for obj in summarizer(parser.document, length)]) return summary except Exception as e: return str(e)
def summarize(string, summary_length = 1, language = "english"): string = string.lower() if string.isupper() else string parser = PlaintextParser.from_string(string, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) return ". ".join([str(sentence) for sentence in summarizer(parser.document, summary_length)])
def sumrise(text = text, sentences = 5): if (validators.url(text)): text = web2text.getwebtxt(text) parser = PlaintextParser.from_string(text, Tokenizer('english')) summerizer = LsaSummarizer() summary = str(summerizer(parser.document, sentences)) return summary
def summarize(self, extracted_refs, facet_results, max_length=250, mode='citance'): ''' Summarizes the extracted references based on community detection Args: extracted_refs(list) -- results of the method.run (e.g. simple.py) facet_results(dict) -- facets for each extracted reference Look at data/task1b_results1.json max_length(int) -- maximum length of the summary mode(str) -- can be citance, reference ''' citances = defaultdict(list) summarizer = LexRankSummarizer(Stemmer('english')) summary = defaultdict(lambda: defaultdict(list)) for t in extracted_refs: citances[t[0]['topic']].append( {'refs': t[0]['sentence'], 'citance': self.clean_citation(t[0]['citation_text'])}) for topic, citance in citances.iteritems(): # Create graph of citation similarities vectorizer = TfidfVectorizer( tokenizer=self.tokenize, min_df=1, max_df=len(citances) * .9) cit_vectors = vectorizer.fit_transform( [e['citance'] for e in citance]).toarray() cit_text = { i: v for i, v in enumerate(citance)} cit_dict = {i: v for i, v in enumerate(cit_vectors)} cits = [] for e in cit_dict: # vector (numpy array) for e1 in cit_dict: if e != e1: simil = self.cossim(cit_dict[e], cit_dict[e1]) if simil > 0.1: cits.append((e, e1, simil)) G = nx.Graph() G.add_weighted_edges_from(cits) part = community.best_partition(G) clusters = defaultdict(list) tokenize = SentTokenizer(offsets=False) for k, v in part.iteritems(): clusters[v].extend(tokenize(citance[k]['refs'])) # clusters includes ref sentences that belong in each cluster # Find the most salient sentence in each cluster sal_in_cluster = {} # salient sentences for each cluster for i in clusters: parser = PlaintextParser.from_string( ' '.join(clusters[i]).replace('\\', ''), Tokenizer('english')) summ = summarizer(parser.document, 5) # 5 is the number of sentences returned by LexRank sal_in_cluster[i] = [unicode(s) for s in summ] # The most salient sentences in each cluster summary[topic.upper()] =\ self.pick_from_cluster( sal_in_cluster, max_length, weighted=False) return summary
def summary(): max_sent = 10 language = 'english' url = request.form['summary'] tokenizer = Tokenizer(language) article = alt_extract(url) parser = PlaintextParser.from_string(article, tokenizer) summary = summarizer(parser, max_sent, language).decode('utf-8') return render_template('summary.html', url=url, summary=summary)
def summarize(self, extracted_refs, facet_results, max_length=250): ''' Summarizes the extracted references based on the facet results Args: extracted_refs(list) -- results of the method.run (e.g. simple.py) facet_results(dict) -- facets for each extracted reference Look at data/task1b_results1.json max_length(int) -- maximum length of the summary ''' summaries = defaultdict(lambda: defaultdict(list)) for t in extracted_refs: topic = t[0]['topic'] citance = t[0]['citance_number'] if isinstance(t[0]['sentence'][0], list): logger.warn('Unexpected, should check') summaries[topic.upper()]\ [facet_results[topic.upper()] [str(citance)]['SVM_LABEL']].append([t[0]['citation_text']]) summarizer = TextRankSummarizer(Stemmer('english')) final_summ = defaultdict(lambda: defaultdict(dict)) ret_summ = defaultdict(list) counts = defaultdict(lambda: defaultdict(dict)) for t in summaries: for facet in summaries[t]: if len(summaries[t][facet]) > 1: summs = list( itertools.chain.from_iterable(summaries[t][facet])) parser = PlaintextParser.from_string( ' '.join(summs), Tokenizer('english')) summ = summarizer(parser.document, max_length) final_summ[t][facet] = [unicode(sent) for sent in summ] counts[t][facet] = len(final_summ[t][facet]) else: final_summ[t][facet] = self.s_t(summaries[t][facet][0]) i = 0 while self.w_t.count_words(ret_summ[t]) < max_length: for fct in final_summ[t]: if i < len(final_summ[t][fct]): ret_summ[t].append(final_summ[t][fct][i]) i += 1 while self.w_t.count_words(ret_summ[t]) > max_length: ret_summ[t].pop() # summ = defaultdict(list) # tokzer = WordTokenizer(stem=False) # for k in final_summ: # i = 0 # while tokzer.count_words(summ[k]) < max_length: # for f in final_summ[k]: # if len(final_summ[k][f]) > i and\ # tokzer.count_words(summ[k]) < max_length: # summ[k].append(final_summ[k][f][i]) return ret_summ
def summarize(text): total = "" parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): total += str(sentence) return total
def getSummary(self, num_sentences): lex_rank = LexRankSummarizer() text = str(self.bpLargGetText()) parser = PlaintextParser.from_string(text, Tokenizer('english')) summary = lex_rank(parser.document, num_sentences) sentences = [] for sent in summary: sentences.append(str(sent)) return sentences
def summary(text, summarizer_class): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = summarizer_class(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) yield sentence
def get_summary(self, text): parser = PlaintextParser.from_string(text, Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, 3) #Summarize the document with 5 sentences result = "" for sentence in summary: result += " " + str(sentence) return result
def summarizeText(self, body, numSentences = 10): """Summarizes body of text to numSentences """ #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) stemmer = Stemmer(self.LANG) summarizer = SumySummarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANG) summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)]) return summary
def summarize(filename, num_sentences): with open (filename, "r") as myfile: data=myfile.read() parser = PlaintextParser.from_string(data, Tokenizer('english')) summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") summary = "" for sentence in summarizer(parser.document, num_sentences): summary += sentence.__unicode__().encode('ascii', 'ignore').replace('\"', '').replace('\'', '').strip() + " " return summary
def summarize(content): parser = PlaintextParser.from_string(content.body, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) text = '\n'.join( [str(sentence) for sentence in summarizer(parser.document, COUNT)] ) summary = Summary(content=content, summary=text) summary.save()
def test_real_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("snippets/prevko.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 2) self.assertEqual(len(sentences), 2)
def test_article_example(): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def summarize(text): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) result = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): result += str(sentence) + " " return result
def test_issue_5_sigma_can_multiply_matrix_v(self): """Source: https://github.com/miso-belica/sumy/issues/5""" parser = PlaintextParser.from_string( load_resource("articles/sigma_can_multiply_matrix_v.txt"), Tokenizer("english") ) summarizer = LsaSummarizer(english_stemmer) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) self.assertEqual(len(sentences), 20)
def summarize(text, size=2): parser = PlaintextParser.from_string(text, Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, size) summarize_text="" for sentence in summary: summarize_text+=(str(sentence)+" ") summarize_text=summarize_text.strip() return summarize_text
def lex_rank_sum(path, L): filename = path L = L output = [] parser = PlaintextParser.from_file(filename, Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, L) #number of sentences in parenthecies for sentence in summary: # option for writing to a summary output file. item = str(sentence) output.append(item) return output
def _summ_score(storyName, highlightName): parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) geneSen = summarizer(parser.document, SENTENCES_COUNT) refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences #print geneSen #print "==========" #print refSen try: return evaluate(geneSen, refSen) except Exception as e: print storyName print e raise e
def get_summary(text, max_sentences=5): parser = PlaintextParser.from_string(text, Tokenizer("english")) stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") summary = [] for sentence in summarizer(parser.document, max_sentences): # sentence count set to 10 summary.append(str(sentence._text.encode('ascii', 'ignore'))) return summary
def main(args=None): summarizer, document, items_count, reference_summary = handle_arguments() evaluated_sentences = summarizer(document, items_count) reference_document = PlaintextParser.from_string(reference_summary, Tokenizer(language)) reference_sentences = reference_document.document.sentences for name, evaluate_document, evaluate in AVAILABLE_EVALUATIONS: if evaluate_document: result = evaluate(evaluated_sentences, document.sentences) else: result = evaluate(evaluated_sentences, reference_sentences) print("%s: %f" % (name, result)) return 0
def __init__(self, transcript_file_path, summary_number): """ Input a transcript_file_path in the form of a string and a summary_number denoting the number of sentences requested in the summary. """ self.transcript_file = transcript_file_path full_transcript_text = file.read(open(self.transcript_file, "r")) self.tokenized_transcript = sent_tokenize(full_transcript_text) LANGUAGE = "English" parser = PlaintextParser.from_file(self.transcript_file, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) self.summary = summarizer(parser.document, summary_number)
def summarize(self, fields): """ yields the summary on a hit to facilitate building bulk update """ assert self.content_field in fields content = fields[self.content_field][0] language = fields[self.lang_field][0] if self.lang_field in fields else 'en' language = LANGUAGE_MAP[language] parser = PlaintextParser.from_string(content, Tokenizer(language)) stemmer = Stemmer(language) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(language) sentences = [str(s) for s in summarizer(parser.document, self.count)] summary = ' '.join(sentences) return summary
def extract_summary_keywords(trend,urls,titles): total_articles_content=extract_text(urls) keywords=extract_keywords_from_all_text(total_articles_content,titles) current_path=os.path.dirname(os.path.realpath(__file__)) current_path=current_path+'\\'+trend+'.txt' with open(current_path, 'w') as the_file: the_file.write(total_articles_content) parser = PlaintextParser.from_file(current_path, Tokenizer(LANGUAGE)) os.remove(current_path) sentences='' for sentence in summarizer(parser.document, 12): sentences=sentences+' '+str(sentence) replaced_syn=replacesynonym(sentences) matches = tool.check(sentences) correct_summary=language_check.correct(sentences, matches) return correct_summary,keywords
def summarize(): """ Returns summary of articles """ text = request.form['text'] # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) final = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): final.append(str(sentence)) length = len(final) return render_template('paraphrase.html', report=final, length=length)
def summy_lex_rank_process_article_file(file_path): sents = [] with io.open(file_path, 'r', encoding='utf-8') as article_file: for line in article_file: if line.find('@highlight') != -1: break line = line.strip() sents.extend(sent_tokenize(line)) parser = PlaintextParser.from_string(' '.join(sents), Tokenizer('english')) summarizer = LexRankSummarizer() # Summarize the document with 2 sentences sums = summarizer(parser.document, NUM_SUM_SENTS) res_list = [] for summary in sums: res_list.append(str(summary)) return res_list
def test_document_is_all_in_upper_case(): """ When all words is in upper case Plaintext parser first line as heading and LexRank algorithm raises exception "ZeroDivisionError: float division by zero" because there is no sentence to summarize. See https://github.com/miso-belica/sumy/issues/25 """ parser = PlaintextParser.from_string( "JUST WRITING SOME TEXT. TO TEST CASE. WITH ZERO SENTENCES RETURNED. FROM TOKENIZER.", Tokenizer("english") ) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 0
def get_summary(self): # 1行1文となっているため、改行コードで分離 sentences = [t for t in self._text.split('\n')] for i in range(1): print(sentences[i]) # 形態素解析器を作る analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞・形容詞・副詞・動詞の原型のみ ) # 抽出された単語をスペースで連結 # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。 corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] """ for i in range(2): print(corpus[i]) """ # 転職 Advent Calendar 2016 - Qiita 14 日 目 なる 少し ポエム 含む。 # 今年 11 月 SIer Web サービス 会社 転職 する。 """ from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer """ # 連結したcorpusを再度tinysegmenterでトークナイズさせる parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) # LexRankで要約を2文抽出 summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # スペースも1単語として認識されるため、ストップワードにすることで除外する self.summary = summarizer(document=parser.document, sentences_count=2) # 元の文を表示 for sentence in self.summary: print(sentences[corpus.index(sentence.__str__())])
def summarize(selected_text, n=3): from sumy.parsers.plaintext import PlaintextParser #We're choosing a plaintext parser here, other parsers available for HTML etc. from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer #We're choosing Lexrank, other algorithms are also built in output = '' parser = PlaintextParser(selected_text, Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, n) #Summarize the document with 2 sentences for sentence in summary: output += str(sentence) return output
def generating_transcript(para): for i in range(len(para)): f1 = open("f1.txt", "w+") f1.write(para[i].text + "\n") f1.close() parser = PlaintextParser.from_file("f1.txt", Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, 2) for sentence in summary: if(len(str(sentence)) > 30): temp += str(sentence) + "\n\n" f1 = open("f1.txt", "w+") f1.write(temp) f1.close() return temp
def boring(): boringStuff = request.form['boringstuff'] parser = PlaintextParser.from_string(request.form['boringstuff'], Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentencesCount = request.form['sentences'] if request.form[ 'sentences'] else 3 sentences = summarizer(parser.document, sentencesCount) return render_template('index.html', sentences=sentences, sentencesCount=sentencesCount, boringStuff=boringStuff)
def models_LUHN_LEX_LSA_2(dataframe): ## Candidate models: # Bag of Words # FastText # word2vec # LDA (topic extra) # skip-thoughts # doc2vec # LSTM LANGUAGE = "english" stop = get_stop_words(LANGUAGE) size = len(dataframe) stemmer = Stemmer(LANGUAGE) for i in range(0, size): article = dataframe.loc[i, "post_content"] parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE)) summarizerLUHN = LUHN(stemmer) summarizerLUHN.stop_words = stop summarizerLEX = LEX(stemmer) summarizerLEX.stop_words = stop summarizerLSA = LSA(stemmer) summarizerLSA.stop_words = stop LUHNsentence = summarizerLUHN(parser.document, 1) #summarize document with one sentence LEXsentence = summarizerLEX(parser.document, 1) #summarize document with one sentence LSAsentence = summarizerLSA(parser.document, 1) #summarize document with one sentence for sentence1 in LUHNsentence: LUHNsummary = sentence1 for sentence2 in LEXsentence: LEXsummary = sentence2 for sentence3 in LSAsentence: LSAsummary = sentence3 dataframe.loc[i, "LUHN"] = LUHNsummary dataframe.loc[i, "LEX"] = LEXsummary dataframe.loc[i, "LSA"] = LSAsummary
def summarize(self, excerpt: str, len_s: int) -> str: parser = PlaintextParser.from_string(excerpt, Tokenizer('english')) document = parser.document dictionary = self.summarizer._create_dictionary(document) if dictionary is None: return excerpt words_count = len(dictionary) sentences_count = len(document.sentences) if words_count < sentences_count: return excerpt sents = self.summarizer(parser.document, len_s) return ' '.join(str(s) for s in sents)
def run_single(self, document): parser = PlaintextParser.from_string(document, Tokenizer(self.language)) document = parser.document self.summarizer._ensure_dependencies_installed() sentences_words = [self.summarizer._to_words_set(s) for s in document.sentences] if not sentences_words: return tuple() tf_metrics = self.summarizer._compute_tf(sentences_words) idf_metrics = self.summarizer._compute_idf(sentences_words) matrix = self.summarizer._create_matrix(sentences_words, self.summarizer.threshold, tf_metrics, idf_metrics) scores = self.summarizer.power_method(matrix, self.summarizer.epsilon) return list(map(str, document.sentences)), list(scores)
def videoFunction(search, video_sentence_count, no_video): results = YoutubeSearch(search, max_results=no_video).to_json() results = json.loads(results) filtered_results = [] for result in results['videos']: v = result['id'] transcript_flag = True try: transcript_list = YouTubeTranscriptApi.list_transcripts(v) except: print("no transcripts") transcript_flag = False if transcript_flag: try: transcript = transcript_list.find_manually_created_transcript( ['en', 'en-UK', 'en-US']) except: print("no transcript") transcript_flag = False if transcript_flag: transcript_proto = transcript.fetch() final_transcript = '' for obj in transcript_proto: final_transcript += obj['text'] + ' ' key_words = keywords(final_transcript, ratio=0.1) parser = PlaintextParser.from_string(final_transcript, Tokenizer("english")) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = '' for sentence in summarizer(parser.document, video_sentence_count): summary += str(sentence) result['summary'] = summary result['id'] = result['id'].split('&')[0] result['keywords'] = key_words filtered_results.append(result) return filtered_results
def test_real_example(): parser = PlaintextParser.from_string( "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. " "Přerostly až v reparát z jazyka na konci školního roku. " "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. " "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě " "o rok mladších dětí budoval vedoucí pozici. " "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.", Tokenizer("czech")) summarizer = LuhnSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") returned = summarizer(parser.document, 2) assert list(map(to_unicode, returned)) == [ "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.", "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.", ]
def response(): if request.method == 'POST': text_org = request.json['foo'] text = json.loads(json.dumps(text_org)) text = re.sub('[^A-Za-z0-9()[]]', ' ', str(text)) text = text.lower() if len(text.split()) <= 3: resp = ' '.join(['please give some more sentences.']) return resp else: parser = PlaintextParser.from_string(text, Tokenizer('english')) sum_1 = summarizer(parser.document, 10) sum_lex = [] for sent in sum_1: resp = sum_lex.append(str(sent)) resp = ' '.join(resp) return resp
def summarize(): SENTENCES_COUNT = numOfSent.get() parser = PlaintextParser.from_file(fileName.cget("text"), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) outputFile = open("C://Users//rakesh chandra//Desktop//ATS//output.txt", 'w') for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) outputFile.write("-> ") outputFile.write(str(sentence)) outputFile.write("\n \n") os.startfile((fileName.cget("text"))) os.startfile("C://Users//rakesh chandra//Desktop//ATS//output.txt")
def summarization(text, alg="lexrank"): parser = PlaintextParser.from_string("".join(text), Tokenizer("japanese")) if alg == "lexrank": summarizer = LexRankSummarizer() elif alg == "textrank": summarizer = TextRankSummarizer() elif alg == "lsa": summarizer = LsaSummarizer() else: raise Exception("IllegalArgumentException") summarizer.stop_words = [" "] abst = summarizer(document=parser.document, sentences_count=5) abst = [x for x in map(lambda x: "".join(x.words), abst)] return abst