def aggregate_file(interval, file_name, start=None): """ Aggregate the data within the windows of time interval: time in seconds to aggregate data file_name: which file to record start: start time to record data, if none given then it starts from te beginning. returns: array of the aggregated data in each interval """ if start is None: start = get_start_time_for(file_name) start = datetime.strptime(start, TIME_FORMAT) summaries = [Summarizer() for _ in range(10)] with open(file_name, 'r+') as data: headers = data.readline().strip().lower().split(',') for line in data: args = line.strip().split(',') time = datetime.strptime(args[0], TIME_FORMAT) window = int((time - start).total_seconds() / interval) if window < 0: continue if window >= len(summaries): for i in range(window + 1): summaries.append(Summarizer()) item = dict(zip(headers, args)) summaries[window].add(item) return [s for s in summaries if s.used]
def summarization(): try: json_data = request.get_json(force=True) text = json_data.get('text') method = json_data.get('method', 'T5') pretrained = json_data.get('pretrained', 't5-large') # initialize Summarizer s = Summarizer(method=method, pretrained=pretrained) pred = s.summarize(text) if isinstance(pred, dict): summary = pred['summary'] data = dict() data['text'] = text data['summary'] = summary data['summary_len'] = Utility.get_doc_length(summary) data['text_len'] = Utility.get_doc_length(text) return json.dumps(data) else: return json.dumps(pred) except Exception as e: return {"Error": str(e)}
def Summ(spacy_model, body, ratio, coreference, greedyness, min_lenght, max_lenght, bert_model): # Loading Spanish BERT custom model and tokenizer (output_hidden_states must be set to True). custom_config = AutoConfig.from_pretrained( pretrained_model_name_or_path=bert_model) custom_config.output_hidden_states = True custom_tokenizer = AutoTokenizer.from_pretrained(bert_model) custom_model = AutoModel.from_pretrained(bert_model, config=custom_config) #Summarization if coreference: # With Coreference handler = CoreferenceHandler(spacy_model=spacy_model, greedyness=greedyness) model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer, sentence_handler=handler) result = model(body, min_length=min_lenght) else: # Without Coreference model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) result = model(body, ratio=ratio, min_length=min_lenght, max_lenght=max_lenght) summarized_text = ''.join(result) return summarized_text
def test_run(): if path.exists("keys"): api = Summarizer("Manchester", "manchester", "keys") assert api.keyToVideo() == 0 else: with open("data.json", "r") as f: return json.load(f)
def get_weight_sentence(document): text = document.text theme = document.theme lang = document.language nlp = nlp_it if lang == 'italian': model = Summarizer(sentence_handler=SentenceHandler(language=Italian)) args = get_words(text, nlp) else: model = Summarizer(sentence_handler=SentenceHandler(language=Russian)) args = get_words_for_ru(text) document.args = str(args) document.save() sentences = sent_tokenize(text) if lang == 'italian': sorted_sentences = get_sorted_sentence(sentences, nlp, text, args, theme, lang) else: sorted_sentences = get_sorted_sentence_for_ru(sentences, text, args, theme, lang) note = generate(sentences, sorted_sentences) note_with_ml = model(text) note_item = Note(document_id=document, text=note, text_for_algo="", text_for_ml=note_with_ml) note_item.save()
def post(): try: url = request.args.get('url') summarizer = Summarizer(url) return render_template('post.html', text=Markup(summarizer.summarize()), img= summarizer.article.top_image, title=summarizer.title), 200 except: raise InvalidUsage('Oops something got wrong. Please try again', status_code=404)
def process_text(text): """ Tries to get the summary, intro, keywords Return is_valid indicating if text was okay :param text: :param title: :return: """ is_valid = True keywords = [] summarizer = Summarizer(text) try: summary = summarizer.summarize() keywords = summarizer.keywords(number_keywords=number_keywords) phrases = summarizer.key_noun_phrases( number_noun_phrases=number_noun_phrase) phrases = [phrase.replace(" ", "_") for phrase in phrases] keywords = keywords + phrases except SummaryException: summary = [""] is_valid = False summary = " ".join(summary) return is_valid, summary, keywords
def inference(_id, corrector, segmenter): res = getBodyTopic(_id) body = res['text'] topic = res['topic'] username = res['username'] length = res['length'] body = preProcessTranscript(body, corrector) entities = entityRecog(_id) # topic = preProcessTranscript(topic) meeting_sentiment = sentiment(_id) # body = correct_sentence__test(body, corrector, segmenter) model = Summarizer() data = model.get_summary(body, topic) size = len(data) result = "" if size < length: result = [result + data[i]['sentence'] for i in range(len(data))] else: result = [result + data[i]['sentence'] for i in range(length)] result = " ".join(result) res = saveSummary(_id, result, _id, username, length, meeting_sentiment, entities, topic) return {"_id": str(res.inserted_id)}
def summarize(): url = request.args.get('url') summarizer = Summarizer(url) if url != False: summary = summarizer.summarize() response = {'title': summarizer.title, 'summary': summary, 'img': summarizer.article.top_image} return jsonify(response) raise InvalidUsage('Oops something got wrong. Please try again', status_code=404)
class TextTeaser(object): def __init__(self): self.summarizer = Summarizer() def summarize(self, title, text, category = "Undefined", source = "Undefined", count = 5): result = self.summarizer.summarize(text, title, source, category) result = self.summarizer.sortSentences(result[:count]) result = [res['sentence'] for res in result] return result
def test_generate_summary(self): """Various test cases for summarizer.generate_summary.""" summarizer_simple = Summarizer(self.simple_dataframe, self.config) output_df_simple = summarizer_simple.generate_summary() self.assertEqual(len(output_df_simple), 3) self.assertIn(1, output_df_simple['Size'].values) self.assertIn('test', output_df_simple['Text'].values) self.assertIn('test2', output_df_simple['Text'].values) self.assertIn('test3', output_df_simple['Text'].values) self.assertIn('', output_df_simple['ClassLines'].values) summarizer_stack_lines = Summarizer(self.stack_lines_dataframe, self.config) output_df_stack_lines = summarizer_stack_lines.generate_summary() self.assertIn('some.class.java', output_df_stack_lines['ClassLines'].values) self.assertIn('some.class2.java', output_df_stack_lines['ClassLines'].values) self.assertIn('some.class3.java', output_df_stack_lines['ClassLines'].values) self.assertIn('', output_df_stack_lines['Text'].values) summarizer_multi_cluster = Summarizer(self.multi_cluster_dataframe, self.config) output_df_multi_cluster = summarizer_multi_cluster.generate_summary() self.assertEqual(len(output_df_multi_cluster), 2) self.assertIn(2, output_df_multi_cluster['Size'].values) self.assertIn(1, output_df_multi_cluster['Size'].values)
def pre_processing(list_lineInDoc, caseIdx, output_path="./output/train_data_preprocessed.txt", separator=DEFAULT_SEPARATOR, write_valid=True, logPrint=False): ''' To Jeff: Implement here. ''' if (logPrint == True): print("") print("\tPre-processing law case [%d]" % caseIdx) if (caseIdx == 0 and logPrint == True): print("") print("\t\t1.Remove \\n, \\t") print("\t\t2.Remove Punctuations.") print("\t\t3.Split Words.") print("\t\t4.Remove StopWords.") print("\t\t5.Lemmatize Words.") ## 1. Remove '\n','\t', etc and make it readable. for idx in range(len(list_lineInDoc)): list_lineInDoc[idx] = " ".join(list_lineInDoc[idx].replace( "\n", " ").split()) ## 2. Analyse catchphrase and sentences. catchphrase = list_lineInDoc[0] sentences = list_lineInDoc[1:] summarizer = Summarizer() result = summarizer.summarize(sentences, catchphrase) ## 3. Save result. if (write_valid == True): f = open(output_path, 'a') f.write(separator + str(caseIdx)) f.write('\n') for i in range(0, len(result)): word_stat_line = result[i][0] for j in range(0, len(word_stat_line)): f.write(str(word_stat_line[j]['word'])) f.write(":") f.write(str(word_stat_line[j]['count'])) f.write(" ") f.write('\n') f.close() return result
class TextTeaser(object): def __init__(self): self.summarizer = Summarizer() def summarize(self, text, count): result = self.summarizer.summarize(text) #print(result) result = self.summarizer.sortSentences(result[:count]) result = [res['sentence'] for res in result] return result
class nlps(object): def __init__(self): self.summarizer = Summarizer() def summarize(self, title, text, category="Undefined", source="Undefined", count=5): result = self.summarizer.summarize(text, title, source, category) result = self.summarizer.sortSentences(result[:count]) result = [res['sentence'] for res in result] return result
def worker(): i = 0 while True: item = q.get() if item is None: print("Break ! cuz item is None") break api = Summarizer(item[0], item[1], "keys") api.keyToVideo() i += 1 print("-----Task{0}----".format(i)) q.task_done()
def summary_of_detection(filename, model, use_bots=False, use_attack=False, sample=False, use_ahead=False, steps_ahead=1, trees=50, norm_and_standardize=False): """General call to train and test any model under any features. Args: filename: Aggregated file to get features and labels from model: Abbreviated model name to use (rf: Random Forest, dl: Deep Learning, dt: Decision Trees) use_bots: Use bot type as the label (Only general-*.featureset.csv files have this information) use_attack: Use attack type as the label (only all-*.featureset.csv files have this information) sample: Sample the featureset so that there is an equal number of attack and normal labels use_ahead: Works only on files aggregated with `--use_separator` flag turned on. steps_ahead: Only used if use_ahead is true. The amount if time windows ahead to skip to get the new label for the current feature set. trees: The number of trees to use on Random Forest. norm_and_standardize: The normalize and standardize the numerical features. Returns: The accuracy, precision, recall, and f1_score of the model. """ if use_ahead: xtrain, xtest, ytrain, ytest = get_ahead_feature_labels( filename, Summarizer().features, steps_ahead) else: xtrain, xtest, ytrain, ytest = get_features_labels_from( filename, Summarizer().features, use_bots, use_attack, sample=sample, norm_and_standardize=norm_and_standardize) if model == 'rf': clf = rf_train(xtrain, ytrain, use_attack, use_ahead, trees=trees) elif model == 'dt': clf = dt_train(xtrain, ytrain) elif model == 'dl': if use_bots: # Multi-label needs to be converted into a sparse array for Tensorflow. ytrain = to_tf_labels(ytrain) ytest = to_tf_labels(ytest) clf = dl_train(xtrain, ytrain, use_bots) return dl_test(clf, xtrain, ytrain, use_bots) if use_attack: return [ x['micro'] for x in test(clf, xtest, ytest, use_bots, use_attack) ] return test(clf, xtest, ytest, use_bots, use_ahead=use_ahead)
class ReviewSummaryAction(Action): summarizer = Summarizer() @staticmethod def review_summarization(reviews): message = "\n" for review in reviews: result = ReviewSummaryAction.summarizer(review["text"]) full = ''.join(result) if full != '': message += "\n-----\n" + full + " - " + review[ "author_name"] + ", " + review["relative_time_description"] return message def name(self) -> Text: return "review_summary_action" def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List: facility_details = tracker.get_slot("facility_details") if facility_details is None: error_message = "You have to choose a facility first" dispatcher.utter_message(error_message) return [] if "reviews" not in facility_details: review_message = ATMOSPHERE_DICT.get("review_messages")[-1] dispatcher.utter_message(review_message) else: summary_message = ReviewSummaryAction.review_summarization( facility_details["reviews"]) print(summary_message) dispatcher.utter_message(summary_message)
def main(): #ask user for how many summaries they want print( "Welcome to CFS's Summary generator. This pulls articles from NLTK's reuter's corpus and summarizers them\n" ) try: numarts = eval( input("How many articles do you wish us to summarize?: ")) except Exception: print("Uh oh, looks like that wasn't a number... \n Please try again") return #prevents users from asking for negative numbers or more than 5 at a time. if numarts < 1: print( "I understand why you wouldn't want to read summaries of financial articles, but come on, at least one? =[ " ) return if numarts > 5: print( "Okay, do you really want to read more than five bland reuters articles summaries at a time?" ) print("Please try a more sensible number") return print() # create our summarizer cfs = Summarizer() getSummaries(cfs, numarts)
def summarize(body): model = Summarizer() result = model(body, min_length=20) full = ''.join(result) return full
def load_summarizer(): try: summ = pickle.load(open('./models/summarizer.pkl', 'rb')) except: summ = Summarizer() pickle.dump(summ, open('./models/summarizer.pkl', 'wb')) return summ
def text_summarization(stt_file_path): """ this method take the speech to text output file path ==> apply bert extractive text summarization ==> return the text summary file path """ try: #Read Data From input File (sumtext.txt) inputfile = open(stt_file_path, "r") datainput = (inputfile.read()) print("Summarization processing ......") model = Summarizer() result = model(datainput, min_length=10) output_summary = ''.join(result) #Write Data in output File (bert-extractive-summarizer.txt) filename = 'summary-bert' + str( uuid.uuid4()) + '.txt' #genere a unique file name out_path = 'upload/text_summarization/' + filename path = os.path.join(MEDIA_ROOT, out_path) file = open(path, "a") file.write(output_summary) file.close() print('sucess summary >>>>>>') return out_path # pylint: disable=bare-except except: print("An exception occurred in text summarization")
def load_summarizer(): try: model = pickle.load(open('./models/summarizer.pkl', 'rb')) except: model = Summarizer() pickle.dump(model, open('./models/summarizer.pkl', 'wb')) return model
def load(self): try: logging.info('Loading BERTExtractiveSummarizer...') BERTExtractiveSummarizer.model = Summarizer() except Exception as e: raise ModelFailedLoadException( f'Failed to load BERTExtractiveSummarizer with {str(e)}')
def get_summarizer_model(self): if self.summarizer_model is None: nlp = spacy.load('en_core_web_md', disable=['ner', 'parser', 'tagger']) nlp.add_pipe(nlp.create_pipe('sentencizer')) self.summarizer_model = Summarizer() return self.summarizer_model
def bert_summarizer(data): summarizer_bert = Summarizer() summary_bert = summarizer_bert(data, min_length=30, max_length=140) print('Bert for Text - Summarization') summary = ''.join(summary_bert) rouge_scores = rouge.get_scores(summary, data) return summary, rouge_scores
def test_multi_hidden_concat(summarizer_multi_hidden: Summarizer, passage): summarizer_multi_hidden.hidden_concat = True res = summarizer_multi_hidden(passage, num_sentences=5, min_length=40, max_length=500) assert len(res) > 10
def texte_summarizer(input_texte, ratio=0.5): """ INPUT: input_texte : str - il s'agit du texte que l'on souhaite résumer ratio : float (compris entre 0 et 1) - pourcentage de réduction du texte, si ratio = 0.5 on va garder 50% des phrases du texte d'origine OUTPUT: dico : dict - {'summary_texte' : str result} avec result le texte résumé BODY: On charge depuis le dossier static/model le modele de NLP à utiliser puis on l'applique au texte à résumer """ try: custom_config = AutoConfig.from_pretrained( "static/model/camembert-base") custom_config.output_hidden_states = True custom_tokenizer = AutoTokenizer.from_pretrained( "static/model/camembert-base") custom_model = AutoModel.from_pretrained("static/model/camembert-base", config=custom_config) model = Summarizer( custom_model=custom_model, custom_tokenizer=custom_tokenizer, ) result = model(input_texte, ratio=ratio) dico = {'summary_text': result} except: import_bert() dico = texte_summarizer(input_texte, ratio) return dico
def load(model: str, tokenizer: BertTokenizer, device=None) -> Summarizer: config = BertConfig.from_pretrained(model) config.output_hidden_states = True bert_model = BertModel.from_pretrained(model, config=config) if device is not None: bert_model = bert_model.to(device) return Summarizer(custom_model=bert_model, custom_tokenizer=tokenizer)
def main(): # Parse the JSON arguments try: config_args = parse_args() except: print("Add a config file using \'--config file_name.json\'") exit(1) # Create the experiment directories _, config_args.summary_dir, config_args.checkpoint_dir = create_experiment_dirs( config_args.experiment_dir) # Reset the default Tensorflow graph tf.reset_default_graph() # Tensorflow specific configuration config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Data loading # The batch size is equal to 1 when testing to simulate the real experiment. data_batch_size = config_args.batch_size if config_args.train_or_test == "train" else 1 data = DataLoader(data_batch_size, config_args.shuffle) print("Loading Data...") config_args.img_height, config_args.img_width, config_args.num_channels, \ config_args.train_data_size, config_args.test_data_size = data.load_data() print("Data loaded\n\n") # Model creation print("Building the model...") model = ShuffleNet(config_args) print("Model is built successfully\n\n") # Summarizer creation summarizer = Summarizer(sess, config_args.summary_dir) # Train class trainer = Train(sess, model, data, summarizer) if config_args.train_or_test == 'train': try: print("FLOPs for batch size = " + str(config_args.batch_size) + "\n") calculate_flops() print("Training...") trainer.train() print("Training Finished\n\n") except KeyboardInterrupt: trainer.save_model() elif config_args.train_or_test == 'test': print("FLOPs for single inference \n") calculate_flops() # This can be 'val' or 'test' or even 'train' according to the needs. print("Testing...") trainer.test('val') print("Testing Finished\n\n") else: raise ValueError("Train or Test options only are allowed")
def extSumm(full): orgi = full model = Summarizer( model='distilbert-base-uncased') # can adjust parameters #summa = model(orgi, ratio=0.3, min_length=60) # can adjust parameters summa = model(orgi, ratio=0.2, min_length=60) return summa
def run(): parser = argparse.ArgumentParser( description='Process and summarize lectures') parser.add_argument('-path', dest='path', default=None, help='File path of lecture') parser.add_argument('-lang', dest='lang', default='en', help='Language supported : en,fr') parser.add_argument('-model', dest='model', default='bert-large-uncased', help='') parser.add_argument('-num-sentences', dest='num_sentences', default=-1, help='Will return X sentences') parser.add_argument( '-ratio', dest='ratio', default=-1, help= 'Will return a ratio of sentences from the text length (0.2 is a good value)' ) parser.add_argument('-hidden', dest='hidden', default=-2, help='Which hidden layer to use from Bert') parser.add_argument('-reduce-option', dest='reduce_option', default='mean', help='How to reduce the hidden layer from bert') parser.add_argument('-greedyness', dest='greedyness', help='Greedyness of the NeuralCoref model', default=0.45) args = parser.parse_args() if not args.path: raise RuntimeError("Must supply text path.") with open(args.path) as d: text_data = d.read() spacy_model = sentence_handlers[args.lang] model = Summarizer( model=args.model, hidden=args.hidden, reduce_option=args.reduce_option, sentence_handler=CoreferenceHandler(spacy_model=spacy_model)) if int(args.num_sentences) >= 0: result = model(text_data, num_sentences=int(args.num_sentences)) elif float(args.ratio) >= 0.0: result = model(text_data, ratio=float(args.num_sentences)) else: result = model(text_data) print(result)
def Summary(self): summ = dict.fromkeys(self.clust[0].unique()) # Will store the name of Cluster based on article name subtitles = [] # Will store the url of main article summarizing the cluster urls = [] model = Summarizer() # Create the summary by aggregating all the bodys of the articles for c in summ.keys(): df2 = self.clust[self.clust[0] == c] subtitles = subtitles + [self.results.loc[df2.index[0]]['title']] urls = urls + [self.results.loc[df2.index[0]]['url']] full_text = ' '.join(self.results.loc[df2.index]['body'].unique()) full_text = full_text.replace('\n', ' ') length = len(full_text) if length < 500: target = 200 ratiol = min(1, target / length) else: target = min(self.max_len + 150 * (len(df2)), length / len(df2) * (0.15 + 0.05 * len(df2))) ratiol = min(1, target / length) summ[c] = model(full_text, min_length=60, ratio=ratiol) self.summary = summ self.subtitles = subtitles self.urls = urls
async def process(ctx): file_path = Formatter(ctx.channel.id).getInputFilePath( int(open("counter.txt", 'r').readline())) input_text = ''.join(open(file_path, 'r').readlines()) output = await Summarizer().summarize(ctx, input_text) Summarizer().save_summaries(output, int(open("counter.txt", 'r').readline()))
#!-*- coding: utf-8 -*- from goose import Goose from goose.text import StopWordsChinese from summarizer import Summarizer import json g = Goose({'stopwords_class': StopWordsChinese}) url = 'http://sports.sina.com.cn/nba/2013-10-29/00086855748.shtml' article = g.extract(url=url) title = article.title print title text = article.cleaned_text print text summary = Summarizer() summary_list = summary.summarize(title, text) print 'summary is below:' for sentence in summary_list: print sentence summary_json = json.dumps(summary_list) print summary_json
def getInput(): with open('input.txt') as file: content = file.readlines() # remove unnecessary \n content = [c.replace('\n', '') for c in content if c != '\n'] title = content[0] text = content[-(len(content) - 1):] return {'title': title, 'text': ' '.join(text)} # ##################### input = getInput() input['text'] = input['text'].decode("ascii", "ignore") input['text'] = " ".join(input['text'].replace("\n", " ").split()) summarizer = Summarizer() result = summarizer.summarize(input['text'], input['title'], 'Undefined', 'Undefined') result = summarizer.sortScore(result) result = summarizer.sortSentences(result[:30]) print 'Summary:' for r in result: print r['sentence'] # print r['totalScore'] # print r['order']
def __init__(self): self.summarizer = Summarizer()
if __name__ == '__main__': mongo_client = MongoClient() db = mongo_client.g_project_data coll = db.test_data summary_list, article_list = get_summaries_and_articles(coll) mongo_client.close() idf = unpickle('idf') vocab = unpickle('vocab') count = CountVectorizer(vocabulary=vocab, stop_words='english') summarizer_multi = Summarizer(vocab=vocab, idf=idf, vectorizer=count, scoring='multi_Tfidf') summarizer_single = Summarizer(vocab=vocab, idf=idf, vectorizer=count, scoring='single_Tfidf') summarizer_sig = Summarizer(vocab=vocab, idf=idf, vectorizer=count, scoring='significance') summarizer_sim = Summarizer(vocab=vocab, idf=idf, vectorizer=count, scoring='similarity') summarizer_rand = Summarizer(vocab=vocab, idf=idf, vectorizer=count, scoring='random') multi_r2 = [] multi_reduction2 = [] single_r2 = [] single_reduction2 = [] sig_r2 = [] sig_reduction2 = [] sim_r2 = [] sim_reduction2 = [] rand_r2 = [] rand_reduction2 = []