def prepare_text_for_ext_summarizer(self) -> str: """ This method process the embed text and prepare it to feed in extractive summarizer It also ensure enough information is avaialble in text itself for highlighting :return: string consist final_text """ logging.info('Prepare pdf text for extractive summarization') text = self.get_text_with_embedded_info() final_text = "" info = "" for sentence in get_sentences(text): final_sentence = sentence if not final_sentence.startswith('@@@id_'): final_sentence = info + final_sentence info = [ word + ' ' for word in final_sentence.split() if word.startswith('@@@id_') ][-1] final_text += final_sentence + '\n' return final_text
def prepare_tweet_for_analysis(tweet): processed_tweet = [] temp_tweet = '' for line in get_sentences(tweet): temp_tweet = ' '.join(tp.preprocess_text(line)) processed_tweet.append(temp_tweet) return '. '.join(processed_tweet)
def lead_3(article, word_count): """ Extract lead-3 summary and return as dictionary. returns d :: { 'system': ... } """ sentences = get_sentences(article["text"]) summary = " ".join(itertools.islice(sentences, 3)) return summary
def basic_preprocessing( text, sents=False, lower=False, stem=False, min_token_len=3, min_sent_len=4, remove_stops=False, stops=STOPWORDS, filters=['strip_multiple_whitespaces', 'strip_punctuation']): # EDT export specific text = text.replace('\x00', '') text = text.replace('\r\n', '\n') # note: filters will be applied in order if sents: sents = get_sentences(text) else: sents = [text] for s in sents: s = s.strip() if lower: s = s.lower() if stem: s = stem_text(s) for f in filters: s = funcs[f](s) # naive word tokenization s = s.split() tmp = list() for t in s: t = t.strip() if t: if remove_stops and stops: if t not in stops: tmp.append(t) else: continue else: tmp.append(t) else: continue s = tmp if len(s) < min_sent_len: yield list() else: yield s
def text8_raw_sentences(length): ''' may want to replace the lower frequency words ''' from gensim.summarization import textcleaner text_file = open("text") text = text_file.read() sentences = textcleaner.get_sentences(text) sentences = [i for i in sentences] if length != -1: return sentences[:length] else: return sentences
def get_data(articles): # document id -> abstract id2abstract = {} # document id -> title id2title = {} # document id -> authors id2authors = {} # list of pre-processed sentences sentences_processed = [] # document id -> list of original sentences, converted to lower case id2sentences = {} for article in articles: id = article['paper_id'] title = article['metadata']['title'] authors = article['metadata']['authors'] bodytext = clean_text(article['body_text']) abstract = consolidate_text(article['abstract']) sentences = [] for para in bodytext: for sentence in get_sentences(para): # remove extra whitespaces behind period # "how are you ." becomes "how are you." if (sentence[-1] == '.'): sentence = sentence[:-1].rstrip() sentence = sentence + '.' sentences_processed.append( preprocess_string(sentence, CUSTOM_FILTERS)) sentences.append(sentence.lower()) # append a newline at the end of the last sentence to indicate paragraph break if len(sentences) is not 0: last_sentence = sentences[-1] last_sentence = last_sentence + '\n' sentences[-1] = last_sentence id2sentences.update({id: sentences}) # should probably apply custom filters to the abstract as well.. id2abstract.update({id: abstract}) id2title.update({id: title}) id2authors.update({id: authors}) return [ id2abstract, id2title, id2authors, sentences_processed, id2sentences ]
def insert_many(TWEETS_FILE, db_session): with open(TWEETS_FILE, 'r') as f: for i, line in enumerate(f.readlines()): line = line.strip("\n") if line != "": if len(line) <= 274: logging.info(f"Line no. {i} processed with length {len(line)}.\n{line}") db_session.add(Tweet(line, timestamp())) else: logging.info(f"Line no. {i} too long ({len(line)}):\n{line}") new_lines = get_sentences(line) num_total = len(new_lines) new_tweets = [] for i, line in enumerate(new_lines, start=1): line = line + f" {str(i)}/{str(num_total)}" if i == 1: new_tweets.append(Tweet(line, ))
def __iter__(self): files = [f for f in os.listdir(self.dirname) if 'tar.bz2' in f] for f in files: print(f) with tarfile.open(path.join(self.dirname, f), 'r:bz2') as t: print('opened file', f) for info in t: if '.txt' in info.name: print('about to read in text from ' + info.name) year = int(info.name.split('/')[1]) if self.start <= year and year <= self.end: print('extracting file') tf = t.extractfile(info) document = tf.read() print('read file') for sentence in get_sentences(str(document)): # Get the sentences # This is pretty noisy data currently and could use # some preprocessing print('about to preprocess') preprocessed = utils.simple_preprocess( sentence) print(preprocessed) yield preprocessed
)) summaries = [] if isinstance(summary, list): summaries = summary elif isinstance(summary, str): summaries = summary.split("\n") summary_colors = {} for num, summary_item in enumerate(summaries): color = COLORS[num%len(COLORS)] summary_out(annotate(summary_item, "", color)) summary_colors[summary_item] = color input_out = div(style=styles( font_family="sans-serif", line_height="1.5", font_size=px(16), )) input_texts = textcleaner.get_sentences(text) for input_text in input_texts: if input_text in summaries: color = summary_colors.get(input_text) input_out(annotate(input_text, "", color)) else: input_out(input_text) input_offset_col.write(HTML_WRAPPER.format(input_out), unsafe_allow_html=True) result_offset_col.write("""<div style="display: block; float:left;">{}</div>""".format(str(summary_out)), unsafe_allow_html=True)
def send_for_transcription(): global transcription global app_data if request.method == 'POST': if request.form['process_btn'] == "Start": print(request.form['file_type']) print(app_data['file_type']) app_data['loading'] = False app_data['file_type'] = request.form['file_type'] if 'file_address' not in request.files: print('No files found.') return 'No files' file = request.files['file_address'] app_data['file_address'] = file #print(file) #print(file.filename) filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) # upload the file to s3 bucket - to be done # create vocab if any custom_vocab_name = 'default_custom_vocab' app_data['vocab'] = str(request.form['vocab']).strip() print(app_data['vocab']) if app_data['vocab']: custom_vocab_name = str( "custom_vocab_" + str(datetime.datetime.now().strftime("%y-%m-%d-%H.%M.%S"))) custom_vocab_status = create_custom_vocabulary( custom_vocab_name, app_data['vocab']) if custom_vocab_status == 'READY': app_data['vocab_status'] = 'Custom Vocabulary Ready' else: app_data[ 'vocab_status'] = 'Custom Vocabulary Creation Failed' # get transcription transciption_job_name = str( "transcribe-job-" + str(datetime.datetime.now().strftime("%y-%m-%d-%H.%M.%S"))) #print(transciption_job_name) #print(app_data['file_type']) if app_data['file_type'] == None or app_data['file_type'] == '': print('no file type') else: transcription_result = transcribe_audio( transciption_job_name, '', app_data['file_type'], custom_vocab_name, file.filename) transcription = transcription_result['transcription'] transcription_sent_count = len( list(get_sentences(transcription))) threshold_sent_count = min(10, transcription_sent_count) ratio = threshold_sent_count / transcription_sent_count app_data['ratio'] = int(round(((1 - ratio) * 100))) text_summary = summarize_text(transcription, ratio) app_data['summary'] = text_summary app_data['num_speaker'] = transcription_result['num_speaker'] if request.form['process_btn'] == "Go": app_data['ratio'] = request.form['ratio'] summ_ratio = (100 - float(request.form['ratio'])) / 100 if summ_ratio < 0.05: summ_ratio = 0.1 app_data['summary'] = summarize_text(transcription, summ_ratio) print(app_data['vocab']) return redirect(url_for('launch_app'))
def create_training_sentences(training_pages: Iterator[str]): for page in training_pages: for sentence in get_sentences(page): yield simple_preprocess(sentence)
def getSentences(text, replaceAbbreviations=False): if replaceAbbreviations: text = replace_abbreviations(text) return list(get_sentences(text))
def create_corpus(text): text = text.replace('\n', '') for i, sentence in enumerate(get_sentences(text)): yield TaggedDocument(simple_preprocess(sentence), [i], sentence)
def num_sentences(text): if not text: return 0 return len(list(get_sentences(text)))
def get_sentences_from_text(text): ''' returns generator of texts ''' return get_sentences(text)