Example #1
0
    def prepare_text_for_ext_summarizer(self) -> str:
        """
    This method process the embed text and prepare it to feed in extractive summarizer
    It also ensure enough information is avaialble in text itself for highlighting

    :return: string consist final_text 
    """

        logging.info('Prepare pdf text for extractive summarization')

        text = self.get_text_with_embedded_info()
        final_text = ""
        info = ""

        for sentence in get_sentences(text):
            final_sentence = sentence

            if not final_sentence.startswith('@@@id_'):
                final_sentence = info + final_sentence

            info = [
                word + ' ' for word in final_sentence.split()
                if word.startswith('@@@id_')
            ][-1]
            final_text += final_sentence + '\n'

        return final_text
def prepare_tweet_for_analysis(tweet):
    processed_tweet = []
    temp_tweet = ''
    for line in get_sentences(tweet):
        temp_tweet = ' '.join(tp.preprocess_text(line))
        processed_tweet.append(temp_tweet)

    return '. '.join(processed_tweet)
Example #3
0
def lead_3(article, word_count):
    """
    Extract lead-3 summary and return as dictionary.

    returns d :: { 'system': ... }
    """
    sentences = get_sentences(article["text"])
    summary = " ".join(itertools.islice(sentences, 3))
    return summary
Example #4
0
def basic_preprocessing(
        text,
        sents=False,
        lower=False,
        stem=False,
        min_token_len=3,
        min_sent_len=4,
        remove_stops=False,
        stops=STOPWORDS,
        filters=['strip_multiple_whitespaces', 'strip_punctuation']):
    # EDT export specific
    text = text.replace('\x00', '')
    text = text.replace('\r\n', '\n')

    # note: filters will be applied in order
    if sents:
        sents = get_sentences(text)
    else:
        sents = [text]

    for s in sents:
        s = s.strip()
        if lower:
            s = s.lower()
        if stem:
            s = stem_text(s)

        for f in filters:
            s = funcs[f](s)

        # naive word tokenization
        s = s.split()
        tmp = list()
        for t in s:
            t = t.strip()
            if t:
                if remove_stops and stops:
                    if t not in stops:
                        tmp.append(t)
                    else:
                        continue
                else:
                    tmp.append(t)
            else:
                continue
        s = tmp

        if len(s) < min_sent_len:
            yield list()
        else:
            yield s
Example #5
0
def text8_raw_sentences(length):
    '''
	may want to replace the lower frequency words
	'''
    from gensim.summarization import textcleaner

    text_file = open("text")
    text = text_file.read()

    sentences = textcleaner.get_sentences(text)
    sentences = [i for i in sentences]

    if length != -1:
        return sentences[:length]
    else:
        return sentences
Example #6
0
def get_data(articles):
    # document id -> abstract
    id2abstract = {}
    # document id -> title
    id2title = {}
    # document id -> authors
    id2authors = {}
    # list of pre-processed sentences
    sentences_processed = []
    # document id -> list of original sentences, converted to lower case
    id2sentences = {}
    for article in articles:
        id = article['paper_id']
        title = article['metadata']['title']
        authors = article['metadata']['authors']
        bodytext = clean_text(article['body_text'])
        abstract = consolidate_text(article['abstract'])
        sentences = []
        for para in bodytext:
            for sentence in get_sentences(para):
                # remove extra whitespaces behind period
                # "how are you  ." becomes "how are you."
                if (sentence[-1] == '.'):
                    sentence = sentence[:-1].rstrip()
                    sentence = sentence + '.'
                sentences_processed.append(
                    preprocess_string(sentence, CUSTOM_FILTERS))
                sentences.append(sentence.lower())
            # append a newline at the end of the last sentence to indicate paragraph break
            if len(sentences) is not 0:
                last_sentence = sentences[-1]
                last_sentence = last_sentence + '\n'
                sentences[-1] = last_sentence

        id2sentences.update({id: sentences})
        # should probably apply custom filters to the abstract as well..
        id2abstract.update({id: abstract})
        id2title.update({id: title})
        id2authors.update({id: authors})

    return [
        id2abstract, id2title, id2authors, sentences_processed, id2sentences
    ]
Example #7
0
def insert_many(TWEETS_FILE, db_session):
    with open(TWEETS_FILE, 'r') as f:
        for i, line in enumerate(f.readlines()):
            line = line.strip("\n")
            if line != "":
                if len(line) <= 274:
                    logging.info(f"Line no. {i} processed with length {len(line)}.\n{line}")
                    db_session.add(Tweet(line, timestamp()))

                else:
                    logging.info(f"Line no. {i} too long ({len(line)}):\n{line}")
                    new_lines = get_sentences(line)
                    num_total = len(new_lines)

                    new_tweets = []
                    for i, line in enumerate(new_lines, start=1):
                        line = line + f" {str(i)}/{str(num_total)}"
                        if i == 1:
                            new_tweets.append(Tweet(line, ))
 def __iter__(self):
     files = [f for f in os.listdir(self.dirname) if 'tar.bz2' in f]
     for f in files:
         print(f)
         with tarfile.open(path.join(self.dirname, f), 'r:bz2') as t:
             print('opened file', f)
             for info in t:
                 if '.txt' in info.name:
                     print('about to read in text from ' + info.name)
                     year = int(info.name.split('/')[1])
                     if self.start <= year and year <= self.end:
                         print('extracting file')
                         tf = t.extractfile(info)
                         document = tf.read()
                         print('read file')
                         for sentence in get_sentences(str(document)):
                             # Get the sentences
                             # This is pretty noisy data currently and could use
                             # some preprocessing
                             print('about to preprocess')
                             preprocessed = utils.simple_preprocess(
                                 sentence)
                             print(preprocessed)
                             yield preprocessed
Example #9
0
    ))

    summaries = []
    if isinstance(summary, list):
        summaries = summary
    elif isinstance(summary, str):
        summaries = summary.split("\n")

    summary_colors = {}
    for num, summary_item in enumerate(summaries):
        color = COLORS[num%len(COLORS)]
        summary_out(annotate(summary_item, "", color))
        summary_colors[summary_item] = color

    input_out = div(style=styles(
        font_family="sans-serif",
        line_height="1.5",
        font_size=px(16),
    ))

    input_texts = textcleaner.get_sentences(text)
    for input_text in input_texts:
        if input_text in summaries:
            color = summary_colors.get(input_text)
            input_out(annotate(input_text, "", color))
        else:
            input_out(input_text)

    input_offset_col.write(HTML_WRAPPER.format(input_out), unsafe_allow_html=True)
    result_offset_col.write("""<div style="display: block; float:left;">{}</div>""".format(str(summary_out)), unsafe_allow_html=True)
def send_for_transcription():
    global transcription
    global app_data
    if request.method == 'POST':
        if request.form['process_btn'] == "Start":
            print(request.form['file_type'])
            print(app_data['file_type'])
            app_data['loading'] = False

            app_data['file_type'] = request.form['file_type']

            if 'file_address' not in request.files:
                print('No files found.')
                return 'No files'
            file = request.files['file_address']
            app_data['file_address'] = file
            #print(file)
            #print(file.filename)

            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))

            # upload the file to s3 bucket - to be done

            # create vocab if any
            custom_vocab_name = 'default_custom_vocab'
            app_data['vocab'] = str(request.form['vocab']).strip()
            print(app_data['vocab'])
            if app_data['vocab']:
                custom_vocab_name = str(
                    "custom_vocab_" +
                    str(datetime.datetime.now().strftime("%y-%m-%d-%H.%M.%S")))
                custom_vocab_status = create_custom_vocabulary(
                    custom_vocab_name, app_data['vocab'])
                if custom_vocab_status == 'READY':
                    app_data['vocab_status'] = 'Custom Vocabulary Ready'
                else:
                    app_data[
                        'vocab_status'] = 'Custom Vocabulary Creation Failed'

            # get transcription

            transciption_job_name = str(
                "transcribe-job-" +
                str(datetime.datetime.now().strftime("%y-%m-%d-%H.%M.%S")))

            #print(transciption_job_name)
            #print(app_data['file_type'])

            if app_data['file_type'] == None or app_data['file_type'] == '':
                print('no file type')
            else:
                transcription_result = transcribe_audio(
                    transciption_job_name, '', app_data['file_type'],
                    custom_vocab_name, file.filename)
                transcription = transcription_result['transcription']

                transcription_sent_count = len(
                    list(get_sentences(transcription)))
                threshold_sent_count = min(10, transcription_sent_count)
                ratio = threshold_sent_count / transcription_sent_count
                app_data['ratio'] = int(round(((1 - ratio) * 100)))

                text_summary = summarize_text(transcription, ratio)

                app_data['summary'] = text_summary
                app_data['num_speaker'] = transcription_result['num_speaker']

        if request.form['process_btn'] == "Go":
            app_data['ratio'] = request.form['ratio']
            summ_ratio = (100 - float(request.form['ratio'])) / 100
            if summ_ratio < 0.05:
                summ_ratio = 0.1

            app_data['summary'] = summarize_text(transcription, summ_ratio)

        print(app_data['vocab'])
        return redirect(url_for('launch_app'))
Example #11
0
def create_training_sentences(training_pages: Iterator[str]):
    for page in training_pages:
        for sentence in get_sentences(page):
            yield simple_preprocess(sentence)
Example #12
0
def getSentences(text, replaceAbbreviations=False):
    if replaceAbbreviations:
        text = replace_abbreviations(text)
    return list(get_sentences(text))
Example #13
0
def create_corpus(text):
    text = text.replace('\n', '')
    for i, sentence in enumerate(get_sentences(text)):
        yield TaggedDocument(simple_preprocess(sentence), [i], sentence)
Example #14
0
def num_sentences(text):
    if not text:
        return 0
    return len(list(get_sentences(text)))
def get_sentences_from_text(text):
    ''' returns generator of texts '''

    return get_sentences(text)