Exemple #1
0
def run_loop(context, card, card_tag):
    list_of_sentences = []
    list_of_paragraphs = []
    if granularity_level == "Sent":
        for paragraph in segmenter.analyze(card):
            for sentence in paragraph:  ## sentence level summarization
                set_str = ""
                for token in sentence:
                    set_str += token.spacing
                    set_str += token.value
                list_of_sentences.append(set_str)
        word_list = embed(card_tag, list_of_sentences, 0, 0)
    elif granularity_level == "Paragraph":
        for paragraph in segmenter.analyze(card):
            set_str = ""
            for sentence in paragraph:  ## sentence level summarization
                #set_str = ""
                for token in sentence:
                    set_str += token.spacing
                    set_str += token.value
            list_of_paragraphs.append(set_str)
        word_list = embed(card_tag, list_of_paragraphs, 0, 0)
    elif granularity_level == "Word":
        card_as_sentence = Sentence(card)
        card_words, card_words_org = create_ngram(context, card, card_tag)
        word_list = embed(card_tag, card_as_sentence, card_words,
                          card_words_org)
    #print(word_list)
    return word_list
Exemple #2
0
 def tokenize_text(self, text):
     paragraphs = []
     for paragraph in segmenter.analyze(text):
         sentences = []
         for sentence in paragraph:
             sentences.append(' '.join([token.value for token in sentence]))
         paragraphs.append('\n'.join(sentences))
     return '\n\n'.join(paragraphs)
Exemple #3
0
def sentenceify(text):
    return [
        sl for l in [[
            ''.join([t.spacing + t.value for t in s]) for s in p
            if len(s) < MAX_SENT_LEN
        ] for p in segmenter.analyze(text)] for sl in l
        if any(map(lambda x: x.isalpha(), sl))
    ]
Exemple #4
0
    def test_analyze(self):
        offset = 0

        for paragraph in segmenter.analyze(DOCUMENT):
            for sentence in paragraph:
                for token in sentence:
                    if token.value:
                        offset = DOCUMENT.index(token.value, offset)
                        self.assertEqual(offset, token.offset, repr(token))
                        offset += len(token.value)
def readability_stats(dataframe, row, i, current_column, new_column,
                      readability_group, readability_measure):
    this_comment = row[current_column]
    tokenized = '\n\n'.join('\n'.join(' '.join(token.value
                                               for token in sentence)
                                      for sentence in paragraph)
                            for paragraph in segmenter.analyze(this_comment))
    this_result = readability.getmeasures(tokenized, lang='en')
    c.df[dataframe].at[
        i, new_column] = this_result[readability_group][readability_measure]
Exemple #6
0
def syntok_ssplit(text: str, ignore_newlines=True) -> Tuple[str, int, int]:
    if ignore_newlines:
        # remove only single newlines, assume multiples are paragraph breaks
        text = ' '.join(re.split(r'(?<!\n)\n(?!\n)', text))
    start = 0
    for paragraph in syntok_segmenter.analyze(text):
        for sentence in paragraph:
            sentence = ' '.join(tok.value for tok in sentence)
            end = start + len(sentence)
            yield sentence, start, end
            start = end
Exemple #7
0
def run_readability(texts):
    out = []
    for text in texts:
        tokenized = '\n\n'.join(
            '\n'.join(
            ' '.join(token.value for token in sentence) 
            for sentence in paragraph) 
            for paragraph in segmenter.analyze(text))
        results = readability.getmeasures(tokenized, lang='en')
        data = {}
        for key in results:
            data[key.replace(' ', '')] = dict(results[key])
        out.append(data)
    return out
    def sentence_tokenize(
        self,
        text: str,
    ) -> List[str]:
        """Split a text into sentences using syntok package

        Args:
            text: text to be split
        """
        lst_sentences = []
        for paragraph in segmenter.analyze(text):
            for sentence in paragraph:
                sentence = "".join(map(str, sentence)).lstrip()
                sentence = self.postprocess(sentence)
                lst_sentences.append(sentence)
        return lst_sentences
Exemple #9
0
def run_loop(context, card, card_tag):
    list_of_sentences = []
    if sent_level:
        for paragraph in segmenter.analyze(card):
            for sentence in paragraph:  ## sentence level summarization
                set_str = ""
                for token in sentence:
                    set_str += token.spacing
                    set_str += token.value
                list_of_sentences.append(set_str)
        word_list = embed(card_tag, list_of_sentences, 0, 0)
    else:
        card_as_sentence = Sentence(card)
        card_words, card_words_org = create_ngram(context, card, card_tag)
        word_list = embed(card_tag, card_as_sentence, card_words,
                          card_words_org)
    return word_list
Exemple #10
0
def add_doc(writer, path, processed_doc_path):
    fileobj = open(path, "r")
    content = fileobj.read()
    fileobj.close()

    # tokenize
    tokenized_str = ''
    for sent in [sent for para in segmenter.analyze(content) for sent in para]:
        tokens = tokenizer.convert_ids_to_tokens(
            tokenizer([t.value for t in sent], is_pretokenized=True, add_special_tokens=False)['input_ids'])
        tokenized_str += ' '.join(tokens) + '\n'

    filename = os.path.basename(path)
    out = open(processed_doc_path, 'w')
    print(tokenized_str, file=out)
    out.close()

    writer.add_document(filename=filename, content=tokenized_str)
Exemple #11
0
def sentences(text: str) -> StringGenerator:
    """
    Get the sentences of a document.

    Parameters
    ----------
    text : str
        The text to tokenize.

    Returns
    -------
    Generator of str
        The sentences, one after the other.
    """
    for paragraph in segmenter.analyze(text):
        for sentence in paragraph:
            orig_sentence = ""
            for t in sentence:
                orig_sentence += t.spacing + t.value
            yield orig_sentence
Exemple #12
0
tree_left = strip_ns_prefix(etree.parse(sys.argv[1], parser))
tree_right = strip_ns_prefix(etree.parse(sys.argv[2], parser))
(lstring, lsegments) = readtree(tree_left)
(rstring, rsegments) = readtree(tree_right)

for i in range(len(lstring)):
    if lstring[i] != rstring[i]:
        print("abort: normalized text missmatches on char index " + i,
              file=sys.stderr)
        sys.exit(1)

string = lstring
sentences = [{
    'start': sentence[0]._offset,
    'end': sentence[-1]._offset + len(sentence[-1].value)
} for paragraph in segmenter.analyze(string) for sentence in paragraph]

# TODO implement alignments
alignment_scores = [{
    'name': 'random',
    'scores': [random.random() for s in sentences]
}, {
    'name': 'random2',
    'scores': [random.random() for s in sentences]
}]

output = {
    'sentences': sentences,
    'alignment_scores': alignment_scores,
    'text': string,
    'left_segments': {seg['id']: seg
Exemple #13
0
        for i in sentence:
            sen += i
        sentence = sen
        sentence = sentence.split('?')
        sen = ''
        for i in sentence:
            sen += i
        sentence = sen
        sentence = sentence.split(',')
        sen = ''
        for i in sentence:
            sen += i
        sentence = sen
        sentence += ' '
        expected.append(sentence)
print(expected)
print()
document = ''
random.shuffle(expected)
for phrase in expected:
    document += phrase

for paragraph in segmenter.analyze(document):
    for sentence in paragraph:
        for token in sentence:
            # exactly reproduce the input
            # and do not remove "imperfections"
            print(token.spacing, token.value, sep='', end='')
    print()
    print("\n")  # reinsert paragraph separators
Exemple #14
0
def predict():
    # Works only for a single sample
    if request.method == 'POST':
        if not request.form.getlist('review'):
            return render_template(
                'home.html',
                predicted='You need to enter a review for the restaurant')
        if not request.form.getlist('star'):
            return render_template(
                'home.html',
                predicted=
                'You need to give a number of stars for the restaurant')
        text = request.form.getlist('review')[0]
        stars = request.form.getlist('star')[0]

        tokenized = '\n\n'.join('\n'.join(' '.join(token.value
                                                   for token in sentence)
                                          for sentence in paragraph)
                                for paragraph in segmenter.analyze(text))
        nested_feature_dict = readability.getmeasures(tokenized, lang='en')
        new_cols = {"stars": int(stars)}
        for k in nested_feature_dict.keys():
            new_dict = {}
            for nested_key in nested_feature_dict[k].keys():
                new_cols[k + " " +
                         nested_key] = nested_feature_dict[k][nested_key]
        df = pd.DataFrame(new_cols, index=[0])
        remove_cols = [
            'readability grades Kincaid', 'readability grades ARI',
            'readability grades FleschReadingEase',
            'sentence info characters_per_word', 'sentence info syll_per_word',
            'sentence info words_per_sentence', 'sentence info characters',
            'sentence info syllables', 'sentence info long_words'
        ]
        df.drop(columns=remove_cols, inplace=True)

        def get_tag(pos_tag):
            if pos_tag.startswith('J'):
                return wordnet.ADJ
            elif pos_tag.startswith('R'):
                return wordnet.ADV
            elif pos_tag.startswith('V'):
                return wordnet.VERB
            else:
                return wordnet.NOUN

        def clean_review(review):
            # lower case and remove special characters\whitespaces
            review = re.sub(r'[^a-zA-Z\s]', '', review, re.I | re.A)
            review = review.lower()
            review = review.strip()

            # tokenize document
            tokens = wpt.tokenize(review)
            # filter stopwords out of document
            review = [token for token in tokens if token not in stop_words]

            #get POS tags for the review
            pos_tags = pos_tag(review)

            # lemmatize review
            review = [
                WordNetLemmatizer().lemmatize(t[0], get_tag(t[1]))
                for t in pos_tags
            ]

            # re-create document from filtered tokens
            review = ' '.join(review)
            return review

        sid = SentimentIntensityAnalyzer()
        sentiments = sid.polarity_scores(text)
        sentiments_df = json_normalize(sentiments)
        df["compound"] = sentiments_df["compound"]

        df_scaled = scaler.transform(df)
        df_transformed = pca_transformer.transform(df_scaled)
        prediction = rf_model.predict(
            df_transformed)  # runs globally loaded model on the data
        print(prediction)
        return render_template('home.html',
                               predicted=round(np.exp(prediction[0]), 2))
    return render_template('home.html', predicted='Error')
Exemple #15
0
 def __init__(self, text):
     tokenized = '\n\n'.join('\n'.join(' '.join(token.value
                                                for token in sentence)
                                       for sentence in paragraph)
                             for paragraph in segmenter.analyze(text))
     self.results = readability.getmeasures(tokenized, lang='en')