Ejemplo n.º 1
0
def ts():
    text = """
    Mr. Speaker, 480,000 Federal employees are working without pay, a form of involuntary servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have mortgages to pay, children to feed, and financial obligations to meet.
    Mr. Speaker, what is happening to these workers is immoral, is wrong, and must be rectified immediately. Newt Gingrich and the Republican leadership must not continue to hold the House and the American people hostage while they push their disastrous 7-year balanced budget plan. The gentleman from Georgia, Mr. Gingrich, and the Republican leadership must join Senator Dole and the entire Senate and pass a continuing resolution now, now to reopen Government.
    Mr. Speaker, that is what the American people want, that is what they need, and that is what this body must do.
    """
    doc = Doc(text.strip(), lang="en")
    ts_ = text_stats.TextStats(doc)
    return ts_
Ejemplo n.º 2
0
 def setUp(self):
     self.doc = Doc(TEXT, lang='en')
     self.ts = text_stats.TextStats(self.doc)
Ejemplo n.º 3
0
def ts():
    doc = Doc(TEXT, lang='en')
    ts_ = text_stats.TextStats(doc)
    return ts_
Ejemplo n.º 4
0
 def setUp(self):
     self.doc = Doc(TEXT, lang='en_core_web_sm')
     self.ts = text_stats.TextStats(self.doc)
Ejemplo n.º 5
0
def score_summary_2(summary_text):
    """Score a summarized piece of text
    """
    # Want high similarity between paragraphs
    inter_paragraph_similarities = []
    avg_similarity = None

    sentences = [i.text for i in NLP(summary_text).sents]

    # readability measures close to ebert baseline
    readability = abs(
        text_stats.TextStats(NLP(summary_text)).automated_readability_index -
        EBERT_READABILITY) / EBERT_READABILITY

    # Coh Metrix Indices
    anaphor_score = anaphor_overlap(summary_text)
    person_score = person_overlap(summary_text)

    # more subjective is better
    total_subjectivity = 0
    for i in sentences:
        total_subjectivity += TextBlob(i).sentiment[1]
    subjectivity = total_subjectivity / len(sentences)

    # thesis sentence doesn't have "this", "here", "it"
    if sentences[0] not in [' ', '', '\n']:
        thesis_penalty = sum(i in sentences[0]
                             for i in [" this ", " This ", " here ", " Here"])
    elif sentences[1] not in [' ', '', '\n']:
        thesis_penalty = sum(i in sentences[1]
                             for i in [" this ", " This ", " here ", " Here"])
    else:
        thesis_penalty = 0

    # Prefer expressions from the author
    author_count = 0
    for s in sentences:
        if any(i in s for i in ["I ", "I'd", "My"]):
            author_count += 1

    # iterate through the paragraphs
    # sentiment within a paragraph is similar
    paragraphs = summary_text.split('\n')
    for i in range(1, len(paragraphs)):
        if paragraphs[i - 1] not in [' ', '', '\n'] and paragraphs[i] not in [
                ' ', '', '\n'
        ]:
            inter_paragraph_similarities.append(
                similarity.word_movers(NLP(paragraphs[i - 1]),
                                       NLP(paragraphs[i])))

    max_diff = 0
    for p in paragraphs:
        p_sent_min = None
        p_sent_max = None
        for s in p.split('.'):
            sent = TextBlob(s).sentiment[0]
            if p_sent_min is None:
                p_sent_min = sent
            if p_sent_max is None:
                p_sent_max = sent

            if sent < p_sent_min:
                p_sent_min = sent
            if sent > p_sent_max:
                p_sent_max = sent
        if max_diff < abs(p_sent_max - p_sent_min):
            max_diff = abs(p_sent_max - p_sent_min)
    max_diff = 1 - max_diff
    avg_similarity = sum(inter_paragraph_similarities) / len(
        inter_paragraph_similarities)

    # Make score
    score = (0.25 * avg_similarity) + \
            (0.20 * person_score) + \
            (0.15 * anaphor_score) + \
            (0.1 * max_diff) + \
            (0.05 * readability) + \
            (0.25 * subjectivity)
    # boost by person count
    score = score * (1 + (0.1 * author_count))
    score = score - (0.2 * thesis_penalty)

    return score
Ejemplo n.º 6
0
def ari(doc):
    doc_stats = ts.TextStats(doc)
    return (0.4 * doc_stats.n_words / doc_stats.n_sents) + (
        6 * doc_stats.n_chars / doc_stats.n_words) - 27.4
Ejemplo n.º 7
0
def flesch(doc):
    doc_stats = ts.TextStats(doc)
    return 206.835 - (1.015 * doc_stats.n_words / doc_stats.n_sents) - (
        84.6 * doc_stats.n_syllables / doc_stats.n_words)