Beispiel #1
0
def get_textacy_name_entities(text,
                              article_id,
                              drop_determiners=True,
                              exclude_types='numeric'):
    '''Get Named Entities using textacy
    ## NOT USED IN THE PROJECT
    text: full_text or summary
    article_id: string, article id (names of json files)
    Return a pd dataframe with two columns: named entities and entities category
    '''

    en = textacy.load_spacy_lang("en_core_web_sm", disable=("parser", ))
    if isinstance(text, str):  # if raw string
        doc = textacy.make_spacy_doc(text, lang=en)
    elif isinstance(text, Doc):  # if pre-created spacy doc
        doc = text
    else:
        doc = textacy.make_spacy_doc("NA", lang=en)

    nes = textacy.extract.entities(
        doc, drop_determiners=drop_determiners,
        exclude_types=exclude_types)  # nes is a generator
    ne_list = []
    ne_label_list = []

    for ne in nes:
        ne_list.append(ne)
        ne_label_list.append(ne.label_)

    data = pd.DataFrame(data={'text': ne_list, 'label': ne_label_list})
    data = data.drop_duplicates(keep='first')
    if article_id != None:  # store article ID for csv
        data['article_id'] = article_id
    return data
Beispiel #2
0
def textacy_featurize(transcript):
    features=list()
    labels=list()

    # use Spacy doc
    try:
        doc = textacy.make_spacy_doc(transcript)
    except:
        os.system('python3 -m spacy download en')
        doc = textacy.make_spacy_doc(transcript)
    
    ts = textacy.TextStats(doc)
    uniquewords=ts.n_unique_words
    features.append(uniquewords)
    labels.append('uniquewords')

    mfeatures=ts.basic_counts
    features=features+list(mfeatures.values())
    labels=labels+list(mfeatures)

    kincaid=ts.flesch_kincaid_grade_level
    features.append(kincaid)
    labels.append('flesch_kincaid_grade_level')

    readability=ts.readability_stats
    features=features+list(readability.values())
    labels=labels+list(readability)
    
    return features, labels
Beispiel #3
0
 def test_chunk_size(self, text, chunk_size, en_core_web_sm):
     doc_full = make_spacy_doc(text, en_core_web_sm)
     doc_chunked = make_spacy_doc(text,
                                  en_core_web_sm,
                                  chunk_size=chunk_size)
     assert isinstance(doc_chunked, spacy.tokens.Doc)
     assert len(doc_full.text) == len(doc_chunked.text)
     assert len(doc_full) == len(doc_chunked)
Beispiel #4
0
def p_complexity(args):
    correct_meta_labels = create_meta_labels(
        os.path.join(args.meta_outputs_dir, CORRECT_META_FILE))
    incorrect_meta_labels = create_meta_labels(
        os.path.join(args.meta_outputs_dir, INCORRECT_META_FILE))

    correct_outputs = torch.load(os.path.join(args.test_outputs_dir,
                                              CORRECT + LAYER_NAME),
                                 map_location='cpu')
    incorrect_outputs = torch.load(os.path.join(args.test_outputs_dir,
                                                INCORRECT + LAYER_NAME),
                                   map_location='cpu')

    correct_cmplx = []
    incorrect_cmplx = []

    for output in progressbar.progressbar(correct_outputs):
        psg = output['metadata'][0]['original_passage']
        psg = " ".join(w for w in nltk.wordpunct_tokenize(psg)
                       if w.lower() in WORDS or not w.isalpha())
        try:
            doc = textacy.make_spacy_doc(psg)
            ts = TextStats(doc)
            cur_cmplx = ts.readability_stats['flesch_kincaid_grade_level']
        except:
            cur_cmplx = 0
        correct_cmplx.append(cur_cmplx)

    for output in progressbar.progressbar(incorrect_outputs):
        psg = output['metadata'][0]['original_passage']
        try:
            doc = textacy.make_spacy_doc(psg)
            ts = TextStats(doc)
            cur_cmplx = ts.readability_stats['flesch_kincaid_grade_level']
        except:
            cur_cmplx = 0
        incorrect_cmplx.append(cur_cmplx)

    correct_cmplx_dict = {
        'Complexity': correct_cmplx,
        'Meta Prediction': correct_meta_labels,
        'Base Network Prediction': [1] * len(correct_cmplx)
    }
    incorrect_cmplx_dict = {
        'Complexity': incorrect_cmplx,
        'Meta Prediction': incorrect_meta_labels,
        'Base Network Prediction': [0] * len(incorrect_cmplx)
    }

    correct_cmplx_df = pd.DataFrame.from_dict(correct_cmplx_dict)
    incorrect_cmplx_df = pd.DataFrame.from_dict(incorrect_cmplx_dict)

    return correct_cmplx_df, incorrect_cmplx_df
Beispiel #5
0
 def _exec_pipeline_for_sub_corpus(self, normalize_texts, batch_id, docs):
     # Internal function to enable multi-threaded pipeline execution
     sub_corpus = textacy.Corpus(self.nlp)
     for doc in docs:
         if doc['text']:
             if normalize_texts:
                 spacy_doc = textacy.make_spacy_doc(
                     (normalize(self.language, doc['text']), {'celex': doc['celex']}), self.nlp)
             else:
                 spacy_doc = textacy.make_spacy_doc(doc['text'], {'celex': doc['celex']}, self.nlp)
             sub_corpus.add_doc(spacy_doc)
     return sub_corpus
Beispiel #6
0
def get_sentiment(data, tags, sentence, verbose):
    if len(tags) > 0:
        ct = 0
        for title in data.titles:
            for tag in tags:
                if title == tag:
                    click.echo(tag)
                    content = data.documents[ct]
            ct += 1
        interview = Content(content)
        doc = textacy.make_spacy_doc(interview.doc)

        ## Sentiment
        s = Sentiment()

        if sentence is True:
            for sentence in doc.sents:
                if len(sentence) > 3:
                    sent = s.sentiment_analyzer_scores(sentence.text)
                    if verbose:
                        click.echo("{:-<40} {}\n".format(
                            sent["sentence"], str(sent["score"])))
                    click.echo(s.sentiment())

        else:
            sent = s.sentiment_analyzer_scores(doc.text)
            if verbose:
                click.echo("{:-<40} {}\n".format(sent["sentence"],
                                                 str(sent["score"])))
            click.echo(s.sentiment())
        return s.sentiment()
    else:
        all_interviews = Content(data.content)
        doc = textacy.make_spacy_doc(all_interviews.doc)

        ## Sentiment
        s = Sentiment()
        if sentence is True:
            for sentence in doc.sents:
                if len(sentence) > 3:
                    sent = s.sentiment_analyzer_scores(sentence.text)
                    if verbose:
                        click.echo("{:-<40} {}\n".format(
                            sent["sentence"], str(sent["score"])))
                    click.echo(s.sentiment())

        else:
            sent = s.sentiment_analyzer_scores(doc.text)
            if verbose:
                click.echo("{:-<40} {}\n".format(sent["sentence"],
                                                 str(sent["score"])))
            click.echo(s.sentiment())
        return s.sentiment()
Beispiel #7
0
def q_complexity(args):
    correct_meta_labels = create_meta_labels(
        os.path.join(args.meta_outputs_dir, CORRECT_META_FILE))
    incorrect_meta_labels = create_meta_labels(
        os.path.join(args.meta_outputs_dir, INCORRECT_META_FILE))

    correct_outputs = torch.load(os.path.join(args.test_outputs_dir,
                                              CORRECT + LAYER_NAME),
                                 map_location='cpu')
    incorrect_outputs = torch.load(os.path.join(args.test_outputs_dir,
                                                INCORRECT + LAYER_NAME),
                                   map_location='cpu')

    correct_cmplx = []
    incorrect_cmplx = []

    for output in progressbar.progressbar(correct_outputs):
        q = ' '.join(output['metadata'][0]['question_tokens'])
        try:
            doc = textacy.make_spacy_doc(q)
            ts = TextStats(doc)
            cur_cmplx = ts.readability_stats['flesch_kincaid_grade_level']
        except:
            cur_cmplx = 0
        correct_cmplx.append(cur_cmplx)

    for output in progressbar.progressbar(incorrect_outputs):
        q = ' '.join(output['metadata'][0]['question_tokens'])
        try:
            doc = textacy.make_spacy_doc(q)
            ts = TextStats(doc)
            cur_cmplx = ts.readability_stats['flesch_kincaid_grade_level']
        except:
            cur_cmplx = 0
        incorrect_cmplx.append(cur_cmplx)

    correct_cmplx_dict = {
        'Complexity': correct_cmplx,
        'Meta Prediction': correct_meta_labels,
        'Base Network Prediction': [1] * len(correct_cmplx)
    }
    incorrect_cmplx_dict = {
        'Complexity': incorrect_cmplx,
        'Meta Prediction': incorrect_meta_labels,
        'Base Network Prediction': [0] * len(incorrect_cmplx)
    }

    correct_cmplx_df = pd.DataFrame.from_dict(correct_cmplx_dict)
    incorrect_cmplx_df = pd.DataFrame.from_dict(incorrect_cmplx_dict)

    return correct_cmplx_df, incorrect_cmplx_df
Beispiel #8
0
def analyzeSubLevel(input_text: str):
    """
    :Returns: highest CEFR of word in input_text, flesh_kincade_level, number of words
    """

    if (not (isinstance(input_text, str)) or (len(input_text) <= 0)):
        return ""

    if (this.cefr_data is None):
        this.cefr_data = loadCefrList()

    # TBD make static
    if (this.spacy_en is None):
        this.spacy_en = loadSpacyLangEn()

    # normalize text with NLP
    input_text = processText(input_text)

    doc = textacy.make_spacy_doc(input_text, lang=this.spacy_en)
    ts = textacy.TextStats(doc)

    flesh_kincade_level = calcFleshKincadeGrade(ts.n_words,
                                                ts.n_syllables / ts.n_words)

    # store words of text lowercase in list
    words: list = [item.lower() for item in input_text.split()]
    max_level = getMaxWordLevelForWordsSet(set(words), this.cefr_data)

    return max_level, flesh_kincade_level, ts.n_words
Beispiel #9
0
def test_empty_stats():
    text = ""
    doc = make_spacy_doc(text, lang=SPACY_MODEL)
    stats = nlp.compute_stats(doc)
    assert stats.counts.sentences == 0
    assert stats.counts.words == 0
    assert stats.readibility == None
Beispiel #10
0
def get_flesch(text):
    doc = textacy.make_spacy_doc(text, lang=en)
    ts = TextStats(doc)
    try:
        return ts.flesch_kincaid_grade_level
    except ZeroDivisionError:
        return (11.8 * ts.n_syllables) + (0.39 * ts.n_words) - 15.59
Beispiel #11
0
def txt_to_df(txt_lst, captions_clm_name="captions"):
    """
    Transform a list of texts to a df with some stats to reshape the number of captions
    """
    # Load the language model for textacy
    en = en_core_web_sm.load()

    captions_lst = list()
    for txt in txt_lst:

        doc = textacy.make_spacy_doc(txt, lang=en)

        ts = textacy.text_stats.TextStats(doc)
        df = pd.DataFrame({
            "n_chars": [ts.n_chars],
            captions_clm_name: doc.text
        })

        captions_lst.append(df)

    # Concat all df's into captions df for easy sorting and manipulation
    captions_df = pd.concat(captions_lst, ignore_index=True)
    captions_df['sent_order'] = captions_df.index

    return captions_df
Beispiel #12
0
 def augment_document(self, doc):
     try:
         doc = textacy.make_spacy_doc(doc, lang="en_core_web_sm")
         doc = self.augmenter.apply_transforms(doc)
         return str(doc)
     except:
         return str(doc)
Beispiel #13
0
def main(text,
         dmodels,
         snormalize='lemma',
         sngrams=(1, 2, 3, 4, 5, 6),
         sinclude_pos=('NOUN', 'PROPN', 'ADJ'),
         swindow_size=1500,
         stopn=1.,
         sidf=None,
         verbose=False):
    # identify language
    language = textacy.lang_utils.identify_lang(text)
    if verbose: print('[info] language = "%s"' % language)
    # load language model
    nlp = textacy.load_spacy_lang(dmodels[language], disable=("parser", ))
    # create documents
    doc = textacy.make_spacy_doc(text, lang=nlp)
    # model launch
    keywords = textacy.ke.sgrank(
        doc,
        normalize=snormalize,  #normalize = None, #normalize = 'lower', 
        ngrams=sngrams,
        include_pos=sinclude_pos,
        window_size=swindow_size,
        topn=stopn,
        idf=sidf)
    # return
    return keywords
Beispiel #14
0
def analyze_post(post, debug=False):
    "Perform NLP analysis"

    counters = PerfCounters()
    nlp = create_objdict()

    # clean fields
    counters.start('cleanup')
    clean_fields = generate_clean_fields(post)
    nlp.clean_fields = clean_fields
    counters.stop('cleanup')

    # creating spacy docs
    counters.start('make_spacy_docs')
    all_cleaned_content = ' '.join([clean_fields.title, clean_fields.category,
                                    " ".join(clean_fields.tags),
                                    clean_fields.abstract, clean_fields.text])

    # overall terms
    cleaned_doc = make_spacy_doc(all_cleaned_content, lang=SPACY_MODEL)

    # title terms
    title_doc = make_spacy_doc(clean_fields.title, lang=SPACY_MODEL)

    # for statistics
    text_doc = make_spacy_doc(post.text, lang=SPACY_MODEL)

    counters.stop('make_spacy_docs')

    # terms extraction
    counters.start('extract_key_terms')
    nlp.terms = extract_key_terms(cleaned_doc, num_terms=NUM_TERMS,
                                  algo=TERM_EXTRACTOR_ALGO, ngrams=NGRAMS)

    # !note we restrict ngram to one as we only want the lemmized top terms.
    nlp.title_terms = extract_key_terms(title_doc, num_terms=NUM_TERMS,
                                        algo=TERM_EXTRACTOR_ALGO, ngrams=1)

    counters.stop('extract_key_terms')

    # text stats
    counters.start('text_stats')
    nlp.stats = compute_stats(text_doc)
    counters.stop('text_stats')
    if debug:
        counters.report()
    return nlp
Beispiel #15
0
def spacy_doc():
    text = (
        "Democrats might know that they stand against Trump's policies, but coming up with their own plan is harder than you think. "
        "For a long time, the party's top echelon has been captive to free trade orthodoxy. "
        "Since Bill Clinton, the theory of the case among the Democratic Party's elite has been the more globalization, the better — with mostly a deaf ear turned to the people and places most badly affected. "
        "Worse, their response to globalization's excesses has been: "
        "Here's a new trade deal, much better than the last one.")
    return make_spacy_doc(text, lang="en")
Beispiel #16
0
def get_noun_phrases(text, nlp):
    doc = textacy.make_spacy_doc(text, lang='en_core_web_sm')
    # print([chunk.text for chunk in doc.noun_chunks])
    noun_phrase = [chunk.text for chunk in doc.noun_chunks] # noun phrase
    single_noun = [word.text for word in doc if (word.pos_ in ["NOUN", "ADJ", "ADV"] and not word.is_stop)] # single NOUN, ADJ, ADV

    all_noun_phrases = list(set(noun_phrase + single_noun))
    return all_noun_phrases
Beispiel #17
0
def detect_verb_phrases(sentence, return_as_string: bool = True):
    pattern = r"(<VERB>?<ADV>*<VERB>+)"
    doc = make_spacy_doc(data=sentence, lang="en_core_web_sm")
    verb_phrases = pos_regex_matches(doc=doc, pattern=pattern)

    if return_as_string:
        return " ".join([c.text for c in verb_phrases])
    return verb_phrases
Beispiel #18
0
def test_single_sentence_doc():
    doc = textacy.make_spacy_doc(
        "This is a document with a single sentence.",
        lang="en_core_web_sm",
    )
    result = kt.scake(doc)
    assert isinstance(result, list)
    assert len(result) > 0
Beispiel #19
0
def make_corpus(df: pd.DataFrame, col_name: str,
                min_token_count: int) -> textacy.Corpus:
    spacy_records = df[col_name].apply(
        lambda x: textacy.make_spacy_doc(x, lang="en"))
    long_records = [
        record for record in spacy_records if len(record) >= min_token_count
    ]
    corpus = textacy.Corpus("en", data=list(long_records))
    return corpus
Beispiel #20
0
def ts():
    text = """
    Mr. Speaker, 480,000 Federal employees are working without pay, a form of involuntary servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have mortgages to pay, children to feed, and financial obligations to meet.
    Mr. Speaker, what is happening to these workers is immoral, is wrong, and must be rectified immediately. Newt Gingrich and the Republican leadership must not continue to hold the House and the American people hostage while they push their disastrous 7-year balanced budget plan. The gentleman from Georgia, Mr. Gingrich, and the Republican leadership must join Senator Dole and the entire Senate and pass a continuing resolution now, now to reopen Government.
    Mr. Speaker, that is what the American people want, that is what they need, and that is what this body must do.
    """.strip()
    doc = make_spacy_doc(text, lang="en")
    ts_ = text_stats.TextStats(doc)
    return ts_
Beispiel #21
0
def test_terms():
    text = "the quick fox and the cat. The turtle and the rabbit."
    doc = make_spacy_doc(text, lang=SPACY_MODEL)
    terms = nlp.extract_key_terms(doc, num_terms=5)
    terms = [t[0] for t in terms]  # remove scores
    assert 'fox' in terms
    assert 'cat' in terms
    assert 'turtle' in terms
    assert 'rabbit' in terms
Beispiel #22
0
def ts_en():
    text = (
        "Many years later, as he faced the firing squad, Colonel Aureliano Buendía was "
        "to remember that distant afternoon when his father took him to discover ice. "
        "At that time Macondo was a village of twenty adobe houses, built on the bank "
        "of a river of clear water that ran along a bed of polished stones, which were "
        "white and enormous, like prehistoric eggs. The world was so recent that many "
        "things lacked names, and in order to indicate them it was necessary to point."
    )
    return textacy.TextStats(textacy.make_spacy_doc(text, lang="en"))
Beispiel #23
0
def generate_categories(data, tags, num):
    q = Qrmine()

    if len(tags) > 0:
        ct = 0
        for title in data.titles:
            for tag in tags:
                if title == tag:
                    click.echo(tag)
                    content = data.documents[ct]
            ct += 1
        interview = Content(content)
        doc = textacy.make_spacy_doc(interview.doc)
        return q.print_categories(doc, num)

    else:
        all_interviews = Content(data.content)
        doc = textacy.make_spacy_doc(all_interviews.doc)
        return q.print_categories(doc, num)
Beispiel #24
0
def test_two_term_behavhior():
    """Case when there is less than 3 words and rank algo can't be used.
    """
    text = "search page"
    doc = make_spacy_doc(text, lang=SPACY_MODEL)
    terms = nlp.extract_key_terms(doc, num_terms=5)
    assert 'search' == terms[0][0]
    assert 0.5 == terms[0][1]
    assert 'page' == terms[1][0]
    assert 0.5 == terms[1][1]
Beispiel #25
0
def ts_es():
    text = (
        "Muchos años después, frente al pelotón de fusilamiento, el coronel Aureliano "
        "Buendía había de recordar aquella tarde remota en que su padre lo llevó a "
        "conocer el hielo. Macondo era entonces una aldea de veinte casas de barro y "
        "cañabrava construidas a la orilla de un río de aguas diáfanas que se precipitaban "
        "por un lecho de piedras pulidas, blancas y enormes como huevos prehistóricos. "
        "El mundo era tan reciente, que muchas cosas carecían de nombre, y para "
        "mencionarlas había que señalarlas con el dedo.")
    return textacy.TextStats(textacy.make_spacy_doc(text, lang="es"))
def preText(text, pos_bow, neg_bow):
    # parameter:
    # text : takes a sentence, string
    # pos_bow: positive bag of words, list
    # neg_bow: negative bag of words, list

    # return:
    # score_word_sim : similarity score for all verbs, float
    # score_bow: score for bag of words implementation, float

    # recognize verb pattern
    pattern = [{
        "POS": "VERB",
        "OP": "*"
    }, {
        "POS": "ADV",
        "OP": "*"
    }, {
        "POS": "VERB",
        "OP": "+"
    }, {
        "POS": "PART",
        "OP": "*"
    }]

    # extract verb pattern
    doc = textacy.make_spacy_doc(text, lang='en_core_web_lg')
    verbs = textacy.extract.matches(doc, pattern)
    score_word_sim = 0.0
    score_bow = 0.0
    for verb in verbs:
        # singularize verb, e.g. "likes" to "like"
        singularized_verb = singularize(verb.text)
        score_word_sim += wordSimilarity(pos_bow, neg_bow, singularized_verb)
        # apply bag of words to the singularized verb
        score_bow += pos_bow.count(str(singularized_verb))
        score_bow -= neg_bow.count(str(singularized_verb))

    # aggregate all verb similarity
    if score_word_sim > 0.5:
        score_word_sim = 1.0
    elif score_word_sim < -0.5:
        score_word_sim = -1.0
    else:
        score_word_sim = 0.0

    # aggregate the count with bag of words
    if score_bow > 0.5:
        score_bow = 1.0
    elif score_bow < -0.5:
        score_bow = -1.0
    else:
        score_bow = 0.0

    return score_word_sim, score_bow
def get_nouns(sent):
    about_talk_text = (get_useCase(sent)[0])
    about_talk_doc = textacy.make_spacy_doc(about_talk_text,
                                            lang='en_core_web_sm')
    #Extract Noun Phrase to explain what nouns are involved
    chunks = []
    for chunk in about_talk_doc.noun_chunks:
        print(chunk)
        chunks.append(chunk)
    #print(chunks)
    return chunks
Beispiel #28
0
 def test_invalid_data(self):
     invalid_contents = [
         b"This is an English sentence in bytes.",
         {
             "content": "This is an English sentence as dict value."
         },
         True,
     ]
     for invalid_content in invalid_contents:
         with pytest.raises(TypeError):
             _ = make_spacy_doc(invalid_content)
Beispiel #29
0
def remove_det(phrase, nlp):
    pattern = [
        {'POS': 'DET', 'OP': '+'},
        {'POS': 'ADJ', 'OP': '*'},
        {'POS': 'NOUN', "OP": '+'}
    ]
    doc = textacy.make_spacy_doc(phrase, lang='en_core_web_sm')
    # if phrase.startswith("an ") or phrase.startswith("a ") or phrase.startswith("the "):
    matches = [phs.text for phs in textacy.extract.matches(doc, pattern)]
    if len(matches) != 0 and phrase.find(matches[0]) == 0: # matched phrase should be in the beginning
        return phrase.split(" ",1)[1]
    return phrase
Beispiel #30
0
def doc():
    lang = textacy.load_spacy_lang("en_core_web_sm")
    text = (
        "Many years later, as he faced the firing squad, Colonel Aureliano Buendía was "
        "to remember that distant afternoon when his father took him to discover ice. "
        "At that time Macondo was a village of twenty adobe houses, built on the bank "
        "of a river of clear water that ran along a bed of polished stones, which were "
        "white and enormous, like prehistoric eggs. The world was so recent that many "
        "things lacked names, and in order to indicate them it was necessary to point."
    )
    meta = {"author": "Gabriel García Márquez", "title": "Cien años de soledad"}
    return textacy.make_spacy_doc((text, meta), lang=lang)