Exemple #1
0
def get_textacy_name_entities(text,
                              article_id,
                              drop_determiners=True,
                              exclude_types='numeric'):
    '''Get Named Entities using textacy
    ## NOT USED IN THE PROJECT
    text: full_text or summary
    article_id: string, article id (names of json files)
    Return a pd dataframe with two columns: named entities and entities category
    '''

    en = textacy.load_spacy_lang("en_core_web_sm", disable=("parser", ))
    if isinstance(text, str):  # if raw string
        doc = textacy.make_spacy_doc(text, lang=en)
    elif isinstance(text, Doc):  # if pre-created spacy doc
        doc = text
    else:
        doc = textacy.make_spacy_doc("NA", lang=en)

    nes = textacy.extract.entities(
        doc, drop_determiners=drop_determiners,
        exclude_types=exclude_types)  # nes is a generator
    ne_list = []
    ne_label_list = []

    for ne in nes:
        ne_list.append(ne)
        ne_label_list.append(ne.label_)

    data = pd.DataFrame(data={'text': ne_list, 'label': ne_label_list})
    data = data.drop_duplicates(keep='first')
    if article_id != None:  # store article ID for csv
        data['article_id'] = article_id
    return data
Exemple #2
0
def main(text,
         dmodels,
         snormalize='lemma',
         sngrams=(1, 2, 3, 4, 5, 6),
         sinclude_pos=('NOUN', 'PROPN', 'ADJ'),
         swindow_size=1500,
         stopn=1.,
         sidf=None,
         verbose=False):
    # identify language
    language = textacy.lang_utils.identify_lang(text)
    if verbose: print('[info] language = "%s"' % language)
    # load language model
    nlp = textacy.load_spacy_lang(dmodels[language], disable=("parser", ))
    # create documents
    doc = textacy.make_spacy_doc(text, lang=nlp)
    # model launch
    keywords = textacy.ke.sgrank(
        doc,
        normalize=snormalize,  #normalize = None, #normalize = 'lower', 
        ngrams=sngrams,
        include_pos=sinclude_pos,
        window_size=swindow_size,
        topn=stopn,
        idf=sidf)
    # return
    return keywords
Exemple #3
0
def spacy_lang():
    spacy_lang = load_spacy_lang("en_core_web_sm")
    spacy_lang.add_pipe("textacy_text_stats", last=True)

    yield spacy_lang

    # remove component after running these tests
    spacy_lang.remove_pipe("textacy_text_stats")
Exemple #4
0
def spacy_doc():
    spacy_lang = load_spacy_lang("en_core_web_sm")
    text = ("The unit tests aren't going well. "
            "I love Python, but I don't love backwards incompatibilities. "
            "No programmers were permanently damaged for textacy's sake. "
            "Thank God for Stack Overflow.")
    spacy_doc = spacy_lang(text.strip())
    return spacy_doc
Exemple #5
0
 def test_to_tokenized_text_nosents(self):
     spacy_lang = load_spacy_lang("en")
     with spacy_lang.disable_pipes("parser"):
         doc = spacy_lang("This is sentence #1. This is sentence #2.")
     tokenized_text = doc._.to_tokenized_text()
     assert isinstance(tokenized_text, list)
     assert len(tokenized_text) == 1
     assert isinstance(tokenized_text[0], list)
     assert isinstance(tokenized_text[0][0], str)
Exemple #6
0
def spacy_lang():
    spacy_lang = load_spacy_lang("en")
    text_stats_component = components.TextStatsComponent()
    spacy_lang.add_pipe(text_stats_component, after="parser")

    yield spacy_lang

    # remove component after running these tests
    spacy_lang.remove_pipe("textacy_text_stats")
def spacy_doc():
    spacy_lang = load_spacy_lang("en_core_web_sm")
    text = (
        "Two weeks ago, I was in Kuwait participating in an I.M.F. (International Monetary Fund) seminar for Arab educators. "
        "For 30 minutes, we discussed the impact of technology trends on education in the Middle East. "
        "And then an Egyptian education official raised his hand and asked if he could ask me a personal question: \"I heard Donald Trump say we need to close mosques in the United States,\" he said with great sorrow. "
        "\"Is that what we want our kids to learn?\"")
    spacy_doc = spacy_lang(text)
    return spacy_doc
Exemple #8
0
 def test_corpus_init_docs(self):
     limit = 3
     spacy_lang = load_spacy_lang("en")
     texts = DATASET.texts(limit=limit)
     docs = [spacy_lang(text) for text in texts]
     corpus = Corpus("en", data=docs)
     assert len(corpus) == len(corpus.docs) == limit
     assert all(doc.vocab is corpus.spacy_lang.vocab for doc in corpus)
     assert all(doc1 is doc2 for doc1, doc2 in zip(docs, corpus))
Exemple #9
0
    def __init__(self, language):
        if(language != "en" and language != "de"):
            raise ValueError("Language not supported")
        else:
            self.language = language

        config = configparser.ConfigParser()
        config.read("config.ini")
        self.threads = int(config.get("analysis", "threads"))

        if self.language == "en":
            # pip install https://blackstone-model.s3-eu-west-1.amazonaws.com/en_blackstone_proto-0.0.1.tar.gz
            # Use Blackstone model which has been trained on english legal texts (https://github.com/ICLRandD/Blackstone)
            self.nlp = textacy.load_spacy_lang("en_blackstone_proto", disable=("textcat"))
            if not ("sentence_segmenter" or "CompoundCases") in self.nlp.pipe_names:
                # Use a custom sentence segmenter for better tokenization
                sentence_segmenter = SentenceSegmenter(self.nlp.vocab, CONCEPT_PATTERNS)
                self.nlp.add_pipe(sentence_segmenter, before="parser")
                # https://github.com/ICLRandD/Blackstone#compound-case-reference-detection
                compound_pipe = CompoundCases(self.nlp)
                self.nlp.add_pipe(compound_pipe)
            else:
                print("Please only instantiate this class only once per language.")
            stanza.download("en", processors="tokenize, sentiment", logging_level="WARN")
            self.stanza_nlp = stanza.Pipeline(lang="en", processors="tokenize, sentiment",
                                              tokenize_pretokenized=True, logging_level="WARN")
        else:
            # python -m spacy download de_core_news_md
            self.nlp = textacy.load_spacy_lang("de_core_news_md", disable=("textcat"))
            # Textacy caches loaded pipeline components. So do not add them again if they are already present.
            if not ("sentence_segmenter" or "spacyiwnlp") in self.nlp.pipe_names:
                iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20181001.json', ignore_case=True)
                self.nlp.add_pipe(iwnlp)
                sentence_segmenter = SentenceSegmenter(self.nlp.vocab, CONCEPT_PATTERNS)
                self.nlp.add_pipe(sentence_segmenter, before="parser")
            else:
                print("Please only instantiate this class only once per language.")
            stanza.download("de", processors="tokenize, sentiment", logging_level="WARN")
            self.stanza_nlp = stanza.Pipeline(lang="de", processors="tokenize, sentiment",
                                              tokenize_pretokenized=True, logging_level="WARN")

        self.corpus = None
Exemple #10
0
def doc():
    nlp = load_spacy_lang("en_core_web_sm")
    text = (
        "Many years later, as he faced the firing squad, Colonel Aureliano Buendía was "
        "to remember that distant afternoon when his father took him to discover ice. "
        "At that time Macondo was a village of twenty adobe houses, built on the bank "
        "of a river of clear water that ran along a bed of polished stones, which were "
        "white and enormous, like prehistoric eggs. The world was so recent that many "
        "things lacked names, and in order to indicate them it was necessary to point."
    )
    return nlp(text)
Exemple #11
0
def doc():
    lang = textacy.load_spacy_lang("en_core_web_sm")
    text = (
        "Many years later, as he faced the firing squad, Colonel Aureliano Buendía was "
        "to remember that distant afternoon when his father took him to discover ice. "
        "At that time Macondo was a village of twenty adobe houses, built on the bank "
        "of a river of clear water that ran along a bed of polished stones, which were "
        "white and enormous, like prehistoric eggs. The world was so recent that many "
        "things lacked names, and in order to indicate them it was necessary to point."
    )
    meta = {"author": "Gabriel García Márquez", "title": "Cien años de soledad"}
    return textacy.make_spacy_doc((text, meta), lang=lang)
Exemple #12
0
    def get_textacy_doc(text):
        """
        Gets document of textacy library
        :param text: Text of which textacy doc to get
        :return: tuple Textacy doc, Processed text
        """
        en = textacy.load_spacy_lang(NLPService._WORD_MODEL_NAME,
                                     disable=('parser', ))
        processed_text = textacy.preprocess_text(text,
                                                 lowercase=True,
                                                 no_punct=True)

        return textacy.make_spacy_doc(processed_text, lang=en), processed_text
Exemple #13
0
 def test_invalid_data_lang_combo(self):
     spacy_lang = load_spacy_lang("en")
     combos = (
         (spacy_lang("Hello, how are you my friend?"), "es"),
         (spacy_lang("Hello, how are you my friend?"), True),
         ("This is an English sentence.", True),
         (("This is an English sentence.", {
             "foo": "bar"
         }), True),
     )
     for data, lang in combos:
         with pytest.raises((ValueError, TypeError)):
             _ = make_spacy_doc(data, lang=lang)
Exemple #14
0
def docs():
    lang = textacy.load_spacy_lang("en_core_web_sm")
    texts = [
        "Mary had a little lamb. Its fleece was white as snow.",
        "Everywhere that Mary went the lamb was sure to go.",
        # "It followed her to school one day, which was against the rule.",
        # "It made the children laugh and play to see a lamb at school.",
        # "And so the teacher turned it out, but still it lingered near.",
        # "It waited patiently about until Mary did appear.",
        # "Why does the lamb love Mary so? The eager children cry.",
        # "Mary loves the lamb, you know, the teacher did reply.",
    ]
    return [textacy.make_spacy_doc(text, lang=lang) for text in texts]
Exemple #15
0
def test_to_gensim(spacy_doc):
    spacy_lang = load_spacy_lang("en")
    result = export.docs_to_gensim(
        [spacy_doc], spacy_lang.vocab,
        filter_stops=True, filter_punct=True, filter_nums=True,
    )
    assert isinstance(result[0], str)
    assert isinstance(result[1], list)
    assert isinstance(result[1][0], list)
    assert isinstance(result[1][0][0], tuple)
    assert (
        isinstance(result[1][0][0][0], int)
        and isinstance(result[1][0][0][1], int)
    )
Exemple #16
0
 def test_corpus_add(self, corpus):
     spacy_lang = load_spacy_lang("en")
     datas = (
         "This is an english sentence.",
         ("This is an english sentence.", {"foo": "bar"}),
         spacy_lang("This is an english sentence."),
         ["This is one sentence.", "This is another sentence."],
         [("This is sentence #1.", {"foo": "bar"}), ("This is sentence #2.", {"bat": "baz"})],
         [spacy_lang("This is sentence #1"), spacy_lang("This is sentence #2")],
     )
     n_docs = corpus.n_docs
     for data in datas:
         corpus.add(data)
         assert corpus.n_docs > n_docs
         n_docs = corpus.n_docs
Exemple #17
0
 def __init__(self):
     self._min_occurrence_for_topic = 2
     self._common_verbs = 10
     # create an empty corpus
     self._en = textacy.load_spacy_lang('en_core_web_sm', disable=('parser',))
     self._corpus = textacy.Corpus(lang=self._en)
     self._content = None
     self._model = None
     self._numdocs = 0
     self._numtopics = 0
     self._terms = None
     self._doc_term_matrix = None
     self._doc_topic_matrix = None
     self._vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth',
                                   norm='l2', min_df=3, max_df=0.95, max_n_terms=100000)
Exemple #18
0
def tokenized_docs():
    texts = [
        "Mary had a little lamb. Its fleece was white as snow.",
        "Everywhere that Mary went the lamb was sure to go.",
        "It followed her to school one day, which was against the rule.",
        "It made the children laugh and play to see a lamb at school.",
        "And so the teacher turned it out, but still it lingered near.",
        "It waited patiently about until Mary did appear.",
        "Why does the lamb love Mary so? The eager children cry.",
        "Mary loves the lamb, you know, the teacher did reply.",
    ]
    nlp = textacy.load_spacy_lang("en_core_web_sm")
    docs = list(nlp.pipe(texts))
    tokenized_docs = [
        [term.text.lower() for term in extract.terms(doc, ngs=1)] for doc in docs
    ]
    return tokenized_docs
Exemple #19
0
def test_bad_filters():
    bad_filters = (
        {
            "lang": "xx"
        },
        {
            "lang": ["en", "un"]
        },
    )
    for bad_filter in bad_filters:
        with pytest.raises(ValueError):
            list(DATASET.texts(**bad_filter))
    bad_filters = (
        {
            "lang": True
        },
        {
            "lang": textacy.load_spacy_lang("en_core_web_sm")
        },
    )
    for bad_filter in bad_filters:
        with pytest.raises(TypeError):
            list(DATASET.texts(**bad_filter))
Exemple #20
0
 def test_bad_name(self, name):
     with pytest.raises(OSError):
         _ = load_spacy_lang(name)
Exemple #21
0
 def test_disable_hashability(self, kwargs):
     with pytest.raises(TypeError):
         _ = load_spacy_lang("en_core_web_sm", **kwargs)
Exemple #22
0
 def test_load_model_kwargs(self, kwargs):
     assert isinstance(
         load_spacy_lang("en_core_web_sm", **kwargs),
         spacy.language.Language,
     )
Exemple #23
0
 def test_load_model(self, name):
     assert isinstance(load_spacy_lang(name), spacy.language.Language)
Exemple #24
0
def langs():
    return (
        "en_core_web_sm",
        load_spacy_lang("en_core_web_sm"),
        lambda text: "en_core_web_sm",
    )
Exemple #25
0
def en_core_web_sm():
    return load_spacy_lang("en_core_web_sm")
from bs4 import BeautifulSoup
import click
import logging
import os
from pathlib import Path
from dotenv import find_dotenv, load_dotenv

import textacy
import textacy.keyterms
import ftfy

import entities

# Load English tokenizer, tagger, parser, NER and word vectors
en = textacy.load_spacy_lang("en_core_web_lg")
patterns = [
    {
        "label": "ORG",
        "pattern": [{
            "lower": "european"
        }, {
            "lower": "central"
        }, {
            "lower": "bank"
        }]
    },
    {
        "label": "ORG",
        "pattern": [{
            "lower": "bank"
        }, {
Exemple #27
0
def spacy_doc():
    text = "I would have lived in peace. But my enemies brought me war."
    spacy_lang = load_spacy_lang("en")
    spacy_doc = spacy_lang(text)
    return spacy_doc
Exemple #28
0
def clustering_analysis(input=None,
                        algorithm="s",
                        n_key_float=0.75,
                        n_grams="1,2,3,4",
                        cutoff=10,
                        threshold=0.5):
    if algorithm != "t" and algorithm != "s":
        return ("Specify an algorithm! (t)extrank or (s)grank")

        alldata = []

        for curline in input:
            alldata.append(curline["message"])

    # the cummulative tally of common keywords
    word_keyterm_cummula = defaultdict(lambda: 0)
    # the mapping of journals to the common keywords
    word_keyterm_journals = defaultdict(lambda: [])

    en = textacy.load_spacy_lang("en_core_web_sm", disable=("parser", ))
    for item in alldata:
        msgid = item.split(' ')[0]
        curline = item.replace(msgid, '').strip()
        curdoc = textacy.make_spacy_doc(curline.lower(), lang=en)
        curdoc_ranks = []
        if algorithm == "t":
            if n_key_float > 0.0 and n_key_float < 1.0:
                curdoc_ranks = textacy.keyterms.textrank(
                    curdoc, normalize="lemma", n_keyterms=n_key_float)
            else:
                curdoc_ranks = textacy.keyterms.textrank(curdoc,
                                                         normalize="lemma",
                                                         n_keyterms=n_key)
        elif algorithm == "s":
            ngram_str = set(n_grams.split(','))
            ngram = []
            for gram in ngram_str:
                ngram.append(int(gram))
            curdoc_ranks = textacy.keyterms.sgrank(curdoc,
                                                   window_width=1500,
                                                   ngrams=ngram,
                                                   normalize="lower",
                                                   n_keyterms=n_key_float)

        for word in curdoc_ranks:
            word_keyterm_cummula[word[0]] += 1
            word_keyterm_journals[word[0]].append((msgid, word[1]))
            if len(word_keyterm_journals[word[0]]) > 10:
                newlist = []
                min_tuple = word_keyterm_journals[word[0]][0]
                for tuple in word_keyterm_journals[word[0]]:
                    if tuple[1] < min_tuple[1]:
                        min_tuple = tuple
                for tuple in word_keyterm_journals[word[0]]:
                    if tuple[0] != min_tuple[0]:
                        newlist.append(tuple)
                word_keyterm_journals[word[0]] = newlist

    word_keyterm_cummula_sorted = sorted(word_keyterm_cummula.items(),
                                         key=lambda val: val[1],
                                         reverse=True)

    quint = 0
    quint_printout = ""
    for entry in word_keyterm_cummula_sorted[:cutoff]:
        quint_printout += entry[0] + ","
        quint += 1
    quint_printout = quint_printout[:-1]
    #print(quint_printout)
    return quint_printout
Exemple #29
0
 def test_corpus_init_lang(self):
     assert isinstance(Corpus("en"), Corpus)
     assert isinstance(Corpus(load_spacy_lang("en")), Corpus)
     for bad_lang in (b"en", None):
         with pytest.raises(TypeError):
             Corpus(bad_lang)
Exemple #30
0
 def test_corpus_init_no_parser(self):
     spacy_lang = load_spacy_lang("en", disable=("parser", ))
     corpus = Corpus(spacy_lang,
                     data=(spacy_lang("This is a sentence in a doc."), ))
     assert len(corpus) == 1
     assert corpus.n_sents == 0