Python load_spacy_lang Exemples, textacy.load_spacy_lang Python Exemples

Exemple #1

0

Afficher le fichier

def get_textacy_name_entities(text,
                              article_id,
                              drop_determiners=True,
                              exclude_types='numeric'):
    '''Get Named Entities using textacy
    ## NOT USED IN THE PROJECT
    text: full_text or summary
    article_id: string, article id (names of json files)
    Return a pd dataframe with two columns: named entities and entities category
    '''

    en = textacy.load_spacy_lang("en_core_web_sm", disable=("parser", ))
    if isinstance(text, str):  # if raw string
        doc = textacy.make_spacy_doc(text, lang=en)
    elif isinstance(text, Doc):  # if pre-created spacy doc
        doc = text
    else:
        doc = textacy.make_spacy_doc("NA", lang=en)

    nes = textacy.extract.entities(
        doc, drop_determiners=drop_determiners,
        exclude_types=exclude_types)  # nes is a generator
    ne_list = []
    ne_label_list = []

    for ne in nes:
        ne_list.append(ne)
        ne_label_list.append(ne.label_)

    data = pd.DataFrame(data={'text': ne_list, 'label': ne_label_list})
    data = data.drop_duplicates(keep='first')
    if article_id != None:  # store article ID for csv
        data['article_id'] = article_id
    return data

Exemple #2

0

Afficher le fichier

Fichier : keywords.py Projet : jmquintana79/crawling

def main(text,
         dmodels,
         snormalize='lemma',
         sngrams=(1, 2, 3, 4, 5, 6),
         sinclude_pos=('NOUN', 'PROPN', 'ADJ'),
         swindow_size=1500,
         stopn=1.,
         sidf=None,
         verbose=False):
    # identify language
    language = textacy.lang_utils.identify_lang(text)
    if verbose: print('[info] language = "%s"' % language)
    # load language model
    nlp = textacy.load_spacy_lang(dmodels[language], disable=("parser", ))
    # create documents
    doc = textacy.make_spacy_doc(text, lang=nlp)
    # model launch
    keywords = textacy.ke.sgrank(
        doc,
        normalize=snormalize,  #normalize = None, #normalize = 'lower', 
        ngrams=sngrams,
        include_pos=sinclude_pos,
        window_size=swindow_size,
        topn=stopn,
        idf=sidf)
    # return
    return keywords

Exemple #3

0

Afficher le fichier

Fichier : test_components.py Projet : dbragdon1/textacy

def spacy_lang():
    spacy_lang = load_spacy_lang("en_core_web_sm")
    spacy_lang.add_pipe("textacy_text_stats", last=True)

    yield spacy_lang

    # remove component after running these tests
    spacy_lang.remove_pipe("textacy_text_stats")

Exemple #4

0

Afficher le fichier

def spacy_doc():
    spacy_lang = load_spacy_lang("en_core_web_sm")
    text = ("The unit tests aren't going well. "
            "I love Python, but I don't love backwards incompatibilities. "
            "No programmers were permanently damaged for textacy's sake. "
            "Thank God for Stack Overflow.")
    spacy_doc = spacy_lang(text.strip())
    return spacy_doc

Exemple #5

0

Afficher le fichier

 def test_to_tokenized_text_nosents(self):
     spacy_lang = load_spacy_lang("en")
     with spacy_lang.disable_pipes("parser"):
         doc = spacy_lang("This is sentence #1. This is sentence #2.")
     tokenized_text = doc._.to_tokenized_text()
     assert isinstance(tokenized_text, list)
     assert len(tokenized_text) == 1
     assert isinstance(tokenized_text[0], list)
     assert isinstance(tokenized_text[0][0], str)

Exemple #6

0

Afficher le fichier

def spacy_lang():
    spacy_lang = load_spacy_lang("en")
    text_stats_component = components.TextStatsComponent()
    spacy_lang.add_pipe(text_stats_component, after="parser")

    yield spacy_lang

    # remove component after running these tests
    spacy_lang.remove_pipe("textacy_text_stats")

Exemple #7

0

Afficher le fichier

Fichier : test_extract_basics.py Projet : dbragdon1/textacy

def spacy_doc():
    spacy_lang = load_spacy_lang("en_core_web_sm")
    text = (
        "Two weeks ago, I was in Kuwait participating in an I.M.F. (International Monetary Fund) seminar for Arab educators. "
        "For 30 minutes, we discussed the impact of technology trends on education in the Middle East. "
        "And then an Egyptian education official raised his hand and asked if he could ask me a personal question: \"I heard Donald Trump say we need to close mosques in the United States,\" he said with great sorrow. "
        "\"Is that what we want our kids to learn?\"")
    spacy_doc = spacy_lang(text)
    return spacy_doc

Exemple #8

0

Afficher le fichier

 def test_corpus_init_docs(self):
     limit = 3
     spacy_lang = load_spacy_lang("en")
     texts = DATASET.texts(limit=limit)
     docs = [spacy_lang(text) for text in texts]
     corpus = Corpus("en", data=docs)
     assert len(corpus) == len(corpus.docs) == limit
     assert all(doc.vocab is corpus.spacy_lang.vocab for doc in corpus)
     assert all(doc1 is doc2 for doc1, doc2 in zip(docs, corpus))

Exemple #9

0

Afficher le fichier

    def __init__(self, language):
        if(language != "en" and language != "de"):
            raise ValueError("Language not supported")
        else:
            self.language = language

        config = configparser.ConfigParser()
        config.read("config.ini")
        self.threads = int(config.get("analysis", "threads"))

        if self.language == "en":
            # pip install https://blackstone-model.s3-eu-west-1.amazonaws.com/en_blackstone_proto-0.0.1.tar.gz
            # Use Blackstone model which has been trained on english legal texts (https://github.com/ICLRandD/Blackstone)
            self.nlp = textacy.load_spacy_lang("en_blackstone_proto", disable=("textcat"))
            if not ("sentence_segmenter" or "CompoundCases") in self.nlp.pipe_names:
                # Use a custom sentence segmenter for better tokenization
                sentence_segmenter = SentenceSegmenter(self.nlp.vocab, CONCEPT_PATTERNS)
                self.nlp.add_pipe(sentence_segmenter, before="parser")
                # https://github.com/ICLRandD/Blackstone#compound-case-reference-detection
                compound_pipe = CompoundCases(self.nlp)
                self.nlp.add_pipe(compound_pipe)
            else:
                print("Please only instantiate this class only once per language.")
            stanza.download("en", processors="tokenize, sentiment", logging_level="WARN")
            self.stanza_nlp = stanza.Pipeline(lang="en", processors="tokenize, sentiment",
                                              tokenize_pretokenized=True, logging_level="WARN")
        else:
            # python -m spacy download de_core_news_md
            self.nlp = textacy.load_spacy_lang("de_core_news_md", disable=("textcat"))
            # Textacy caches loaded pipeline components. So do not add them again if they are already present.
            if not ("sentence_segmenter" or "spacyiwnlp") in self.nlp.pipe_names:
                iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20181001.json', ignore_case=True)
                self.nlp.add_pipe(iwnlp)
                sentence_segmenter = SentenceSegmenter(self.nlp.vocab, CONCEPT_PATTERNS)
                self.nlp.add_pipe(sentence_segmenter, before="parser")
            else:
                print("Please only instantiate this class only once per language.")
            stanza.download("de", processors="tokenize, sentiment", logging_level="WARN")
            self.stanza_nlp = stanza.Pipeline(lang="de", processors="tokenize, sentiment",
                                              tokenize_pretokenized=True, logging_level="WARN")

        self.corpus = None

Exemple #10

0

Afficher le fichier

Fichier : test_matches.py Projet : dbragdon1/textacy

def doc():
    nlp = load_spacy_lang("en_core_web_sm")
    text = (
        "Many years later, as he faced the firing squad, Colonel Aureliano Buendía was "
        "to remember that distant afternoon when his father took him to discover ice. "
        "At that time Macondo was a village of twenty adobe houses, built on the bank "
        "of a river of clear water that ran along a bed of polished stones, which were "
        "white and enormous, like prehistoric eggs. The world was so recent that many "
        "things lacked names, and in order to indicate them it was necessary to point."
    )
    return nlp(text)

Exemple #11

0

Afficher le fichier

def doc():
    lang = textacy.load_spacy_lang("en_core_web_sm")
    text = (
        "Many years later, as he faced the firing squad, Colonel Aureliano Buendía was "
        "to remember that distant afternoon when his father took him to discover ice. "
        "At that time Macondo was a village of twenty adobe houses, built on the bank "
        "of a river of clear water that ran along a bed of polished stones, which were "
        "white and enormous, like prehistoric eggs. The world was so recent that many "
        "things lacked names, and in order to indicate them it was necessary to point."
    )
    meta = {"author": "Gabriel García Márquez", "title": "Cien años de soledad"}
    return textacy.make_spacy_doc((text, meta), lang=lang)

Exemple #12

0

Afficher le fichier

    def get_textacy_doc(text):
        """
        Gets document of textacy library
        :param text: Text of which textacy doc to get
        :return: tuple Textacy doc, Processed text
        """
        en = textacy.load_spacy_lang(NLPService._WORD_MODEL_NAME,
                                     disable=('parser', ))
        processed_text = textacy.preprocess_text(text,
                                                 lowercase=True,
                                                 no_punct=True)

        return textacy.make_spacy_doc(processed_text, lang=en), processed_text

Exemple #13

0

Afficher le fichier

 def test_invalid_data_lang_combo(self):
     spacy_lang = load_spacy_lang("en")
     combos = (
         (spacy_lang("Hello, how are you my friend?"), "es"),
         (spacy_lang("Hello, how are you my friend?"), True),
         ("This is an English sentence.", True),
         (("This is an English sentence.", {
             "foo": "bar"
         }), True),
     )
     for data, lang in combos:
         with pytest.raises((ValueError, TypeError)):
             _ = make_spacy_doc(data, lang=lang)

Exemple #14

0

Afficher le fichier

Fichier : test_network.py Projet : dbragdon1/textacy

def docs():
    lang = textacy.load_spacy_lang("en_core_web_sm")
    texts = [
        "Mary had a little lamb. Its fleece was white as snow.",
        "Everywhere that Mary went the lamb was sure to go.",
        # "It followed her to school one day, which was against the rule.",
        # "It made the children laugh and play to see a lamb at school.",
        # "And so the teacher turned it out, but still it lingered near.",
        # "It waited patiently about until Mary did appear.",
        # "Why does the lamb love Mary so? The eager children cry.",
        # "Mary loves the lamb, you know, the teacher did reply.",
    ]
    return [textacy.make_spacy_doc(text, lang=lang) for text in texts]

Exemple #15

0

Afficher le fichier

def test_to_gensim(spacy_doc):
    spacy_lang = load_spacy_lang("en")
    result = export.docs_to_gensim(
        [spacy_doc], spacy_lang.vocab,
        filter_stops=True, filter_punct=True, filter_nums=True,
    )
    assert isinstance(result[0], str)
    assert isinstance(result[1], list)
    assert isinstance(result[1][0], list)
    assert isinstance(result[1][0][0], tuple)
    assert (
        isinstance(result[1][0][0][0], int)
        and isinstance(result[1][0][0][1], int)
    )

Exemple #16

0

Afficher le fichier

 def test_corpus_add(self, corpus):
     spacy_lang = load_spacy_lang("en")
     datas = (
         "This is an english sentence.",
         ("This is an english sentence.", {"foo": "bar"}),
         spacy_lang("This is an english sentence."),
         ["This is one sentence.", "This is another sentence."],
         [("This is sentence #1.", {"foo": "bar"}), ("This is sentence #2.", {"bat": "baz"})],
         [spacy_lang("This is sentence #1"), spacy_lang("This is sentence #2")],
     )
     n_docs = corpus.n_docs
     for data in datas:
         corpus.add(data)
         assert corpus.n_docs > n_docs
         n_docs = corpus.n_docs

Exemple #17

0

Afficher le fichier

 def __init__(self):
     self._min_occurrence_for_topic = 2
     self._common_verbs = 10
     # create an empty corpus
     self._en = textacy.load_spacy_lang('en_core_web_sm', disable=('parser',))
     self._corpus = textacy.Corpus(lang=self._en)
     self._content = None
     self._model = None
     self._numdocs = 0
     self._numtopics = 0
     self._terms = None
     self._doc_term_matrix = None
     self._doc_topic_matrix = None
     self._vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth',
                                   norm='l2', min_df=3, max_df=0.95, max_n_terms=100000)

Exemple #18

0

Afficher le fichier

Fichier : test_sparse_vec.py Projet : dbragdon1/textacy

def tokenized_docs():
    texts = [
        "Mary had a little lamb. Its fleece was white as snow.",
        "Everywhere that Mary went the lamb was sure to go.",
        "It followed her to school one day, which was against the rule.",
        "It made the children laugh and play to see a lamb at school.",
        "And so the teacher turned it out, but still it lingered near.",
        "It waited patiently about until Mary did appear.",
        "Why does the lamb love Mary so? The eager children cry.",
        "Mary loves the lamb, you know, the teacher did reply.",
    ]
    nlp = textacy.load_spacy_lang("en_core_web_sm")
    docs = list(nlp.pipe(texts))
    tokenized_docs = [
        [term.text.lower() for term in extract.terms(doc, ngs=1)] for doc in docs
    ]
    return tokenized_docs

Exemple #19

0

Afficher le fichier

Fichier : test_udhr.py Projet : dbragdon1/textacy

def test_bad_filters():
    bad_filters = (
        {
            "lang": "xx"
        },
        {
            "lang": ["en", "un"]
        },
    )
    for bad_filter in bad_filters:
        with pytest.raises(ValueError):
            list(DATASET.texts(**bad_filter))
    bad_filters = (
        {
            "lang": True
        },
        {
            "lang": textacy.load_spacy_lang("en_core_web_sm")
        },
    )
    for bad_filter in bad_filters:
        with pytest.raises(TypeError):
            list(DATASET.texts(**bad_filter))

Exemple #20

0

Afficher le fichier