Exemple #1
0
 def test_lang_callable(self):
     def dumb_detect_language(text):
         return 'en'
     self.assertIsInstance(
         Doc('This is an English sentence.', lang=dumb_detect_language), Doc)
     self.assertIsInstance(
         Doc('This is an English sentence.', lang=lambda x: 'en'), Doc)
Exemple #2
0
def test_lang_callable():
    def dumb_detect_language(text):
        return "en"

    assert isinstance(
        Doc("This is an English sentence.", lang=dumb_detect_language), Doc
    )
    assert isinstance(Doc("This is an English sentence.", lang=lambda x: "en"), Doc)
 def setUp(self):
     with open(
             os.path.join(os.path.dirname(__file__), 'corpus_txt_test.txt'),
             'rb') as f:
         texts = [t.decode('utf-8') for t in f.readlines()]
         self.corpus = Corpus(u'en', texts=texts)
     self.text = u"disease drop due economic disease else"
     self.doc = Doc(self.text)
     self.w2v = {
         w: 5 * np.random.random_sample((300, )) - 2
         for w in self.text.split()
     }
 def setUp(self):
     self.spacy_doc = Doc('This is an English-language document.')
     self.n_chars = 2855
     self.n_syllables = 857
     self.n_words = 441
     self.n_polysyllable_words = 104
     self.n_sents = 21
Exemple #5
0
def test_doc_save_and_load(tmpdir, doc):
    filepath = str(tmpdir.join('test_doc_save_and_load.pkl'))
    doc.save(filepath)
    new_doc = Doc.load(filepath)
    assert isinstance(new_doc, Doc)
    assert len(new_doc) == len(doc)
    assert new_doc.lang == doc.lang
    assert new_doc.metadata == doc.metadata
Exemple #6
0
 def test_doc_save_and_load(self):
     filepath = os.path.join(self.tempdir, 'test_doc_save_and_load.pkl')
     self.doc.save(filepath)
     new_doc = Doc.load(filepath)
     self.assertIsInstance(new_doc, Doc)
     self.assertEqual(len(new_doc), len(self.doc))
     self.assertEqual(new_doc.lang, self.doc.lang)
     self.assertEqual(new_doc.metadata, self.doc.metadata)
Exemple #7
0
 def test_invalid_content(self):
     invalid_contents = [
         b'This is an English sentence in bytes.',
         {'content': 'This is an English sentence as dict value.'},
         True,
         ]
     for invalid_content in invalid_contents:
         with self.assertRaises(ValueError):
             Doc(invalid_content)
Exemple #8
0
def test_invalid_content():
    invalid_contents = [
        b"This is an English sentence in bytes.",
        {"content": "This is an English sentence as dict value."},
        True,
    ]
    for invalid_content in invalid_contents:
        with pytest.raises(ValueError):
            _ = Doc(invalid_content)
Exemple #9
0
def ts():
    text = """
    Mr. Speaker, 480,000 Federal employees are working without pay, a form of involuntary servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have mortgages to pay, children to feed, and financial obligations to meet.
    Mr. Speaker, what is happening to these workers is immoral, is wrong, and must be rectified immediately. Newt Gingrich and the Republican leadership must not continue to hold the House and the American people hostage while they push their disastrous 7-year balanced budget plan. The gentleman from Georgia, Mr. Gingrich, and the Republican leadership must join Senator Dole and the entire Senate and pass a continuing resolution now, now to reopen Government.
    Mr. Speaker, that is what the American people want, that is what they need, and that is what this body must do.
    """
    doc = Doc(text.strip(), lang="en")
    ts_ = text_stats.TextStats(doc)
    return ts_
Exemple #10
0
def test_invalid_content():
    invalid_contents = [
        b'This is an English sentence in bytes.',
        {
            'content': 'This is an English sentence as dict value.'
        },
        True,
    ]
    for invalid_content in invalid_contents:
        with pytest.raises(ValueError):
            _ = Doc(invalid_content)
Exemple #11
0
def extract_entities(data):
    tokens = []
    doc = Doc(data, lang="en_core_web_md")
    res = extract.named_entities(doc, include_types=["PERSON", "ORG", "LOC"])

    for r in res:
        tokens.append(str(r[0]))

    if len(tokens) == 0:
        tokens = ["empty"]

    return tokens
Exemple #12
0
def extract_keyterms(data):
    tokens = []
    doc = Doc(data, lang="en_core_web_md")
    res = keyterms.sgrank(doc, n_keyterms=100)

    for r in res:
        tokens.append(str(r[0]))

    if len(tokens) == 0:
        tokens = ["empty"]

    return tokens
Exemple #13
0
 def test_corpus_init_docs(self):
     limit = 3
     texts, metadatas = fileio.split_record_fields(
         DATASET.records(limit=limit), 'text')
     docs = [Doc(text, lang='en', metadata=metadata)
             for text, metadata in zip(texts, metadatas)]
     corpus = Corpus('en', docs=docs)
     self.assertEqual(len(corpus.docs), limit)
     self.assertTrue(
         all(doc.spacy_vocab is corpus.spacy_vocab for doc in corpus))
     for i in range(limit):
         self.assertEqual(corpus[i].metadata, docs[i].metadata)
     corpus = Corpus(
         'en', docs=docs, metadatas=({'foo': 'bar'} for _ in range(limit)))
     for i in range(limit):
         self.assertEqual(corpus[i].metadata, {'foo': 'bar'})
Exemple #14
0
def test_corpus_init_docs():
    limit = 3
    texts, metadatas = io.split_records(DATASET.records(limit=limit), "text")
    docs = [
        Doc(text, lang="en", metadata=metadata)
        for text, metadata in zip(texts, metadatas)
    ]
    corpus = Corpus("en", docs=docs)
    assert len(corpus.docs) == limit
    assert all(doc.spacy_vocab is corpus.spacy_vocab for doc in corpus)
    for i in range(limit):
        assert corpus[i].metadata == docs[i].metadata
    corpus = Corpus("en",
                    docs=docs,
                    metadatas=({
                        "foo": "bar"
                    } for _ in range(limit)))
    for i in range(limit):
        assert corpus[i].metadata == {"foo": "bar"}
Exemple #15
0
def test_corpus_init_docs():
    limit = 3
    texts, metadatas = io.split_records(DATASET.records(limit=limit), 'text')
    docs = [
        Doc(text, lang='en', metadata=metadata)
        for text, metadata in zip(texts, metadatas)
    ]
    corpus = Corpus('en', docs=docs)
    assert len(corpus.docs) == limit
    assert all(doc.spacy_vocab is corpus.spacy_vocab for doc in corpus)
    for i in range(limit):
        assert corpus[i].metadata == docs[i].metadata
    corpus = Corpus('en',
                    docs=docs,
                    metadatas=({
                        'foo': 'bar'
                    } for _ in range(limit)))
    for i in range(limit):
        assert corpus[i].metadata == {'foo': 'bar'}
Exemple #16
0
def extract_triples(text):
    doc = Doc(text, lang='en_core_web_sm')
    for item in ext.subject_verb_object_triples(doc):
        if span_is_interesting(item[0]) or span_is_interesting(item[2]):
            #print(item)
            exp_sub = subject_to_subtree(item[0])
            sub = ' '.join([x.text for x in item[0]])
            verb = ' '.join([x.text for x in item[1]])
            obj = ' '.join([x.text for x in item[2]])
            exp_obj = subject_to_subtree(item[2])

            sub_pn = ""
            if span_is_interesting(item[0]):
                sub_pn = sub

            obj_pn = ""
            if span_is_interesting(item[2]):
                obj_pn = obj

            print([sub_pn, exp_sub, verb, obj_pn, exp_obj])
class TestTfidfEmbeddingVectorizer(unittest.TestCase):
    def setUp(self):
        with open(
                os.path.join(os.path.dirname(__file__), 'corpus_txt_test.txt'),
                'rb') as f:
            texts = [t.decode('utf-8') for t in f.readlines()]
            self.corpus = Corpus(u'en', texts=texts)
        self.text = u"disease drop due economic disease else"
        self.doc = Doc(self.text)
        self.w2v = {
            w: 5 * np.random.random_sample((300, )) - 2
            for w in self.text.split()
        }

    def test_tfidf_vectorizer(self):
        vectorizer = TfidfEmbeddingVectorizer(self.w2v, self.corpus)
        vectorizer.fit()
        tokenized_doc = [
            list(
                self.doc.to_terms_list(ngrams=1,
                                       named_entities=True,
                                       as_strings=True))
        ]
        tfidf_doc = vectorizer.vectorizer.transform(tokenized_doc)
        v = tfidf_doc[:,
                      vectorizer.vectorizer.vocabulary_terms['drop']].toarray(
                      )[0]
        doc_term_matrix = vectorizer.doc_term_matrix
        vectorizer.save(
            os.path.join(os.path.dirname(__file__),
                         'test_doc_term_matrix.npz'))
        vectorizer.load(os.path.join(os.path.dirname(__file__),
                                     'test_doc_term_matrix.npz'),
                        force=True)
        self.assertAlmostEqual(np.asscalar(v), 0.42063495, delta=0.05)
        self.assertEqual(vectorizer.transform(self.doc).shape, (300, ))
        self.assertTrue(
            np.allclose(doc_term_matrix.toarray(),
                        vectorizer.doc_term_matrix.toarray()))
Exemple #18
0
    def run_custom_task(self, temp_file, mongo_client: MongoClient):
        print('run custom task')
        n_num = self.get_integer('n', default=2)
        filter_stops = self.get_boolean('filter_stops', default=True)
        filter_punct = self.get_boolean('filter_punct', default=True)
        filter_nums = self.get_boolean('filter_nums', default=False)
        lemmas = self.get_boolean('lemmas', default=True)
        limit_to_termset = self.get_boolean('limit_to_termset', default=False)
        termset = self.pipeline_config.terms
        if not termset:
            termset = list()
        lower_termset = [x.lower() for x in termset]

        for doc in self.docs:
            ngrams = list()
            cln_txt = self.get_document_text(doc, clean=True)
            t_doc = Doc(preprocess_text(cln_txt, lowercase=True))
            res = extract.ngrams(t_doc,
                                 n_num,
                                 filter_stops=filter_stops,
                                 filter_punct=filter_punct,
                                 filter_nums=filter_nums)
            for r in res:
                if lemmas:
                    text = r.lemma_
                else:
                    text = r.text

                if limit_to_termset:
                    for t in lower_termset:
                        if text == t or t in text:
                            ngrams.append({'text': text, 'count': 1})
                else:
                    ngrams.append({'text': text, 'count': 1})
            self.write_multiple_result_data(temp_file, mongo_client, doc,
                                            ngrams)
Exemple #19
0
 def doc(self):
     # NOTE: Should cache this somehow if we want to use for more than one
     # thing.
     return Doc(self.text(), lang="en")
Exemple #20
0
def doc(text):
    spacy_lang = cache.load_spacy('en')
    return Doc(text.strip(), lang=spacy_lang)
Exemple #21
0
 def setUp(self):
     self.doc = Doc(TEXT, lang='en')
     self.ts = text_stats.TextStats(self.doc)
Exemple #22
0
 def setUp(self):
     self.doc = Doc(TEXT.strip(), lang='en_core_web_sm')
Exemple #23
0
class DocMethodsTestCase(unittest.TestCase):
    def setUp(self):
        self.doc = Doc(TEXT.strip(), lang='en_core_web_sm')

    def test_n_tokens_and_sents(self):
        self.assertEqual(self.doc.n_tokens, 241)
        self.assertEqual(self.doc.n_sents, 8)

    def test_term_count(self):
        self.assertEqual(self.doc.count('statistical'), 3)
        self.assertEqual(self.doc.count('machine learning'), 2)
        self.assertEqual(self.doc.count('foo'), 0)

    def test_tokenized_text(self):
        tokenized_text = self.doc.tokenized_text
        self.assertIsInstance(tokenized_text, list)
        self.assertIsInstance(tokenized_text[0], list)
        self.assertIsInstance(tokenized_text[0][0], compat.unicode_)
        self.assertEqual(len(tokenized_text), self.doc.n_sents)

    def test_pos_tagged_text(self):
        pos_tagged_text = self.doc.pos_tagged_text
        self.assertIsInstance(pos_tagged_text, list)
        self.assertIsInstance(pos_tagged_text[0], list)
        self.assertIsInstance(pos_tagged_text[0][0], tuple)
        self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_)
        self.assertEqual(len(pos_tagged_text), self.doc.n_sents)

    def test_to_terms_list(self):
        full_terms_list = list(self.doc.to_terms_list(as_strings=True))
        full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False))
        self.assertEqual(len(full_terms_list), len(full_terms_list_ids))
        self.assertIsInstance(full_terms_list[0], compat.unicode_)
        self.assertIsInstance(full_terms_list_ids[0], int)
        self.assertNotEqual(
            full_terms_list[0],
            list(self.doc.to_terms_list(as_strings=True, normalize=False))[0])
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))

    def test_to_bag_of_words(self):
        bow = self.doc.to_bag_of_words(weighting='count')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        bow = self.doc.to_bag_of_words(weighting='binary')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        for value in list(bow.values())[0:10]:
            self.assertLess(value, 2)
        bow = self.doc.to_bag_of_words(weighting='freq')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], float)
        bow = self.doc.to_bag_of_words(as_strings=True)
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.unicode_)
Exemple #24
0
 def test_invalid_content_lang_combo(self):
     spacy_lang = data.load_spacy('en_core_web_sm')
     with self.assertRaises(ValueError):
         Doc(spacy_lang('Hola, cómo estás mi amigo?'), lang='es')
Exemple #25
0
 def test_invalid_lang(self):
     invalid_langs = [b'en', ['en', 'en_core_web_sm'], True]
     for invalid_lang in invalid_langs:
         with self.assertRaises(ValueError):
             Doc('This is an English sentence.', lang=invalid_lang)
Exemple #26
0
 def test_lang_spacylang(self):
     spacy_lang = data.load_spacy('en_core_web_sm')
     self.assertIsInstance(
         Doc('This is an English sentence.', lang=spacy_lang), Doc)
Exemple #27
0
 def test_lang_str(self):
     self.assertIsInstance(Doc('This is an English sentence.', lang='en'),
                           Doc)
Exemple #28
0
 def test_spacydoc_content(self):
     spacy_lang = data.load_spacy('en_core_web_sm')
     spacy_doc = spacy_lang('This is an English sentence.')
     self.assertIsInstance(Doc(spacy_doc), Doc)
Exemple #29
0
 def test_unicode_content(self):
     self.assertIsInstance(Doc('This is an English sentence.'), Doc)
Exemple #30
0
ec_data = ec_data[[
    "Date", "Quarter", "Company", "Participants", "AnalystCompany",
    "AnalystName", "QuestionOrder", "Tag", "Question"
]]

docs = Corpus(
    lang=en,
    docs=ec_data.apply(lambda x: Doc(content=' '.join([
        token for token in preprocess_text(text=x['Question'],
                                           lowercase=True,
                                           no_punct=True,
                                           no_contractions=True,
                                           no_accents=True,
                                           no_currency_symbols=True,
                                           no_numbers=True).split(' ')
        if len(token) > 2
    ]),
                                     lang=en,
                                     metadata={
                                         'Quarter': x['Quarter'],
                                         'Company': x['Company'],
                                         'QuestionOrder': x['QuestionOrder'],
                                         'Analyst': x["AnalystName"],
                                         'Tag': x['Tag']
                                     }),
                       axis=1).tolist())
tokenized_docs = [
    list(
        doc.to_terms_list(ngrams=(1),
                          as_strings=True,
                          normalize='lemma',
                          drop_determiners=True)) for doc in docs