def test_pos_extract_when_pos_includes_is_jj(
        mary_had_a_little_lamb_corpus: textacy_api.Corpus):

    terms = (textacy_api.ExtractPipeline(
        mary_had_a_little_lamb_corpus,
        target='lemma').pos(include_pos=('ADJ', )).process())

    terms = [d for d in terms]
    assert ["little", "white"] == terms[0]
def test_pos_extract_when_pos_includes_is_noun(
        mary_had_a_little_lamb_corpus: textacy_api.Corpus):

    terms = (textacy_api.ExtractPipeline(corpus=mary_had_a_little_lamb_corpus,
                                         target='lemma').pos(include_pos=(
                                             'NOUN',
                                             'PROPN',
                                         )).process())

    terms = [d for d in terms]

    assert ["Mary", "lamb", "fleece", "snow"] == terms[0]
def test_pos_extract_when_a_more_complexed_filter(
        mary_had_a_little_lamb_corpus: textacy_api.Corpus):

    terms = (textacy_api.ExtractPipeline(
        mary_had_a_little_lamb_corpus,
        target='lemma').pos(include_pos=('NOUN', 'PROPN')).remove_stopwords(
            extra_stopwords=[]).ingest(
                filter_nums=True, filter_punct=True).infrequent_word_filter(1).
             frequent_word_filter(100).min_character_filter(2).substitute(
                 subst_map={
                     'Mary': 'mar4'
                 }).predicate(predicate=lambda x: True).transform(
                     transformer=lambda x: x.upper()).process())

    terms = [d for d in terms]
    assert ["MAR4", "LAMB", "FLEECE", "SNOW"] == terms[0]
def test_vectorizer(mary_had_a_little_lamb_corpus: textacy_api.Corpus):  # pylint: disable=redefined-outer-name

    expected_dtm = np.matrix([
        [0, 0, 1, 1, 0, 1, 0, 0, 1, 0],
        [0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 1, 1, 0, 0],
        [1, 0, 0, 1, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [1, 0, 0, 1, 1, 1, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 1, 0, 0, 0, 1],
    ])

    opts: textacy_api.ExtractPipeline.ExtractOpts = textacy_api.ExtractPipeline.ExtractOpts(
        include_pos=('NOUN', 'PROPN'), filter_nums=True, filter_punct=True)
    terms = (
        textacy_api.ExtractPipeline(
            mary_had_a_little_lamb_corpus, target='lemma',
            extract_opts=opts).remove_stopwords(extra_stopwords=[])
        # .ingest(filter_nums=True, filter_punct=True)
        .min_character_filter(2).transform(
            transformer=lambda x: x.lower()).process())

    document_terms = ((f'document_{i}.txt', tokens)
                      for i, tokens in enumerate(terms))
    vectorizer = CorpusVectorizer()

    v_corpus: VectorizedCorpus = vectorizer.fit_transform(
        document_terms, already_tokenized=True)

    assert v_corpus is not None

    assert {
        'mary': 5,
        'lamb': 3,
        'fleece': 2,
        'snow': 8,
        'school': 7,
        'day': 1,
        'rule': 6,
        'child': 0,
        'teacher': 9,
        'love': 4,
    } == v_corpus.token2id

    assert (expected_dtm == v_corpus.data.todense()).all()
Exemple #5
0
    def _create_extract_pipeline(self, corpus):

        gui = self.corpus_widgets

        pipeline = (textacy_api.ExtractPipeline(
            corpus, target=gui.normalize.value).ingest(
                as_strings=True,
                include_pos=gui.include_pos.value,
                filter_stops=gui.filter_stops.value,
                filter_punct=True,
            ).frequent_word_filter(
                max_doc_freq=gui.max_doc_freq.value).infrequent_word_filter(
                    min_freq=gui.min_freq.value).remove_stopwords(
                        extra_stopwords=set(gui.stop_words.value)))

        if gui.substitute_terms.value is True:
            pipeline = pipeline.substitute(subst_map=None,
                                           filename=self.substitution_filename)

        return pipeline