def test_pos_extract_when_pos_includes_is_jj( mary_had_a_little_lamb_corpus: textacy_api.Corpus): terms = (textacy_api.ExtractPipeline( mary_had_a_little_lamb_corpus, target='lemma').pos(include_pos=('ADJ', )).process()) terms = [d for d in terms] assert ["little", "white"] == terms[0]
def test_pos_extract_when_pos_includes_is_noun( mary_had_a_little_lamb_corpus: textacy_api.Corpus): terms = (textacy_api.ExtractPipeline(corpus=mary_had_a_little_lamb_corpus, target='lemma').pos(include_pos=( 'NOUN', 'PROPN', )).process()) terms = [d for d in terms] assert ["Mary", "lamb", "fleece", "snow"] == terms[0]
def test_pos_extract_when_a_more_complexed_filter( mary_had_a_little_lamb_corpus: textacy_api.Corpus): terms = (textacy_api.ExtractPipeline( mary_had_a_little_lamb_corpus, target='lemma').pos(include_pos=('NOUN', 'PROPN')).remove_stopwords( extra_stopwords=[]).ingest( filter_nums=True, filter_punct=True).infrequent_word_filter(1). frequent_word_filter(100).min_character_filter(2).substitute( subst_map={ 'Mary': 'mar4' }).predicate(predicate=lambda x: True).transform( transformer=lambda x: x.upper()).process()) terms = [d for d in terms] assert ["MAR4", "LAMB", "FLEECE", "SNOW"] == terms[0]
def test_vectorizer(mary_had_a_little_lamb_corpus: textacy_api.Corpus): # pylint: disable=redefined-outer-name expected_dtm = np.matrix([ [0, 0, 1, 1, 0, 1, 0, 0, 1, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 1, 0, 0], [1, 0, 0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 1], ]) opts: textacy_api.ExtractPipeline.ExtractOpts = textacy_api.ExtractPipeline.ExtractOpts( include_pos=('NOUN', 'PROPN'), filter_nums=True, filter_punct=True) terms = ( textacy_api.ExtractPipeline( mary_had_a_little_lamb_corpus, target='lemma', extract_opts=opts).remove_stopwords(extra_stopwords=[]) # .ingest(filter_nums=True, filter_punct=True) .min_character_filter(2).transform( transformer=lambda x: x.lower()).process()) document_terms = ((f'document_{i}.txt', tokens) for i, tokens in enumerate(terms)) vectorizer = CorpusVectorizer() v_corpus: VectorizedCorpus = vectorizer.fit_transform( document_terms, already_tokenized=True) assert v_corpus is not None assert { 'mary': 5, 'lamb': 3, 'fleece': 2, 'snow': 8, 'school': 7, 'day': 1, 'rule': 6, 'child': 0, 'teacher': 9, 'love': 4, } == v_corpus.token2id assert (expected_dtm == v_corpus.data.todense()).all()
def _create_extract_pipeline(self, corpus): gui = self.corpus_widgets pipeline = (textacy_api.ExtractPipeline( corpus, target=gui.normalize.value).ingest( as_strings=True, include_pos=gui.include_pos.value, filter_stops=gui.filter_stops.value, filter_punct=True, ).frequent_word_filter( max_doc_freq=gui.max_doc_freq.value).infrequent_word_filter( min_freq=gui.min_freq.value).remove_stopwords( extra_stopwords=set(gui.stop_words.value))) if gui.substitute_terms.value is True: pipeline = pipeline.substitute(subst_map=None, filename=self.substitution_filename) return pipeline