def test_noun_chunks_min_freq(spacy_doc): expected = ['I', 'we', 'he', 'I', 'we', 'he', 'we'] observed = [ nc.text for nc in extract.noun_chunks( spacy_doc, drop_determiners=True, min_freq=2) ] assert observed == expected
def test_noun_chunks_min_freq(self): expected = ['I', 'we', 'he', 'I', 'we', 'he', 'we'] observed = [ nc.text for nc in extract.noun_chunks( self.spacy_doc, drop_determiners=True, min_freq=2) ] self.assertEqual(observed, expected)
def select_extractors(use_unigrams=False): """ Extractors For Alignment :return: List of Extractors objects to use for text-text alignment note: ngram extractors below filter out stopwords and number words/symbols """ noun_chunk_extractor = Extractor( lambda doc: list(filter(lambda x: len(x) > 3, list(noun_chunks(doc))))) tetragram_extractor = Extractor( lambda doc: list(ngrams(doc, 4, filter_stops=True, filter_nums=True))) trigram_extractor = Extractor( lambda doc: list(ngrams(doc, 3, filter_stops=True, filter_nums=True))) bigram_extractor = Extractor(lambda doc: list( ngrams(doc, 2, filter_stops=False, filter_nums=False))) unigram_extractor = Extractor(lambda doc: list( ngrams(doc, 1, filter_stops=False, filter_nums=False))) extractor_list = [ noun_chunk_extractor, tetragram_extractor, trigram_extractor, bigram_extractor, ] if use_unigrams: extractor_list.append(unigram_extractor) return extractor_list
def noun_chunks(self, **kwargs): """ Extract an ordered sequence of noun phrases from doc, optionally filtering by frequency and dropping leading determiners. .. seealso:: :func:`extract.noun_chunks() <textacy.extract.noun_chunks>` for all function kwargs. """ return extract.noun_chunks(self.spacy_doc, **kwargs)
def test_noun_chunks(spacy_doc): expected = [ 'I', 'Kuwait', 'I.M.F. seminar', 'Arab educators', '30 minutes', 'we', 'impact', 'technology trends', 'education', 'Middle East' ] observed = [ nc.text for nc in extract.noun_chunks(spacy_doc, drop_determiners=True) ][:10] assert observed == expected
def test_noun_chunks(self): expected = [ 'I', 'Kuwait', 'I.M.F. seminar', 'Arab educators', '30 minutes', 'we', 'impact', 'technology trends', 'education', 'Middle East', 'Egyptian education official', 'his hand', 'he', 'personal question', 'I', 'Donald Trump', 'we', 'mosques', 'United States', 'he', 'great sorrow', 'what', 'we', 'our kids'] observed = [nc.text for nc in extract.noun_chunks( self.spacy_doc, drop_determiners=True)] self.assertEqual(observed, expected)
def test_noun_chunks_determiner(self): expected = [ 'I', 'Kuwait', 'an I.M.F. seminar', 'Arab educators', '30 minutes', 'we', 'the impact', 'technology trends', 'education', 'the Middle East', 'an Egyptian education official', 'his hand', 'he', 'a personal question', 'I', 'Donald Trump', 'we', 'mosques', 'the United States', 'he', 'great sorrow', 'what', 'we', 'our kids' ] observed = [ nc.text for nc in extract.noun_chunks(self.spacy_doc, drop_determiners=False) ] self.assertEqual(observed, expected)
def noun_chunks(self, **kwargs): """ Extract an ordered sequence of noun phrases from doc, optionally filtering by frequency and dropping leading determiners. Args: **kwargs: drop_determiners (bool, optional): remove leading determiners (e.g. "the") from phrases (e.g. "the quick brown fox" => "quick brown fox") min_freq (int, optional): remove chunks that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next noun chunk, in order of appearance in the document .. seealso:: :func:`extract.noun_chunks() <textacy.extract.noun_chunks>` """ for nc in extract.noun_chunks(self.spacy_doc, **kwargs): yield nc
def grammars( carrel, grammar, query, noun, lemma, sort, count ) : """Extract sentence fragments from <carrel> where fragments are one of: \b nouns - all the nouns and noun chunks quotes - things people say svo - fragments in the form of subject-verb-object (the default) sss - a more advanced version of svo; fragments beginning with an entity, are co-occur with a verb, and are followed by a phrase This is very useful for the purposes of listing more complete ideas from a text. Examples: \b rdr grammars homer rdr grammars -g nouns homer rdr grammars -g sss -n hector -l be homer""" # require from textacy import extract from os import system from re import search # sanity check checkForCarrel( carrel ) # initialize doc = carrel2doc( carrel ) # get the features; svo if grammar == 'svo' : # do the work features = list( extract.subject_verb_object_triples( doc ) ) # simplify the result items = [] for feature in features : subject = [ token.text_with_ws for token in feature.subject ] verb = [ token.text_with_ws for token in feature.verb ] object = [ token.text_with_ws for token in feature.object ] items.append(' \t'.join( [ ''.join( subject ), ''.join( verb ), ''.join( object ) ] ) ) # done features = items # quotes elif grammar == 'quotes' : # do the work features = list( extract.direct_quotations( doc ) ) # simplify the result items = [] for feature in features : # parse and stringify speaker = [ token.text_with_ws for token in feature.speaker ] cue = [ token.text_with_ws for token in feature.cue ] content = feature.content.text_with_ws items.append( '\t'.join( [ ''.join( speaker ), ''.join( cue ), content ] ) ) # done features = items # noun chunks elif grammar == 'nouns' : # do the work and simplify the result features = list( extract.noun_chunks( doc ) ) features = [ feature.text for feature in features ] # semi-structured sentences elif grammar == 'sss' : # sanity check if not noun : click.echo( "Error: When specifying sss, the -n option is required. See 'rdr grammars --help'.", err=True ) exit() # do the work features = list( extract.semistructured_statements( doc, entity=noun, cue=lemma ) ) # simplify the result items = [] for feature in features : entity = [ token.text_with_ws for token in feature.entity ] cue = [ token.text_with_ws for token in feature.cue ] fragment = [ token.text_with_ws for token in feature.fragment ] items.append( '\t'.join( [ ''.join( entity ), ''.join( cue ), ''.join( fragment ) ] ) ) # done features = items # filter, conditionally if query : features = [ feature for feature in features if ( search( query, feature ) ) ] # sort, conditionally if sort : features.sort() # count, conditionally if count : # initialize a dictionary and process each feature items = {} for feature in features : # update the dictionary if feature in items : items[ feature ] += 1 else : items[ feature ] = 1 # sort the dictionary; return the features features = sorted( items.items(), key=lambda x:x[ 1 ], reverse=True ) # process each feature, again items = [] for feature in features : # create a record and update record = str( feature[ 1 ] ) + '\t' + feature[ 0 ] items.append( record ) # done features = items # output for feature in features : click.echo( feature )
def test_determiner(self, spacy_doc): result = list(extract.noun_chunks(spacy_doc, drop_determiners=False)) assert all(isinstance(span, Span) for span in result) assert any(span[0].pos_ == "DET" for span in result)
def test_default(self, spacy_doc): result = list(extract.noun_chunks(spacy_doc)) assert all(isinstance(span, Span) for span in result)
def extract_noun_chunks(doc, min_freq=1): return extract.noun_chunks(doc, drop_determiners=True, min_freq=min_freq)
def test_min_freq(self, spacy_doc): text = spacy_doc.text.lower() result = list(extract.noun_chunks(spacy_doc, drop_determiners=True, min_freq=2)) assert all(text.count(span.lower_) >= 2 for span in result)
def test_noun_chunks_min_freq(self): expected = ['I', 'we', 'he', 'I', 'we', 'he', 'we'] observed = [nc.text for nc in extract.noun_chunks( self.spacy_doc, drop_determiners=True, min_freq=2)] self.assertEqual(observed, expected)
def noun_chunks(s): s = unicode(s) doc = nlp(s) return list(extr.noun_chunks(doc))