def semistructured_statements(self, entity, **kwargs): """ Extract "semi-structured statements" from doc, each as a (entity, cue, fragment) triple. This is similar to subject-verb-object triples. Args: entity (str): a noun or noun phrase of some sort (e.g. "President Obama", "global warming", "Python") **kwargs: cue (str, optional): verb lemma with which `entity` is associated (e.g. "talk about", "have", "write") ignore_entity_case (bool, optional): if True, entity matching is case-independent min_n_words (int, optional): min number of tokens allowed in a matching fragment max_n_words (int, optional): max number of tokens allowed in a matching fragment Yields: (``spacy.Span`` or ``spacy.Token``, ``spacy.Span`` or ``spacy.Token``, ``spacy.Span``): where each element is a matching (entity, cue, fragment) triple .. seealso:: :func:`extract.semistructured_statements() <textacy.extract.semistructured_statements>` """ for sss in extract.semistructured_statements(self.spacy_doc, entity, **kwargs): yield sss
def semistructured_statements(self, entity, **kwargs): """ Extract "semi-structured statements" from doc, each as a (entity, cue, fragment) triple. This is similar to subject-verb-object triples. Args: entity (str): a noun or noun phrase of some sort (e.g. "President Obama", "global warming", "Python") **kwargs: cue (str, optional): verb lemma with which `entity` is associated (e.g. "talk about", "have", "write") ignore_entity_case (bool, optional): if True, entity matching is case-independent min_n_words (int, optional): min number of tokens allowed in a matching fragment max_n_words (int, optional): max number of tokens allowed in a matching fragment Yields: (``spacy.Span`` or ``spacy.Token``, ``spacy.Span`` or ``spacy.Token``, ``spacy.Span``): where each element is a matching (entity, cue, fragment) triple .. seealso:: :func:`extract.semistructured_statements() <textacy.extract.semistructured_statements>` """ for sss in extract.semistructured_statements(self.spacy_doc, entity, **kwargs): yield sss
def get_statements(parsed_doc: spacy.language.Doc, possible_subjects: List[str]) -> List[tuple]: statements = [] for subject in possible_subjects: statements.extend(list(semistructured_statements(parsed_doc, subject))) return statements
def test_semistructured_statements(spacy_doc): expected = ( "we", "discussed", "the impact of technology trends on education in the Middle East") observed = next( extract.semistructured_statements(spacy_doc, "we", cue="discuss")) assert isinstance(observed, tuple) and len(observed) == 3 assert all(isinstance(obj, (Span, Token)) for obj in observed) assert all(obs.text == exp for obs, exp in compat.zip_(observed, expected))
def semistructured_statements(self, entity, **kwargs): """ Extract "semi-structured statements" from doc, each as a (entity, cue, fragment) triple. This is similar to subject-verb-object triples. Args: entity (str): a noun or noun phrase of some sort (e.g. "President Obama", "global warming", "Python") .. seealso:: :func:`extract.semistructured_statements() <textacy.extract.semistructured_statements>` for all function kwargs. """ return extract.semistructured_statements(self.spacy_doc, entity, **kwargs)
def semistructured_statements(self, entity, **kwargs): """ Extract "semi-structured statements" from doc, each as a (entity, cue, fragment) triple. This is similar to subject-verb-object triples. Args: entity (str): a noun or noun phrase of some sort (e.g. "President Obama", "global warming", "Python") .. seealso:: :func:`extract.semistructured_statements() <textacy.extract.semistructured_statements>` for all function kwargs. """ return extract.semistructured_statements(self.spacy_doc, entity, **kwargs)
def test_extract_functionality(doc): bigrams = list( extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:10] for bigram in bigrams: assert isinstance(bigram, SpacySpan) assert len(bigram) == 2 trigrams = list( extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2))[:10] for trigram in trigrams: assert isinstance(trigram, SpacySpan) assert len(trigram) == 3 nes = list( extract.named_entities(doc, drop_determiners=False, exclude_types='numeric'))[:10] for ne in nes: assert isinstance(ne, SpacySpan) assert ne.label_ assert ne.label_ != 'QUANTITY' pos_regex_matches = list( extract.pos_regex_matches( doc, constants.POS_REGEX_PATTERNS['en']['NP']))[:10] for match in pos_regex_matches: assert isinstance(match, SpacySpan) stmts = list(extract.semistructured_statements(doc, 'I', cue='be'))[:10] for stmt in stmts: assert isinstance(stmt, list) assert isinstance(stmt[0], compat.unicode_) assert len(stmt) == 3 kts = keyterms.textrank(doc, n_keyterms=10) for keyterm in kts: assert isinstance(keyterm, tuple) assert isinstance(keyterm[0], compat.unicode_) assert isinstance(keyterm[1], float) assert keyterm[1] > 0.0
def test_semistructured_statements(sss_doc, entity, cue, fragment_len_range, exp): obs = list( extract.semistructured_statements( sss_doc, entity=entity, cue=cue, fragment_len_range=fragment_len_range)) assert all( hasattr(sss, attr) for sss in obs for attr in ["entity", "cue", "fragment"]) obs_text = [([tok.text for tok in e], [tok.text for tok in c], [tok.text for tok in f]) for e, c, f in obs] assert obs_text == exp
def test_extract_functionality(doc): bigrams = list( extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:10] for bigram in bigrams: assert isinstance(bigram, Span) assert len(bigram) == 2 trigrams = list( extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2))[:10] for trigram in trigrams: assert isinstance(trigram, Span) assert len(trigram) == 3 nes = list( extract.entities(doc, drop_determiners=False, exclude_types="numeric"))[:10] for ne in nes: assert isinstance(ne, Span) assert ne.label_ assert ne.label_ != "QUANTITY" pos_regex_matches = list( extract.pos_regex_matches( doc, constants.POS_REGEX_PATTERNS["en"]["NP"]))[:10] for match in pos_regex_matches: assert isinstance(match, Span) stmts = list(extract.semistructured_statements(doc, "I", cue="be"))[:10] for stmt in stmts: assert isinstance(stmt, list) assert isinstance(stmt[0], compat.unicode_) assert len(stmt) == 3 kts = textacy.ke.textrank(doc, topn=10) for keyterm in kts: assert isinstance(keyterm, tuple) assert isinstance(keyterm[0], compat.unicode_) assert isinstance(keyterm[1], float) assert keyterm[1] > 0.0
def test_extract_functionality(doc): bigrams = list( extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:10] for bigram in bigrams: assert isinstance(bigram, Span) assert len(bigram) == 2 trigrams = list( extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2))[:10] for trigram in trigrams: assert isinstance(trigram, Span) assert len(trigram) == 3 nes = list( extract.entities(doc, drop_determiners=False, exclude_types="numeric"))[:10] for ne in nes: assert isinstance(ne, Span) assert ne.label_ assert ne.label_ != "QUANTITY" regex_matches = list(extract.regex_matches(doc, "Mr\. Speaker"))[:10] for match in regex_matches: assert isinstance(match, Span) stmts = list(extract.semistructured_statements(doc, entity="I", cue="be"))[:10] for stmt in stmts: assert isinstance(stmt, list) assert isinstance(stmt[0], str) assert len(stmt) == 3 kts = kt.textrank(doc, topn=10) for keyterm in kts: assert isinstance(keyterm, tuple) assert isinstance(keyterm[0], str) assert isinstance(keyterm[1], float) assert keyterm[1] > 0.0
def get_semi_structured_statements(doc, entity, cue='be'): assert isinstance(doc, textacy.Doc) or isinstance( doc, spacy.tokens.Doc), "Only {} are supported".format(possible_docs) assert isinstance(entity, basestring) assert isinstance(cue, basestring) return extract.semistructured_statements(doc, entity, cue)
def grammars( carrel, grammar, query, noun, lemma, sort, count ) : """Extract sentence fragments from <carrel> where fragments are one of: \b nouns - all the nouns and noun chunks quotes - things people say svo - fragments in the form of subject-verb-object (the default) sss - a more advanced version of svo; fragments beginning with an entity, are co-occur with a verb, and are followed by a phrase This is very useful for the purposes of listing more complete ideas from a text. Examples: \b rdr grammars homer rdr grammars -g nouns homer rdr grammars -g sss -n hector -l be homer""" # require from textacy import extract from os import system from re import search # sanity check checkForCarrel( carrel ) # initialize doc = carrel2doc( carrel ) # get the features; svo if grammar == 'svo' : # do the work features = list( extract.subject_verb_object_triples( doc ) ) # simplify the result items = [] for feature in features : subject = [ token.text_with_ws for token in feature.subject ] verb = [ token.text_with_ws for token in feature.verb ] object = [ token.text_with_ws for token in feature.object ] items.append(' \t'.join( [ ''.join( subject ), ''.join( verb ), ''.join( object ) ] ) ) # done features = items # quotes elif grammar == 'quotes' : # do the work features = list( extract.direct_quotations( doc ) ) # simplify the result items = [] for feature in features : # parse and stringify speaker = [ token.text_with_ws for token in feature.speaker ] cue = [ token.text_with_ws for token in feature.cue ] content = feature.content.text_with_ws items.append( '\t'.join( [ ''.join( speaker ), ''.join( cue ), content ] ) ) # done features = items # noun chunks elif grammar == 'nouns' : # do the work and simplify the result features = list( extract.noun_chunks( doc ) ) features = [ feature.text for feature in features ] # semi-structured sentences elif grammar == 'sss' : # sanity check if not noun : click.echo( "Error: When specifying sss, the -n option is required. See 'rdr grammars --help'.", err=True ) exit() # do the work features = list( extract.semistructured_statements( doc, entity=noun, cue=lemma ) ) # simplify the result items = [] for feature in features : entity = [ token.text_with_ws for token in feature.entity ] cue = [ token.text_with_ws for token in feature.cue ] fragment = [ token.text_with_ws for token in feature.fragment ] items.append( '\t'.join( [ ''.join( entity ), ''.join( cue ), ''.join( fragment ) ] ) ) # done features = items # filter, conditionally if query : features = [ feature for feature in features if ( search( query, feature ) ) ] # sort, conditionally if sort : features.sort() # count, conditionally if count : # initialize a dictionary and process each feature items = {} for feature in features : # update the dictionary if feature in items : items[ feature ] += 1 else : items[ feature ] = 1 # sort the dictionary; return the features features = sorted( items.items(), key=lambda x:x[ 1 ], reverse=True ) # process each feature, again items = [] for feature in features : # create a record and update record = str( feature[ 1 ] ) + '\t' + feature[ 0 ] items.append( record ) # done features = items # output for feature in features : click.echo( feature )