def simple_subjects_and_objects(self, verb): """ Extract all simple subjects and objects for a given verb. Uses Textacy get_objects_of_verb and get_subjects_of_verb methods param: verb A Spacy Token return: A list of verb subjects and objects (Spacy Tokens or Spans) """ verb_objects = get_objects_of_verb(verb) verb_subjects = get_subjects_of_verb(verb) verb_objects.extend(verb_subjects) return verb_objects
def subject_verb_object_triples(doc): """ Extract an ordered sequence of subject-verb-object (SVO) triples from a spacy-parsed doc. Note that this only works for SVO languages. Args: doc (``spacy.Doc`` or ``spacy.Span``): either a spacy document or a sentence thereof Yields: (``spacy.Span``, ``spacy.Span``, ``spacy.Span``): the next 3-tuple from ``doc`` representing a (subject, verb, object) triple, in order of apperance # TODO: What to do about questions, where it may be VSO instead of SVO? # TODO: What about non-adjacent verb negations? # TODO: What about object (noun) negations? """ try: sents = doc.sents except AttributeError: sents = [doc] for sent in sents: start_i = sent[0].i verbs = get_main_verbs_of_sent(sent) for verb in verbs: subjs = get_subjects_of_verb(verb) if not subjs: continue objs = get_objects_of_verb(verb) if not objs: continue # add adjacent auxiliaries to verbs, for context # and add compounds to compound nouns verb_span = get_span_for_verb_auxiliaries(verb) verb = sent[verb_span[0] - start_i: verb_span[1] - start_i + 1] for subj in subjs: subj = sent[get_span_for_compound_noun(subj)[0] - start_i: subj.i - start_i + 1] for obj in objs: if obj.pos == NOUN: span = get_span_for_compound_noun(obj) elif obj.pos == VERB: span = get_span_for_verb_auxiliaries(obj) else: span = (obj.i, obj.i) obj = sent[span[0] - start_i: span[1] - start_i + 1] yield (subj, verb, obj)
def verb_relevance(self, verb, article): """ Checks a verb for relevance by: 1. Comparing to structure term lemmas 2. Comparing to person term lemmas 3. Looking for special cases such as 'leave homeless' """ # case for eviction first because we have 'forced eviction' case which would be picked by the 'elif' below if 'eviction' in [obj.lemma_ for obj in get_objects_of_verb(verb)]: verb_objects = get_objects_of_verb(verb) for verb_object in verb_objects: if verb_object.text == 'eviction' or verb_object.text == 'evictions': return self.structure_unit_lemmas + self.person_unit_lemmas, Fact( verb, article[verb.i:verb_object.i + 1], verb.lemma_ + " " + "eviction", "term") elif verb.lemma_ in self.joint_term_lemmas: return self.structure_unit_lemmas + self.person_unit_lemmas, Fact( verb, verb, verb.lemma_, "term") elif verb.lemma_ in self.structure_term_lemmas: return self.structure_unit_lemmas, Fact(verb, verb, verb.lemma_, "term") elif verb.lemma_ in self.person_term_lemmas: return self.person_unit_lemmas, Fact(verb, verb, verb.lemma_, "term") elif verb.lemma_ in ('leave', 'render', 'become'): children = verb.children obj_predicate = None for child in children: if child.dep_ in ('oprd', 'dobj', 'acomp'): obj_predicate = child if obj_predicate: if obj_predicate.lemma_ in self.structure_term_lemmas: return self.structure_unit_lemmas, Fact( verb, article[verb.i:obj_predicate.i + 1], 'leave ' + obj_predicate.lemma_, "term") elif obj_predicate.lemma_ in self.person_term_lemmas: return self.person_unit_lemmas, Fact( verb, article[verb.i:obj_predicate.i + 1], 'leave ' + obj_predicate.lemma_, "term") elif verb.lemma_ == 'affect' and self.article_relevance(article): return self.structure_unit_lemmas + self.person_unit_lemmas, Fact( verb, verb, verb.lemma_, "term") elif verb.lemma_ in ('fear', 'assume'): verb_objects = get_objects_of_verb(verb) if verb_objects: verb_object = verb_objects[0] if verb_object.lemma_ in self.person_term_lemmas: return self.person_unit_lemmas, Fact( verb, article[verb.i:verb_object.i + 1], verb.lemma_ + " " + verb_object.text, "term") elif verb_object.lemma_ in self.structure_term_lemmas: return self.structure_unit_lemmas, Fact( verb, article[verb.i:verb_object.i + 1], verb.lemma_ + " " + verb_object.text, "term") elif verb.lemma_ == 'claim': verb_objects = get_objects_of_verb(verb) for verb_object in verb_objects: if verb_object.text == 'lives': return self.person_unit_lemmas, Fact( verb, article[verb.i:verb_object.i + 1], verb.lemma_ + " " + "lives", "term") return None, None
def subject_verb_object_triples(doc): """ Extract an ordered sequence of subject-verb-object (SVO) triples from a spacy-parsed doc. Note that this only works for SVO languages. Args: doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``) Yields: Tuple[``spacy.Span``, ``spacy.Span``, ``spacy.Span``]: the next 3-tuple of spans from ``doc`` representing a (subject, verb, object) triple, in order of appearance """ # TODO: Rewrite rules based on http://www.anthology.aclweb.org/W/W12/W12-0702.pdf # TODO: Think about relative clauses (that-of) e.g., products that include # TODO: What to do about questions, where it may be VSO instead of SVO? # TODO: What about non-adjacent verb negations? # TODO: What about object (noun) negations? if isinstance(doc, SpacySpan): sents = [doc] else: # textacy.Doc or spacy.Doc sents = doc.sents for sent in sents: start_i = sent[0].i verbs_init = get_main_verbs_of_sent(sent) list_candidates = [] verb_tmp_token = None for verb_init in verbs_init: if (verb_init['token'] != verb_tmp_token): verb_tmp_token = verb_init['token'] subjs = get_subjects_of_verb(verb_init['token'], sent) if not subjs: continue verbs = get_span_for_verb_auxiliaries(verb_init['token'], start_i, sent) list_candidates.append((subjs, verbs)) for subjs, verbs in list_candidates: for verb in verbs: objs = get_objects_of_verb(verb['token']) if not objs: continue # add adjacent auxiliaries to verbs, for context # and add compounds to compound nouns for subj in subjs: subj_type = subj.ent_type_ subj = sent[get_span_for_compound_noun(subj)[0] - start_i:subj.i - start_i + 1] for obj in objs: obj_type = obj.ent_type_ if obj.pos != VERB: #obj.pos == NOUN or obj.pos == PROPN: span = get_span_for_compound_noun(obj) elif obj.pos == VERB: #span = get_span_for_verb_auxiliaries(obj, start_i, sent) span = (obj.i, obj.i) else: span = (obj.i, obj.i) obj = sent[span[0] - start_i:span[1] - start_i + 1] score = subj.similarity(obj) + obj.similarity(subj) yield (subj, verb, obj, score, subj_type, obj_type)