コード例 #1
0
 def simple_subjects_and_objects(self, verb):
     """
     Extract all simple subjects and objects for a given verb.
     Uses Textacy get_objects_of_verb and get_subjects_of_verb methods
     param: verb     A Spacy Token
     return: A list of verb subjects and objects (Spacy Tokens or Spans)
     """
     verb_objects = get_objects_of_verb(verb)
     verb_subjects = get_subjects_of_verb(verb)
     verb_objects.extend(verb_subjects)
     return verb_objects
コード例 #2
0
def subject_verb_object_triples(doc):
    """
    Extract an ordered sequence of subject-verb-object (SVO) triples from a
    spacy-parsed doc. Note that this only works for SVO languages.

    Args:
        doc (``spacy.Doc`` or ``spacy.Span``): either a spacy document
            or a sentence thereof

    Yields:
        (``spacy.Span``, ``spacy.Span``, ``spacy.Span``): the next 3-tuple from ``doc``
            representing a (subject, verb, object) triple, in order of apperance

    # TODO: What to do about questions, where it may be VSO instead of SVO?
    # TODO: What about non-adjacent verb negations?
    # TODO: What about object (noun) negations?
    """
    try:
        sents = doc.sents
    except AttributeError:
        sents = [doc]

    for sent in sents:
        start_i = sent[0].i

        verbs = get_main_verbs_of_sent(sent)
        for verb in verbs:
            subjs = get_subjects_of_verb(verb)
            if not subjs:
                continue
            objs = get_objects_of_verb(verb)
            if not objs:
                continue

            # add adjacent auxiliaries to verbs, for context
            # and add compounds to compound nouns
            verb_span = get_span_for_verb_auxiliaries(verb)
            verb = sent[verb_span[0] - start_i: verb_span[1] - start_i + 1]
            for subj in subjs:
                subj = sent[get_span_for_compound_noun(subj)[0] - start_i: subj.i - start_i + 1]
                for obj in objs:
                    if obj.pos == NOUN:
                        span = get_span_for_compound_noun(obj)
                    elif obj.pos == VERB:
                        span = get_span_for_verb_auxiliaries(obj)
                    else:
                        span = (obj.i, obj.i)
                    obj = sent[span[0] - start_i: span[1] - start_i + 1]

                    yield (subj, verb, obj)
コード例 #3
0
    def verb_relevance(self, verb, article):
        """
        Checks a verb for relevance by:
        1. Comparing to structure term lemmas
        2. Comparing to person term lemmas
        3. Looking for special cases such as 'leave homeless'
        """
        # case for eviction first because we have 'forced eviction' case which would be picked by the 'elif' below
        if 'eviction' in [obj.lemma_ for obj in get_objects_of_verb(verb)]:
            verb_objects = get_objects_of_verb(verb)
            for verb_object in verb_objects:
                if verb_object.text == 'eviction' or verb_object.text == 'evictions':
                    return self.structure_unit_lemmas + self.person_unit_lemmas, Fact(
                        verb, article[verb.i:verb_object.i + 1],
                        verb.lemma_ + " " + "eviction", "term")
        elif verb.lemma_ in self.joint_term_lemmas:
            return self.structure_unit_lemmas + self.person_unit_lemmas, Fact(
                verb, verb, verb.lemma_, "term")
        elif verb.lemma_ in self.structure_term_lemmas:
            return self.structure_unit_lemmas, Fact(verb, verb, verb.lemma_,
                                                    "term")
        elif verb.lemma_ in self.person_term_lemmas:
            return self.person_unit_lemmas, Fact(verb, verb, verb.lemma_,
                                                 "term")

        elif verb.lemma_ in ('leave', 'render', 'become'):
            children = verb.children
            obj_predicate = None
            for child in children:
                if child.dep_ in ('oprd', 'dobj', 'acomp'):
                    obj_predicate = child
            if obj_predicate:
                if obj_predicate.lemma_ in self.structure_term_lemmas:
                    return self.structure_unit_lemmas, Fact(
                        verb, article[verb.i:obj_predicate.i + 1],
                        'leave ' + obj_predicate.lemma_, "term")

                elif obj_predicate.lemma_ in self.person_term_lemmas:
                    return self.person_unit_lemmas, Fact(
                        verb, article[verb.i:obj_predicate.i + 1],
                        'leave ' + obj_predicate.lemma_, "term")

        elif verb.lemma_ == 'affect' and self.article_relevance(article):
            return self.structure_unit_lemmas + self.person_unit_lemmas, Fact(
                verb, verb, verb.lemma_, "term")

        elif verb.lemma_ in ('fear', 'assume'):
            verb_objects = get_objects_of_verb(verb)
            if verb_objects:
                verb_object = verb_objects[0]
                if verb_object.lemma_ in self.person_term_lemmas:
                    return self.person_unit_lemmas, Fact(
                        verb, article[verb.i:verb_object.i + 1],
                        verb.lemma_ + " " + verb_object.text, "term")

                elif verb_object.lemma_ in self.structure_term_lemmas:
                    return self.structure_unit_lemmas, Fact(
                        verb, article[verb.i:verb_object.i + 1],
                        verb.lemma_ + " " + verb_object.text, "term")

        elif verb.lemma_ == 'claim':
            verb_objects = get_objects_of_verb(verb)
            for verb_object in verb_objects:
                if verb_object.text == 'lives':
                    return self.person_unit_lemmas, Fact(
                        verb, article[verb.i:verb_object.i + 1],
                        verb.lemma_ + " " + "lives", "term")

        return None, None
コード例 #4
0
def subject_verb_object_triples(doc):
    """
    Extract an ordered sequence of subject-verb-object (SVO) triples from a
    spacy-parsed doc. Note that this only works for SVO languages.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``)

    Yields:
        Tuple[``spacy.Span``, ``spacy.Span``, ``spacy.Span``]: the next 3-tuple
            of spans from ``doc`` representing a (subject, verb, object) triple,
            in order of appearance
    """
    # TODO: Rewrite rules based on http://www.anthology.aclweb.org/W/W12/W12-0702.pdf
    # TODO: Think about relative clauses (that-of) e.g., products that include
    # TODO: What to do about questions, where it may be VSO instead of SVO?
    # TODO: What about non-adjacent verb negations?
    # TODO: What about object (noun) negations?
    if isinstance(doc, SpacySpan):
        sents = [doc]
    else:  # textacy.Doc or spacy.Doc
        sents = doc.sents

    for sent in sents:
        start_i = sent[0].i
        verbs_init = get_main_verbs_of_sent(sent)
        list_candidates = []
        verb_tmp_token = None
        for verb_init in verbs_init:
            if (verb_init['token'] != verb_tmp_token):
                verb_tmp_token = verb_init['token']
                subjs = get_subjects_of_verb(verb_init['token'], sent)
                if not subjs:
                    continue
                verbs = get_span_for_verb_auxiliaries(verb_init['token'],
                                                      start_i, sent)
                list_candidates.append((subjs, verbs))

        for subjs, verbs in list_candidates:

            for verb in verbs:
                objs = get_objects_of_verb(verb['token'])
                if not objs:
                    continue
                # add adjacent auxiliaries to verbs, for context
                # and add compounds to compound nouns
                for subj in subjs:
                    subj_type = subj.ent_type_
                    subj = sent[get_span_for_compound_noun(subj)[0] -
                                start_i:subj.i - start_i + 1]

                    for obj in objs:
                        obj_type = obj.ent_type_
                        if obj.pos != VERB:  #obj.pos == NOUN or obj.pos == PROPN:
                            span = get_span_for_compound_noun(obj)
                        elif obj.pos == VERB:
                            #span = get_span_for_verb_auxiliaries(obj, start_i, sent)
                            span = (obj.i, obj.i)
                        else:
                            span = (obj.i, obj.i)

                        obj = sent[span[0] - start_i:span[1] - start_i + 1]
                        score = subj.similarity(obj) + obj.similarity(subj)
                        yield (subj, verb, obj, score, subj_type, obj_type)