Ejemplo n.º 1
0
def subject_verb_object_triples(doc):
    """
    Extract an ordered sequence of subject-verb-object (SVO) triples from a
    spacy-parsed doc. Note that this only works for SVO languages.

    Args:
        doc (``spacy.Doc`` or ``spacy.Span``): either a spacy document
            or a sentence thereof

    Yields:
        (``spacy.Span``, ``spacy.Span``, ``spacy.Span``): the next 3-tuple from ``doc``
            representing a (subject, verb, object) triple, in order of apperance

    # TODO: What to do about questions, where it may be VSO instead of SVO?
    # TODO: What about non-adjacent verb negations?
    # TODO: What about object (noun) negations?
    """
    try:
        sents = doc.sents
    except AttributeError:
        sents = [doc]

    for sent in sents:
        start_i = sent[0].i

        verbs = get_main_verbs_of_sent(sent)
        for verb in verbs:
            subjs = get_subjects_of_verb(verb)
            if not subjs:
                continue
            objs = get_objects_of_verb(verb)
            if not objs:
                continue

            # add adjacent auxiliaries to verbs, for context
            # and add compounds to compound nouns
            verb_span = get_span_for_verb_auxiliaries(verb)
            verb = sent[verb_span[0] - start_i: verb_span[1] - start_i + 1]
            for subj in subjs:
                subj = sent[get_span_for_compound_noun(subj)[0] - start_i: subj.i - start_i + 1]
                for obj in objs:
                    if obj.pos == NOUN:
                        span = get_span_for_compound_noun(obj)
                    elif obj.pos == VERB:
                        span = get_span_for_verb_auxiliaries(obj)
                    else:
                        span = (obj.i, obj.i)
                    obj = sent[span[0] - start_i: span[1] - start_i + 1]

                    yield (subj, verb, obj)
Ejemplo n.º 2
0
def direct_quotations(doc):
    """
    Baseline, not-great attempt at direction quotation extraction (no indirect
    or mixed quotations) using rules and patterns. English only.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)

    Yields:
        (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc``
            represented as a (speaker, reporting verb, quotation) 3-tuple

    Notes:
        Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
        Tagging of Reported Speech in Newspaper Articles".

    TODO: Better approach would use ML, but needs a training dataset.
    """
    if isinstance(doc, textacy.Doc):
        if doc.lang != 'en':
            raise NotImplementedError('sorry, English-language texts only :(')
        doc = doc.spacy_doc
    quote_end_punct = {',', '.', '?', '!'}
    quote_indexes = set(itertoolz.concat(
        (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string)))
    quote_positions = list(itertoolz.partition(
        2, sorted(tok.i for tok in doc if tok.idx in quote_indexes)))
    sents = list(doc.sents)
    sent_positions = [(sent.start, sent.end) for sent in sents]

    for q0, q1 in quote_positions:
        quote = doc[q0: q1 + 1]

        # we're only looking for direct quotes, not indirect or mixed
        if not any(char in quote_end_punct for char in quote.text[-4:]):
            continue

        # get adjacent sentences
        candidate_sent_indexes = []
        for i, (s0, s1) in enumerate(sent_positions):

            if s0 <= q1 + 1 and s1 > q1:
                candidate_sent_indexes.append(i)
            elif s0 < q0 and s1 >= q0 - 1:
                candidate_sent_indexes.append(i)

        for si in candidate_sent_indexes:
            sent = sents[si]

            # get any reporting verbs
            rvs = [tok for tok in sent
                   if spacy_utils.preserve_case(tok) is False and
                   tok.lemma_ in REPORTING_VERBS and
                   tok.pos_ == 'VERB' and
                   not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)]

            # get target offset against which to measure distances of NEs
            if rvs:
                if len(rvs) == 1:
                    rv = rvs[0]
                else:
                    min_rv_dist = 1000
                    for rv_candidate in rvs:
                        rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1))
                        if rv_dist < min_rv_dist:
                            rv = rv_candidate
                            min_rv_dist = rv_dist
                        else:
                            break
            else:
                # TODO: do we have no other recourse?!
                continue

            try:
                # rv_subj = _find_subjects(rv)[0]
                rv_subj = get_subjects_of_verb(rv)[0]
            except IndexError:
                continue
    #         if rv_subj.text in {'he', 'she'}:
    #             for ne in named_entities(doc, good_ne_types={'PERSON'}):
    #                 if ne.start < rv_subj.i:
    #                     speaker = ne
    #                 else:
    #                     break
    #         else:
            span = get_span_for_compound_noun(rv_subj)
            speaker = doc[span[0]: span[1] + 1]

            yield (speaker, rv, quote)
            break
Ejemplo n.º 3
0
def direct_quotations(doc):
    """
    Baseline, not-great attempt at direction quotation extraction (no indirect
    or mixed quotations) using rules and patterns. English only.

    Args:
        doc (``spacy.Doc``)

    Yields:
        (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc``
            represented as a (speaker, reporting verb, quotation) 3-tuple

    Notes:
        Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
        Tagging of Reported Speech in Newspaper Articles".

    TODO: Better approach would use ML, but needs a training dataset.
    """
    quote_end_punct = {',', '.', '?', '!'}
    quote_indexes = set(itertoolz.concat(
        (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string)))
    quote_positions = list(itertoolz.partition(
        2, sorted(tok.i for tok in doc if tok.idx in quote_indexes)))
    sents = list(doc.sents)
    sent_positions = [(sent.start, sent.end) for sent in sents]

    for q0, q1 in quote_positions:
        quote = doc[q0: q1 + 1]

        # we're only looking for direct quotes, not indirect or mixed
        if not any(char in quote_end_punct for char in quote.text[-4:]):
            continue

        # get adjacent sentences
        candidate_sent_indexes = []
        for i, (s0, s1) in enumerate(sent_positions):

            if s0 <= q1 + 1 and s1 > q1:
                candidate_sent_indexes.append(i)
            elif s0 < q0 and s1 >= q0 - 1:
                candidate_sent_indexes.append(i)

        for si in candidate_sent_indexes:
            sent = sents[si]

            # get any reporting verbs
            rvs = [tok for tok in sent
                   if spacy_utils.preserve_case(tok) is False
                   and tok.lemma_ in REPORTING_VERBS
                   and tok.pos_ == 'VERB'
                   and not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)]

            # get target offset against which to measure distances of NEs
            if rvs:
                if len(rvs) == 1:
                    rv = rvs[0]
                else:
                    min_rv_dist = 1000
                    for rv_candidate in rvs:
                        rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1))
                        if rv_dist < min_rv_dist:
                            rv = rv_candidate
                            min_rv_dist = rv_dist
                        else:
                            break
            else:
                # TODO: do we have no other recourse?!
                continue

            try:
                # rv_subj = _find_subjects(rv)[0]
                rv_subj = get_subjects_of_verb(rv)[0]
            except IndexError:
                continue
    #         if rv_subj.text in {'he', 'she'}:
    #             for ne in named_entities(doc, good_ne_types={'PERSON'}):
    #                 if ne.start < rv_subj.i:
    #                     speaker = ne
    #                 else:
    #                     break
    #         else:
            span = get_span_for_compound_noun(rv_subj)
            speaker = doc[span[0]: span[1] + 1]

            yield (speaker, rv, quote)
            break
Ejemplo n.º 4
0
def subject_verb_object_triples(doc):
    """
    Extract an ordered sequence of subject-verb-object (SVO) triples from a
    spacy-parsed doc. Note that this only works for SVO languages.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``)

    Yields:
        Tuple[``spacy.Span``, ``spacy.Span``, ``spacy.Span``]: the next 3-tuple
            of spans from ``doc`` representing a (subject, verb, object) triple,
            in order of appearance
    """
    # TODO: Rewrite rules based on http://www.anthology.aclweb.org/W/W12/W12-0702.pdf
    # TODO: Think about relative clauses (that-of) e.g., products that include
    # TODO: What to do about questions, where it may be VSO instead of SVO?
    # TODO: What about non-adjacent verb negations?
    # TODO: What about object (noun) negations?
    if isinstance(doc, SpacySpan):
        sents = [doc]
    else:  # textacy.Doc or spacy.Doc
        sents = doc.sents

    for sent in sents:
        start_i = sent[0].i
        verbs_init = get_main_verbs_of_sent(sent)
        list_candidates = []
        verb_tmp_token = None
        for verb_init in verbs_init:
            if (verb_init['token'] != verb_tmp_token):
                verb_tmp_token = verb_init['token']
                subjs = get_subjects_of_verb(verb_init['token'], sent)
                if not subjs:
                    continue
                verbs = get_span_for_verb_auxiliaries(verb_init['token'],
                                                      start_i, sent)
                list_candidates.append((subjs, verbs))

        for subjs, verbs in list_candidates:

            for verb in verbs:
                objs = get_objects_of_verb(verb['token'])
                if not objs:
                    continue
                # add adjacent auxiliaries to verbs, for context
                # and add compounds to compound nouns
                for subj in subjs:
                    subj_type = subj.ent_type_
                    subj = sent[get_span_for_compound_noun(subj)[0] -
                                start_i:subj.i - start_i + 1]

                    for obj in objs:
                        obj_type = obj.ent_type_
                        if obj.pos != VERB:  #obj.pos == NOUN or obj.pos == PROPN:
                            span = get_span_for_compound_noun(obj)
                        elif obj.pos == VERB:
                            #span = get_span_for_verb_auxiliaries(obj, start_i, sent)
                            span = (obj.i, obj.i)
                        else:
                            span = (obj.i, obj.i)

                        obj = sent[span[0] - start_i:span[1] - start_i + 1]
                        score = subj.similarity(obj) + obj.similarity(subj)
                        yield (subj, verb, obj, score, subj_type, obj_type)