Example #1
0
 def test_preserve_case(self):
     preserved_cases = [
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0
     ]
     self.assertEqual(
         [int(spacy_utils.preserve_case(tok)) for tok in self.spacy_doc],
         preserved_cases)
Example #2
0
def test_preserve_case(spacy_doc):
    preserved_cases = [
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        1,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        1,
        0,
        1,
        1,
        0,
    ]
    assert [int(spacy_utils.preserve_case(tok)) for tok in spacy_doc] == preserved_cases
Example #3
0
def direct_quotations(doc):
    """
    Baseline, not-great attempt at direction quotation extraction (no indirect
    or mixed quotations) using rules and patterns. English only.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)

    Yields:
        (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc``
            represented as a (speaker, reporting verb, quotation) 3-tuple

    Notes:
        Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
        Tagging of Reported Speech in Newspaper Articles".

    TODO: Better approach would use ML, but needs a training dataset.
    """
    if isinstance(doc, textacy.Doc):
        if doc.lang != 'en':
            raise NotImplementedError('sorry, English-language texts only :(')
        doc = doc.spacy_doc
    quote_end_punct = {',', '.', '?', '!'}
    quote_indexes = set(itertoolz.concat(
        (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string)))
    quote_positions = list(itertoolz.partition(
        2, sorted(tok.i for tok in doc if tok.idx in quote_indexes)))
    sents = list(doc.sents)
    sent_positions = [(sent.start, sent.end) for sent in sents]

    for q0, q1 in quote_positions:
        quote = doc[q0: q1 + 1]

        # we're only looking for direct quotes, not indirect or mixed
        if not any(char in quote_end_punct for char in quote.text[-4:]):
            continue

        # get adjacent sentences
        candidate_sent_indexes = []
        for i, (s0, s1) in enumerate(sent_positions):

            if s0 <= q1 + 1 and s1 > q1:
                candidate_sent_indexes.append(i)
            elif s0 < q0 and s1 >= q0 - 1:
                candidate_sent_indexes.append(i)

        for si in candidate_sent_indexes:
            sent = sents[si]

            # get any reporting verbs
            rvs = [tok for tok in sent
                   if spacy_utils.preserve_case(tok) is False and
                   tok.lemma_ in REPORTING_VERBS and
                   tok.pos_ == 'VERB' and
                   not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)]

            # get target offset against which to measure distances of NEs
            if rvs:
                if len(rvs) == 1:
                    rv = rvs[0]
                else:
                    min_rv_dist = 1000
                    for rv_candidate in rvs:
                        rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1))
                        if rv_dist < min_rv_dist:
                            rv = rv_candidate
                            min_rv_dist = rv_dist
                        else:
                            break
            else:
                # TODO: do we have no other recourse?!
                continue

            try:
                # rv_subj = _find_subjects(rv)[0]
                rv_subj = get_subjects_of_verb(rv)[0]
            except IndexError:
                continue
    #         if rv_subj.text in {'he', 'she'}:
    #             for ne in named_entities(doc, good_ne_types={'PERSON'}):
    #                 if ne.start < rv_subj.i:
    #                     speaker = ne
    #                 else:
    #                     break
    #         else:
            span = get_span_for_compound_noun(rv_subj)
            speaker = doc[span[0]: span[1] + 1]

            yield (speaker, rv, quote)
            break
Example #4
0
def direct_quotations(doc):
    """
    Baseline, not-great attempt at direction quotation extraction (no indirect
    or mixed quotations) using rules and patterns. English only.

    Args:
        doc (``spacy.Doc``)

    Yields:
        (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc``
            represented as a (speaker, reporting verb, quotation) 3-tuple

    Notes:
        Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
        Tagging of Reported Speech in Newspaper Articles".

    TODO: Better approach would use ML, but needs a training dataset.
    """
    quote_end_punct = {',', '.', '?', '!'}
    quote_indexes = set(itertoolz.concat(
        (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string)))
    quote_positions = list(itertoolz.partition(
        2, sorted(tok.i for tok in doc if tok.idx in quote_indexes)))
    sents = list(doc.sents)
    sent_positions = [(sent.start, sent.end) for sent in sents]

    for q0, q1 in quote_positions:
        quote = doc[q0: q1 + 1]

        # we're only looking for direct quotes, not indirect or mixed
        if not any(char in quote_end_punct for char in quote.text[-4:]):
            continue

        # get adjacent sentences
        candidate_sent_indexes = []
        for i, (s0, s1) in enumerate(sent_positions):

            if s0 <= q1 + 1 and s1 > q1:
                candidate_sent_indexes.append(i)
            elif s0 < q0 and s1 >= q0 - 1:
                candidate_sent_indexes.append(i)

        for si in candidate_sent_indexes:
            sent = sents[si]

            # get any reporting verbs
            rvs = [tok for tok in sent
                   if spacy_utils.preserve_case(tok) is False
                   and tok.lemma_ in REPORTING_VERBS
                   and tok.pos_ == 'VERB'
                   and not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)]

            # get target offset against which to measure distances of NEs
            if rvs:
                if len(rvs) == 1:
                    rv = rvs[0]
                else:
                    min_rv_dist = 1000
                    for rv_candidate in rvs:
                        rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1))
                        if rv_dist < min_rv_dist:
                            rv = rv_candidate
                            min_rv_dist = rv_dist
                        else:
                            break
            else:
                # TODO: do we have no other recourse?!
                continue

            try:
                # rv_subj = _find_subjects(rv)[0]
                rv_subj = get_subjects_of_verb(rv)[0]
            except IndexError:
                continue
    #         if rv_subj.text in {'he', 'she'}:
    #             for ne in named_entities(doc, good_ne_types={'PERSON'}):
    #                 if ne.start < rv_subj.i:
    #                     speaker = ne
    #                 else:
    #                     break
    #         else:
            span = get_span_for_compound_noun(rv_subj)
            speaker = doc[span[0]: span[1] + 1]

            yield (speaker, rv, quote)
            break
Example #5
0
 def test_preserve_case(self):
     preserved_cases = [
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0]
     self.assertEqual([int(spacy_utils.preserve_case(tok)) for tok in self.spacy_doc],
                      preserved_cases)