Example #1
0
def split(key, seq):
    """
    Split sequence by a function key./

    :param key: Key function.
    :param seq: Sequence to split.
    """
    items = (tuple(v) for k, v in groupby(seq, key))
    items = partition(2, items)
    items = ((v1[-1], v2) for v1, v2 in items)
    yield from items
    def __iter__(self):
        if not self.filepath:
            raise OSError("{} database dump file {} not found; "
                          "has the dataset been downloaded yet?".format(
                              self.project, self.filepath))

        is_bad_category = is_bad_category_funcs.get(self.project,
                                                    {}).get(self.lang)
        bad_wl_starts = _bad_wiki_link_starts.get(self.project,
                                                  {}).get(self.lang, tuple())

        lines = tio.read_json(self.filepath, mode="rb", lines=True)
        for index, source in itertoolz.partition(2, lines):
            if source.get("namespace") != self.namespace:
                continue
            # split opening text from main body text, if available
            opening_text = source.get("opening_text")
            text = source.get("text")
            if opening_text and text and text.startswith(opening_text):
                text = opening_text + "\n\n" + text[len(opening_text):].strip()
            # do minimal cleaning of categories and wiki links, if available
            if is_bad_category:
                categories = tuple(cat for cat in source.get("category", [])
                                   if not is_bad_category(cat))
            else:
                categories = tuple(source.get("category", []))
            wiki_links = tuple(
                wl for wl in source.get("outgoing_link", [])
                if not any(wl.startswith(bwls) for bwls in bad_wl_starts))
            yield {
                "page_id":
                index["index"]["_id"],
                "title":
                source["title"],
                "text":
                text,
                "headings":
                tuple(source.get("heading", [])),
                "wiki_links":
                wiki_links,
                "ext_links":
                tuple(
                    urllib.parse.unquote_plus(el)
                    for el in source.get("external_link", [])),
                "categories":
                categories,
                "dt_created":
                source.get("create_timestamp"),
                "n_incoming_links":
                source.get("incoming_links"),
                "popularity_score":
                source.get("popularity_score"),
            }
Example #3
0
    def assign(self, *args, **kwargs):
        df = self
        for name, value in sorted(chain(partition(2, args), kwargs.items()),
                                  key=itemgetter(0)):
            # TODO if isinstance(value, np.ndarray):
            #     value = self.ctx.array(value)
            # TODO elif isinstance(value, pd.DataFrame):
            # ...
            # TODO elif not isinstance(value, Dataset):
            #     value = self.ctx.collection(value)
            def assign_values(part, values):
                extended = part.assign(**{name: ensure_collection(values)})
                return extended

            src = df.zip_partitions(value, assign_values)
            df = DistributedDataFrame(src, df.index.copy(), df.columns.copy())
            df.columns.append(pd.Series([Column(df, name)], [name]))
        return df
Example #4
0
def direct_quotations(doc):
    """
    Baseline, not-great attempt at direction quotation extraction (no indirect
    or mixed quotations) using rules and patterns. English only.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)

    Yields:
        (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc``
        represented as a (speaker, reporting verb, quotation) 3-tuple

    Notes:
        Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
        Tagging of Reported Speech in Newspaper Articles".

    TODO: Better approach would use ML, but needs a training dataset.
    """
    if hasattr(doc, 'spacy_doc'):
        doc_lang = doc.lang
        doc = doc.spacy_doc
    else:
        doc_lang = doc.vocab.lang
    if doc.lang != 'en':
        raise NotImplementedError('sorry, English-language texts only :(')
    quote_end_punct = {',', '.', '?', '!'}
    quote_indexes = set(
        itertoolz.concat((m.start(), m.end() - 1) for m in re.finditer(
            r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string)))
    quote_positions = list(
        itertoolz.partition(
            2, sorted(tok.i for tok in doc if tok.idx in quote_indexes)))
    sents = list(doc.sents)
    sent_positions = [(sent.start, sent.end) for sent in sents]

    for q0, q1 in quote_positions:
        quote = doc[q0:q1 + 1]

        # we're only looking for direct quotes, not indirect or mixed
        if not any(char in quote_end_punct for char in quote.text[-4:]):
            continue

        # get adjacent sentences
        candidate_sent_indexes = []
        for i, (s0, s1) in enumerate(sent_positions):

            if s0 <= q1 + 1 and s1 > q1:
                candidate_sent_indexes.append(i)
            elif s0 < q0 and s1 >= q0 - 1:
                candidate_sent_indexes.append(i)

        for si in candidate_sent_indexes:
            sent = sents[si]

            # get any reporting verbs
            rvs = [
                tok for tok in sent
                if spacy_utils.preserve_case(tok) is False and tok.lemma_ in
                constants.REPORTING_VERBS and tok.pos_ == 'VERB' and not any(
                    oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)
            ]

            # get target offset against which to measure distances of NEs
            if rvs:
                if len(rvs) == 1:
                    rv = rvs[0]
                else:
                    min_rv_dist = 1000
                    for rv_candidate in rvs:
                        rv_dist = min(
                            abs(rv_candidate.i - qp) for qp in (q0, q1))
                        if rv_dist < min_rv_dist:
                            rv = rv_candidate
                            min_rv_dist = rv_dist
                        else:
                            break
            else:
                # TODO: do we have no other recourse?!
                continue

            try:
                # rv_subj = _find_subjects(rv)[0]
                rv_subj = spacy_utils.get_subjects_of_verb(rv)[0]
            except IndexError:
                continue
    #         if rv_subj.text in {'he', 'she'}:
    #             for ne in named_entities(doc, good_ne_types={'PERSON'}):
    #                 if ne.start < rv_subj.i:
    #                     speaker = ne
    #                 else:
    #                     break
    #         else:
            span = spacy_utils.get_span_for_compound_noun(rv_subj)
            speaker = doc[span[0]:span[1] + 1]

            yield (speaker, rv, quote)
            break
Example #5
0
def test_partition():
    assert list(partition(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)]
    assert list(partition(3, range(7))) == [(0, 1, 2), (3, 4, 5)]
    assert list(partition(3, range(4), pad=-1)) == [(0, 1, 2),
                                                    (3, -1, -1)]
    assert list(partition(2, [])) == []
Example #6
0
def test_partition():
    assert list(partition(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)]
    assert list(partition(3, range(7))) == [(0, 1, 2), (3, 4, 5)]
    assert list(partition(3, range(4), pad=-1)) == [(0, 1, 2),
                                                    (3, -1, -1)]
    assert list(partition(2, [])) == []
Example #7
0
def direct_quotations(doc):
    """
    Baseline, not-great attempt at direction quotation extraction (no indirect
    or mixed quotations) using rules and patterns. English only.

    Args:
        doc (``spacy.Doc``)

    Yields:
        (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc``
            represented as a (speaker, reporting verb, quotation) 3-tuple

    Notes:
        Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
        Tagging of Reported Speech in Newspaper Articles".

    TODO: Better approach would use ML, but needs a training dataset.
    """
    quote_end_punct = {',', '.', '?', '!'}
    quote_indexes = set(itertoolz.concat(
        (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string)))
    quote_positions = list(itertoolz.partition(
        2, sorted(tok.i for tok in doc if tok.idx in quote_indexes)))
    sents = list(doc.sents)
    sent_positions = [(sent.start, sent.end) for sent in sents]

    for q0, q1 in quote_positions:
        quote = doc[q0: q1 + 1]

        # we're only looking for direct quotes, not indirect or mixed
        if not any(char in quote_end_punct for char in quote.text[-4:]):
            continue

        # get adjacent sentences
        candidate_sent_indexes = []
        for i, (s0, s1) in enumerate(sent_positions):

            if s0 <= q1 + 1 and s1 > q1:
                candidate_sent_indexes.append(i)
            elif s0 < q0 and s1 >= q0 - 1:
                candidate_sent_indexes.append(i)

        for si in candidate_sent_indexes:
            sent = sents[si]

            # get any reporting verbs
            rvs = [tok for tok in sent
                   if spacy_utils.preserve_case(tok) is False
                   and tok.lemma_ in REPORTING_VERBS
                   and tok.pos_ == 'VERB'
                   and not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)]

            # get target offset against which to measure distances of NEs
            if rvs:
                if len(rvs) == 1:
                    rv = rvs[0]
                else:
                    min_rv_dist = 1000
                    for rv_candidate in rvs:
                        rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1))
                        if rv_dist < min_rv_dist:
                            rv = rv_candidate
                            min_rv_dist = rv_dist
                        else:
                            break
            else:
                # TODO: do we have no other recourse?!
                continue

            try:
                # rv_subj = _find_subjects(rv)[0]
                rv_subj = get_subjects_of_verb(rv)[0]
            except IndexError:
                continue
    #         if rv_subj.text in {'he', 'she'}:
    #             for ne in named_entities(doc, good_ne_types={'PERSON'}):
    #                 if ne.start < rv_subj.i:
    #                     speaker = ne
    #                 else:
    #                     break
    #         else:
            span = get_span_for_compound_noun(rv_subj)
            speaker = doc[span[0]: span[1] + 1]

            yield (speaker, rv, quote)
            break
Example #8
0
def direct_quotations(doc: Doc) -> Iterable[DQTriple]:
    """
    Extract direct quotations with an attributable speaker from a document
    using simple rules and patterns. Does not extract indirect or mixed quotations!

    Args:
        doc

    Yields:
        Next direct quotation in ``doc`` as a (speaker, cue, content) triple.

    Notes:
        Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
        Tagging of Reported Speech in Newspaper Articles".
    """
    # TODO: train a model to do this instead, maybe similar to entity recognition
    try:
        _reporting_verbs = constants.REPORTING_VERBS[doc.lang_]
    except KeyError:
        raise ValueError(
            f"direct quotation extraction is not implemented for lang='{doc.lang_}', "
            f"only {sorted(constants.REPORTING_VERBS.keys())}")
    qtok_idxs = [tok.i for tok in doc if tok.is_quote]
    if len(qtok_idxs) % 2 != 0:
        raise ValueError(
            f"{len(qtok_idxs)} quotation marks found, indicating an unclosed quotation; "
            "given the limitations of this method, it's safest to bail out "
            "rather than guess which quotation is unclosed")
    qtok_pair_idxs = list(itertoolz.partition(2, qtok_idxs))
    for qtok_start_idx, qtok_end_idx in qtok_pair_idxs:
        content = doc[qtok_start_idx:qtok_end_idx + 1]
        cue = None
        speaker = None
        # filter quotations by content
        if (
                # quotations should have at least a couple tokens
                # excluding the first/last quotation mark tokens
                len(content) < 4
                # filter out titles of books and such, if possible
                or all(tok.is_title for tok in content
                       # if tok.pos in {NOUN, PROPN}
                       if not (tok.is_punct or tok.is_stop))
                # TODO: require closing punctuation before the quotation mark?
                # content[-2].is_punct is False
        ):
            continue
        # get window of adjacent/overlapping sentences
        window_sents = (
            sent for sent in doc.sents
            # these boundary cases are a subtle bit of work...
            if ((
                sent.start < qtok_start_idx and sent.end >= qtok_start_idx - 1
            ) or (sent.start <= qtok_end_idx + 1 and sent.end > qtok_end_idx)))
        # get candidate cue verbs in window
        cue_cands = [
            tok for sent in window_sents for tok in sent
            if (tok.pos == VERB and tok.lemma_ in _reporting_verbs
                # cue verbs must occur *outside* any quotation content
                and not any(qts_idx <= tok.i <= qte_idx
                            for qts_idx, qte_idx in qtok_pair_idxs))
        ]
        # sort candidates by proximity to quote content
        cue_cands = sorted(
            cue_cands,
            key=lambda cc: min(abs(cc.i - qtok_start_idx),
                               abs(cc.i - qtok_end_idx)),
        )
        for cue_cand in cue_cands:
            if cue is not None:
                break
            for speaker_cand in cue_cand.children:
                if speaker_cand.dep in _ACTIVE_SUBJ_DEPS:
                    cue = expand_verb(cue_cand)
                    speaker = expand_noun(speaker_cand)
                    break
        if content and cue and speaker:
            yield DQTriple(
                speaker=sorted(speaker, key=attrgetter("i")),
                cue=sorted(cue, key=attrgetter("i")),
                content=content,
            )