def split(key, seq): """ Split sequence by a function key./ :param key: Key function. :param seq: Sequence to split. """ items = (tuple(v) for k, v in groupby(seq, key)) items = partition(2, items) items = ((v1[-1], v2) for v1, v2 in items) yield from items
def __iter__(self): if not self.filepath: raise OSError("{} database dump file {} not found; " "has the dataset been downloaded yet?".format( self.project, self.filepath)) is_bad_category = is_bad_category_funcs.get(self.project, {}).get(self.lang) bad_wl_starts = _bad_wiki_link_starts.get(self.project, {}).get(self.lang, tuple()) lines = tio.read_json(self.filepath, mode="rb", lines=True) for index, source in itertoolz.partition(2, lines): if source.get("namespace") != self.namespace: continue # split opening text from main body text, if available opening_text = source.get("opening_text") text = source.get("text") if opening_text and text and text.startswith(opening_text): text = opening_text + "\n\n" + text[len(opening_text):].strip() # do minimal cleaning of categories and wiki links, if available if is_bad_category: categories = tuple(cat for cat in source.get("category", []) if not is_bad_category(cat)) else: categories = tuple(source.get("category", [])) wiki_links = tuple( wl for wl in source.get("outgoing_link", []) if not any(wl.startswith(bwls) for bwls in bad_wl_starts)) yield { "page_id": index["index"]["_id"], "title": source["title"], "text": text, "headings": tuple(source.get("heading", [])), "wiki_links": wiki_links, "ext_links": tuple( urllib.parse.unquote_plus(el) for el in source.get("external_link", [])), "categories": categories, "dt_created": source.get("create_timestamp"), "n_incoming_links": source.get("incoming_links"), "popularity_score": source.get("popularity_score"), }
def assign(self, *args, **kwargs): df = self for name, value in sorted(chain(partition(2, args), kwargs.items()), key=itemgetter(0)): # TODO if isinstance(value, np.ndarray): # value = self.ctx.array(value) # TODO elif isinstance(value, pd.DataFrame): # ... # TODO elif not isinstance(value, Dataset): # value = self.ctx.collection(value) def assign_values(part, values): extended = part.assign(**{name: ensure_collection(values)}) return extended src = df.zip_partitions(value, assign_values) df = DistributedDataFrame(src, df.index.copy(), df.columns.copy()) df.columns.append(pd.Series([Column(df, name)], [name])) return df
def direct_quotations(doc): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. Args: doc (``textacy.Doc`` or ``spacy.Doc``) Yields: (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc`` represented as a (speaker, reporting verb, quotation) 3-tuple Notes: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". TODO: Better approach would use ML, but needs a training dataset. """ if hasattr(doc, 'spacy_doc'): doc_lang = doc.lang doc = doc.spacy_doc else: doc_lang = doc.vocab.lang if doc.lang != 'en': raise NotImplementedError('sorry, English-language texts only :(') quote_end_punct = {',', '.', '?', '!'} quote_indexes = set( itertoolz.concat((m.start(), m.end() - 1) for m in re.finditer( r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string))) quote_positions = list( itertoolz.partition( 2, sorted(tok.i for tok in doc if tok.idx in quote_indexes))) sents = list(doc.sents) sent_positions = [(sent.start, sent.end) for sent in sents] for q0, q1 in quote_positions: quote = doc[q0:q1 + 1] # we're only looking for direct quotes, not indirect or mixed if not any(char in quote_end_punct for char in quote.text[-4:]): continue # get adjacent sentences candidate_sent_indexes = [] for i, (s0, s1) in enumerate(sent_positions): if s0 <= q1 + 1 and s1 > q1: candidate_sent_indexes.append(i) elif s0 < q0 and s1 >= q0 - 1: candidate_sent_indexes.append(i) for si in candidate_sent_indexes: sent = sents[si] # get any reporting verbs rvs = [ tok for tok in sent if spacy_utils.preserve_case(tok) is False and tok.lemma_ in constants.REPORTING_VERBS and tok.pos_ == 'VERB' and not any( oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions) ] # get target offset against which to measure distances of NEs if rvs: if len(rvs) == 1: rv = rvs[0] else: min_rv_dist = 1000 for rv_candidate in rvs: rv_dist = min( abs(rv_candidate.i - qp) for qp in (q0, q1)) if rv_dist < min_rv_dist: rv = rv_candidate min_rv_dist = rv_dist else: break else: # TODO: do we have no other recourse?! continue try: # rv_subj = _find_subjects(rv)[0] rv_subj = spacy_utils.get_subjects_of_verb(rv)[0] except IndexError: continue # if rv_subj.text in {'he', 'she'}: # for ne in named_entities(doc, good_ne_types={'PERSON'}): # if ne.start < rv_subj.i: # speaker = ne # else: # break # else: span = spacy_utils.get_span_for_compound_noun(rv_subj) speaker = doc[span[0]:span[1] + 1] yield (speaker, rv, quote) break
def test_partition(): assert list(partition(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)] assert list(partition(3, range(7))) == [(0, 1, 2), (3, 4, 5)] assert list(partition(3, range(4), pad=-1)) == [(0, 1, 2), (3, -1, -1)] assert list(partition(2, [])) == []
def direct_quotations(doc): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. Args: doc (``spacy.Doc``) Yields: (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc`` represented as a (speaker, reporting verb, quotation) 3-tuple Notes: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". TODO: Better approach would use ML, but needs a training dataset. """ quote_end_punct = {',', '.', '?', '!'} quote_indexes = set(itertoolz.concat( (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string))) quote_positions = list(itertoolz.partition( 2, sorted(tok.i for tok in doc if tok.idx in quote_indexes))) sents = list(doc.sents) sent_positions = [(sent.start, sent.end) for sent in sents] for q0, q1 in quote_positions: quote = doc[q0: q1 + 1] # we're only looking for direct quotes, not indirect or mixed if not any(char in quote_end_punct for char in quote.text[-4:]): continue # get adjacent sentences candidate_sent_indexes = [] for i, (s0, s1) in enumerate(sent_positions): if s0 <= q1 + 1 and s1 > q1: candidate_sent_indexes.append(i) elif s0 < q0 and s1 >= q0 - 1: candidate_sent_indexes.append(i) for si in candidate_sent_indexes: sent = sents[si] # get any reporting verbs rvs = [tok for tok in sent if spacy_utils.preserve_case(tok) is False and tok.lemma_ in REPORTING_VERBS and tok.pos_ == 'VERB' and not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)] # get target offset against which to measure distances of NEs if rvs: if len(rvs) == 1: rv = rvs[0] else: min_rv_dist = 1000 for rv_candidate in rvs: rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1)) if rv_dist < min_rv_dist: rv = rv_candidate min_rv_dist = rv_dist else: break else: # TODO: do we have no other recourse?! continue try: # rv_subj = _find_subjects(rv)[0] rv_subj = get_subjects_of_verb(rv)[0] except IndexError: continue # if rv_subj.text in {'he', 'she'}: # for ne in named_entities(doc, good_ne_types={'PERSON'}): # if ne.start < rv_subj.i: # speaker = ne # else: # break # else: span = get_span_for_compound_noun(rv_subj) speaker = doc[span[0]: span[1] + 1] yield (speaker, rv, quote) break
def direct_quotations(doc: Doc) -> Iterable[DQTriple]: """ Extract direct quotations with an attributable speaker from a document using simple rules and patterns. Does not extract indirect or mixed quotations! Args: doc Yields: Next direct quotation in ``doc`` as a (speaker, cue, content) triple. Notes: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". """ # TODO: train a model to do this instead, maybe similar to entity recognition try: _reporting_verbs = constants.REPORTING_VERBS[doc.lang_] except KeyError: raise ValueError( f"direct quotation extraction is not implemented for lang='{doc.lang_}', " f"only {sorted(constants.REPORTING_VERBS.keys())}") qtok_idxs = [tok.i for tok in doc if tok.is_quote] if len(qtok_idxs) % 2 != 0: raise ValueError( f"{len(qtok_idxs)} quotation marks found, indicating an unclosed quotation; " "given the limitations of this method, it's safest to bail out " "rather than guess which quotation is unclosed") qtok_pair_idxs = list(itertoolz.partition(2, qtok_idxs)) for qtok_start_idx, qtok_end_idx in qtok_pair_idxs: content = doc[qtok_start_idx:qtok_end_idx + 1] cue = None speaker = None # filter quotations by content if ( # quotations should have at least a couple tokens # excluding the first/last quotation mark tokens len(content) < 4 # filter out titles of books and such, if possible or all(tok.is_title for tok in content # if tok.pos in {NOUN, PROPN} if not (tok.is_punct or tok.is_stop)) # TODO: require closing punctuation before the quotation mark? # content[-2].is_punct is False ): continue # get window of adjacent/overlapping sentences window_sents = ( sent for sent in doc.sents # these boundary cases are a subtle bit of work... if (( sent.start < qtok_start_idx and sent.end >= qtok_start_idx - 1 ) or (sent.start <= qtok_end_idx + 1 and sent.end > qtok_end_idx))) # get candidate cue verbs in window cue_cands = [ tok for sent in window_sents for tok in sent if (tok.pos == VERB and tok.lemma_ in _reporting_verbs # cue verbs must occur *outside* any quotation content and not any(qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_pair_idxs)) ] # sort candidates by proximity to quote content cue_cands = sorted( cue_cands, key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)), ) for cue_cand in cue_cands: if cue is not None: break for speaker_cand in cue_cand.children: if speaker_cand.dep in _ACTIVE_SUBJ_DEPS: cue = expand_verb(cue_cand) speaker = expand_noun(speaker_cand) break if content and cue and speaker: yield DQTriple( speaker=sorted(speaker, key=attrgetter("i")), cue=sorted(cue, key=attrgetter("i")), content=content, )