def gen_tasks(tree: _ElementTree) -> Iterator[Task]: for task_node in tree.findall('/body/ul/li'): task_dict = dict(node_to_dict(task_node)) recurrence = None if task_dict.get('Recurrence info'): assert isinstance(task_dict, dict) recnode = cast(Dict[str, str], task_dict['Recurrence info']) recurrence = Recurrence( frequency=cast(Frequency, recnode['Frequency']), start=ensure(parse_timestamp_ms(recnode['Start'])), end=ensure(parse_timestamp_ms(recnode['End'])), hour=int(recnode['Hour of day to fire']), every=maybe(recnode.get('Every'), int) or 1, weekday_num=maybe(recnode.get('Weekday number'), int), day_of_month=maybe(recnode.get('Day number of month'), parse_day_num), day_of_week=maybe(recnode.get('Day of week'), lambda x: cast(Weekday, x)), month=maybe(recnode.get('Month of year'), lambda x: cast(Month, x)), ) simple_fields = cast(Dict[str, str], task_dict) task = Task(title=simple_fields['Title'], created=ensure( parse_timestamp_ms(simple_fields['Created time'])), state=cast(State, simple_fields['State']), due=maybe(simple_fields.get('Due date'), lambda x: parse_timestamp_ms(x)), recurrence=recurrence) print(task) yield task
def get_mentions(tree: etree._ElementTree,) -> ty.Dict[ty.Tuple[str, str], Mention]: """Extract the mentions from an ANCOR-TEI document.""" mentions = tree.xpath( ( './tei:standOff/tei:annotation[@tei:type="coreference"]' '/tei:spanGrp[@tei:subtype="mention"]/tei:span' ), namespaces=NSMAP, ) if not mentions: raise ValueError("`tree` has no mention spans") features = get_fs(tree) texts_lst = tree.findall(f"{TEI}text") if not texts_lst: raise ValueError( "Attempting to extract mentions from a document without a text" ) tokens_id_store = { xmlid(elt): elt for text in texts_lst for elt in text.iter(*TOKEN_TAGS) } res = dict() for m_elt in mentions: try: m = Mention.from_urs(m_elt, tokens_id_store.get, features.get) except ValueError as e: logger.warning(f"Skipping span {xmlid(m)}: {e}") continue if m.span_type not in MENTION_TYPES: if m.span_type in IGNORED_MENTION_TYPES: logger.debug( f"Ignoring span {m.identifier!r} with mention type {m.span_type!r}" ) else: logger.warning( f"Span {m.identifier!r} has an invalid mention type ({m.span_type!r})" ) continue res[(xmlid(m.targets[0]), xmlid(m.targets[-1]))] = m return res
def scan_fields(tree: _ElementTree) -> List[Dict[str, Set[str]]]: task_fields: Dict[str, Set[str]] = {} location_fields: Dict[str, Set[str]] = {} recurrence_fields: Dict[str, Set[str]] = {} for task_node in tree.findall('/body/ul/li'): task_dict = dict(node_to_dict(task_node)) for key, val in task_dict.items(): if key not in ['Recurrence info', 'Location']: task_fields.setdefault(key, set()).add(cast(str, val)) if task_dict.get('Location'): location_dict = cast(Dict[str, str], task_dict['Location']) for key, val in location_dict.items(): location_fields.setdefault(key, set()).add(val) if task_dict.get('Recurrence info'): recurrence_dict = cast(Dict[str, str], task_dict['Recurrence info']) for key, val in recurrence_dict.items(): recurrence_fields.setdefault(key, set()).add(val) task_fields = chop(task_fields) location_fields = chop(location_fields) recurrence_fields = chop(recurrence_fields) return [task_fields, recurrence_fields, location_fields]
def test__parse_node(valid_nodl: etree._ElementTree): nodes = valid_nodl.findall('node') node = nodl._parsing._v1._parsing._parse_node(nodes[1]) assert node.actions and node.parameters and node.services and node.topics
def spans_from_doc( doc: etree._ElementTree, min_width: int = 1, max_width: int = 26, context: ty.Tuple[int, int] = (10, 10), length_buckets: ty.Optional[ty.Sequence[int]] = (1, 2, 3, 4, 5, 7, 15, 32, 63), ) -> ty.Iterable[MentionFeaturesDict]: """ Return all the text spans of `#doc`, with their mention type, definiteness and anaphoricity (for those who are not mentions, all of these are `None`) """ w_pos = get_w_pos(doc) units = get_mentions(doc) nlp = spacy.load("fr_core_news_lg") nlp.tokenizer = CustomTokenizer(nlp.vocab) for utterance in doc.findall(".//tei:u", namespaces=NSMAP): content: ty.List[etree._Element] = list(utterance.iter(*TOKEN_TAGS)) processed_utterance = nlp([t.text for t in content]) ent_dict = {(e[0], e[-1]): e.label_ for e in processed_utterance.ents} noun_chunks = sorted(processed_utterance.noun_chunks) spans = generate_spans_with_context( zip(content, ty.cast(ty.Iterable[spacy.tokens.Token], processed_utterance)), min_width, max_width, *context, ) for left_context_t, span_t, right_context_t in spans: # FIXME: Dirty way to split out spacy processing left_context, processed_left = ( zip(*left_context_t) if left_context_t else ([], []) ) right_context, processed_right = ( zip(*right_context_t) if right_context_t else ([], []) ) span, processed_span = zip(*span_t) start_id, end_id = xmlid(span[0]), xmlid(span[-1]) mention = units.get((start_id, end_id)) pos = [w.pos_ for w in (*processed_left, *processed_span, *processed_right)] lemma = [ w.lemma_ for w in (*processed_left, *processed_span, *processed_right) ] morph = [ morph_from_tag(w.tag_) for w in (*processed_left, *processed_span, *processed_right) ] left_context = [w.text for w in left_context] right_context = [w.text for w in right_context] if len(left_context) < context[0]: left_context.insert(0, "<start>") pos.insert(0, "<start>") lemma.insert(0, "<start>") morph.insert(0, []) if len(right_context) < context[1]: right_context.append("<end>") pos.append("<end>") lemma.append("<end>") morph.append([]) content = [w.text for w in span] length = ( int(np.digitize(len(content), bins=length_buckets, right=True)) if length_buckets is not None else len(content) ) entity_type = ent_dict.get((processed_span[0], processed_span[-1]), None) chunk_inclusion = span_inclusion(processed_span, noun_chunks) if mention is None: yield ( { "content": content, "left_context": left_context, "right_context": right_context, "length": length, "type": None, "new": None, "def": None, "id": None, "start": w_pos[xmlid(span[0])], "end": w_pos[xmlid(span[-1])], "pos": pos, "lemma": lemma, "morph": morph, "entity_type": entity_type, "chunk_inclusion": chunk_inclusion, } ) else: yield { "content": content, "left_context": left_context, "right_context": right_context, "length": length, "type": mention.span_type, "new": mention.features.get("NEW", "_"), "def": mention.features.get("DEF", "_"), "id": mention.identifier, "start": w_pos[xmlid(span[0])], "end": w_pos[xmlid(span[-1])], "pos": pos, "lemma": lemma, "morph": morph, "entity_type": entity_type, "chunk_inclusion": chunk_inclusion, }