def sent_rm_pos(sent): finnpos_analys = get_finnpos_analys(sent) anns = sent.xpath("./annotations/annotation") new_anns = anns.copy() for ann in anns: tok, tok_len = get_ann_pos(ann) if tok_len != 1: continue props = finnpos_analys[tok][1] if any((match(props) for match in to_remove)): new_anns.remove(ann) trim_anns(anns, new_anns)
def sent_span_dom(sent): anns = sent.xpath("./annotations/annotation") token_positions = {} for ann in anns: if sup_only and not HasSupportTournament.rank(ann): continue tok, tok_len = get_ann_pos(ann) token_positions.setdefault(tok, []).append((tok_len, ann)) new_anns = greedy_max_span(token_positions) if sup_only: for ann in anns: if not HasSupportTournament.rank(ann): new_anns.append(ann) trim_anns(anns, new_anns)
def sent_rm_ambg(sent): anns = sent.xpath("./annotations/annotation") new_anns = anns.copy() span_counts = {} for ann in anns: span = get_ann_pos(ann) if span not in span_counts: span_counts[span] = 0 span_counts[span] += 1 for ann in anns: span = get_ann_pos(ann) if span_counts[span] >= 2: new_anns.remove(ann) trim_anns(anns, new_anns)
def sent_span_dom(sent): anns = sent.xpath("./annotations/annotation") starts = [] for ann in anns: anchor_pos = get_ann_pos_dict(ann) anchor = ann.attrib["anchor"] starts.append(int(anchor_pos["char"])) starts.sort() char_positions = {} for ann in anns: anchor_pos = get_ann_pos_dict(ann) anchor = ann.attrib["anchor"] cur_start = int(anchor_pos["char"]) cur_start_idx = starts.index(cur_start) anchor_len = len(anchor) span = 0 while (cur_start_idx + span < len(starts) and starts[cur_start_idx + span] <= cur_start + anchor_len): span += 1 char_positions.setdefault(cur_start, []).append((span, ann)) new_anns = greedy_max_span(char_positions) trim_anns(anns, new_anns)