def unified_test_dev_split(inf, ingoldf, keyin, goldkeyin, outf, keyout): gold_sent_iter = peekable(iter_sentences(ingoldf)) rm_inst_ids = [] def sent_rm_gold(sent): gold_sent = gold_sent_iter.peek(None) if gold_sent is not None and gold_sent.attrib["id"] == sent.attrib[ "id"]: for instance in sent.xpath("./instance"): rm_inst_ids.append(instance.attrib["id"]) next(gold_sent_iter) return BYPASS transform_sentences(inf, sent_rm_gold, outf) def next_rm(): try: return rm_inst_ids.pop(0) except IndexError: return None rm_id = next_rm() for line in keyin: if rm_id == line.split()[0]: rm_id = next_rm() continue keyout.write(line) assert len(rm_inst_ids) == 0 and rm_id is None
def iter_sentences_opensubs18_man_ann(stream): # XXX: This assumes a 1-1 imdb subtitle correspondance -- which should be # the case near the beginning where the man-ann takes place, but should be # fixed in general for sent in iter_sentences(stream): sources, imdb, sent_id = sent.attrib["id"].split("; ") sent_id = "stiff.{:010d}.000.{:08d}".format(int(imdb), int(sent_id)) yield sent_id, sent
def unigram(inf, keyout, wn): for sent in iter_sentences(inf): for instance in sent.xpath("instance"): inst_id = instance.attrib["id"] word, pos, lemmas = lemmas_from_instance(wn, instance) if not len(lemmas): sys.stderr.write("No lemma found for {} {}\n".format(word, pos)) continue lemma = lemmas[0] write_lemma(keyout, inst_id, lemma)
def lex_ambg_hist_uni(inf, wn): hist = Counter() for sent in iter_sentences(inf): instances = sent.xpath("instance") for inst in instances: ambg = len( wn.lemmas(inst.attrib["lemma"], UNI_POS_WN_MAP[inst.attrib["pos"]])) hist[ambg] += 1 return hist
def plot_train_entropies( eurosensetrainxml, eurosensetrainkey, stifftrainxml, stifftrainkey, semcorxml, semcorkey, outf, ): from statsmodels.sandbox.nonparametric import kernels from statsmodels.nonparametric.kde import bandwidths fig, (ax1, ax2, ax3) = pl.subplots(3, sharex=True, gridspec_kw={"hspace": 0.05}) def add_to_data(data, dists): for dist in dists: insts = sum(dist.values()) h = entropy(dist, insts) data.extend((h for _ in range(int(insts + 0.5)))) # EuroSense/STIFF bw = None for inf, keyin, ax in [ (eurosensetrainxml, eurosensetrainkey, ax1), (stifftrainxml, stifftrainkey, ax2), ]: data = [] add_to_data(data, iter_dists_sup(inf, keyin)) if bw is None: bw = bandwidths.select_bandwidth(data, "scott", kernels.Gaussian) sns.distplot(data, kde_kws=dict(bw=bw, gridsize=1000), ax=ax) # SemCor semcor_vocab = {} build_uni_sense_dist(iter_sentences(semcorxml), semcorkey, semcor_vocab) data = [] add_to_data(data, semcor_vocab.values()) # Plot sns.distplot(data, kde_kws=dict(bw=bw, gridsize=1000), ax=ax3) ax3.set_xlim(-0.1) ax3.set_xlabel("Entropy") ax1.set_ylabel("EuroSense instance density") ax2.set_ylabel("STIFF instance density") ax3.set_ylabel("SemCor instance density") fix_border(ax1) fix_border(ax2) fix_border(ax3) ax3.xaxis.set_minor_locator(MultipleLocator(0.1)) fig.set_size_inches(441.0 / 72, 645.0 / 72) if outf: pl.savefig(outf, bbox_inches="tight") else: pl.show()
def unified_to_senseval(inf: IO, keyin: IO, outdir: str): """ Converts from the unified format to a Senseval-3 -style format in individual files. The resulting files should be directly usable to train a single word model with ItMakesSense or can be gathered using. This is a scatter type operation. """ out_files: Dict[str, str] = {} for sent_elem in iter_sentences(inf): for inst in sent_elem.xpath("instance"): lemma_str = inst.attrib["lemma"].lower() pos_str = inst.attrib["pos"] pos_chr = UNI_POS_WN_MAP[pos_str] lemma_pos = "{}.{}".format(lemma_str, pos_chr) # Write XML out_dir = pjoin(outdir, lemma_pos) if lemma_pos not in out_files: makedirs(out_dir, exist_ok=True) out_fn = pjoin(out_dir, "train.xml") out_f = open(out_fn, "w") lexical_sample_head(out_f) lexelt_head(lemma_str, pos_chr, out_f) else: out_fn = out_files[lemma_pos] out_f = open(out_fn, "a") with instance(inst, out_f): write_context(sent_elem, inst, out_f) out_f.close() # Write key file key_fn = pjoin(out_dir, "train.key") key_line = keyin.readline() key_id, key_synset = key_line.rstrip().split(" ", 1) assert key_id == inst.attrib["id"] if lemma_pos not in out_files: key_f = open(key_fn, "w") else: key_f = open(key_fn, "a") out_line = "{} {} {}\n".format(lemma_pos, key_id, key_synset) key_f.write(out_line) key_f.close() # Add to out_files if lemma_pos not in out_files: out_files[lemma_pos] = out_fn for out_fn in out_files.values(): with open(out_fn, "a") as out_f: lexelt_foot(out_f) lexical_sample_foot(out_f)
def overlap_examples(inf): for sent in iter_sentences(inf): tok_lems = sent.xpath("./text[@id='zh-tok']")[0].text.split(" ") untok_lems = set() for ann in sent.xpath("./annotations/annotation[@lang='zh']"): anchor_positions = ann.attrib["anchor-positions"] for position in anchor_positions.split(" "): anchor = parse_qs_single(position) source = anchor["from-id"] if source == "zh-untok": untok_lems.add(ann.attrib["lemma"]) for untok_lem in untok_lems: if not (any(untok_lem in tok_lem for tok_lem in tok_lems)): print("Not a substring:", untok_lem) for text in sent.xpath("./text"): print(text.text)
def sent_report(inf, report_cb, subtotal=None): sents = 0 done = False try: # XXX: take into account token length for coverage for sent in iter_sentences(inf): yield sent sents += 1 if subtotal is not None and sents % subtotal == 0: print(f"Report at {sents} sentences:") report_cb() done = True finally: if sents: if not done: print(f"Terminated early after {sents} sentences.") else: print(f"Finished after {sents} sentences.") report_cb()
def unified_to_ukb(inf, outf, extract_extra): from stiff.extract.fin import FinExtractor if extract_extra: extractor = FinExtractor() for sent_elem in iter_sentences(inf): bits = [] for instance in sent_elem.xpath("instance"): id = instance.attrib["id"] lemma = instance.attrib["lemma"].lower() pos = UNI_POS_WN_MAP[instance.attrib["pos"]] bits.append(f"{lemma}#{pos}#{id}#1") if extract_extra: elems = sent_elem.xpath("wf|instance") toks = [node.text for node in elems] known_idxs = { idx for idx, elem in enumerate(elems) if elem.tag == "instance" } tagging = extractor.extract_toks(toks, list(fake_starts(toks))) for tok_idx, tok in enumerate(tagging.tokens): if tok_idx in known_idxs: continue extra_id = 0 lemma_poses = set() for tag in tok.tags: for wn, lemma_obj in tag.lemma_objs: lemma_name = lemma_obj.name().lower().strip() if lemma_name == "": continue lemma_pos = lemma_obj.synset().pos() if lemma_pos == "s": lemma_pos = "a" lemma_poses.add((lemma_name, lemma_pos)) for lemma, pos in lemma_poses: bits.append(f"{lemma}#{pos}#xT{tok_idx}N{extra_id}#0") extra_id += 1 if bits: outf.write(sent_elem.attrib["id"]) outf.write("\n") outf.write(" ".join(bits)) outf.write("\n")
def lesk_pp(mean, inf, keyout, include_wfs, expand, exclude_cand, score_by): aggf = ALL_MEANS[mean] lesk_pp = LeskPP(numberbatch_multispace, aggf, False, expand) for sent_idx, sent in enumerate(iter_sentences(inf)): if include_wfs: instances = sent.xpath("instance|wf") sent = [inst.text for inst in instances] tagged_sent = sent_finnpos(sent) else: instances = sent.xpath("instance") tagged_sent = None sent_lemmas = [] instance_ids = [] # XXX: SHOULD add wfs too! (equiv to wn_filter) for idx, instance in enumerate(instances): if instance.tag == "wf": lemma_str = tagged_sent[idx][1] lemmas = [] else: lemma_str, _pos, lemmas = lemmas_from_instance(fiwn_encnt, instance) sent_lemmas.append((lemma_str, lemmas)) if instance.tag == "instance": instance_ids.append(instance.attrib["id"]) disambg_order = sorted( (len(lemmas), idx) for idx, (lemma_str, lemmas) in enumerate(sent_lemmas) if len(lemmas) > 0 ) for ambiguity, lemma_idx in disambg_order: if ambiguity <= 1: continue lemma_str, lemmas = sent_lemmas[lemma_idx] # XXX: Should context_vec exclude the word being disambiguated context_vec = lesk_pp.mk_ctx_vec( sent_lemmas, *([lemma_idx] if exclude_cand else []) ) if context_vec is None: logger.debug("No context vec, backing off to MFS") # Back off to MFS sent_lemmas[lemma_idx] = (lemma_str, [lemmas[0]]) else: logger.debug(f"Got context vec {context_vec}") best_lemma = None best_score = -2 for lemma in lemmas: logger.debug(f"Considering lemma: {lemma}") defn_vec = lesk_pp.mk_defn_vec(lemma) logger.debug(f"Got defn_vec: {defn_vec}") if defn_vec is None: defn_ctx_score = 0 else: defn_ctx_score = cosine_sim(defn_vec, context_vec) try: lemma_vec = mk_lemma_vec(lemma) except KeyError: # XXX: Is this reasonable, or should there be a penalty? lemma_ctx_score = defn_ctx_score else: logger.debug(f"Got lemma_vec: {lemma_vec}") lemma_ctx_score = cosine_sim(lemma_vec, context_vec) if score_by == "both": score = defn_ctx_score + lemma_ctx_score elif score_by == "defn": score = defn_ctx_score elif score_by == "lemma": score = lemma_ctx_score else: assert False logger.debug( f"Score: {score} ({defn_ctx_score} + {lemma_ctx_score})" ) if score > best_score: best_lemma = lemma best_score = score sent_lemmas[lemma_idx] = (lemma_str, [best_lemma]) instance_sent_lemmas = (x for x in sent_lemmas if len(x[1]) > 0) for (lemma_str, lemmas), inst_id in zip(instance_sent_lemmas, instance_ids): if lemmas[0] is None: continue write_lemma(keyout, inst_id, lemmas[0])
def iter_sentences_eurosense(stream): for sent_elem in iter_sentences(stream): yield "eurosense.{:08d}".format(int(sent_elem.attrib["id"])), sent_elem
def unified_to_senseval( inf: IO, keyin: IO, outdir: str, exclude_word: List[str], write_tag: bool, synset_group: bool, filter_key: Optional[IO], ): """ Converts from the unified format to a Senseval-3 -style format in individual files. The resulting files should be directly usable to train a single word model with ItMakesSense or can be gathered using senseval-gather. This is a scatter type operation. """ def train_out(tag): if tag: return "train.tag.xml" else: return "train.xml" seen_keys: Set[str] = set() filter = None if filter_key is not None: exclude = pickle.load(filter_key) for sent_elem in iter_sentences(inf): for inst in sent_elem.xpath("instance"): def read_key(): key_line = keyin.readline() key_id, key_synset = key_line.rstrip().split(" ", 1) assert key_id == inst.attrib["id"] return key_id, key_synset lemma_str = inst.attrib["lemma"].lower() key_id, key_synset = read_key() if lemma_str in exclude: continue pos_str = inst.attrib["pos"] pos_chr = UNI_POS_WN_MAP[pos_str] lemma_pos = "{}.{}".format(lemma_str, pos_chr) if synset_group: group_keys = key_synset.split(" ") else: group_keys = [lemma_pos] for group_key in group_keys: if filter is not None and group_key not in filter: continue new_group = group_key not in seen_keys seen_keys.add(group_key) # Make dir group_dir = pjoin(outdir, group_key) if new_group: makedirs(group_dir, exist_ok=True) # Write XML def write_xml(tag=False): out_fn = pjoin(group_dir, train_out(tag)) if new_group: out_f = open(out_fn, "w") lexical_sample_head(out_f) if synset_group: lexelt_synset_head(group_key, out_f) else: lexelt_head(lemma_str, pos_chr, out_f) else: out_f = open(out_fn, "a") with instance(inst, out_f): write_context(sent_elem, inst, out_f, write_tag=tag) out_f.close() write_xml() if write_tag: write_xml(True) # Write key file key_fn = pjoin(group_dir, "train.key") if new_group: key_f = open(key_fn, "w") else: key_f = open(key_fn, "a") out_line = "{} {} {}\n".format(lemma_pos, key_id, key_synset) key_f.write(out_line) key_f.close() for group_key in seen_keys: def write_foot(tag=False): out_fn = pjoin(outdir, group_key, train_out(tag)) with open(out_fn, "a") as out_f: lexelt_foot(out_f) lexical_sample_foot(out_f) write_foot(False) if write_tag: write_foot(True)
def cov(inf, subtotal=None): """ Produce a report of how much of a corpus in Eurosense/STIFF format is covered by annotations. """ source = Stream() header = pd.DataFrame( {"toks": [], "anns": [], "unambg_anns": [], "uniq_anns": [], "cover": []} ) sdf = DataFrame(source, example=header) sums = {} for col in ["toks", "anns", "unambg_anns", "uniq_anns", "cover"]: sums[col] = getattr(sdf, col).sum().stream.gather().sink_to_list() ambg_source = Stream() ambg_header = pd.DataFrame({"ambg": []}) ambg_sdf = DataFrame(ambg_source, example=ambg_header) ambg_hist = ambg_sdf.ambg.value_counts().stream.gather().sink_to_list() def print_cov(): sents = len(sums["anns"]) print("Coverage at {} sentences:".format(sents)) print( "Total annotations, unique annotations, unambiguous annotations, " "tokens, tokens covered, proportion of tokens covered" ) print( sums["anns"][-1], sums["uniq_anns"][-1], sums["unambg_anns"][-1], sums["toks"][-1], sums["cover"][-1], sums["cover"][-1] / sums["toks"][-1], ) print(ambg_hist[-1]) try: # XXX: take into account token length for coverage for idx, sent in enumerate(iter_sentences(inf)): toks = len(sent.xpath("text")[0].text.split(" ")) anns = sent.xpath("annotations/annotation") num_anns = len(anns) ann_index = {} cov_map = [0] * toks for ann in anns: tok, tok_len = get_ann_pos(ann) ann_index.setdefault(tok, []).append(ann) for idx in range(tok, tok + tok_len): cov_map[idx] += 1 unambg_anns = 0 uniq_anns = 0 for ann_list in ann_index.values(): ambg = len(ann_list) ambg_source.emit(pd.DataFrame({"ambg": [ambg]})) if ambg == 1: unambg_anns += 1 uniq_anns += 1 source.emit( pd.DataFrame( { "toks": [toks], "anns": [num_anns], "unambg_anns": [unambg_anns], "uniq_anns": [uniq_anns], "cover": [toks - cov_map.count(0)], } ) ) idx1 = idx + 1 if subtotal is not None and idx1 % subtotal == 0: print_cov() finally: if len(sums["anns"]): print_cov()