def lemmatize(infname, lang, outf=sys.stdout, sent_tag="seg", encoding="utf-8", replace_unknown_lemma=True): """ Lemmatize reference translations in mteval format (may work for other formats too) Parameters ---------- infname: str name of inputfile in mteval xml format lang: str two-letter language identifier outf: file or str, optional file or filename for output encoding: str, optional char encoding for output (should be the same as that of input) replace_unknown_lemma: bool, optional replace unknown lemma by word """ annotator = get_annotator(lang, replace_unknown_lemma=replace_unknown_lemma) log.info("using annotator " + annotator.__class__.__name__) log.info("reading evaluation data from file " + infname) etree = et.ElementTree(file=infname) sentences = [ sent_elem.text for sent_elem in etree.iter(sent_tag) ] log.debug(u"input:\n" + u"\n".join(sentences)) graph_list = annotator.annot_sentences(sentences) for sent_elem, graph in zip(etree.iter(sent_tag), graph_list): lemma_text = " ".join(graph.source_lemmas()) sent_elem.text = lemma_text log.info("writing lemmatized evaluation data to {0}".format( getattr(outf, "name", outf))) etree.write(outf, encoding=encoding)
def preprocess(data_set, lang_pair): source_lang, target_lang = lang_pair.split("-") graphs_fname = config["eval"][data_set][lang_pair]["graphs_fname"] out_dir = os.path.dirname(graphs_fname) if not os.path.exists(out_dir): os.mkdir(out_dir) # annotate annotator = get_annotator(source_lang) graph_list = annotator.annot_xml_file( config["eval"][data_set][lang_pair]["src_fname"]) # lookup translations dict_fname = TransDict.load(config["dict"][lang_pair]["pkl_fname"]) lookup = Lookup(dict_fname) lookup(graph_list) # score most frequent translation freq_score = FreqScorer(config["count"]["lemma"][target_lang]["pkl_fname"]) freq_score(graph_list) # dict upper scores lemma_ref_fname = \ config["eval"][data_set][lang_pair]["lemma_ref_fname"] scorer = DictUpperScorer(lemma_ref_fname) scorer(graph_list) # model upper scores ambig_fname = config["sample"][lang_pair]["ambig_fname"] filter = filter_functions(source_lang) scorer = ModelUpperScorer(lemma_ref_fname, ambig_fname, filter) scorer(graph_list) # save graphs log.info("saving preprocessed graphs to " + graphs_fname) cPickle.dump(graph_list, open(graphs_fname, "wb"))
def make_graphs(): """ Create annotated translations graphs with scores for random translation, most frequent translation and approximated maximum. Also create minimal translation dictionaries for these graphs and drawings. """ for lang_pair, src_fname, lemma_ref_fname in [ ("en-de", "sample_newstest2011-src.en.sgm", "lemma_sample_newstest2011-ref.de.sgm"), ("de-en", "sample_out_de-en.src", "lemma_sample_out_de-en.ref") ]: source_lang, target_lang = lang_pair.split("-") root_fname = splitext(src_fname)[0] # annotate annotator = get_annotator(source_lang) graphs = annotator.annot_xml_file(src_fname) # lookup dict_fname = config["dict"][lang_pair]["pkl_fname"] trans_dict = TransDict.load(dict_fname) lookup = LookupKeepKeys(trans_dict) lookup(graphs) # write pickle of minimal translation dict min_dict = lookup.get_minimal_trans_dict() min_dict_fname = "dict_" + root_fname + ".pkl" dump(min_dict, open(min_dict_fname, "wb")) # score most frequent translation counts_fname = config["count"]["lemma"][target_lang]["pkl_fname"] freq_score = FreqScorer(counts_fname) freq_score(graphs) # score random translation counts_fname = config["count"]["lemma"][target_lang]["pkl_fname"] rand_score = RandScorer() rand_score(graphs) # dict upper score maxscore = DictUpperScorer(lemma_ref_fname) maxscore(graphs) # model upper scores ambig_fname = config["sample"][lang_pair]["ambig_fname"] filter = filter_functions(source_lang) scorer = ModelUpperScorer(lemma_ref_fname, ambig_fname, filter) scorer(graphs) # draw graphs draw = Draw() draw(graphs, out_format="pdf", base_score_attrs=["dup_score", "mup_score", "freq_score", "rand_score"], out_dir="_draw_" + lang_pair) # save graphs graphs_fname = "graphs_" + root_fname + ".pkl" dump(graphs, open(graphs_fname, "wb"))