def test_read_ref_trans(self): refset = read_ref_trans( config["test_data_dir"] + "/lemma_sample_out_de-en.ref") # check doc ids assert refset.keys() == ["test"] # check seg ids assert refset["test"].keys() == ['1', '2', '3', '4', '5'] # check translations assert refset["test"]["5"] == [ 'all other aspect be secondary .', 'the rest be secondary .', 'All the rest be secondary .', 'All the rest be secondary .', 'the rest be secondary .']
def trans_diff(inf, score_attrs, ref_fname=None, colwidth=32, outf=codecs.getwriter('utf8')(sys.stdout)): """ Report translation differences Outputs all cases where translations differ when selected on score_attr. If reference translations are provided, it also shows the reference translation sentences as well as a guess of the reference lemma(s) per source lemma. Parameters ---------- inf: list or str list of TransGraph instances or filename of pickled graphs score_attrs: list of strings list of scoring attributes ref_fname: str filename of reference translations in mteval xml format col_width: int column width outf: file or str file or filename for output Notes ----- Does not support multi-word expressions """ assert len(score_attrs) > 1 if isinstance(inf, basestring): inf = cPickle.load(open(inf)) if isinstance(outf, basestring): outf = codecs.open(outf, "w", encoding="utf-8") no_cols = 1 + len(score_attrs) if ref_fname: ref_trans = read_ref_trans(ref_fname, flatten=True) ref_counts = read_ref_trans_counts(ref_fname, flatten=True) no_cols += 1 else: ref_lemmas = set() bar = no_cols * colwidth * u"=" + u"\n" subbar = no_cols * colwidth * u"-" + u"\n" for i, graph in enumerate(inf): diffs = graph_trans_diff(graph, score_attrs) if not diffs: continue outf.write(bar) outf.write( u"SEGMENT {} (id={})\n".format(graph.graph.get("n"), graph.graph.get("id"))) outf.write(bar + u"\n") outf.write(u"SRC: {}\n".format( graph.source_string())) if ref_fname: for ref_lemmas in ref_trans[i]: outf.write(u"REF: {}\n".format(ref_lemmas)) outf.write(u"\n") outf.write(u"SRC LEMPOS:".ljust(colwidth)) for attr in score_attrs: outf.write((attr.upper() + ":").ljust(colwidth)) if ref_fname: outf.write(u"REF TRANS:".ljust(colwidth)) outf.write(u"\n" + subbar) for source_node, max_scores in diffs.iteritems(): if ref_fname: ref_lemmas = get_ref_lemmas(graph, source_node, ref_counts[i]) outf.write(graph.lempos(source_node).ljust(colwidth)) for score, target_node in max_scores: if score is not None: target_lemma = graph.lemma(target_node) else: target_lemma = u"__NONE__" pair = u"{}: {:.4f}: {}".format( "+" if target_lemma in ref_lemmas else "-", score, target_lemma) outf.write(pair.ljust(colwidth)) if ref_fname: outf.write(", ".join(ref_lemmas or ["---"])) outf.write(u"\n") outf.write(u"\n")