Esempio n. 1
0
def lemmatize(infname, lang, outf=sys.stdout, sent_tag="seg",
              encoding="utf-8", replace_unknown_lemma=True):
    """
    Lemmatize reference translations in mteval format (may work for other
    formats too)
    
    Parameters
    ----------
    infname: str
        name of inputfile in mteval xml format
    lang: str
        two-letter language identifier
    outf: file or str, optional
        file or filename for output
    encoding: str, optional
        char encoding for output (should be the same as that of input)
    replace_unknown_lemma: bool, optional
        replace unknown lemma by word
    """
    annotator = get_annotator(lang, replace_unknown_lemma=replace_unknown_lemma)
    log.info("using annotator " + annotator.__class__.__name__)
    log.info("reading evaluation data from file " + infname)
    etree = et.ElementTree(file=infname)
    sentences = [ sent_elem.text for sent_elem in etree.iter(sent_tag) ]
    log.debug(u"input:\n" + u"\n".join(sentences))
    graph_list = annotator.annot_sentences(sentences)
        
    for sent_elem, graph in zip(etree.iter(sent_tag), graph_list):
        lemma_text = " ".join(graph.source_lemmas())
        sent_elem.text = lemma_text
        
    log.info("writing lemmatized evaluation data to {0}".format(
        getattr(outf, "name", outf)))
        
    etree.write(outf, encoding=encoding)
Esempio n. 2
0
def preprocess(data_set, lang_pair):
    source_lang, target_lang = lang_pair.split("-")
    graphs_fname = config["eval"][data_set][lang_pair]["graphs_fname"]
    out_dir = os.path.dirname(graphs_fname)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
        
    # annotate
    annotator = get_annotator(source_lang)
    graph_list = annotator.annot_xml_file(
        config["eval"][data_set][lang_pair]["src_fname"])
    
    # lookup translations
    dict_fname = TransDict.load(config["dict"][lang_pair]["pkl_fname"])
    lookup = Lookup(dict_fname)
    lookup(graph_list)
    
    # score most frequent translation
    freq_score = FreqScorer(config["count"]["lemma"][target_lang]["pkl_fname"])
    freq_score(graph_list)
    
    # dict upper scores  
    lemma_ref_fname = \
        config["eval"][data_set][lang_pair]["lemma_ref_fname"]
    scorer = DictUpperScorer(lemma_ref_fname)
    scorer(graph_list)
    
    # model upper scores  
    ambig_fname = config["sample"][lang_pair]["ambig_fname"]  
    filter = filter_functions(source_lang)
    scorer = ModelUpperScorer(lemma_ref_fname, ambig_fname, filter)
    scorer(graph_list)
    
    # save graphs
    log.info("saving preprocessed graphs to " + graphs_fname)
    cPickle.dump(graph_list, open(graphs_fname, "wb"))
    
Esempio n. 3
0
def make_graphs():
    """
    Create annotated translations graphs with scores for random translation,
    most frequent translation and approximated maximum. Also create minimal
    translation dictionaries for these graphs and drawings.
    """
    for lang_pair, src_fname, lemma_ref_fname in [ 
        ("en-de", 
         "sample_newstest2011-src.en.sgm", 
         "lemma_sample_newstest2011-ref.de.sgm"),
        ("de-en", 
         "sample_out_de-en.src", 
         "lemma_sample_out_de-en.ref") ]:
        source_lang, target_lang = lang_pair.split("-")
        root_fname = splitext(src_fname)[0]
        
        # annotate
        annotator = get_annotator(source_lang)
        graphs = annotator.annot_xml_file(src_fname)    
        
        # lookup
        dict_fname = config["dict"][lang_pair]["pkl_fname"]
        trans_dict = TransDict.load(dict_fname)
        lookup = LookupKeepKeys(trans_dict)
        lookup(graphs)
        
        #  write pickle of minimal translation dict
        min_dict = lookup.get_minimal_trans_dict()
        min_dict_fname = "dict_" + root_fname + ".pkl"
        dump(min_dict, open(min_dict_fname, "wb"))
        
        # score most frequent translation
        counts_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
        freq_score = FreqScorer(counts_fname)
        freq_score(graphs)
        
        # score random translation
        counts_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
        rand_score = RandScorer()
        rand_score(graphs)
        
        # dict upper score
        maxscore = DictUpperScorer(lemma_ref_fname)
        maxscore(graphs)
    
        # model upper scores  
        ambig_fname = config["sample"][lang_pair]["ambig_fname"]  
        filter = filter_functions(source_lang)
        scorer = ModelUpperScorer(lemma_ref_fname, ambig_fname, filter)
        scorer(graphs)
        
        # draw graphs
        draw = Draw()
        draw(graphs, out_format="pdf", 
             base_score_attrs=["dup_score", "mup_score", "freq_score", 
                               "rand_score"], 
             out_dir="_draw_" + lang_pair)
        
        # save graphs
        graphs_fname = "graphs_" + root_fname + ".pkl"
        dump(graphs, open(graphs_fname, "wb"))