def make_extension_matrix(samp_hdfile, tdict_pkl_fname, reverse_tdict_pkl_fname): reverse_vocab = [lemma.decode("utf-8") for lemma in samp_hdfile["vocab"]] vocab = dict((lemma, i) for i, lemma in enumerate(reverse_vocab)) assert len(reverse_vocab) == len(vocab) tdict = TransDict.load(tdict_pkl_fname) # disable POS mapping tdict.pos_map = None reverse_tdict = TransDict.load(reverse_tdict_pkl_fname) reverse_tdict.pos_map = None shape = len(vocab), len(vocab) log.info("making extension matrix as sparse lil_matrix {0}".format(shape)) em = sp.lil_matrix(shape, dtype="int8") for i, target_lemma in enumerate(reverse_vocab): try: reverse_lookup = reverse_tdict.lookup_lemma(target_lemma) except KeyError: # vocab term not in reverse dict # FIXME: these terms should be removed from vocab continue log.debug(40 * "=") for _, source_lempos_list in reverse_lookup: for source_lempos in source_lempos_list: target_lempos_list = tdict.lookup_lempos(source_lempos)[1] for target_lempos in target_lempos_list: # does not handle MWU, but vocab contains only atomic # lemmas so far ext_target_lemma = target_lempos.rsplit("/",1)[0] try: j = vocab[ext_target_lemma] except: # oov continue log.debug(u"{0} --> {1} --> {2}".format( target_lemma, source_lempos, ext_target_lemma)) # counting occurrences does not make a a lot of sense, # so assume boolean em[i,j] = 1 if log.isEnabledFor(logging.DEBUG): log.debug(u"{0} ==> {1}".format( target_lemma, ", ".join([str((reverse_vocab[j], count)) for j, count in zip(em.rows[i], em.data[i])]))) log.info("converting to csr_matrix") return em.tocsr()
def make_new_vocab(sample_hdfile, tdict_pkl_fname): tdict = TransDict.load(tdict_pkl_fname) # disable POS mapping tdict.pos_map = None log.info("extracting target lemmas from translation dictionary") dict_target_lemmas = set() for target_lempos_list in tdict._lempos_dict.itervalues(): for target_lempos in target_lempos_list: # skip MWU if not " " in target_lempos: target_lemma = target_lempos.rsplit("/",1)[0] dict_target_lemmas.add(target_lemma) del tdict vocab = [t.decode("utf-8") for t in sample_hdfile["vocab"]] # select columns numbers and corresponding target lemmas # sorting is required because order of column number is relevant selection = [ (i, lemma) for i, lemma in enumerate(vocab) if lemma in dict_target_lemmas ] columns_selector, filtered_vocab = zip(*selection) return columns_selector, filtered_vocab
def make_new_vocab(sample_hdfile, tdict_pkl_fname): tdict = TransDict.load(tdict_pkl_fname) # disable POS mapping tdict.pos_map = None log.info("extracting target lemmas from translation dictionary") dict_target_lemmas = set() for target_lempos_list in tdict._lempos_dict.itervalues(): for target_lempos in target_lempos_list: # skip MWU if not " " in target_lempos: target_lemma = target_lempos.rsplit("/", 1)[0] dict_target_lemmas.add(target_lemma) del tdict vocab = [t.decode("utf-8") for t in sample_hdfile["vocab"][()]] org_size = len(vocab) log.info("orginal vocab size: {} lemmas".format(org_size)) # select columns numbers and corresponding target lemmas # sorting is required because order of column number is relevant selection = [(i, lemma) for i, lemma in enumerate(vocab) if lemma in dict_target_lemmas] columns_selector, filtered_vocab = zip(*selection) new_size = len(filtered_vocab) log.info("filtered vocab size: {} lemmas".format(new_size)) reduction = (new_size / float(org_size)) * 100 log.info("vocab reduced to {:.2f}% of orginal size".format(reduction)) return columns_selector, filtered_vocab
def preprocess(data_set, lang_pair): source_lang, target_lang = lang_pair.split("-") graphs_fname = config["eval"][data_set][lang_pair]["graphs_fname"] out_dir = os.path.dirname(graphs_fname) if not os.path.exists(out_dir): os.mkdir(out_dir) # annotate annotator = get_annotator(source_lang) graph_list = annotator.annot_xml_file( config["eval"][data_set][lang_pair]["src_fname"]) # lookup translations dict_fname = TransDict.load(config["dict"][lang_pair]["pkl_fname"]) lookup = Lookup(dict_fname) lookup(graph_list) # score most frequent translation freq_score = FreqScorer(config["count"]["lemma"][target_lang]["pkl_fname"]) freq_score(graph_list) # dict upper scores lemma_ref_fname = \ config["eval"][data_set][lang_pair]["lemma_ref_fname"] scorer = DictUpperScorer(lemma_ref_fname) scorer(graph_list) # model upper scores ambig_fname = config["sample"][lang_pair]["ambig_fname"] filter = filter_functions(source_lang) scorer = ModelUpperScorer(lemma_ref_fname, ambig_fname, filter) scorer(graph_list) # save graphs log.info("saving preprocessed graphs to " + graphs_fname) cPickle.dump(graph_list, open(graphs_fname, "wb"))
def prepare(lang_pair): """ extract input text, create annotated graphs, lookup translation candidates, perform frequency scoring and save pickled graphs to file """ source_lang, target_lang = lang_pair.split("-") # get text from input source xml_tree = et.ElementTree(file=config["eval"]["presemt"][lang_pair]["src_fname"]) text = " ".join(seg.text.strip() for seg in xml_tree.iter("seg")) # annotate if source_lang == "en": annotator = TreeTaggerEnglish() elif source_lang == "de": annotator = TreeTaggerGerman() else: raise ValueError("unknown source language: " + source_lang) graph_list = annotator(text) # lookup trans_dict = TransDict.load(config["dict"][lang_pair]["pkl_fname"]) lookup = Lookup(trans_dict) lookup(graph_list) # frequency scoring freq_score = FreqScorer(config["count"]["lemma"][target_lang]["pkl_fname"]) freq_score(graph_list) # save if not os.path.exists(PREP_DIR): os.makedirs(PREP_DIR) pkl_fname = join(PREP_DIR, lang_pair + "_graphs.pkl") log.info("saving graphs to " + pkl_fname) cPickle.dump(graph_list, open(pkl_fname, "wb"), protocol=cPickle.HIGHEST_PROTOCOL)
def setup_class(cls): dict_fname = config["dict"]["en-de"]["pkl_fname"] print "loading picked dictionary from " + dict_fname cls.trans_dict = TransDict.load(dict_fname)
def setup_class(cls): dict_fname = config["dict"]["en-de"]["pkl_fname"] print "loading picked dictionary from " + dict_fname cls.trans_dict = TransDict.load(dict_fname) # remove the POS mapping cls.trans_dict.pos_map = None
def make_graphs(): """ Create annotated translations graphs with scores for random translation, most frequent translation and approximated maximum. Also create minimal translation dictionaries for these graphs and drawings. """ for lang_pair, src_fname, lemma_ref_fname in [ ("en-de", "sample_newstest2011-src.en.sgm", "lemma_sample_newstest2011-ref.de.sgm"), ("de-en", "sample_out_de-en.src", "lemma_sample_out_de-en.ref") ]: source_lang, target_lang = lang_pair.split("-") root_fname = splitext(src_fname)[0] # annotate annotator = get_annotator(source_lang) graphs = annotator.annot_xml_file(src_fname) # lookup dict_fname = config["dict"][lang_pair]["pkl_fname"] trans_dict = TransDict.load(dict_fname) lookup = LookupKeepKeys(trans_dict) lookup(graphs) # write pickle of minimal translation dict min_dict = lookup.get_minimal_trans_dict() min_dict_fname = "dict_" + root_fname + ".pkl" dump(min_dict, open(min_dict_fname, "wb")) # score most frequent translation counts_fname = config["count"]["lemma"][target_lang]["pkl_fname"] freq_score = FreqScorer(counts_fname) freq_score(graphs) # score random translation counts_fname = config["count"]["lemma"][target_lang]["pkl_fname"] rand_score = RandScorer() rand_score(graphs) # dict upper score maxscore = DictUpperScorer(lemma_ref_fname) maxscore(graphs) # model upper scores ambig_fname = config["sample"][lang_pair]["ambig_fname"] filter = filter_functions(source_lang) scorer = ModelUpperScorer(lemma_ref_fname, ambig_fname, filter) scorer(graphs) # draw graphs draw = Draw() draw(graphs, out_format="pdf", base_score_attrs=["dup_score", "mup_score", "freq_score", "rand_score"], out_dir="_draw_" + lang_pair) # save graphs graphs_fname = "graphs_" + root_fname + ".pkl" dump(graphs, open(graphs_fname, "wb"))