def ambig_map_sample(lang_pair, k=0, r=1.0, pos=set()): """ take sample from orignal ambiguity map, either an absulute number or a fraction, possibly restricted to particular POS tags """ ambig_fname = config["sample"][lang_pair]["ambig_fname"] ambig_map = AmbiguityMap(ambig_fname) if pos: select = [ sl for sl in ambig_map.source_iter() if sl.rsplit("/",1)[1] in pos ] else: select = list(ambig_map.source_iter()) if r < 1.0: k = int(round(len(select) * r)) elif k == 0: k = len(select) select = random.sample(select, k) ambig_map.source_target_map = dict( (sl, ambig_map[sl]) for sl in select ) return ambig_map
def test_graphs(self): # "absolute/adj" is the only lempos shared between graphs and ambiguity table graphs_fname = config["test_data_dir"] +"/graphs_sample_out_de-en.pkl" subset = AmbiguityMap.extract_source_lempos_subset(graphs_fname) am = AmbiguityMap(self.ambig_fname, subset=subset) assert len(am) == 1 assert am["absolut/adj".decode("utf-8")] == ["absolute/jj".decode("utf-8"), "thorough/jj".decode("utf-8"), "total/jj".decode("utf-8")]
def test_subset(self): subset = {"köstlich/adj".decode("utf-8")} am = AmbiguityMap(self.ambig_fname, subset=subset) assert len(list(am.source_target_pair_iter())) == 2
def test_init_from_file(self): am = AmbiguityMap(self.ambig_fname) assert len(list(am.source_target_pair_iter())) == 11 assert am["köstlich/adj".decode("utf-8")] == ["delicious/jj".decode("utf-8"), "rich/jj".decode("utf-8")]