Example #1
0
def ambig_map_sample(lang_pair, k=0, r=1.0, pos=set()):
    """
    take sample from orignal ambiguity map,
    either an absulute number or a fraction,
    possibly restricted to particular POS tags
    """
    ambig_fname = config["sample"][lang_pair]["ambig_fname"]
    ambig_map = AmbiguityMap(ambig_fname)
    
    if pos:
        select = [ sl for sl in ambig_map.source_iter()
                   if sl.rsplit("/",1)[1] in pos ]    
    else:
        select =  list(ambig_map.source_iter())
        
    if r < 1.0:
        k = int(round(len(select) * r))
    elif k == 0:
        k = len(select)
    
    select = random.sample(select, k)
    
    ambig_map.source_target_map = dict( (sl, ambig_map[sl])
                                        for sl in select )
    
    return ambig_map
Example #2
0
 def test_graphs(self):
     # "absolute/adj" is the only lempos shared between graphs and ambiguity table
     graphs_fname = config["test_data_dir"] +"/graphs_sample_out_de-en.pkl" 
     subset = AmbiguityMap.extract_source_lempos_subset(graphs_fname)
     am = AmbiguityMap(self.ambig_fname, subset=subset)
     assert len(am) == 1
     assert am["absolut/adj".decode("utf-8")] == ["absolute/jj".decode("utf-8"),
                                                  "thorough/jj".decode("utf-8"),
                                                  "total/jj".decode("utf-8")]
Example #3
0
 def test_subset(self):
     subset = {"köstlich/adj".decode("utf-8")}
     am = AmbiguityMap(self.ambig_fname, subset=subset)
     assert len(list(am.source_target_pair_iter())) == 2
Example #4
0
 def test_init_from_file(self):
     am = AmbiguityMap(self.ambig_fname)
     assert len(list(am.source_target_pair_iter())) == 11
     assert am["köstlich/adj".decode("utf-8")] == ["delicious/jj".decode("utf-8"),
                                                    "rich/jj".decode("utf-8")]