additional = min(additional, len(lexicon) - len(source_words)) # we sample additional elements that are not already in source_words random.seed(100) lexicon = random.sample(list(lexicon.difference(source_words)), additional) # load the source space source_sp = Space.build(source_file, source_words.union(set(lexicon))) source_sp.normalize() print "Reading: %s" % target_file target_sp = Space.build(target_file) target_sp.normalize() print "Translating" # translates all the elements loaded in the source space mapped_source_sp = apply_tm(source_sp, tm) print "Retrieving translations" test_data = get_valid_data(source_sp, target_sp, test_data) # turn test data into a dictionary (a word can have mutiple translation) gold = collections.defaultdict(set) for k, v in test_data: gold[k].add(v) score(mapped_source_sp, target_sp, gold, additional) print "Printing mapped vectors: %s" % out_file np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat) np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
#we sample additional elements that are not already in source_words random.seed(100) lexicon = random.sample(list(lexicon.difference(source_words)), additional) #load the source space source_sp = Space.build(source_file, source_words.union(set(lexicon))) source_sp.normalize() print "Reading: %s" % target_file target_sp = Space.build(target_file) target_sp.normalize() print "Translating" #translates all the elements loaded in the source space mapped_source_sp = apply_tm(source_sp, tm) print "Retrieving translations" test_data = get_valid_data(source_sp, target_sp, test_data) #turn test data into a dictionary (a word can have mutiple translation) gold = collections.defaultdict(set) for k, v in test_data: gold[k].add(v) score(mapped_source_sp, target_sp, gold, additional) print "Printing mapped vectors: %s" % out_file np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat) np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
def main(sys_argv): try: opts, argv = getopt.getopt(sys_argv[1:], "ho:c:l:m:1:2:t:a:v:", [ "help", "output=", "correction=", "levenshtein=", "matrix=", "1=", "2=", "topK=", "alpha=", "verbosity=" ]) except getopt.GetoptError as err: print(str(err)) usage() sys.exit(1) out_file = "./translated_vecs" additional = None levcosts = {} for opt, val in opts: # print(opt+'='+val) if opt in ("-o", "--ouput"): out_file = val elif opt in ("-l", "--levenshtein"): levcosts = u.readcosts(val) elif opt in ("-m", "--matrix"): tm_file = val elif opt == '-1': source_file = val elif opt == '-2': target_file = val elif opt in ("-c", "--correction"): try: additional = int(val) except ValueError: print("additional: %s" % val) usage(1) elif opt in ("-t", "--topK"): try: u.topK = int(val) except ValueError: print("topK: %s" % val) usage(1) elif opt in ("-v", "--verbosity"): try: u.verbosity = int(val) except ValueError: print("verbosity: %s" % val) usage(1) elif opt in ("-a", "--alpha"): try: u.alpha = float(val) except ValueError: print("alpha: %s" % val) usage(1) elif opt in ("-h", "--help"): usage(0) else: print("Unknown option: -%s %s" % (opt, val)) usage(1) if len(argv) == 1: test_file = argv[0] else: print('Unused arguments:') print(argv) usage(1) #if u.verbosity>0: # always log the parameters in the output sys.stdout.write(sys_argv[0] + " ") for opt, val in opts: sys.stdout.write(opt + " " + val + " ") print(test_file) if u.verbosity > 1: print("Loading the translation matrix %s " % tm_file) tm = np.loadtxt(tm_file) if u.verbosity > 1: print("Reading the test data %s " % test_file) test_data = u.read_dict(test_file) #in the _source_ space, we only need to load vectors for the words in test. #semantic spaces may contain additional words, ALL words in the _target_ #space are used as the search space source_words, _ = zip(*test_data) source_words = set(source_words) if u.verbosity > 1: print("Reading: %s" % source_file) if not additional: source_sp = Space.build(source_file, source_words) else: #read all the words in the space with io.open(source_file, 'r', encoding='utf8') as f: lexicon = set([l.split(' ')[0] for l in f]) # lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, # comments=None, usecols=(0,)).flatten()) #the max number of additional+test elements is bounded by the size #of the lexicon additional = min(additional, len(lexicon) - len(source_words)) #we sample additional elements that are not already in source_words random.seed(100) if additional > 0: lexicon = random.sample(list(lexicon.difference(source_words)), additional) #load the source space source_sp = Space.build(source_file, source_words.union(set(lexicon))) source_sp.normalize() if u.verbosity > 1: print("Reading: %s" % target_file) target_sp = Space.build(target_file) target_sp.normalize() if u.verbosity > 1: print("Retrieving translations") test_data = u.get_valid_data(source_sp, target_sp, test_data) #turn test data into a dictionary (a word can have mutiple translation) gold = collections.defaultdict(set) for k, v in test_data: gold[k].add(v) if u.verbosity > 1: print("Translating" ) #translates all the elements loaded in the source space source_sp = u.apply_tm(source_sp, tm) u.score(source_sp, target_sp, gold, additional, levcosts) print("Printing mapped vectors: %s" % out_file) np.savetxt("%s.vecs.txt" % out_file, source_sp.mat) # np.savetxt("%s.wds.txt" % out_file, source_sp.id2row, fmt="%s") # no utf8 with open("%s.wds.txt" % out_file, "w") as outf: for s in source_sp.id2row: print(s, file=outf)
def main(sys_argv): try: opts, argv = getopt.getopt(sys_argv[1:], "ho:c:", ["help", "output=", "correction="]) except getopt.GetoptError as err: print(str(err)) usage() sys.exit(1) out_file = "./translated_vecs" additional = None for opt, val in opts: if opt in ("-o", "--ouput"): out_file = val if opt in ("-c", "--correction"): try: additional = int(val) except ValueError: usage(1) elif opt in ("-h", "--help"): usage(0) else: usage(1) if len(argv) == 4: tm_file = argv[0] test_file = argv[1] source_file = argv[2] target_file = argv[3] else: # print(str(err)) usage(1) print("Loading the translation matrix") tm = np.loadtxt(tm_file) print("Reading the test data") test_data = read_dict(test_file) #in the _source_ space, we only need to load vectors for the words in test. #semantic spaces may contain additional words, ALL words in the _target_ #space are used as the search space source_words, _ = zip(*test_data) source_words = set(source_words) print("Reading: %s" % source_file) if not additional: source_sp = Space.build(source_file, source_words) else: #read all the words in the space lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, comments=None, usecols=(0,)).flatten()) #the max number of additional+test elements is bounded by the size #of the lexicon additional = min(additional, len(lexicon) - len(source_words)) #we sample additional elements that are not already in source_words random.seed(100) lexicon = random.sample(list(lexicon.difference(source_words)), additional) #load the source space source_sp = Space.build(source_file, source_words.union(set(lexicon))) source_sp.normalize() print("Reading: %s" % target_file) target_sp = Space.build(target_file) target_sp.normalize() print("Translating") #translates all the elements loaded in the source space mapped_source_sp = apply_tm(source_sp, tm) print("Retrieving translations") test_data = get_valid_data(source_sp, target_sp, test_data) #turn test data into a dictionary (a word can have mutiple translation) gold = collections.defaultdict(set) for k, v in test_data: gold[k].add(v) score(mapped_source_sp, target_sp, gold, additional) print("Printing mapped vectors: %s" % out_file) np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat) np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")