def train_translation_matrix(source_file, target_file, dict_file, out_file): """Trains a transltion matrix between the source and target languages, using the words in dict_file as anchor points and writing the translation matrix to out_file Note that the source language file and target language file must be in the word2vec C ASCII format :param source_file: The name of the source language file :param target_file: The name of the target language file :param dict_file: The name of the file with the bilingual dictionary :param out_file: The name of the file to write the translation matrix to """ log.info("Reading the training data") train_data = read_dict(dict_file) #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words, target_words = zip(*train_data) log.info("Reading: %s" % source_file) source_sp = Space.build(source_file, set(source_words)) source_sp.normalize() log.info("Reading: %s" % target_file) target_sp = Space.build(target_file, set(target_words)) target_sp.normalize() log.debug('Words in the source space: %s' % source_sp.row2id) log.debug('Words in the target space: %s' % target_sp.row2id) log.info("Learning the translation matrix") log.info("Training data: %s" % str(train_data)) tm = train_tm(source_sp, target_sp, train_data) log.info("Printing the translation matrix") np.savetxt(out_file, tm)
def train_wrapper(seed_fn, source_fn, target_fn, reverse=False, mx_path=None, train_size=5000): logging.info("Training...") seed_trans = read_dict(seed_fn, reverse=reverse) #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words = set(seed_trans.iterkeys()) target_words = set().union(*seed_trans.itervalues()) source_sp = Space.build(source_fn, lexicon=source_words) source_sp.normalize() target_sp = Space.build(target_fn, lexicon=target_words) target_sp.normalize() logging.info("Learning the translation matrix") tm, used_for_train = train_tm(source_sp, target_sp, seed_trans, train_size) mx_path = default_output_fn(mx_path, seed_fn, source_fn, target_fn,) logging.info("Saving the translation matrix to {}".format(mx_path)) np.save('{}.npy'.format(mx_path), tm) pickle.dump(used_for_train, open('{}.train_wds'.format(mx_path), mode='w')) return tm, used_for_train
def main(sys_argv): try: opts, argv = getopt.getopt(sys_argv[1:], "ho:", ["help", "output="]) except getopt.GetoptError as err: print(str(err)) usage() sys.exit(1) out_file = "./tm" for opt, val in opts: if opt in ("-o", "--output"): out_file = val elif opt in ("-h", "--help"): usage(0) else: usage(1) if len(argv) == 3: source_file = argv[1] target_file = argv[2] dict_file = argv[0] else: print(str(err)) usage(1) print("Reading the training data") train_data = read_dict(dict_file) #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words, target_words = zip(*train_data) print("Reading: %s" % source_file) source_sp = Space.build(source_file, set(source_words)) source_sp.normalize() print("Reading: %s" % target_file) target_sp = Space.build(target_file, set(target_words)) target_sp.normalize() print("Learning the translation matrix") tm = train_tm(source_sp, target_sp, train_data) print("Printing the translation matrix") np.savetxt("%s.txt" % out_file, tm)
target_file = argv[2] dict_file = argv[0] else: print str(err) usage(1) print "Reading the training data" train_data = read_dict(dict_file) print train_data #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words, target_words = zip(*train_data) print "Reading: %s" % source_file source_sp = Space.build(source_file, set(source_words)) source_sp.normalize() print "Reading: %s" % target_file target_sp = Space.build(target_file, set(target_words)) target_sp.normalize() print "Learning the translation matrix" tm = train_tm(source_sp, target_sp, train_data) print "Printing the translation matrix" np.savetxt("%s.txt" % out_file, tm) if __name__ == '__main__': main(sys.argv)
print str(err) usage(1) print "Reading the training data" train_data = read_dict(dict_file) #we only need to load the vectors for the words in the training data #semantic spaces contain additional words source_words, target_words = zip(*train_data) print "Reading: %s" % source_file source_sp = Space.build(source_file, set(source_words)) source_sp.normalize() print "Reading: %s" % target_file target_sp = Space.build(target_file, set(target_words)) target_sp.normalize() print "Learning the translation matrix" print "Training data: %s" % str(train_data) tm = train_tm(source_sp, target_sp, train_data) print "Printing the translation matrix" np.savetxt("%s.txt" % out_file, tm) if __name__ == '__main__': main(sys.argv)