def main(): train_test = sys.argv[1] if train_test not in ["train", "predict"]: sys.stderr("train or predict") exit(1) config_name = sys.argv[2] forced_decode_data = "data/brae.train.data" phrase_data_path = "data/phrase.list" src_count_path = "data/src.trans.data" tar_count_path = "data/tar.trans.data" brae_config = BRAEISOMAPConfig(config_name) train_name = "dim%d_lrec%f_lsem%f_ll2%f_alpha%f_beta%f_num%d_seed%d_batch%d_lr%f" % (brae_config.dim, brae_config.weight_rec, brae_config.weight_sem, brae_config.weight_l2, brae_config.alpha, brae_config.beta, brae_config.trans_num, brae_config.random_seed, brae_config.batch_size, brae_config.optimizer.param["lr"]) model_name = "model/%s" % train_name temp_model = model_name + ".temp" if train_test == "train": start_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 0 end_iter = int(sys.argv[4]) if len(sys.argv) > 4 else 25 pre_logger("braeisomap_" + train_name) np.random.seed(brae_config.random_seed) if start_iter == 0: src_word_dict, tar_word_dict = read_phrase_pair_vocab(forced_decode_data) src_word_dict, tar_word_dict = add_trans_word_vocab(src_count_path, src_word_dict, tar_word_dict) tar_word_dict, src_word_dict = add_trans_word_vocab(tar_count_path, src_word_dict, tar_word_dict) src_word_dict = filter_vocab(src_word_dict, min_count=0) tar_word_dict = filter_vocab(tar_word_dict, min_count=0) src_phrases, tar_phrases, src_tar_pair = read_phrase_list(forced_decode_data, src_word_dict, tar_word_dict) src_phrases, tar_phrases = read_trans_list(src_count_path, src_phrases, tar_phrases, src_word_dict, tar_word_dict) tar_phrases, src_phrases = read_trans_list(tar_count_path, tar_phrases, src_phrases, tar_word_dict, src_word_dict) src_phrases = clean_text(src_phrases) tar_phrases = clean_text(tar_phrases) brae = pre_model(src_word_dict, tar_word_dict, brae_config, verbose=True) with open(temp_model, 'wb') as fout: pickle.dump(src_phrases, fout) pickle.dump(tar_phrases, fout) pickle.dump(src_tar_pair, fout) pickle.dump(brae, fout) pickle.dump(np.random.get_state(), fout) if end_iter == 1: exit(1) else: with open(temp_model, 'rb') as fin: src_phrases = pickle.load(fin) tar_phrases = pickle.load(fin) src_tar_pair = pickle.load(fin) brae = pickle.load(fin) np.random.set_state(pickle.load(fin)) brae.train(src_phrases, tar_phrases, src_tar_pair, brae_config, model_name, start_iter, end_iter) brae.save_model("%s.model" % model_name) elif train_test == "predict": num_process = int(sys.argv[3]) if len(sys.argv) > 3 else 0 brae_predict(phrase_data_path, train_name + ".pred", model_file="%s.model" % model_name, bilinear=True, num_process=num_process) else: sys.stderr("train or predict") exit(1)
def main(): min_count = int(sys.argv[1]) dim = 50 ''' forced_decode_data = "data/brae.train.data" src_count_path = "data/src.trans.data" tar_count_path = "data/tar.trans.data" tar_para_path = "data/tar.para.data" src_para_path = "data/src.para.data" gbrae_data_name = "model/gbrae.data.min.count.%d.pkl" % min_count gbrae_dict_name = "model/gbrae.dict.min.count.%d.pkl" % min_count gbrae_phrase_dict_name = "model/gbrae.phrase.text.dict.pkl" ''' forced_decode_data = "data/250w/tune_hyperparameter/tune.data" src_count_path = "data/250w/tune_hyperparameter/tune.data" #tar_count_path = "data/250w/phrase-table.filtered" tar_para_path = "data/250w/enBP_alignPhraProb.xml" src_para_path = "data/250w/chBP_alignPhraProb.xml" gbrae_data_name = "data/250w/tune_hyperparameter/gbrae.data.tune.min.count.%d.pkl" % min_count gbrae_dict_name = "data/250w/tune_hyperparameter/train/gbrae.dict.tune.min.count.%d.pkl" % min_count gbrae_phrase_dict_name = "data/250w/tune_hyperparameter/gbrae.tune.phrase.text.dict.pkl" print "Load Word Dict ..." en_embedding_name = "data/embedding/en.token.dim%d.bin" % dim zh_embedding_name = "data/embedding/zh.token.dim%d.bin" % dim tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name, binary=True, oov=True) src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name, binary=True, oov=True) print "Load All Data ..." src_phrases, tar_phrases, src_tar_pair = read_phrase_list( forced_decode_data, src_word_dict, tar_word_dict) print "Load Para Data ..." src_phrases = read_para_list(src_para_path, src_phrases, src_word_dict) tar_phrases = read_para_list(tar_para_path, tar_phrases, tar_word_dict) print "Load Trans Data ..." src_phrases, tar_phrases = read_trans_list(src_count_path, src_phrases, tar_phrases, src_word_dict, tar_word_dict) #tar_phrases, src_phrases = read_trans_list(tar_count_path, tar_phrases, src_phrases, #tar_word_dict, src_word_dict) src_phrase2id = dict() tar_phrase2id = dict() for phrase, i in zip(src_phrases, xrange(len(src_phrases))): src_phrase2id[phrase[TEXT_INDEX]] = i for phrase, i in zip(tar_phrases, xrange(len(tar_phrases))): tar_phrase2id[phrase[TEXT_INDEX]] = i src_phrases = clean_text(src_phrases) tar_phrases = clean_text(tar_phrases) with open(gbrae_dict_name, 'wb') as fout: print "Write Word Dict ..." pickle.dump(src_word_dict, fout) pickle.dump(tar_word_dict, fout) with open(gbrae_data_name, 'wb') as fout: print "Write Source Phrases Data ..." pickle.dump(src_phrases, fout) print "Write Target Phrases Data ..." pickle.dump(tar_phrases, fout) pickle.dump(src_tar_pair, fout) with open(gbrae_phrase_dict_name, 'wb') as fout: print "Write Source Phrases Dictionary ..." pickle.dump(src_phrase2id, fout) print "Write Target Phrases Dictionary ..." pickle.dump(tar_phrase2id, fout)
def main(): config_name = sys.argv[1] forced_decode_data = "../gbrae/data/250w/tune_hyperparameter/tune.data" brae_config = BRAEConfig(config_name) train_data = "../gbrae/data/250w/tune_hyperparameter/train/tune.train" dev_data = "../gbrae/data/250w/tune_hyperparameter/dev/tune.dev" test_data = "../gbrae/data/250w/tune_hyperparameter/test/tune.test" train_name = "dim%d_lrec%f_lsem%f_ll2%f_alpha%f_seed%d_batch%d_min%d_lr%f" % ( brae_config.dim, brae_config.weight_rec, brae_config.weight_sem, brae_config.weight_l2, brae_config.alpha, brae_config.random_seed, brae_config.batch_size, brae_config.min_count, brae_config.optimizer.param["lr"], ) model_name = "model/%s" % train_name temp_model = model_name + ".temp" start_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 0 end_iter = int(sys.argv[4]) if len(sys.argv) > 4 else 26 pre_logger("brae_" + train_name) np.random.seed(brae_config.random_seed) if start_iter == 0: print "Load Dict ..." en_embedding_name = "../gbrae/data/embedding/en.token.dim%d.bin" % brae_config.dim zh_embedding_name = "../gbrae/data/embedding/zh.token.dim%d.bin" % brae_config.dim tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name, binary=True, oov=True) src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name, binary=True, oov=True) print "Compiling Model ..." brae = pre_model(src_word_dict, tar_word_dict, brae_config, verbose=True) print "Load All Data ..." src_phrases, tar_phrases, src_tar_pair = read_phrase_list( forced_decode_data, src_word_dict, tar_word_dict) src_train = [p[WORD_INDEX] for p in src_phrases] tar_train = [p[WORD_INDEX] for p in tar_phrases] print "Write Binary Data ..." with open(temp_model, 'wb') as fout: pickle.dump(src_train, fout) pickle.dump(tar_train, fout) pickle.dump(src_tar_pair, fout) pickle.dump(brae, fout) pickle.dump(np.random.get_state(), fout) if end_iter == 1: exit(1) else: with open(temp_model, 'rb') as fin: src_train = pickle.load(fin) tar_train = pickle.load(fin) src_tar_pair = pickle.load(fin) brae = pickle.load(fin) np.random.set_state(pickle.load(fin)) src_phrase2id = dict() tar_phrase2id = dict() for phrase, i in zip(src_phrases, xrange(len(src_phrases))): src_phrase2id[phrase[TEXT_INDEX]] = i for phrase, i in zip(tar_phrases, xrange(len(tar_phrases))): tar_phrase2id[phrase[TEXT_INDEX]] = i train_pair = load_sub_data_pair(train_data, src_phrase2id, tar_phrase2id) dev_pair = load_sub_data_pair(dev_data, src_phrase2id, tar_phrase2id) test_pair = load_sub_data_pair(test_data, src_phrase2id, tar_phrase2id) brae.tune_hyper_parameter(src_train, tar_train, train_pair, dev_pair, test_pair, brae_config, model_name, start_iter=start_iter, end_iter=end_iter) brae.save_model("%s.tune.model" % model_name)
def main(): train_test = sys.argv[1] if train_test not in ["train", "predict"]: sys.stderr("train or predict") exit(1) config_name = sys.argv[2] forced_decode_data = "../gbrae/data/250w/phrase-table.filtered" phrase_data_path = "data/phrase.list" brae_config = BRAEConfig(config_name) train_name = "dim%d_lrec%f_lsem%f_ll2%f_alpha%f_seed%d_batch%d_min%d_lr%f" % (brae_config.dim, brae_config.weight_rec, brae_config.weight_sem, brae_config.weight_l2, brae_config.alpha, brae_config.random_seed, brae_config.batch_size, brae_config.min_count, brae_config.optimizer.param["lr"],) model_name = "model/%s" % train_name temp_model = model_name + ".temp" if train_test == "train": start_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 0 end_iter = int(sys.argv[4]) if len(sys.argv) > 4 else 26 pre_logger("brae_" + train_name) np.random.seed(brae_config.random_seed) if start_iter == 0: print "Load Dict ..." en_embedding_name = "../gbrae/data/embedding/en.token.dim%d.bin" % brae_config.dim zh_embedding_name = "../gbrae/data/embedding/zh.token.dim%d.bin" % brae_config.dim tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name, binary=True, oov=True) src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name, binary=True, oov=True) print "Compiling Model ..." brae = pre_model(src_word_dict, tar_word_dict, brae_config, verbose=True) print "Load All Data ..." src_phrases, tar_phrases, src_tar_pair = read_phrase_list(forced_decode_data, src_word_dict, tar_word_dict) src_train = [p[WORD_INDEX] for p in src_phrases] tar_train = [p[WORD_INDEX] for p in tar_phrases] print "Write Binary Data ..." with open(temp_model, 'wb') as fout: pickle.dump(src_train, fout) pickle.dump(tar_train, fout) pickle.dump(src_tar_pair, fout) pickle.dump(brae, fout) pickle.dump(np.random.get_state(), fout) if end_iter == 1: exit(1) else: with open(temp_model, 'rb') as fin: src_train = pickle.load(fin) tar_train = pickle.load(fin) src_tar_pair = pickle.load(fin) brae = pickle.load(fin) np.random.set_state(pickle.load(fin)) brae.train(src_train, tar_train, src_tar_pair, brae_config, model_name, start_iter, end_iter) brae.save_model("%s.model" % model_name) elif train_test == "predict": num_process = int(sys.argv[3]) if len(sys.argv) > 3 else 0 brae_predict(phrase_data_path, train_name + ".pred", model_file="%s.model" % model_name, num_process=num_process) else: sys.stderr("train or predict") exit(1)