def test_brae_corpus(): source_phrase, target_phrase, src_tar_pair, src_word_dict, tar_word_dict = read_forced_decode("../data/fd.txt") config = BRAEConfig("../conf/brae.conf") src_embedding = WordEmbedding(src_word_dict, dim=50) tar_embedding = WordEmbedding(tar_word_dict, dim=50) brae = BilingualPhraseRAE(src_embedding, tar_embedding, config=config) brae.train_using_lbfgs(source_phrase, target_phrase, src_tar_pair)
def test_something(self): bin_word_map = WordEmbedding.load_word2vec_word_map( "text.bin", binary=True, unicode_errors='replace') embedding = WordEmbedding(bin_word_map, filename="text.bin", unicode_errors='replace') self.assertEqual(True, True)
def pre_model(src_dict, tar_dict, config, verbose): if rand_word_init: src_embedding = WordEmbedding(src_dict, dim=config.dim) tar_embedding = WordEmbedding(tar_dict, dim=config.dim) else: src_embedding = WordEmbedding(src_dict, filename="data/zh.token.dim%d.bin" % config.dim, dim=config.dim) tar_embedding = WordEmbedding(tar_dict, filename="data/en.token.dim%d.bin" % config.dim, dim=config.dim) return BilingualPhraseRAEISOMAP(src_embedding, tar_embedding, config=config, verbose=verbose)
def pre_model(src_dict, tar_dict, config, verbose): if rand_word_init: src_embedding = WordEmbedding(src_dict, dim=config.dim) tar_embedding = WordEmbedding(tar_dict, dim=config.dim) else: en_embedding_name = "../gbrae/data/embedding/en.token.dim%d.bin" % config.dim zh_embedding_name = "../gbrae/data/embedding/zh.token.dim%d.bin" % config.dim src_embedding = WordEmbedding(src_dict, filename=zh_embedding_name, dim=config.dim) tar_embedding = WordEmbedding(tar_dict, filename=en_embedding_name, dim=config.dim) return BilingualPhraseRAE(src_embedding, tar_embedding, config=config, verbose=verbose)
def test_brae(): np.random.seed(0) src_word_idx = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4} src_embedding = WordEmbedding(src_word_idx, dim=3) tar_word_idx = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4} tar_embedding = WordEmbedding(tar_word_idx, dim=3) src_pos, src_neg = [[1], ], [[4]] tar_pos, tar_neg = [[1], ], [[4]] brae = BilingualPhraseRAE(src_embedding, tar_embedding) src_pos_nodes, src_pos_seq = brae.source_encoder.generate_node_path(src_pos) src_neg_nodes, src_neg_seq = brae.source_encoder.generate_node_path(src_neg) tar_pos_nodes, tar_pos_seq = brae.target_encoder.generate_node_path(tar_pos) tar_neg_nodes, tar_neg_seq = brae.target_encoder.generate_node_path(tar_neg) print brae.compute_result_grad(src_pos_nodes, src_pos_seq, src_neg_nodes, src_neg_seq, tar_pos_nodes, tar_pos_seq, tar_neg_nodes, tar_neg_seq)
def bi_normal(seed): # pre_logger() np.random.seed(seed) train, dev, test, word_idx = read_sst( u"sst.bi.train", u"sst.bi.dev", u"sst.bi.test", ) # embedding = WordEmbedding(word_idx, filename=u"GoogleNews-vectors-negative300.bin") embedding_initializer = UniformInitializer(scale=0.1) weight_initializer = GlorotUniformInitializer() # embedding = WordEmbedding(word_idx, filename=u"imdb.50.bin", initializer=embedding_initializer) embedding = WordEmbedding(word_idx, dim=64, initializer=embedding_initializer) from src.recurrent import RecurrentClassifier classifier = RecurrentClassifier(embedding, recurrent_encoder=RecurrentNormEncoder, in_dim=embedding.dim, hidden_dim=64, initializer=weight_initializer, batch_size=64, num_label=2, pooling="final", activation="tanh") classifier.train(train, dev, test)
def test_rae_sentiment(): train, dev, test, word_idx = read_sst(u"E:\\Corpus\\mr\\mr.shuffle.train", u"E:\\Corpus\\mr\\mr.shuffle.test", u"E:\\Corpus\\mr\\mr.shuffle.test", ) embedding = WordEmbedding(word_idx, dim=3) # fname=u"F:\\Corpus\\imdb.50.bin") classifier = PhraseRAEClassifier(embedding=embedding, n_out=2, uniform_range=0.01, normalize=False, weight_rec=0.001, weight_l2=0.01, dropout=0, verbose=True) classifier.fit(train, dev, test)
def pre_model(src_dict, tar_dict, config, verbose): if rand_word_init: src_embedding = WordEmbedding(src_dict, dim=config.dim) tar_embedding = WordEmbedding(tar_dict, dim=config.dim) else: en_embedding_name = "data/embedding/en.token.min%d.dim%d.bin" % ( config.min_count, config.dim) zh_embedding_name = "data/embedding/zh.token.min%d.dim%d.bin" % ( config.min_count, config.dim) src_embedding = WordEmbedding(src_dict, filename=zh_embedding_name, dim=config.dim) tar_embedding = WordEmbedding(tar_dict, filename=en_embedding_name, dim=config.dim) return BilingualPhraseRAEBiLinear(src_embedding, tar_embedding, config=config, verbose=verbose)
def pre_classifier(word_idx, embedding_name, labels_nums, word_dim, hidden_dims, batch_size, dropout, act): hidden_dims = [int(hidden) for hidden in hidden_dims.split("_")] embedding_initializer = UniformInitializer(scale=0.1) weight_initializer = GlorotUniformInitializer() embedding = WordEmbedding(word_idx, dim=word_dim, filename=embedding_name, binary=True, initializer=embedding_initializer, add_unknown_word=True) classifier = MultiTaskHierarchicalClassifier(embedding, in_dim=embedding.dim, hidden_dims=hidden_dims, initializer=weight_initializer, batch_size=batch_size, dropout=dropout, labels_nums=labels_nums, activation=act, ) return classifier
def test_cnn(): import numpy as np np.random.seed(0) train, dev, test, word_idx = read_sst( u"C:\\Users\\roger\\NLP\\Corpus\\sst_bi\\sst.bi.train", u"C:\\Users\\roger\\NLP\\Corpus\\sst_bi\\sst.bi.dev", u"C:\\Users\\roger\\NLP\\Corpus\\sst_bi\\sst.bi.test", ) embedding = WordEmbedding( word_idx, dim=5) # fname=u"F:\\Corpus\\GoogleNews-vectors-negative300.bin") classifier = ShallowCNNClassifier(embedding, n_out=2, verbose=True, weight_l2=0.001) classifier.fit(train, dev, test) acc, pred = classifier.test(test[0], test[1]) print acc
def bi(): pre_logger() train, dev, test, word_idx = read_sst( u"sst.bi.train", u"sst.bi.dev", u"sst.bi.test", ) # embedding = WordEmbedding(word_idx, filename=u"GoogleNews-vectors-negative300.bin") embedding_initializer = UniformInitializer(scale=0.1) weight_initializer = GlorotUniformInitializer() # embedding = WordEmbedding(word_idx, filename=u"imdb.50.bin", initializer=embedding_initializer) embedding = WordEmbedding(word_idx, dim=50, initializer=embedding_initializer) classifier = EmbeddingClassifier( embedding, in_dim=embedding.dim, hidden_dim=50, initializer=weight_initializer, batch_size=64, num_label=2, activation="tanh", ) classifier.train(train, dev, test)
def main(): config_name = sys.argv[1] forced_decode_data = "../gbrae/data/250w/tune_hyperparameter/tune.data" brae_config = BRAEConfig(config_name) train_data = "../gbrae/data/250w/tune_hyperparameter/train/tune.train" dev_data = "../gbrae/data/250w/tune_hyperparameter/dev/tune.dev" test_data = "../gbrae/data/250w/tune_hyperparameter/test/tune.test" train_name = "dim%d_lrec%f_lsem%f_ll2%f_alpha%f_seed%d_batch%d_min%d_lr%f" % ( brae_config.dim, brae_config.weight_rec, brae_config.weight_sem, brae_config.weight_l2, brae_config.alpha, brae_config.random_seed, brae_config.batch_size, brae_config.min_count, brae_config.optimizer.param["lr"], ) model_name = "model/%s" % train_name temp_model = model_name + ".temp" start_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 0 end_iter = int(sys.argv[4]) if len(sys.argv) > 4 else 26 pre_logger("brae_" + train_name) np.random.seed(brae_config.random_seed) if start_iter == 0: print "Load Dict ..." en_embedding_name = "../gbrae/data/embedding/en.token.dim%d.bin" % brae_config.dim zh_embedding_name = "../gbrae/data/embedding/zh.token.dim%d.bin" % brae_config.dim tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name, binary=True, oov=True) src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name, binary=True, oov=True) print "Compiling Model ..." brae = pre_model(src_word_dict, tar_word_dict, brae_config, verbose=True) print "Load All Data ..." src_phrases, tar_phrases, src_tar_pair = read_phrase_list( forced_decode_data, src_word_dict, tar_word_dict) src_train = [p[WORD_INDEX] for p in src_phrases] tar_train = [p[WORD_INDEX] for p in tar_phrases] print "Write Binary Data ..." with open(temp_model, 'wb') as fout: pickle.dump(src_train, fout) pickle.dump(tar_train, fout) pickle.dump(src_tar_pair, fout) pickle.dump(brae, fout) pickle.dump(np.random.get_state(), fout) if end_iter == 1: exit(1) else: with open(temp_model, 'rb') as fin: src_train = pickle.load(fin) tar_train = pickle.load(fin) src_tar_pair = pickle.load(fin) brae = pickle.load(fin) np.random.set_state(pickle.load(fin)) src_phrase2id = dict() tar_phrase2id = dict() for phrase, i in zip(src_phrases, xrange(len(src_phrases))): src_phrase2id[phrase[TEXT_INDEX]] = i for phrase, i in zip(tar_phrases, xrange(len(tar_phrases))): tar_phrase2id[phrase[TEXT_INDEX]] = i train_pair = load_sub_data_pair(train_data, src_phrase2id, tar_phrase2id) dev_pair = load_sub_data_pair(dev_data, src_phrase2id, tar_phrase2id) test_pair = load_sub_data_pair(test_data, src_phrase2id, tar_phrase2id) brae.tune_hyper_parameter(src_train, tar_train, train_pair, dev_pair, test_pair, brae_config, model_name, start_iter=start_iter, end_iter=end_iter) brae.save_model("%s.tune.model" % model_name)
def main(): min_count = int(sys.argv[1]) dim = 50 ''' forced_decode_data = "data/brae.train.data" src_count_path = "data/src.trans.data" tar_count_path = "data/tar.trans.data" tar_para_path = "data/tar.para.data" src_para_path = "data/src.para.data" gbrae_data_name = "model/gbrae.data.min.count.%d.pkl" % min_count gbrae_dict_name = "model/gbrae.dict.min.count.%d.pkl" % min_count gbrae_phrase_dict_name = "model/gbrae.phrase.text.dict.pkl" ''' forced_decode_data = "data/250w/tune_hyperparameter/tune.data" src_count_path = "data/250w/tune_hyperparameter/tune.data" #tar_count_path = "data/250w/phrase-table.filtered" tar_para_path = "data/250w/enBP_alignPhraProb.xml" src_para_path = "data/250w/chBP_alignPhraProb.xml" gbrae_data_name = "data/250w/tune_hyperparameter/gbrae.data.tune.min.count.%d.pkl" % min_count gbrae_dict_name = "data/250w/tune_hyperparameter/train/gbrae.dict.tune.min.count.%d.pkl" % min_count gbrae_phrase_dict_name = "data/250w/tune_hyperparameter/gbrae.tune.phrase.text.dict.pkl" print "Load Word Dict ..." en_embedding_name = "data/embedding/en.token.dim%d.bin" % dim zh_embedding_name = "data/embedding/zh.token.dim%d.bin" % dim tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name, binary=True, oov=True) src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name, binary=True, oov=True) print "Load All Data ..." src_phrases, tar_phrases, src_tar_pair = read_phrase_list( forced_decode_data, src_word_dict, tar_word_dict) print "Load Para Data ..." src_phrases = read_para_list(src_para_path, src_phrases, src_word_dict) tar_phrases = read_para_list(tar_para_path, tar_phrases, tar_word_dict) print "Load Trans Data ..." src_phrases, tar_phrases = read_trans_list(src_count_path, src_phrases, tar_phrases, src_word_dict, tar_word_dict) #tar_phrases, src_phrases = read_trans_list(tar_count_path, tar_phrases, src_phrases, #tar_word_dict, src_word_dict) src_phrase2id = dict() tar_phrase2id = dict() for phrase, i in zip(src_phrases, xrange(len(src_phrases))): src_phrase2id[phrase[TEXT_INDEX]] = i for phrase, i in zip(tar_phrases, xrange(len(tar_phrases))): tar_phrase2id[phrase[TEXT_INDEX]] = i src_phrases = clean_text(src_phrases) tar_phrases = clean_text(tar_phrases) with open(gbrae_dict_name, 'wb') as fout: print "Write Word Dict ..." pickle.dump(src_word_dict, fout) pickle.dump(tar_word_dict, fout) with open(gbrae_data_name, 'wb') as fout: print "Write Source Phrases Data ..." pickle.dump(src_phrases, fout) print "Write Target Phrases Data ..." pickle.dump(tar_phrases, fout) pickle.dump(src_tar_pair, fout) with open(gbrae_phrase_dict_name, 'wb') as fout: print "Write Source Phrases Dictionary ..." pickle.dump(src_phrase2id, fout) print "Write Target Phrases Dictionary ..." pickle.dump(tar_phrase2id, fout)
def main(_): phrase_file = FLAGS.phrase_file src_para_file = FLAGS.src_para tar_para_file = FLAGS.tar_para trans_file = FLAGS.trans_file src_phrase_list, tar_phrase_list, bi_phrase_list, src_word_idx, tar_word_idx = prepare_data( phrase_file, src_para_file, tar_para_file, trans_file) ssbrae_config = SSBRAEConfig(FLAGS.config_name) src_word_embedding = WordEmbedding(src_word_idx, dim=50, name="src_word_embedding") tar_word_embedding = WordEmbedding(tar_word_idx, dim=50, name="tar_word_embedding") sess = tf.Session() ssbrae_encoder = SSBRAEEncoder( src_word_embedding, tar_word_embedding, ssbrae_config.activation, ssbrae_config.normalize, ssbrae_config.weight_rec, ssbrae_config.weight_sem, ssbrae_config.weight_embedding, ssbrae_config.alpha, ssbrae_config.beta, ssbrae_config.max_src_len, ssbrae_config.max_tar_len, ssbrae_config.n_epoch, ssbrae_config.batch_size, ssbrae_config.dropout, ssbrae_config.optimizer_config, ssbrae_config.para, ssbrae_config.trans, ssbrae_config.para_num, ssbrae_config.trans_num, sess) train_phrase_list = bi_phrase_list[:-2 * ssbrae_encoder.batch_size] valid_phrase_list = bi_phrase_list[-2 * ssbrae_encoder. batch_size:-ssbrae_encoder.batch_size] test_phrase_list = bi_phrase_list[-ssbrae_encoder.batch_size:] pre_logger("ssbrae") logger.info("Now train ssbrae encoder\n") for i in range(ssbrae_encoder.n_epoch): logger.info("Now train ssbrae encoder epoch %d\n" % i) start_time = time.time() losses = [] train_phrase_index = get_train_sequence(train_phrase_list, ssbrae_encoder.batch_size) num_batches = int(len(train_phrase_index) / ssbrae_encoder.batch_size) for j in range(num_batches): (src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para, src_para_weight, tar_para_weight, src_tar_trans,\ tar_src_trans, src_tar_trans_weight, tar_src_trans_weight) = ssbrae_encoder.get_batch(src_phrase_list, tar_phrase_list, train_phrase_list, train_phrase_index, src_word_idx, tar_word_idx, j) result = ssbrae_encoder.ssbrae_train_step( src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para, src_para_weight, tar_para_weight, src_tar_trans, tar_src_trans, src_tar_trans_weight, tar_src_trans_weight) if ssbrae_encoder.para and ssbrae_encoder.trans: logger.info( "train ssbrae_para epoch %d, step %d, total loss:%f, loss_l2: %f, loss_rec: %f," "loss_sem:%f, loss_para:%f, loss_trans:%f\n" % (i, j, result[1], result[2], result[3], result[4], result[5], result[6])) elif ssbrae_encoder.para and not ssbrae_encoder.trans: logger.info( "train ssbrae_para epoch %d, step %d, total loss:%f, loss_l2: %f, loss_rec: %f," "loss_sem:%f, loss_para:%f\n" % (i, j, result[1], result[2], result[3], result[4], result[5])) elif ssbrae_encoder.trans and not ssbrae_encoder.para: logger.info( "train ssbrae_para epoch %d, step %d, total loss:%f, loss_l2: %f, loss_rec: %f," "loss_sem:%f, loss_trans:%f\n" % (i, j, result[1], result[2], result[3], result[4], result[5])) else: raise ValueError("No such configuration") losses.append(result[1:]) use_time = time.time() - start_time valid_phrase_index = get_train_sequence(valid_phrase_list, ssbrae_encoder.batch_size) num_batches = int(len(valid_phrase_index) / ssbrae_encoder.batch_size) dev_loss = [] for j in range(num_batches): (src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para, src_para_weight, tar_para_weight, src_tar_trans, \ tar_src_trans, src_tar_trans_weight, tar_src_trans_weight) = ssbrae_encoder.get_batch(src_phrase_list, tar_phrase_list, valid_phrase_list, valid_phrase_index, src_word_idx, tar_word_idx, j) dev_loss.append( ssbrae_encoder.ssbrae_predict_step( src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para, src_para_weight, tar_para_weight, src_tar_trans, tar_src_trans, src_tar_trans_weight, tar_src_trans_weight)) logger.info("train ssbrae encoder epoch %d, use time:%d\n" % (i, use_time)) ave_train_loss = np.average(losses, axis=0) ave_dev_loss = np.average(dev_loss, axis=0) if ssbrae_encoder.para and ssbrae_encoder.trans: logger.info( "train: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f, trans loss:%f\n" % (ave_train_loss[0], ave_train_loss[1], ave_train_loss[2], ave_train_loss[3], ave_train_loss[4], ave_train_loss[5])) logger.info( "dev: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f, trans loss:%f" % (ave_dev_loss[0], ave_dev_loss[1], ave_dev_loss[2], ave_dev_loss[3], ave_dev_loss[4], ave_dev_loss[5])) elif ssbrae_encoder.para and not ssbrae_encoder.trans: logger.info( "train: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f\n" % (ave_train_loss[1], ave_train_loss[2], ave_train_loss[3], ave_train_loss[4], ave_train_loss[5])) logger.info( "dev: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f" % (ave_dev_loss[0], ave_dev_loss[1], ave_dev_loss[2], ave_dev_loss[3], ave_dev_loss[4])) elif ssbrae_encoder.trans and not ssbrae_encoder.para: logger.info( "train: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, trans loss:%f\n" % (ave_train_loss[1], ave_train_loss[2], ave_train_loss[3], ave_train_loss[4], ave_train_loss[5])) logger.info( "dev: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, trans loss:%f" % (ave_dev_loss[0], ave_dev_loss[1], ave_dev_loss[2], ave_dev_loss[3], ave_dev_loss[4])) checkpoint_path = os.path.join(FLAGS.train_dir, "ssbare_encoder.epoch%d.ckpt" % i) #ssbrae_encoder.saver.save(ssbrae_encoder.sess, checkpoint_path, global_step=ssbrae_encoder.global_step) ssbrae_encoder.saver.save(ssbrae_encoder.sess, checkpoint_path) test_phrase_index = get_train_sequence(test_phrase_list, ssbrae_encoder.batch_size) num_batches = int(len(test_phrase_index) / ssbrae_encoder.batch_size) test_loss = [] for j in range(num_batches): (src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para, src_para_weight, tar_para_weight, src_tar_trans, \ tar_src_trans, src_tar_trans_weight, tar_src_trans_weight) = ssbrae_encoder.get_batch(src_phrase_list, tar_phrase_list, test_phrase_list, test_phrase_index, src_word_idx, tar_word_idx, j) test_loss.append( ssbrae_encoder.ssbrae_predict_step( src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para, src_para_weight, tar_para_weight, src_tar_trans, tar_src_trans, src_tar_trans_weight, tar_src_trans_weight)) ave_test_loss = np.average(test_loss, axis=0) if ssbrae_encoder.para and ssbrae_encoder.trans: logger.info( "test: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f, trans loss:%f" % (ave_test_loss[0], ave_test_loss[1], ave_test_loss[2], ave_test_loss[3], ave_test_loss[4], ave_test_loss[5])) elif ssbrae_encoder.para and not ssbrae_encoder.trans: logger.info( "test: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f" % (ave_test_loss[0], ave_test_loss[1], ave_test_loss[2], ave_test_loss[3], ave_test_loss[4])) elif ssbrae_encoder.trans and not ssbrae_encoder.para: logger.info( "test: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, trans loss:%f" % (ave_test_loss[0], ave_test_loss[1], ave_test_loss[2], ave_test_loss[3], ave_test_loss[4]))
def main(): train_test = sys.argv[1] if train_test not in ["train", "predict"]: sys.stderr("train or predict") exit(1) config_name = sys.argv[2] forced_decode_data = "../gbrae/data/250w/phrase-table.filtered" phrase_data_path = "data/phrase.list" brae_config = BRAEConfig(config_name) train_name = "dim%d_lrec%f_lsem%f_ll2%f_alpha%f_seed%d_batch%d_min%d_lr%f" % (brae_config.dim, brae_config.weight_rec, brae_config.weight_sem, brae_config.weight_l2, brae_config.alpha, brae_config.random_seed, brae_config.batch_size, brae_config.min_count, brae_config.optimizer.param["lr"],) model_name = "model/%s" % train_name temp_model = model_name + ".temp" if train_test == "train": start_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 0 end_iter = int(sys.argv[4]) if len(sys.argv) > 4 else 26 pre_logger("brae_" + train_name) np.random.seed(brae_config.random_seed) if start_iter == 0: print "Load Dict ..." en_embedding_name = "../gbrae/data/embedding/en.token.dim%d.bin" % brae_config.dim zh_embedding_name = "../gbrae/data/embedding/zh.token.dim%d.bin" % brae_config.dim tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name, binary=True, oov=True) src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name, binary=True, oov=True) print "Compiling Model ..." brae = pre_model(src_word_dict, tar_word_dict, brae_config, verbose=True) print "Load All Data ..." src_phrases, tar_phrases, src_tar_pair = read_phrase_list(forced_decode_data, src_word_dict, tar_word_dict) src_train = [p[WORD_INDEX] for p in src_phrases] tar_train = [p[WORD_INDEX] for p in tar_phrases] print "Write Binary Data ..." with open(temp_model, 'wb') as fout: pickle.dump(src_train, fout) pickle.dump(tar_train, fout) pickle.dump(src_tar_pair, fout) pickle.dump(brae, fout) pickle.dump(np.random.get_state(), fout) if end_iter == 1: exit(1) else: with open(temp_model, 'rb') as fin: src_train = pickle.load(fin) tar_train = pickle.load(fin) src_tar_pair = pickle.load(fin) brae = pickle.load(fin) np.random.set_state(pickle.load(fin)) brae.train(src_train, tar_train, src_tar_pair, brae_config, model_name, start_iter, end_iter) brae.save_model("%s.model" % model_name) elif train_test == "predict": num_process = int(sys.argv[3]) if len(sys.argv) > 3 else 0 brae_predict(phrase_data_path, train_name + ".pred", model_file="%s.model" % model_name, num_process=num_process) else: sys.stderr("train or predict") exit(1)
def main(_): phrase_file = FLAGS.phrase_file src_para_file = FLAGS.src_para_file tar_para_file = FLAGS.tar_para_file trans_file = FLAGS.trans_file src_phrase_list, tar_phrase_list, bi_phrase_list, src_word_idx, tar_word_idx = prepare_data(phrase_file, src_para_file, tar_para_file, trans_file) # src rae encoder src_config_name = FLAGS.src_config_name src_rae_config = BRAEConfig(src_config_name) src_embedding = WordEmbedding(src_word_idx, dim=50, name="src_embedding") sess = tf.Session() src_rae_encoder = RAEEncoder(src_rae_config.activation, src_embedding, src_rae_config.normalize, src_rae_config.weight_rec, src_rae_config.weight_embedding, src_rae_config.n_epoch, src_rae_config.max_src_len, src_rae_config.batch_size, src_rae_config.dropout, src_rae_config.optimizer_config, sess, name="rae_encoder") # tar rae encoder tar_config_name = FLAGS.tar_config_name tar_rae_config = BRAEConfig(tar_config_name) tar_embedding = WordEmbedding(tar_word_idx, dim=50, name="tar_embedding") tar_rae_encoder = RAEEncoder(tar_rae_config.activation, tar_embedding, tar_rae_config.normalize, tar_rae_config.weight_rec, tar_rae_config.weight_embedding, tar_rae_config.n_epoch, tar_rae_config.max_tar_len, tar_rae_config.batch_size, tar_rae_config.dropout, tar_rae_config.optimizer_config, sess, name="tar_rae_encoder") train_phrase_list = src_phrase_list[: - 2 * src_rae_config.batch_size], valid_phrase_list = src_phrase_list[-2 * src_rae_config.batch_size: - src_rae_config.batch_size] test_phrase_list = src_phrase_list[- src_rae_config.batch_size:] logger.info("Now train the src rae encoder:\n") for i in range(src_rae_encoder.n_epoch): logger.info("Now train src rae epoch %d\n" % (i + 1)) start_time = time.time() src_train_index = get_train_sequence(train_phrase_list, src_rae_encoder.batch_size) batch_number = int(len(src_train_index) / src_rae_encoder.batch_size) losses = [] for j in range(batch_number): inputs = src_rae_encoder.get_batch(train_phrase_list, src_train_index, j, src_word_idx) loss = src_rae_encoder.train_step(inputs) logging.info("src rae epoch %d, step %d, loss: %f\n" % (i, j, loss)) losses.append(loss) src_valid_index = get_train_sequence(valid_phrase_list, src_rae_encoder.batch_size) valid_batches = int(len(src_valid_index) / src_rae_encoder.batch_size) dev_loss = [] for j in range(valid_batches): inputs = src_rae_encoder.get_batch(valid_phrase_list, src_valid_index, j, src_word_idx) dev_loss.append(src_rae_encoder.predict_step(inputs)) use_time = time.time() - start_time logger.info("src rae epoch %d, time: %d, train loss:%f, development loss:%f\n" % (i, use_time, sess.run(tf.reduce_mean(losses)), sess.run(tf.reduce_mean(dev_loss)))) checkpoint_path = os.path.join(FLAGS.train_dir, "src_rae.epoch%d.ckpt" % i) src_rae_encoder.saver.save(src_rae_encoder.sess, checkpoint_path, global_step=src_rae_encoder.global_step) src_test_index = get_train_sequence(test_phrase_list, src_rae_encoder.batch_size) test_batches = int(len(src_test_index) / src_rae_encoder.batch_size) test_loss = [] for j in range(test_batches): inputs = src_rae_encoder.get_batch(test_phrase_list, src_test_index, j, src_word_idx) test_loss.append(src_rae_encoder.predict_step(inputs)) logger.info("src test loss : %f\n" % sess.run(tf.reduce_mean(test_loss)))
def __init__(self, entity_index, relation_index, entity_dim=100, k=100, initializer=default_initializer, regularization_weight=0.0001): self.relation_num = len(relation_index) self.scorer = SingleLayerModel(entity_dim=entity_dim, relation_num=self.relation_num, k=k, initializer=UniformInitializer(scale=1 / np.sqrt(entity_dim * 2))) self.entity_embedding = WordEmbedding(entity_index, dim=entity_dim, initializer=initializer) self.regularization_weight = regularization_weight self.e1_index = T.lscalar() self.e2_index = T.lscalar() self.ec_index = T.lscalar() self.relation_index = T.lscalar() self.pos_score = self.scorer.score(self.entity_embedding[self.e1_index], self.entity_embedding[self.e2_index], self.relation_index) self.neg_score = self.scorer.score(self.entity_embedding[self.e1_index], self.entity_embedding[self.ec_index], self.relation_index) self.loss_max_margin = T.maximum(0.0, self.neg_score - self.pos_score + 1.0) self.e1_index_batch = T.lvector() self.e2_index_batch = T.lvector() self.ec_index_batch = T.lvector() self.relation_index_batch = T.lvector() self.pos_score_batch = self.scorer.score_batch(self.entity_embedding[self.e1_index_batch], self.entity_embedding[self.e2_index_batch], self.relation_index_batch) self.neg_score_batch = self.scorer.score_batch(self.entity_embedding[self.e1_index_batch], self.entity_embedding[self.ec_index_batch], self.relation_index_batch) self.loss_max_margin_batch = T.sum(T.maximum(0.0, self.neg_score_batch - self.pos_score_batch + 1.0)) self.pos_score_relation = self.scorer.score_one_relation(self.entity_embedding[self.e1_index_batch], self.entity_embedding[self.e2_index_batch], self.relation_index) self.neg_score_relation = self.scorer.score_one_relation(self.entity_embedding[self.e1_index_batch], self.entity_embedding[self.ec_index_batch], self.relation_index) self.loss_max_margin_relation = T.sum(T.maximum(0.0, self.neg_score_relation - self.pos_score_relation + 1.0)) self.params = self.entity_embedding.params + self.scorer.params self.l2_norm = self.entity_embedding.l2_norm + self.scorer.l2_norm self.l2_loss = self.regularization_weight * self.l2_norm / 2 sgd_optimizer = AdaDeltaOptimizer(lr=0.95, norm_lim=-1) self.loss = self.loss_max_margin + self.l2_loss updates = sgd_optimizer.get_update(self.loss, self.params) self.loss_batch = self.loss_max_margin_batch + self.l2_loss updates_batch = sgd_optimizer.get_update(self.loss_batch, self.params) grad_margin_relation = T.grad(self.loss_max_margin_relation, self.params) grad_l2 = T.grad(self.l2_loss, self.params) self.train_one_instance = theano.function(inputs=[self.e1_index, self.e2_index, self.ec_index, self.relation_index], outputs=[self.loss, self.loss_max_margin, self.l2_loss], updates=updates) self.score_one_instance = theano.function(inputs=[self.e1_index, self.e2_index, self.relation_index], outputs=[self.pos_score]) self.train_batch_instance = theano.function(inputs=[self.e1_index_batch, self.e2_index_batch, self.ec_index_batch, self.relation_index_batch], outputs=[self.loss_batch, self.loss_max_margin_batch, self.l2_loss], updates=updates_batch) self.score_batch_instance = theano.function(inputs=[self.e1_index_batch, self.e2_index_batch, self.relation_index_batch], outputs=self.pos_score_batch) self.grad_relation_margin = theano.function(inputs=[self.e1_index_batch, self.e2_index_batch, self.ec_index_batch, self.relation_index], outputs=[self.loss_max_margin_relation] + grad_margin_relation, ) self.forward_relation_margin = theano.function(inputs=[self.e1_index_batch, self.e2_index_batch, self.ec_index_batch, self.relation_index], outputs=[self.loss_max_margin_relation], ) self.grad_l2 = theano.function(inputs=[], outputs=[self.l2_loss] + grad_l2,) self.forward_l2 = theano.function(inputs=[], outputs=[self.l2_loss],) self.score_relation_instance = theano.function(inputs=[self.e1_index_batch, self.e2_index_batch, self.relation_index], outputs=self.pos_score_relation)
def __init__(self, key_index, label_num, pretrain_name=None, encoder='lstm', word_dim=300, hidden='100_100', dropout=0.5, regularization_weight=0.0001, optimizer_name='adagrad', lr=0.1, norm_lim=-1, label2index_filename=None): self.label2index, self.index2label = self.load_label_index( label2index_filename, label_num) self.indexs = T.imatrix() # (batch, max_len) self.golden = T.ivector() # (batch, ) self.max_len = T.iscalar() # max length self.s1_mask = self.indexs[:, :self.max_len] > 0 self.s1_mask = self.s1_mask * T.constant(1.0, dtype=theano.config.floatX) if pretrain_name is None: self.embedding = WordEmbedding( key_index, dim=word_dim, initializer=UniformInitializer(scale=0.01)) else: self.embedding = WordEmbedding(key_index, filename=pretrain_name, normalize=False, binary=True) assert self.embedding.dim == word_dim self.word_embeddings = self.embedding[self.indexs[:, :self.max_len]] if type(hidden) is str: hidden_dims = [int(hid) for hid in hidden.split('_')] else: hidden_dims = [hidden] if encoder == 'lstm': encoder_layer = LSTMEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="LSTM_", dropout=dropout) elif encoder == 'bilstm': encoder_layer = BiLSTMEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="BiLSTM_", bidirection_shared=True, dropout=dropout) elif encoder == 'recurrent': encoder_layer = RecurrentEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="Recurrent_", dropout=dropout) elif encoder == 'birecurrent': encoder_layer = BiRecurrentEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="BiRecurrent_", bidirection_shared=True, dropout=dropout) elif encoder == 'gru': encoder_layer = GRUEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="GRU_", dropout=dropout) elif encoder == 'bigru': encoder_layer = BiGRUEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="BiGRU_", bidirection_shared=True, dropout=dropout) elif encoder == 'cbow': encoder_layer = CBOWLayer(in_dim=word_dim, ) elif encoder == 'cnn': encoder_layer = MultiFilterConvolutionLayer( in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='max', prefix="ConvLayer_", kernel_sizes=CONV_FILTER_SIZES) else: raise NotImplementedError self.text_embedding = encoder_layer.forward_batch( self.word_embeddings, self.s1_mask) if len(hidden_dims) > 1: hidden_layer = MultiHiddenLayer(in_dim=encoder_layer.out_dim, hidden_dims=hidden_dims[1:], dropout=dropout, prefix='Full_Connected_Layer_') classifier_input = hidden_layer.forward_batch(self.text_embedding) classifier_input_dim = hidden_layer.out_dim else: classifier_input = self.text_embedding classifier_input_dim = encoder_layer.out_dim self.classifier = SoftmaxClassifier(classifier_input_dim, label_num, dropout=dropout) self.predict_loss = self.classifier.loss(classifier_input, self.golden) self.predict_prob = self.classifier.forward_batch(classifier_input) self.predict_label = T.argmax(self.predict_prob, axis=1) """Params in TextClassifier""" self.params = self.classifier.params + encoder_layer.params self.l2_norm = self.classifier.l2_norm + encoder_layer.l2_norm if len(hidden_dims) > 1: self.params += hidden_layer.params self.l2_norm += hidden_layer.l2_norm self.l2_loss = regularization_weight * self.l2_norm / 2 self.loss = self.predict_loss + self.l2_loss """Opimizer and Loss""" if optimizer_name == 'adagrad': sgd_optimizer = AdaGradOptimizer(lr=lr, norm_lim=norm_lim) elif optimizer_name == 'adadelta': sgd_optimizer = AdaDeltaOptimizer(lr=lr, norm_lim=norm_lim) elif optimizer_name == 'sgd': sgd_optimizer = SGDOptimizer(lr=lr, norm_lim=norm_lim) elif optimizer_name == 'momentum': sgd_optimizer = SGDMomentumOptimizer(lr=lr, norm_lim=norm_lim) elif optimizer_name == 'adam': sgd_optimizer = AdamOptimizer(lr=lr, norm_lim=norm_lim) else: raise NotImplementedError self.train_indexs = T.ivector() self.train_data_x = shared_zero_matrix(shape=(5, 5), name="train_data_x", dtype=np.int32) self.train_data_y = shared_zero_matrix(shape=(5, ), name="train_data_y", dtype=np.int32) self.model_params = self.params + self.embedding.params """Theano Function""" if EMBEDDING_LR > 0: embedding_updates = SGDOptimizer(lr=EMBEDDING_LR, norm_lim=-1).get_update( self.loss, self.embedding.params) updates = sgd_optimizer.get_update( self.loss, self.params, norm_exc_params=self.embedding.params) updates.update(embedding_updates) elif EMBEDDING_LR < 0: # Optimize Embedding using Global Optimizer self.params += self.embedding.params updates = sgd_optimizer.get_update( self.loss, self.params, norm_exc_params=self.embedding.params) else: # Fix Embedding updates = sgd_optimizer.get_update( self.loss, self.params, norm_exc_params=self.embedding.params) self.train_batch = theano.function( inputs=[self.train_indexs, self.max_len], outputs=[self.loss, self.predict_loss, self.l2_loss], updates=updates, givens=[(self.indexs, self.train_data_x[self.train_indexs]), (self.golden, self.train_data_y[self.train_indexs])]) self.loss_batch = theano.function( inputs=[self.indexs, self.golden, self.max_len], outputs=[self.loss, self.predict_loss, self.l2_loss], ) self.pred_prob_batch = theano.function( inputs=[self.indexs, self.max_len], outputs=[self.predict_prob], ) self.pred_label_batch = theano.function( inputs=[self.indexs, self.max_len], outputs=[self.predict_label], ) self.get_l2_loss = theano.function( inputs=[], outputs=[self.l2_loss, self.l2_norm], )