def __init__(self, args): self.num_updates = 0 self.args = args self.word2vec = load_word2vec(args.word2vec_path) self._build_loader() self._build_model() self._build_optimizer()
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) embeds_path = os.path.join("data", "twitter_mf.clean.npy") if not os.path.isfile(embeds_path): word2vec = utils.load_word2vec() embedding_matrix = np.random.uniform(low=-1.0, high=1.0, size=(self.hparams.vocab_size, 300)) with open(os.path.join("data", "twitter_mf.clean.vocab")) as f: for i, word in enumerate(f): word = word.strip() if word in word2vec: embedding_matrix[i] = word2vec[word] np.save(embeds_path, embedding_matrix) del word2vec else: embedding_matrix = np.load(embeds_path) self.embed = tfkl.Embedding( self.hparams.vocab_size, 300, embeddings_initializer=tf.initializers.constant(embedding_matrix), ) self.embed.trainable = self.hparams.fine_tune_embeds self.encoder = self.make_encoder() self.encoder.add(tfkl.Dense(6, activation=tf.math.tanh))
def __init__(self, tokens, bos='__begin__', eos='__end__', unk='__unknown__', use_w2v=True, w2v_dim=300): self.token_to_idx = {token: i for i, token in enumerate(tokens)} self.use_w2v = use_w2v self.bos = bos self.eos = eos if unk is not None: self.unk = unk self.unk_idx = self.token_to_idx[unk] self.bos_idx = self.token_to_idx[bos] self.eos_idx = self.token_to_idx[eos] self.idx_to_token = {i: token for i, token in enumerate(tokens)} if use_w2v: word2vec = utils.load_word2vec( 'ruwikiruscorpora_upos_skipgram_300_2_2018.vec') self.word2vec_tokens = np.zeros((len(tokens), w2v_dim)) self.w2v_dim = w2v_dim for token, idx in self.token_to_idx.items(): if token in word2vec: self.word2vec_tokens[idx] = word2vec[token] else: self.word2vec_tokens[idx] = np.random.normal(w2v_dim)
def __init__(self, args, model): super(Model, self).__init__() self.embeddings_matrix = load_word2vec(args) self.embeddings = tf.keras.layers.Embedding( args.vocab_size, args.embedding_dim, weights=[self.embeddings_matrix], trainable=False) self.encoder = model self.dense = tf.keras.layers.Dense( units=args.vocab_size, activation='softmax', #input_shape=(args.max_seq_len, args.embedding_dim) ) self.dense2 = tf.keras.layers.Dense( args.hidden_size, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02)) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12) self.act = tf.keras.layers.Activation(gelu) self.bias = self.add_weight(shape=(args.vocab_size, ), initializer="zeros", trainable=True, name="bias") self.word_embeddings = self.add_weight( "weight", shape=[args.vocab_size, args.hidden_size], initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), )
def save_or_load_embeds(embeds_path, vocab_path, vocab_size): """Load or build and embedding matrix from a TSV file.""" should_save = False if not os.path.isfile(embeds_path): embedding_matrix = np.random.uniform(low=-1.0, high=1.0, size=(vocab_size, 300)) should_save = True else: embedding_matrix = np.load(embeds_path) # Check if the vocab sizes match. If the saved matrix is missing words, a new # matrix file is needed. if len(embedding_matrix) < vocab_size: should_save = True if should_save: w2v = utils.load_word2vec() with open(vocab_path) as f: for i, word in enumerate(f): word = word.strip() if word in w2v: embedding_matrix[i] = w2v[word] np.save(embeds_path, embedding_matrix) return embedding_matrix[:vocab_size]
def load_data(arg): if arg.glove: utils.log('Loading glove..', arg.id) word_embedding = utils.load_glove_vector() else: utils.log('Loading word2vec..', arg.id) word_embedding = utils.load_word2vec() return word_embedding
def __init__(self, args, model_roberta): super(Model_Roberta, self).__init__() self.encoder = model_roberta self.args = args self.embeddings_matrix = load_word2vec(args) self.embeddings = tf.keras.layers.Embedding( args.vocab.word_size(), args.embedding_dim, weights=[self.embeddings_matrix], trainable=False)
def train(ex_id, restore=False): global dataset output_path = ex_id + '/' print(output_path) utils.create_folder(output_path) train_data = utils.load_conll_data(dataset['train']) train_range = len(train_data) print('Train dataset: %d' % (train_range)) word2vec_model = utils.load_word2vec() global FLAGS, tf_config with tf.Graph().as_default(), tf.Session() as session: # with tf.Graph().as_default(), tf.Session(config=tf_config) as session: initializer = tf.random_uniform_initializer(-0.1, 0.1) with tf.variable_scope("RNN", reuse=None, initializer=initializer): utils.log("Building model.. ", ex_id) i_train = BiEncoderDecoderModel(is_training=True, FLAGS=FLAGS) start_epoch = 0 if restore: start_epoch = max(int(i) for i in os.listdir(arg.id)) - 1 if start_epoch > 0: print('Restoring model: %s...' % (start_epoch)) model_file_path = os.path.join(ex_id, str(start_epoch), 'model.ckpt') i_train.saver.restore(session, model_file_path) else: utils.log('No saved model, initialize all variables...', ex_id) tf.global_variables_initializer().run() else: tf.global_variables_initializer().run() for epoch in range(start_epoch + 1, 150): print("Epoch: %d" % (epoch), ex_id) epoch_output_path = os.path.join(output_path, str(epoch)) utils.create_folder(epoch_output_path) train_cost = 0.0 per = np.random.permutation(train_range) for i, index in enumerate(per): inputs, labels, name, sentence_len = get_sample( word2vec_model, train_data, index) start = time.time() cost, predicts, feature = i_train.train( session, inputs, labels, sentence_len) train_cost += cost if i % 100 == 0: print('Time: %f\r' % ((time.time() - start) / 100), end='') utils.update_epoch(epoch, i, ex_id) print('Train: ' + str(train_cost), ex_id) model_file_path = os.path.join(epoch_output_path, 'model.ckpt') i_train.save_model(session, model_file_path)
def read_word_embed(vocab): """Google word2vec 300 dim""" existing_embed = utils.load_word2vec(WORD2VEC_FILE, vocab) word_embed = [None] * len(vocab) for word in vocab: index = vocab[word] try: embed = np.array(existing_embed[word], dtype=np.float32) except KeyError: embed = np.random.uniform(low=-0.25, high=0.25, size=[300]).tolist() word_embed[index] = embed return np.array(word_embed)
def train(self): steps_per_epoch = self.train_batcher.num_batches with tf.Session(config=tf_config) as sess: ### 1. create model and load parameters self.model = Model(self.config) ckpt = tf.train.get_checkpoint_state(FLAGS.ckpt_path) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) self.model.saver.restore(sess, ckpt.model_checkpoint_path) else: logger.info("Created model with fresh parameters.") sess.run(tf.global_variables_initializer()) if self.config["pre_emb"]: # load pre-trained word vec params # load word vec embed_weights = sess.run( self.model.char_lookup.read_value()) embed_weights = utils.load_word2vec( FLAGS.emb_path, self.id_2_ch, FLAGS.char_dim, embed_weights) sess.run(self.model.char_lookup.assign(embed_weights)) logger.info("Loaded pre-trained embedding.") ### 2. training logger.info(" => Start training...") loss = [] with tf.device("/gpu:0"): for i in range(FLAGS.epochs): for batch in self.train_batcher.iter_batch(shuffle=False): step, batch_loss = self.model.run_step() loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info( ">>> Epoch:{}, iteration:{}, step:{}/{}, Batch mean loss:{:>9.6f}" .format(i + 1, iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # evaluate at dev set every epoch self.eval(sess, "dev", self.dev_bathcer) # save model if i % 8 == 0: self.model.save_model(sess, FLAGS.ckpt_path, name="train_ner.ckpt") logger.info("=> Model saved. ") # evaluate at test set self.eval(sess, "test", self.test_batcher)
def make_feature(ex_id, epoch): global dataset train_data = utils.load_conll_data_as_dict(dataset['train']) dev_data = utils.load_conll_data_as_dict(dataset['dev']) test_data = utils.load_conll_data_as_dict(dataset['test']) word2vec_model = utils.load_word2vec() global FLAGS, tf_config with tf.Graph().as_default(), tf.Session(config=tf_config) as session: initializer = tf.random_uniform_initializer(-0.1, 0.1) with tf.variable_scope("RNN", reuse=False, initializer=initializer): print('Building model..') i_test = BiEncoderDecoderModel(is_training=False) epoch_output_path = os.path.join(ex_id, epoch) print('Restoring model: %s...' % (epoch)) model_file_path = os.path.join(epoch_output_path, 'model.ckpt') i_test.saver.restore(session, model_file_path) # Test model test_cost = 0.0 print('Starting test..') start = time.time() for idx, data in enumerate([train_data, dev_data, test_data]): output = {} for name, sample in data.items(): inputs, labels, _, sentence_len = get_sample( word_embedding, [sample], 0) cost, predicts, feature = i_test.test( session, inputs, labels, sentence_len) test_cost += cost output[name] = predicts if idx == 0: print('Saving train feature') file_path = os.path.join(epoch_output_path, 'train.feature') elif idx == 1: print('Saving dev feature') file_path = os.path.join(epoch_output_path, 'dev.feature') else: print('Saving test feature') file_path = os.path.join(epoch_output_path, 'test.feature') save_output(file_path, output) print('DONE!')
def get_embedding(self, inputs, id_to_word): # embedding layer for input projection with tf.variable_scope("Embedding"), tf.device('/cpu:0'): if not self.params.pre_emb: embedding = tf.get_variable( "word_emb", [self.num_words, self.params.word_dim], initializer=init_ops.uniform_unit_scaling_initializer()) else: print("load word2vec") embedding = tf.get_variable( "word_emb", dtype=tf.float32, initializer=np.asarray(load_word2vec( self.params.pre_emb, id_to_word), dtype=np.float32)) x = tf.nn.embedding_lookup(embedding, inputs) return x
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) # TODO: get rid of this, use `TextVectorization`. embeds_path = os.path.join("data", "twitter_mf.clean.npy") if not os.path.isfile(embeds_path): word2vec = utils.load_word2vec() embedding_matrix = np.random.uniform(low=-1.0, high=1.0, size=(self.hparams.vocab_size, 300)) with open(os.path.join("data", "twitter_mf.clean.vocab")) as f: for i, word in enumerate(f): word = word.strip() if word in word2vec: embedding_matrix[i] = word2vec[word] np.save(embeds_path, embedding_matrix) del word2vec else: embedding_matrix = np.load(embeds_path) # TODO: get rid of this, use `TextVectorization` and figure out how to # equivalently incorporate `fine_tune_embeds` . self.embed = tfkl.Embedding( self.hparams.vocab_size, 300, embeddings_initializer=tf.initializers.constant(embedding_matrix), ) self.embed.trainable = self.hparams.fine_tune_embeds self.encoder = self.make_encoder() # The "non-moral" axis is actually between 0 and 1, and only 1 when the rest of # the components are 0. if self.hparams.normalize_nonmoral: self.encoder.add(tfkl.Dense(5, activation=tf.math.tanh)) self.encoder.add(tfkl.Lambda(half_sphere)) else: self.encoder.add(tfkl.Dense(6, activation=tf.math.tanh))
def train(): with tf.device('/cpu:0'): x_text, y, pos1, pos2 = data_helpers.load_data(FLAGS.train_path) # 建立词映射表 # Example: x_text[k] = 'the e11 factory e12 products have included flower pots finnish rooster' # =>[1 2 4 3 5 6 7 8 9 10 11] # =>[1 2 4 3 5 6 7 8 9 10 11 0 0 ... 0 0] text_tokenizer = keras.preprocessing.text.Tokenizer() text_tokenizer.fit_on_texts(x_text) x_text = text_tokenizer.texts_to_sequences(x_text) x = keras.preprocessing.sequence.pad_sequences(x_text, FLAGS.max_sentence_length, padding='post') text_vocab_size = len(text_tokenizer.word_index) print("Text vocabulary size:{}".format(text_vocab_size)) print("x shape={0}".format(x.shape)) print("y shape={0}".format(y.shape)) print("") # 建立位置向量 # pos1[k] = ['32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55'] # => [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 # 0 0 0 0 0 0 0 0 0 0 0] pos_tokenizer = keras.preprocessing.text.Tokenizer() pos_tokenizer.fit_on_texts(pos1 + pos2) p1 = pos_tokenizer.texts_to_sequences(pos1) p2 = pos_tokenizer.texts_to_sequences(pos2) p1 = keras.preprocessing.sequence.pad_sequences(p1, FLAGS.max_sentence_length, padding='post') p2 = keras.preprocessing.sequence.pad_sequences(p2, FLAGS.max_sentence_length, padding='post') pos_vocab_size = len(pos_tokenizer.word_index) print("Position vocabulary size:{}".format(pos_vocab_size)) print("pos_1 shape={0}".format(p1.shape)) print("pos_2 shape={0}".format(p2.shape)) print("") # 随机打乱数据然后分为训练和测试数据 np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] p1_shuffled = p1[shuffle_indices] p2_shuffled = p2[shuffle_indices] dev_sample_index = -1 * int(float(len(y)) * FLAGS.dev_sample_percentage) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[ dev_sample_index:] p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[ dev_sample_index:] print("Train/Dev split:{0}/{1}".format(len(y_train), len(y_dev))) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(x.shape[1], y.shape[1], text_vocab_size + 1, pos_vocab_size + 1, FLAGS.text_embedding_dim, FLAGS.pos_embedding_dim, list(map(int, FLAGS.filter_sizes.split(","))), FLAGS.num_filters, FLAGS.l2_reg_lambda) # 定义训练步骤 global_step = tf.Variable(0, trainable=False, name='global_step') optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) # optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate) # op.minimize()的第一步 拆开以梯度修剪 gvs = optimizer.compute_gradients(cnn.loss) # 梯度修剪 capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # 输出路径 timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) # 记录 loss_summary = tf.summary.scalar("loss", cnn.loss) accuracy_summary = tf.summary.scalar("accuracy", cnn.accuracy) # 训练记录 train_summary_op = tf.summary.merge( [loss_summary, accuracy_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # 验证记录 dev_summary_op = tf.summary.merge([loss_summary, accuracy_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # checkoutpoint 输出 checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # 保存文本和位置的映射表 with open(os.path.join(out_dir, 'text_tokenizer.json'), 'w') as js: json.dump(text_tokenizer.word_index, js) with open(os.path.join(out_dir, 'pos_tokenizer.json'), 'w') as js: json.dump(pos_tokenizer.word_index, js) # 初始化所有参数 sess.run(tf.global_variables_initializer()) # 预训练 if FLAGS.embedding_path: Pretrained_W = utils.load_word2vec(FLAGS.embedding_path, FLAGS.text_embedding_dim, text_tokenizer) sess.run(cnn.W_text.assign(Pretrained_W)) print("Load Pretrained Embedding Success!") # 生成batch训练数据 data = list(zip(x_train, p1_train, p2_train, y_train)) batches = data_helpers.batch_iter(data, FLAGS.batch_size, FLAGS.num_epochs, True) best_f1 = 0.0 cnt_epoch = 0 cnt_batch = 0 for batch_and_per_batches in batches: batches_per_epoch = batch_and_per_batches[1] batch = batch_and_per_batches[0] cnt_batch = cnt_batch + 1 x_batch, p1_batch, p2_batch, y_batch = zip(*batch) feed_dic = { cnn.input_text: x_batch, cnn.input_p1: p1_batch, cnn.input_p2: p2_batch, cnn.input_y: y_batch, cnn.drop_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run( [ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict=feed_dic) train_summary_writer.add_summary(summaries, step) if cnt_batch == batches_per_epoch: cnt_epoch = cnt_epoch + 1 feed_dict = { cnn.input_text: x_dev, cnn.input_p1: p1_dev, cnn.input_p2: p2_dev, cnn.input_y: y_dev, cnn.drop_keep_prob: 1.0 } summaries, loss, accuracy, predictions = sess.run([ dev_summary_op, cnn.loss, cnn.accuracy, cnn.predictions ], feed_dict) dev_summary_writer.add_summary(summaries, step) f1 = f1_score(np.argmax(y_dev, 1), predictions, labels=np.array(range(1, 19)), average='macro') print("epoch {0} --- loss: {1} acc:{2} f1:{3}".format( cnt_epoch, loss, accuracy, f1)) if best_f1 < f1: best_f1 = f1 path = saver.save(sess, checkpoint_prefix + "-{:.3f}".format(best_f1), global_step=step) print("Model saved to {}".format(path)) cnt_batch = 0
def main(): parser = argparse.ArgumentParser(description="-----[Reinforced Visual Semantic Embedding ]-----") parser.add_argument('--dataset', default='digit', help='Dataset. (vse | mr | digit)') root_args = parser.parse_args(sys.argv[1:3]) dataset = root_args.dataset parser = argparse.ArgumentParser(description="-----[Reinforced Visual Semantic Embedding ]-----") if dataset == 'vse': # Common params, but specifying each under each dataset-if to make the default values different parser.add_argument("--hidden_size", default=512, type=int, help="Size of hidden layer in deep RL") parser.add_argument("--episodes", default=10000, type=int, help="number of episodes") parser.add_argument("--learning_rate_rl", default=0.1, type=float, help="learning rate") parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=20, type=int, help='Number of reward calculation epochs.') parser.add_argument('--full_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--init_samples', default=224, type=int, help='number of random inital training data') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--budget', default=1120, type=int, help='Our labeling budget') parser.add_argument('--selection_radius', default=32, type=int, help='Selection radius') parser.add_argument("--reward_threshold", default=0, type=float, help="Reward threshold") parser.add_argument('--scorefn', default='intra',type=str, help='Score FN for traditional active learning') parser.add_argument('--w2v', action='store_true', help='Use w2v embeddings') # VSE specific params parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--learning_rate_vse', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=10, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--img_dim', default=4096, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--cnn_type', default='vgg19',type=str, help="""The CNN used for image encoder(e.g. vgg19, resnet152)""") parser.add_argument('--topk', default=10, type=int, help='Topk similarity to use for state') parser.add_argument('--topk_image', default=0, type=int, help='Topk similarity images to use for state') parser.add_argument('--data_name', default='f8k_precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--intra_caption', action='store_true', help='Include closest captions intra distance in state') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--image_distance', action='store_true', help='Include image distance in the state ') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') # parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') elif dataset == 'mr': parser.add_argument('--hidden_size', default=320, type=int, help='Size of hidden layer in deep RL') parser.add_argument('--episodes', default=10000, type=int, help='Number of episodes') parser.add_argument('--learning_rate_rl', default=0.1, type=float, help='learning rate') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=15, type=int, help='Number of training epochs.') parser.add_argument('--full_epochs', default=15, type=int, help='Number of training epochs.') parser.add_argument('--init_samples', default=480, type=int, help='Number of random inital training data') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--budget', default=5, type=int, help='Our labeling budget') parser.add_argument('--selection_radius', default=32, type=int, help='Selection radius') parser.add_argument('--reward_threshold', default=0, type=float, help='Reward threshold') parser.add_argument('--w2v', action='store_true', help='Use w2v embeddings') elif dataset == 'digit': parser.add_argument('--hidden_size', default=10, type=int, help='Size of hidden layer in deep RL') parser.add_argument('--episodes', default=10000, type=int, help='Number of episodes') parser.add_argument('--learning_rate_rl', default=0.1, type=float, help='learning rate') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=50, type=int, help='Number of training epochs.') parser.add_argument('--full_epochs', default=50, type=int, help='Number of training epochs.') parser.add_argument('--init_samples', default=5, type=int, help='Number of random inital training data') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--budget', default=30, type=int, help='Our labeling budget') parser.add_argument('--selection_radius', default=1, type=int, help='Selection radius') parser.add_argument("--reward_threshold", default=0, type=float, help="Reward threshold") parser.add_argument('--w2v', action='store_true', help='Use w2v embeddings') elif dataset == 'mnist': parser.add_argument('--hidden_size', default=320, type=int, help='Size of hidden layer in deep RL') parser.add_argument('--episodes', default=10000, type=int, help='Number of episodes') parser.add_argument('--learning_rate_rl', default=0.1, type=float, help='Learning rate') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=15, type=int, help='Number of training epochs.') parser.add_argument('--full_epochs', default=15, type=int, help='Number of training epochs.') parser.add_argument('--init_samples', default=480, type=int, help='number of random inital training data') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--budget', default=224, type=int, help='Our labeling budget') parser.add_argument('--selection_radius', default=32, type=int, help='Selection radius') parser.add_argument('--reward_threshold', default=0, type=float, help='Reward threshold') parser.add_argument('--w2v', action='store_true', help='Use w2v embeddings') elif dataset == 'test': parser.add_argument("--hidden_size", default=4, type=int, help="Size of hidden layer in deep RL") parser.add_argument("--episodes", default=10000, type=int, help="number of episodes") parser.add_argument("--learning_rate_rl", default=0.1, type=float, help="learning rate") parser.add_argument('--budget', default=50, type=int, help='Our labeling budget') parser.add_argument('--init_samples', default=0, type=int, help='number of random inital training data') parser.add_argument('--num_epochs', default=0, type=int, help='Number of training epochs.') parser.add_argument('--full_epochs', default=0, type=int, help='Number of training epochs.') parser.add_argument("--reward_threshold", default=0.6, type=float, help="Reward threshold") parser.add_argument('--w2v', action='store_true', help='Use w2v embeddings') # Global params all datasets use parser.add_argument('--data_path', default='/data/stud/jorgebjorn/data', type=str, help='Dir path to datasets') parser.add_argument('--vocab_path', default='/data/stud/jorgebjorn/data/vocab/',type=str, help='Dir path to saved vocabulary pickle files.') parser.add_argument('--batch_size_rl', default=32, type=int, help='Size of a training mini-batch.') parser.add_argument('--device', default=0, type=int, help='Which gpu to use') parser.add_argument('--log', default='no', type=str, help='Choose between: no, external, local, visdom') parser.add_argument('--agent', default='dqn', type=str, help='Type of reinforcement agent. (dqn | policy, actor_critic)') parser.add_argument('--c', default='', type=str, help='Comment in logfile') parser.add_argument('--gamma', default=0, type=float, help='Discount factor') parser.add_argument('--load_model_name',default='', type=str, help='Path to existing RL model') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in train mode (Not recommended).') parser.add_argument('--no_cuda', action='store_true', help='Disable cuda') parser.add_argument('--reward_clip', action='store_true', help='Give positive actions +1 and negative actions -1 reward') parser.add_argument('--train_shuffle', action='store_true', help='Shuffle active train set every time') params = parser.parse_args(sys.argv[3:]) params.actions = 2 params.dataset = dataset params.logger_name = '{}_{}_{}_{}_{}_{}'.format(getpass.getuser(), datetime.datetime.now().strftime("%d-%m-%y_%H:%M"), dataset, params.agent, params.c, str(uuid.uuid4())[:4]) params.external_log_url = 'http://logserver.duckdns.org:5000' if torch.cuda.is_available(): torch.cuda.set_device(params.device) params.cuda = (not params.no_cuda) and torch.cuda.is_available() params.pid = os.getpid() for arg in vars(params): opt[arg] = vars(params)[arg] # sending tensorboard logs to external server if params.log == "external": global_logger["lg"] = external_logger() # saving tensorboard logs local elif params.log == "local": global_logger["lg"] = local_logger() elif params.log == 'visdom': global_logger["lg"] = visdom_logger() global_logger["lg"].parameters_summary() # no logging at all, for testing purposes. else: global_logger["lg"] = no_logger() container = importlib.import_module('datasets.{}'.format(dataset)) model = container.model load_data = container.load_data train_data, dev_data, test_data = load_data() data["train"] = train_data data["dev"] = dev_data data["test"] = test_data if params.w2v: load_word2vec() from train import train train(model)
def train(): with tf.device('/cpu:0'): train_text, train_y, train_e1, train_e2, train_pos1, train_pos2, train_rw, train_rw_pos, train_rw_cate = data_helpers.load_data_and_labels( FLAGS.train_path) with tf.device('/cpu:0'): test_text, test_y, test_e1, test_e2, test_pos1, test_pos2, test_rw, test_rw_pos, test_rw_cate = data_helpers.load_data_and_labels( FLAGS.test_path) #words = data_helpers.relation_words([train_between_e, test_between_e]) #train_relation_words_between_entity = data_helpers.relation_words_between_entity(train_between_e, words) #test_relation_words_between_entity = data_helpers.relation_words_between_entity(test_between_e, words) # Build vocabulary # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>." # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>'] # => # [27 39 40 41 42 1 43 0 0 ... 0] # dimension = MAX_SENTENCE_LENGTH vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) vocab_processor.fit(train_text + test_text) train_x = np.array(list(vocab_processor.transform(train_text))) test_x = np.array(list(vocab_processor.transform(test_text))) train_text = np.array(train_text) test_text = np.array(test_text) print("\nText Vocabulary Size: {:d}".format( len(vocab_processor.vocabulary_))) print("train_x = {0}".format(train_x.shape)) print("train_y = {0}".format(train_y.shape)) print("test_x = {0}".format(test_x.shape)) print("test_y = {0}".format(test_y.shape)) vocab_processor2 = tf.contrib.learn.preprocessing.VocabularyProcessor(6) vocab_processor2.fit(train_rw + test_rw) train_rw_x = np.array(list(vocab_processor2.transform(train_rw))) test_rw_x = np.array(list(vocab_processor2.transform(test_rw))) train_rw_text = np.array(train_rw) test_rw_text = np.array(test_rw) vocab_processor2pos = tf.contrib.learn.preprocessing.VocabularyProcessor(6) vocab_processor2pos.fit(train_rw_pos + test_rw_pos) train_rw_pos_x = np.array(list( vocab_processor2pos.transform(train_rw_pos))) test_rw_pos_x = np.array(list(vocab_processor2pos.transform(test_rw_pos))) train_rw_pos_text = np.array(train_rw_pos) test_rw_pos_text = np.array(test_rw_pos) # Example: pos1[3] = [-2 -1 0 1 2 3 4 999 999 999 ... 999] # [95 96 97 98 99 100 101 999 999 999 ... 999] # => # [11 12 13 14 15 16 21 17 17 17 ... 17] # dimension = MAX_SENTENCE_LENGTH pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2) train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1))) train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2))) test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1))) test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2))) print("\nPosition Vocabulary Size: {:d}".format( len(pos_vocab_processor.vocabulary_))) print("train_p1 = {0}".format(train_p1.shape)) print("test_p1 = {0}".format(test_p1.shape)) print("") with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): model = EntityAttentionLSTM( sequence_length=train_x.shape[1], rw_length=6, num_classes=train_y.shape[1], vocab_size=len(vocab_processor.vocabulary_), rw_vocab_size=len(vocab_processor2.vocabulary_), rw_pos_vocab_size=len(vocab_processor2pos.vocabulary_), embedding_size=FLAGS.embedding_size, pos_vocab_size=len(pos_vocab_processor.vocabulary_), pos_embedding_size=FLAGS.pos_embedding_size, hidden_size=FLAGS.hidden_size, num_heads=FLAGS.num_heads, attention_size=FLAGS.attention_size, use_elmo=(FLAGS.embeddings == 'elmo'), l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6) gvs = optimizer.compute_gradients(model.loss) capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("\nWriting to {}\n".format(out_dir)) # Logger logger = Logger(out_dir) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", model.loss) acc_summary = tf.summary.scalar("accuracy", model.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) vocab_processor2.save(os.path.join(out_dir, "rw_vocab")) vocab_processor2pos.save(os.path.join(out_dir, "rw_pos_vocab")) pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) if FLAGS.embeddings == "word2vec": pretrain_W = utils.load_word2vec( 'resource/GoogleNews-vectors-negative300.bin', FLAGS.embedding_size, vocab_processor) sess.run(model.W_text.assign(pretrain_W)) print("Success to load pre-trained word2vec model!\n") elif FLAGS.embeddings == "glove100": pretrain_W = utils.load_glove('resource/glove.6B.100d.txt', FLAGS.embedding_size, vocab_processor) sess.run(model.W_text.assign(pretrain_W)) print("Success to load pre-trained glove100 model!\n") elif FLAGS.embeddings == "glove300": pretrain_W = utils.load_glove('resource/glove.840B.300d.txt', FLAGS.embedding_size, vocab_processor) pretrain_rw_W = utils.load_glove( 'resource/glove.840B.300d.txt', FLAGS.embedding_size, vocab_processor2) sess.run(model.W_text.assign(pretrain_W)) sess.run(model.W_rw_text.assign(pretrain_rw_W)) print("Success to load pre-trained glove300 model!\n") # Generate batches train_batches = data_helpers.batch_iter( list( zip(train_x, train_y, train_text, train_e1, train_e2, train_p1, train_p2, train_rw_x, train_rw_text, train_rw_pos_x, train_rw_pos_text, train_rw_cate)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... best_f1 = 0.0 # For save checkpoint(model) for train_batch in train_batches: train_bx, train_by, train_btxt, train_be1, train_be2, train_bp1, train_bp2, train_brw_x, train_brw_text, train_brw_pos_x, train_brw_pos_text, train_brw_cate = zip( *train_batch) feed_dict = { model.input_x: train_bx, model.input_y: train_by, model.input_text: train_btxt, model.input_e1: train_be1, model.input_e2: train_be2, model.input_p1: train_bp1, model.input_p2: train_bp2, model.input_rw_x: train_brw_x, ######## model.input_rw_text: train_brw_text, ########## model.input_rw_pos_x: train_brw_pos_x, ####### model.input_rw_pos_text: train_brw_pos_text, ####### model.input_rw_cate: train_brw_cate, ########### model.emb_dropout_keep_prob: FLAGS.emb_dropout_keep_prob, model.rnn_dropout_keep_prob: FLAGS.rnn_dropout_keep_prob, model.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, model.loss, model.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: logger.logging_train(step, loss, accuracy) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") # Generate batches test_batches = data_helpers.batch_iter(list( zip(test_x, test_y, test_text, test_e1, test_e2, test_p1, test_p2, test_rw_x, test_rw_text, test_rw_pos_x, test_rw_pos_text, test_rw_cate)), FLAGS.batch_size, 1, shuffle=False) # Training loop. For each batch... losses = 0.0 accuracy = 0.0 predictions = [] iter_cnt = 0 for test_batch in test_batches: test_bx, test_by, test_btxt, test_be1, test_be2, test_bp1, test_bp2, test_brw_x, test_brw_text, test_brw_pos_x, test_brw_pos_text, test_brw_cate = zip( *test_batch) feed_dict = { model.input_x: test_bx, model.input_y: test_by, model.input_text: test_btxt, model.input_e1: test_be1, model.input_e2: test_be2, model.input_p1: test_bp1, model.input_p2: test_bp2, model.input_rw_x: test_brw_x, ######## model.input_rw_text: test_brw_text, ########## model.input_rw_pos_x: test_brw_pos_x, ####### model.input_rw_pos_text: test_brw_pos_text, ####### model.input_rw_cate: test_brw_cate, ######### model.emb_dropout_keep_prob: 1.0, model.rnn_dropout_keep_prob: 1.0, model.dropout_keep_prob: 1.0 } loss, acc, pred = sess.run( [model.loss, model.accuracy, model.predictions], feed_dict) losses += loss accuracy += acc predictions += pred.tolist() iter_cnt += 1 losses /= iter_cnt accuracy /= iter_cnt predictions = np.array(predictions, dtype='int') logger.logging_eval(step, loss, accuracy, predictions) # Model checkpoint if best_f1 < logger.best_f1: best_f1 = logger.best_f1 path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step) print("Saved model checkpoint to {}\n".format(path))
test = utils.padding(test, token2idx, tag2idx, maxlen) train = utils._to_tensor(train, tf.int32) dev = utils._to_tensor(dev, tf.int32) test = utils._to_tensor(test, tf.int32) # to batch train_ds = tf.data.Dataset.from_tensor_slices(train).shuffle(10000).batch( batch_size) dev_ds = tf.data.Dataset.from_tensor_slices(dev).shuffle(2000).batch( batch_size * 2) test_ds = tf.data.Dataset.from_tensor_slices(test).shuffle(2000).batch( batch_size * 2) embedding_pretrained = utils.load_word2vec( 'data/embeddings/wiki_100.utf8', token2idx, embed_dim, 'data/embeddings/embed_mat.npy') model = LSTM_CRF(len(token2idx), embed_dim, maxlen, len(tag2idx), rnn_hiden_size, embedding_pretrained) optimizer = tf.keras.optimizers.Adam(lr=0.003) run.training(model, train_ds, dev_ds, epochs, optimizer) run.evaluate(model, test_ds, data_name="测试集") # # # save model # # print("\nsave model...") # # model.save_weights('model saved/') # # # load model # print("load model...") # model.load_weights('model saved/')
label = [label0, label1][label] text = " ".join(tokens) if len(text) >= 125: text = text[:125] + "..." meta_file.write(f"{label}\t{text}\n") embedding_matrix.append(avg_embed(acc)) return list(map(avg_embed, vals)) def norm_dist(u, v): return np.linalg.norm(u / np.linalg.norm(u) - v / np.linalg.norm(v)) if __name__ == "__main__": w2v = utils.load_word2vec() avg_embed = lambda vs: sum(vs) / len(vs) if not os.path.exists("projector"): os.makedirs("projector") stormfront_vals = [[], []] twitter_vals = [[], []] with open(os.path.join("projector", "metadata.tsv"), "w") as f: f.write("label\tcontent\n") # for tokens, label in utils.stormfront_gen(): # acc = [] # for token in tokens: # if token in w2v: # stormfront_vals[label].append(w2v[token])
def train(): with tf.device('/cpu:0'): x_text, y, pos1, pos2, x_text_clean, sentence_len = data_helpers.load_data_and_labels( FLAGS.train_path) # Build vocabulary # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>." # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>'] # => # [27 39 40 41 42 1 43 0 0 ... 0] # dimension = FLAGS.max_sentence_length # print("text:",x_text) text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) x = np.array(list(text_vocab_processor.fit_transform(x_text))) #token # pretrain_W = utils.load_word2vec(FLAGS.embedding_path, FLAGS.text_embedding_dim, text_vocab_processor) # print("pretrain_w:",pretrain_W) # print(pretrain_W.shape) #(19151,300) print("Text Vocabulary Size: {:d}".format( len(text_vocab_processor.vocabulary_))) # print("vocabulary:", text_vocab_processor.vocabulary_._reverse_mapping) # with open("vocabulary.txt","w",encoding="utf-8") as f: # f.write(str(x)) print("x = {0}".format(x.shape)) #(8000,90) print("y = {0}".format(y.shape)) #(8000,19) print("") # Example: pos1[3] = [-2 -1 0 1 2 3 4 999 999 999 ... 999] # [95 96 97 98 99 100 101 999 999 999 ... 999] # => # [11 12 13 14 15 16 21 17 17 17 ... 17] # dimension = MAX_SENTENCE_LENGTH pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) pos_vocab_processor.fit(pos1 + pos2) #fit # print("pos vocab position:", pos_vocab_processor) p1 = np.array(list(pos_vocab_processor.transform(pos1))) #tokens # print("p1:", p1) p2 = np.array(list(pos_vocab_processor.transform(pos2))) print("Position Vocabulary Size: {:d}".format( len(pos_vocab_processor.vocabulary_))) # with open("position.txt", "w", encoding="utf-8") as f: # f.write(str(x)) print("position_1 = {0}".format(p1.shape)) #(8000,90) print("position_2 = {0}".format(p2.shape)) #(8000,90) print("") # Randomly shuffle data to split into train and test(dev) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) #len(y)=8000 x_shuffled = x[shuffle_indices] p1_shuffled = p1[shuffle_indices] p2_shuffled = p2[shuffle_indices] y_shuffled = y[shuffle_indices] # print(x_shuffled, p1_shuffled,p2_shuffled,y_shuffled) # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int( FLAGS.dev_sample_percentage * float(len(y))) #x_train=7200, x_dev =800 x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[ dev_sample_index:] p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev))) # print(x_train) # print(np.array(x_train)) # print(x_dev) # print(np.array(x_dev)) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=x_train.shape[1], #90 num_classes=y_train.shape[1], #19 text_vocab_size=len(text_vocab_processor.vocabulary_), #19151 text_embedding_size=FLAGS.text_embedding_size, #300 pos_vocab_size=len(pos_vocab_processor.vocabulary_), #162 pos_embedding_size=FLAGS.pos_embedding_dim, #50 filter_sizes=list(map( int, FLAGS.filter_sizes.split(","))), #2,3,4,5 num_filters=FLAGS.num_filters, #128 l2_reg_lambda=FLAGS.l2_reg_lambda) #1e-5 # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6) gvs = optimizer.compute_gradients(cnn.loss) capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary text_vocab_processor.save(os.path.join(out_dir, "text_vocab")) pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) # FLAGS._sess =sess print("shape:", x_train.shape) print(y_train.shape) # Pre-trained word2vec if FLAGS.embedding_path: pretrain_W = utils.load_word2vec(FLAGS.embedding_path, FLAGS.text_embedding_size, text_vocab_processor) sess.run(cnn.W_text.assign(pretrain_W)) print("Success to load pre-trained word2vec model!\n") # Generate batches batches = data_helpers.batch_iter( list(zip(x_train, p1_train, p2_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) print(batches) # Training loop. For each batch... best_f1 = 0.0 # For save checkpoint(model) for batch in batches: x_batch, p1_batch, p2_batch, y_batch = zip(*batch) # Train feed_dict = { cnn.input_text: x_batch, cnn.input_p1: p1_batch, cnn.input_p2: p2_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } # print(len(x_batch)) # print(len(y_batch)) _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") feed_dict = { cnn.input_text: x_dev, cnn.input_p1: p1_dev, cnn.input_p2: p2_dev, cnn.input_y: y_dev, cnn.dropout_keep_prob: 1.0 } summaries, loss, accuracy, predictions, text_expand_shape, pos_shape, embedding_size_shape, embedding_shape, text_shape = sess.run( [ dev_summary_op, cnn.loss, cnn.accuracy, cnn.predictions, cnn.text_expand_shape, cnn.pos_expand_shape, cnn.embedding_size_shape, cnn.embedd_shape, cnn.text_shape ], feed_dict) dev_summary_writer.add_summary(summaries, step) time_str = datetime.datetime.now().isoformat() f1 = f1_score(np.argmax(y_dev, axis=1), predictions, labels=np.array(range(1, 19)), average="macro") precision = tf.metrics.precision(np.argmax(y_dev, axis=1), predictions, weights=1) recall = tf.metrics.recall(np.argmax(y_dev, axis=1), predictions, weights=1) print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) print( "[UNOFFICIAL] (2*9+1)-Way Macro-Average F1 Score (excluding Other): {:g}\n" .format(f1)) # print("text_embedded_shape:", text_shape) # print("text_embedd_extend:", text_expand_shape) # print("pos-embedd_extend:", pos_shape) # print("embedding_size:", embedding_shape) # print("embedding_size_shape", embedding_size_shape) print("predit:", predictions) print(predictions.shape) print("y_dev:", y_dev) print(y_dev.shape) # Model checkpoint if best_f1 < f1: best_f1 = f1 path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step) print("Saved model checkpoint to {}\n".format(path))
def main(gpu, path_corpus, path_config, path_word2vec): MAX_EPOCH = 50 EVAL = 200 MAX_LENGTH = 70 COUNTS_CACHE = "./cache/counts.pkl" config = utils.Config(path_config) word_dim = config.getint("word_dim") state_dim = config.getint("state_dim") grad_clip = config.getfloat("grad_clip") weight_decay = config.getfloat("weight_decay") batch_size = config.getint("batch_size") sample_size = config.getint("sample_size") print "[info] CORPUS: %s" % path_corpus print "[info] CONFIG: %s" % path_config print "[info] PRE-TRAINED WORD EMBEDDINGS: %s" % path_word2vec print "[info] WORD DIM: %d" % word_dim print "[info] STATE DIM: %d" % state_dim print "[info] GRADIENT CLIPPING: %f" % grad_clip print "[info] WEIGHT DECAY: %f" % weight_decay print "[info] BATCH SIZE: %d" % batch_size path_save_head = os.path.join(config.getpath("snapshot"), "rnnlm.%s.%s" % ( os.path.basename(path_corpus), os.path.splitext(os.path.basename(path_config))[0])) print "[info] SNAPSHOT: %s" % path_save_head sents_train, sents_val, vocab, ivocab = \ utils.load_corpus(path_corpus=path_corpus, max_length=MAX_LENGTH) #counts = None #print("[info] Load word counter") #if os.path.exists(COUNTS_CACHE): # print("[info] Found cache of counter") # counts = pickle.load(open(COUNTS_CACHE, "rb")) # if len(counts) != len(vocab): # counts = None #if counts is None: # counts = Counter() # for sent in list(sents_train) + list(sents_val): # counts += Counter(sent) # pickle.dump(counts, open(COUNTS_CACHE, "wb")) #cs = [counts[w] for w in range(len(counts))] if path_word2vec is not None: word2vec = utils.load_word2vec(path_word2vec, word_dim) initialW = utils.create_word_embeddings(vocab, word2vec, dim=word_dim, scale=0.001) else: initialW = None cuda.get_device(gpu).use() model = models.CXT_BLSTM( vocab_size=len(vocab), word_dim=word_dim, state_dim=state_dim, initialW=initialW, EOS_ID=vocab["<EOS>"]) model.to_gpu(gpu) opt = optimizers.SMORMS3() opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(grad_clip)) opt.add_hook(chainer.optimizer.WeightDecay(weight_decay)) # sampler = utils.RandomSampler(cs, sample_size) #print "[info] Evaluating on the validation sentences ..." #loss_data = evaluate(model, sents_val, ivocab, word_dim, sampler) #print "[validation] iter=0, epoch=0, loss=%f" \ # % (loss_data) it = 0 n_train = len(sents_train) vocab_size = model.vocab_size for epoch in xrange(1, MAX_EPOCH+1): perm = np.random.permutation(n_train) for data_i in xrange(0, n_train, batch_size): if data_i + batch_size > n_train: break words = sents_train[perm[data_i:data_i+batch_size]] xs, ms = utils.make_batch(words, train=True, tail=False, mask=True) ys = model.forward(xs=xs, ms=ms, train=True) words_without_edge = [w[1:-1] for w in words] xs_without_edge, ms_without_edge = utils.make_batch(words_without_edge, train=True, tail=False, mask=True) masked_ys = [] for y, m in zip(ys, ms_without_edge): m_ext = F.broadcast_to(F.reshape(m, (batch_size, 1)), (batch_size, vocab_size)) masked_ys.append(y*m_ext) #ts = model.embed_words(xs_without_edge, ms_without_edge, train=True) # BOS, EOSは除く # T : バッチの中の最大長 # N : バッチサイズ # |D|: word_dim ys = F.concat(masked_ys, axis=0) # (TN, |V|) ts = F.concat(xs_without_edge, axis=0) # (TN, |D|) ys = F.reshape(ys, (-1, vocab_size)) # (TN, |D|) ts = F.reshape(ts, (-1,)) # (TN,) loss = F.softmax_cross_entropy(ys, ts) acc = F.accuracy(ys, ts, ignore_label=-1) model.zerograds() loss.backward() loss.unchain_backward() opt.update() it += 1 loss_data = float(cuda.to_cpu(loss.data)) perp = math.exp(loss_data) acc_data = float(cuda.to_cpu(acc.data)) print "[training] iter=%d, epoch=%d (%d/%d=%.03f%%), perplexity=%f, accuracy=%.2f%%" \ % (it, epoch, data_i+batch_size, n_train, float(data_i+batch_size)/n_train*100, perp, acc_data*100) if it % EVAL == 0: print "[info] Evaluating on the validation sentences ..." loss_data, acc_data = evaluate(model, sents_val, ivocab, word_dim) perp = math.exp(loss_data) print "[validation] iter=%d, epoch=%d, perplexity=%f, accuracy=%.2f%%" \ % (it, epoch, perp, acc_data*100) serializers.save_npz(path_save_head + ".iter_%d.epoch_%d.model" % (it, epoch), model) # utils.save_word2vec(path_save_head + ".iter_%d.epoch_%d.vectors.txt" % (it, epoch), # utils.extract_word2vec(model, vocab)) print "[info] Saved." print "[info] Done."
return support_set, support_label, query_set, query_label def next_batch(self, B, N, K, Q): support_set = [] support_label = [] query_set = [] label = [] for one in range(B): cur_support, cur_support_label, cur_query, cur_label = self.next_one( N, K, Q) support_set.append(cur_support) support_label.append(cur_support_label) query_set.append(cur_query) label.append(cur_label) support = np.stack(support_set, 0) support_label = np.stack(support_label, 0) query = np.stack(query_set, 0) label = np.stack(label, 0) return support, support_label, query, label if __name__ == "__main__": import utils vocab, embedding = utils.load_word2vec('data/tencent_embedding.txt') data_loader = DataLoader('data/sample_data.json', vocab) data_loader.next_batch(4, 5, 5, 5)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epochs', dest='epochs', type=int, default=10) parser.add_argument('--output', dest='output', type=str, default='./nn_result.csv') parser.add_argument('--log', dest='log', type=str) parser.add_argument('--w2v', dest='w2v', type=str) parser.add_argument('--freeze', dest='freeze', action='store_true', default=False) parser.add_argument('--model', dest='model', type=str, choices=['CNN', 'RNN'], default='CNN') args = parser.parse_args() if args.log is not None: handler = logging.FileHandler(args.log) handler.setLevel(logging.INFO) handler.setFormatter(formatter) logger.addHandler(handler) logger.info(args) logger.info('device = {}'.format(device)) tsv_train = pd.read_csv('./data/train.tsv', sep='\t') num = len(tsv_train) num_val = int(num * 0.1) num_train = num - num_val tsv_val = tsv_train[num_train:] tsv_train = tsv_train[:num_train] x_train = tsv_train['Phrase'].values y_train = tsv_train['Sentiment'].values x_val = tsv_val['Phrase'].values y_val = tsv_val['Sentiment'].values if args.w2v is None: tokenizer = Tokenizer(text=x_train) else: from utils import load_word2vec w2v = load_word2vec(args.w2v) words = list(w2v.keys()) tokenizer = Tokenizer(words=words) vecs = list(w2v.values()) vecs.insert(0, [.0] * embedding_dim) vecs.insert(1, [.0] * embedding_dim) vocab_size = len(tokenizer.vocabulary_) logger.info('vocab_size = {}'.format(vocab_size)) train_dataset = MyDataset(x_train, y_train, seq_length, tokenizer) train_dl = DataLoader(train_dataset, shuffle=True, batch_size=128) val_dataset = MyDataset(x_val, y_val, seq_length, tokenizer) val_dl = DataLoader(val_dataset, shuffle=False, batch_size=128) net = TextCNN(seq_length, vocab_size, embedding_dim, num_classes) if args.model == 'CNN' \ else TextRNN(seq_length, vocab_size, embedding_dim, num_classes) if args.w2v is not None: net.embed.from_pretrained(torch.tensor(vecs), freeze=args.freeze) net.to(device) train(net, train_dl, val_dl, args.epochs, device) tsv_test = pd.read_csv('./data/test.tsv', sep='\t') x_test = tsv_test['Phrase'].values test_dataset = MyDataset(x_test, seq_length=seq_length, tokenizer=tokenizer) test_dl = DataLoader(test_dataset, shuffle=False, batch_size=128) y_test = [] for data in test_dl: x = data.to(device) out = net(x) y_test += out.argmax(1).cpu().tolist() tsv_test['Sentiment'] = y_test tsv_test[['PhraseId', 'Sentiment']].to_csv(args.output, index=False)
def main(gpu, path_corpus, path_config, path_word2vec): MAX_EPOCH = 50 EVAL = 200 MAX_LENGTH = 70 config = utils.Config(path_config) model_name = config.getstr("model") word_dim = config.getint("word_dim") state_dim = config.getint("state_dim") grad_clip = config.getfloat("grad_clip") weight_decay = config.getfloat("weight_decay") batch_size = config.getint("batch_size") print "[info] CORPUS: %s" % path_corpus print "[info] CONFIG: %s" % path_config print "[info] PRE-TRAINED WORD EMBEDDINGS: %s" % path_word2vec print "[info] MODEL: %s" % model_name print "[info] WORD DIM: %d" % word_dim print "[info] STATE DIM: %d" % state_dim print "[info] GRADIENT CLIPPING: %f" % grad_clip print "[info] WEIGHT DECAY: %f" % weight_decay print "[info] BATCH SIZE: %d" % batch_size path_save_head = os.path.join( config.getpath("snapshot"), "rnnlm.%s.%s" % (os.path.basename(path_corpus), os.path.splitext(os.path.basename(path_config))[0])) print "[info] SNAPSHOT: %s" % path_save_head sents_train, sents_val, vocab, ivocab = \ utils.load_corpus(path_corpus=path_corpus, max_length=MAX_LENGTH) if path_word2vec is not None: word2vec = utils.load_word2vec(path_word2vec, word_dim) initialW = utils.create_word_embeddings(vocab, word2vec, dim=word_dim, scale=0.001) else: initialW = None cuda.get_device(gpu).use() if model_name == "rnn": model = models.RNN(vocab_size=len(vocab), word_dim=word_dim, state_dim=state_dim, initialW=initialW, EOS_ID=vocab["<EOS>"]) elif model_name == "lstm": model = models.LSTM(vocab_size=len(vocab), word_dim=word_dim, state_dim=state_dim, initialW=initialW, EOS_ID=vocab["<EOS>"]) elif model_name == "gru": model = models.GRU(vocab_size=len(vocab), word_dim=word_dim, state_dim=state_dim, initialW=initialW, EOS_ID=vocab["<EOS>"]) elif model_name == "bd_lstm": model = models.BD_LSTM(vocab_size=len(vocab), word_dim=word_dim, state_dim=state_dim, initialW=initialW, EOS_ID=vocab["<EOS>"]) else: print "[error] Unknown model name: %s" % model_name sys.exit(-1) model.to_gpu(gpu) opt = optimizers.SMORMS3() opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(grad_clip)) opt.add_hook(chainer.optimizer.WeightDecay(weight_decay)) print "[info] Evaluating on the validation sentences ..." loss_data, acc_data = evaluate(model, model_name, sents_val, ivocab) perp = math.exp(loss_data) print "[validation] iter=0, epoch=0, perplexity=%f, accuracy=%.2f%%" \ % (perp, acc_data*100) it = 0 n_train = len(sents_train) vocab_size = model.vocab_size for epoch in xrange(1, MAX_EPOCH + 1): perm = np.random.permutation(n_train) for data_i in xrange(0, n_train, batch_size): if data_i + batch_size > n_train: break words = sents_train[perm[data_i:data_i + batch_size]] if model_name == "bd_lstm": xs, ms = utils.make_batch(words, train=True, tail=False, mask=True) ys = model.forward(xs=xs, ms=ms, train=True) else: xs = utils.make_batch(words, train=True, tail=False) ys = model.forward(ts=xs, train=True) ys = F.concat(ys, axis=0) ts = F.concat(xs, axis=0) ys = F.reshape(ys, (-1, vocab_size)) # (TN, |V|) ts = F.reshape(ts, (-1, )) # (TN,) loss = F.softmax_cross_entropy(ys, ts) acc = F.accuracy(ys, ts, ignore_label=-1) model.zerograds() loss.backward() loss.unchain_backward() opt.update() it += 1 loss_data = float(cuda.to_cpu(loss.data)) perp = math.exp(loss_data) acc_data = float(cuda.to_cpu(acc.data)) print "[training] iter=%d, epoch=%d (%d/%d=%.03f%%), perplexity=%f, accuracy=%.2f%%" \ % (it, epoch, data_i+batch_size, n_train, float(data_i+batch_size)/n_train*100, perp, acc_data*100) if it % EVAL == 0: print "[info] Evaluating on the validation sentences ..." loss_data, acc_data = evaluate(model, model_name, sents_val, ivocab) perp = math.exp(loss_data) print "[validation] iter=%d, epoch=%d, perplexity=%f, accuracy=%.2f%%" \ % (it, epoch, perp, acc_data*100) serializers.save_npz( path_save_head + ".iter_%d.epoch_%d.model" % (it, epoch), model) utils.save_word2vec( path_save_head + ".iter_%d.epoch_%d.vectors.txt" % (it, epoch), utils.extract_word2vec(model, vocab)) print "[info] Saved." print "[info] Done."
def main(): parser = argparse.ArgumentParser(description="-----[CNN-classifier]-----") parser.add_argument("--similarity", default=0.0, type=float, help="similarity threshold") parser.add_argument( "--similarity_representation", default="W2V", help= "similarity representation. Available methods: CNN, AUTOENCODER, W2V") parser.add_argument( "--mode", default="train", help="train: train (with test) a model / test: test saved models") parser.add_argument( "--model", default="cnn", help="Type of model to use. Default: CNN. Available models: CNN, RNN") parser.add_argument("--embedding", default="w2v", help="available embedings: random, w2v") parser.add_argument("--dataset", default="MR", help="available datasets: MR, TREC") parser.add_argument("--encoder", default=None, help="Path to encoder model file") parser.add_argument("--decoder", default=None, help="Path to decoder model file") parser.add_argument('--batch-size', type=int, default=32, help='batch size for training [default: 32]') parser.add_argument( '--selection-size', type=int, default=32, help='selection size for selection function [default: 32]') parser.add_argument("--save_model", default="F", help="whether saving model or not (T/F)") parser.add_argument("--early_stopping", default="F", help="whether to apply early stopping(T/F)") parser.add_argument("--epoch", default=100, type=int, help="number of max epoch") parser.add_argument("--learning_rate", default=0.1, type=float, help="learning rate") parser.add_argument("--dropout_embed", default=0.2, type=float, help="Dropout embed probability. Default: 0.2") parser.add_argument("--dropout_model", default=0.4, type=float, help="Dropout model probability. Default: 0.4") parser.add_argument('--device', type=int, default=0, help='Cuda device to run on') parser.add_argument('--no-cuda', action='store_true', default=False, help='disable the gpu') parser.add_argument( "--scorefn", default="entropy", help="available scoring functions: entropy, random, egl") parser.add_argument('--average', type=int, default=1, help='Number of runs to average [default: 1]') parser.add_argument('--hnodes', type=int, default=256, help='Number of nodes in the hidden layer(s)') parser.add_argument('--hlayers', type=int, default=1, help='Number of hidden layers') parser.add_argument('--weight_decay', type=float, default=1e-5, help='Value of weight_decay') parser.add_argument('--no-log', action='store_true', default=False, help='Disable logging') parser.add_argument('--data_path', default='/data/stud/jorgebjorn/data', type=str, help='Dir path to datasets') parser.add_argument('--c', default='', type=str, help='Comment to run ') options = parser.parse_args() params["DATA_PATH"] = options.data_path #TODO rewrite? getattr(utils, "read_{}".format(options.dataset))() data["vocab"] = sorted( list( set([ w for sent in data["train_x"] + data["dev_x"] + data["test_x"] for w in sent ]))) data["classes"] = sorted(list(set(data["train_y"]))) data["word_to_idx"] = {w: i for i, w in enumerate(data["vocab"])} params_local = { "SIMILARITY_THRESHOLD": options.similarity, "SIMILARITY_REPRESENTATION": options.similarity_representation, "DATA_PATH": options.data_path, "MODEL": options.model, "EMBEDDING": options.embedding, "DATASET": options.dataset, "SAVE_MODEL": bool(options.save_model == "T"), "EARLY_STOPPING": bool(options.early_stopping == "T"), "EPOCH": options.epoch, "LEARNING_RATE": options.learning_rate, "MAX_SENT_LEN": max([ len(sent) for sent in data["train_x"] + data["dev_x"] + data["test_x"] ]), "SELECTION_SIZE": options.selection_size, "BATCH_SIZE": options.batch_size, "WORD_DIM": 300, "VOCAB_SIZE": len(data["vocab"]), "CLASS_SIZE": len(data["classes"]), "FILTERS": [3, 4, 5], "FILTER_NUM": [100, 100, 100], "DROPOUT_EMBED": options.dropout_embed, "DROPOUT_MODEL": options.dropout_model, "DEVICE": options.device, "NO_CUDA": options.no_cuda, "SCORE_FN": options.scorefn, "N_AVERAGE": options.average, "HIDDEN_SIZE": options.hnodes, "HIDDEN_LAYERS": options.hlayers, "WEIGHT_DECAY": options.weight_decay, "LOG": not options.no_log, "ENCODER": options.encoder, "DECODER": options.decoder, "C": options.c } for key in params_local: params[key] = params_local[key] if params["LOG"]: logger_name = 'SS/{}_{}_{}_{}_{}'.format( getpass.getuser(), datetime.datetime.now().strftime("%d-%m-%y_%H:%M"), options.dataset, params["C"], str(uuid.uuid4())[:4]) global_logger["lg"] = VisdomLogger( logger_name, "{}_{}".format(params["SIMILARITY_THRESHOLD"], params["SIMILARITY_REPRESENTATION"])) # global_logger["lg"].parameters_summary() print("visdom logger OK") # quit() params["CUDA"] = (not params["NO_CUDA"]) and torch.cuda.is_available() del params["NO_CUDA"] if params["CUDA"]: torch.cuda.set_device(params["DEVICE"]) if params["EMBEDDING"] == "w2v": utils.load_word2vec() encoder = rnnae.EncoderRNN() # decoder = rnnae.DecoderRNN() decoder = rnnae.AttnDecoderRNN() feature_extractor = CNN2() if params["ENCODER"] != None: print("Loading encoder") encoder.load_state_dict(torch.load(params["ENCODER"])) if params["DECODER"] != None: print("Loading decoder") decoder.load_state_dict(torch.load(params["DECODER"])) if params["CUDA"]: encoder, decoder, feature_extractor = encoder.cuda(), decoder.cuda( ), feature_extractor.cuda() models["ENCODER"] = encoder models["DECODER"] = decoder models["FEATURE_EXTRACTOR"] = feature_extractor print("=" * 20 + "INFORMATION" + "=" * 20) for key, value in params.items(): print("{}: {}".format(key.upper(), value)) if params["EMBEDDING"] == "random" and params["SIMILARITY_THRESHOLD"] > 0: print("********** WARNING *********") print("Random embedding makes similarity threshold have no effect. \n") print("=" * 20 + "TRAINING STARTED" + "=" * 20) train.active_train() print("=" * 20 + "TRAINING FINISHED" + "=" * 20)
def train(): with tf.device('/cpu:0'): train_text, train_y, train_pos1, train_pos2, train_x_text_clean, train_sentence_len = data_helpers.load_data_and_labels( FLAGS.train_path) with tf.device('/cpu:0'): test_text, test_y, test_pos1, test_pos2, test_x_text_clean, test_sentence_len = data_helpers.load_data_and_labels( FLAGS.test_path) # Build vocabulary # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>." # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>'] # => # [27 39 40 41 42 1 43 0 0 ... 0] # dimension = FLAGS.max_sentence_length # print("text:",x_text) vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) vocab_processor.fit(train_text + test_text) train_x = np.array(list(vocab_processor.transform(train_text))) test_x = np.array(list(vocab_processor.transform(test_text))) train_text = np.array(train_text) print("train_text", train_text[0:2]) test_text = np.array(test_text) print("\nText Vocabulary Size: {:d}".format( len(vocab_processor.vocabulary_))) print("train_x = {0}".format(train_x.shape)) # (8000,90) print("train_y = {0}".format(train_y.shape)) # (8000,19) print("test_x = {0}".format(test_x.shape)) # (2717, 90) print("test_y = {0}".format(test_y.shape)) # (2717,19) # Example: pos1[3] = [-2 -1 0 1 2 3 4 999 999 999 ... 999] # [95 96 97 98 99 100 101 999 999 999 ... 999] # => # [11 12 13 14 15 16 21 17 17 17 ... 17] # dimension = MAX_SENTENCE_LENGTH pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2) train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1))) train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2))) test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1))) test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2))) print("\nPosition Vocabulary Size: {:d}".format( len(pos_vocab_processor.vocabulary_))) print("train_p1 = {0}".format(train_p1.shape)) # (8000, 90) print("test_p1 = {0}".format(test_p1.shape)) # (2717, 90) print("") # Randomly shuffle data to split into train and test(dev) # np.random.seed(10) # # shuffle_indices = np.random.permutation(np.arange(len(y))) #len(y)=8000 # x_shuffled = x[shuffle_indices] # p1_shuffled = p1[shuffle_indices] # p2_shuffled = p2[shuffle_indices] # y_shuffled = y[shuffle_indices] # print(x_shuffled, p1_shuffled,p2_shuffled,y_shuffled) # Split train/test set # TODO: This is very crude, should use cross-validation # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #x_train=7200, x_dev =800 # x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] # p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[dev_sample_index:] # p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[dev_sample_index:] # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] # print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev))) # print(x_train) # print(np.array(x_train)) # print(x_dev) # print(np.array(x_dev)) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=FLAGS.max_sentence_length, #90 num_classes=train_y.shape[1], #19 text_vocab_size=len(vocab_processor.vocabulary_), #19151 text_embedding_size=FLAGS.text_embedding_size, #300 pos_vocab_size=len(pos_vocab_processor.vocabulary_), #162 pos_embedding_size=FLAGS.pos_embedding_dim, #50 filter_sizes=list(map( int, FLAGS.filter_sizes.split(","))), #2,3,4,5 num_filters=FLAGS.num_filters, #128 l2_reg_lambda=FLAGS.l2_reg_lambda, #1e-5 use_elmo=(FLAGS.embeddings == 'elmo')) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6) gvs = optimizer.compute_gradients(cnn.loss) capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("\nWriting to {}\n".format(out_dir)) # Logger logger = Logger(out_dir) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) if FLAGS.embeddings == "word2vec": pretrain_W = utils.load_word2vec( 'resource/GoogleNews-vectors-negative300.bin', FLAGS.embedding_size, vocab_processor) sess.run(cnn.W_text.assign(pretrain_W)) print("Success to load pre-trained word2vec model!\n") elif FLAGS.embeddings == "glove100": pretrain_W = utils.load_glove('resource/glove.6B.100d.txt', FLAGS.embedding_size, vocab_processor) sess.run(cnn.W_text.assign(pretrain_W)) print("Success to load pre-trained glove100 model!\n") elif FLAGS.embeddings == "glove300": pretrain_W = utils.load_glove('resource/glove.840B.300d.txt', FLAGS.embedding_size, vocab_processor) sess.run(cnn.W_text.assign(pretrain_W)) print("Success to load pre-trained glove300 model!\n") # Generate batches train_batches = data_helpers.batch_iter( list(zip(train_x, train_y, train_text, train_p1, train_p2)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... best_f1 = 0.0 # For save checkpoint(model) for train_batch in train_batches: train_bx, train_by, train_btxt, train_bp1, train_bp2 = zip( *train_batch) # print("train_bxt",list(train_btxt)[:2]) # print(np.array(train_be1).shape) #(20, ) # print(train_be1) feed_dict = { cnn.input_text: train_bx, cnn.input_y: train_by, cnn.input_x_text: list(train_btxt), cnn.input_p1: train_bp1, cnn.input_p2: train_bp2, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: logger.logging_train(step, loss, accuracy) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") # Generate batches test_batches = data_helpers.batch_iter(list( zip(test_x, test_y, test_text, test_p1, test_p2)), FLAGS.batch_size, 1, shuffle=False) # Training loop. For each batch... losses = 0.0 accuracy = 0.0 predictions = [] iter_cnt = 0 for test_batch in test_batches: test_bx, test_by, test_btxt, test_bp1, test_bp2 = zip( *test_batch) feed_dict = { cnn.input_text: test_bx, cnn.input_y: test_by, cnn.input_x_text: list(test_btxt), cnn.input_p1: test_bp1, cnn.input_p2: test_bp2, cnn.dropout_keep_prob: 1.0 } loss, acc, pred = sess.run( [cnn.loss, cnn.accuracy, cnn.predictions], feed_dict) losses += loss accuracy += acc predictions += pred.tolist() iter_cnt += 1 losses /= iter_cnt accuracy /= iter_cnt predictions = np.array(predictions, dtype='int') logger.logging_eval(step, loss, accuracy, predictions) # Model checkpoint if best_f1 < logger.best_f1: best_f1 = logger.best_f1 path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step) print("Saved model checkpoint to {}\n".format(path))
import PySimpleGUI as sg from utils import text_file, Pdf, load_word2vec, retrieve, print_output_sents import random word2vec = load_word2vec() #List of all file names for display in '-FILE LIST-' files_list = [] #Dictionary of all pdf objects to check if file is in list so as to not create the same pdf objects every time another pdf is uploaded #Keys are the filename and the values are the Pdf object corresponding to the filename pdf_obj = {} #Dictionary of all text file objects txt_obj = {} #For printing color to the output so as to differentiate results coming from different documents cprint = sg.cprint #List of possible background colors for cprint to use background_colors = ['yellow', 'orange'] def make_win1(): file_list_column = [ [sg.Text('Upload file'), sg.In(size=(25, 1), enable_events=True, key='-FILES-'), sg.FileBrowse()], [sg.Checkbox('word2vec', key = '-WORD2VEC-')], [sg.Listbox(values=[], size=(40, 20), enable_events=True, select_mode='multiple', key='-FILE LIST-')]]