def __init__(self, model_name, max_length, curdir): df_train, df_valid, df_test, intent_names, \ self.intent_map, self.slot_map = load_prepare_dataset(curdir) # Y's: self.intent_train = df_train["intent_label"].map( self.intent_map).values self.intent_valid = df_valid["intent_label"].map( self.intent_map).values self.intent_test = df_test["intent_label"].map(self.intent_map).values tokenizer = BertTokenizer.from_pretrained(model_name) self.curdir = curdir # X's: print('Encoding data...') self.encoded_train = encode_dataset(tokenizer, df_train["words"], max_length) self.encoded_valid = encode_dataset(tokenizer, df_valid["words"], max_length) self.encoded_test = encode_dataset(tokenizer, df_test["words"], max_length) self.slot_train = encode_token_labels(df_train["words"], df_train["word_labels"], tokenizer, self.slot_map, max_length) self.slot_valid = encode_token_labels(df_valid["words"], df_valid["word_labels"], tokenizer, self.slot_map, max_length) self.slot_test = encode_token_labels(df_test["words"], df_test["word_labels"], tokenizer, self.slot_map, max_length) self.intent_model = SlotIntentDetectorModelBase( intent_num_labels=len(self.intent_map), slot_num_labels=len(self.slot_map)) opt = Adam(learning_rate=3e-5, epsilon=1e-08) losses = [ SparseCategoricalCrossentropy(from_logits=True), SparseCategoricalCrossentropy(from_logits=True) ] metrics = [SparseCategoricalAccuracy('accuracy')] self.intent_model.compile(optimizer=opt, loss=losses, metrics=metrics)
def predict(self, data_dir): teX1, teX2, _ = encode_dataset(self.text_encoder, atec(data_dir)) teX, teM = self.transform_roc(teX1, teX2) self.build_graph() self.sess.run([ p.assign(ip) for p, ip in zip( self.params, joblib.load(os.path.join(save_dir, desc, 'best_params.jl'))) ]) pred_fn = lambda x: np.argmax(x, 1) predictions = pred_fn(self.iter_predict(teX, teM)) return predictions
def predict_sub_instances(text_encoder, sub_instances): global dataset if not len(sub_instances): return [] prems, hyps, ys = zip(*[(sub["premise"], sub["hypothesis"], sub["label"]) for sub in sub_instances]) test_set = encode_dataset([(prems, hyps, ys)], encoder=text_encoder) (tst_p, tst_h, teY) = test_set[0] teX, teM = transform_entailment(tst_p, tst_h) pred_fn = pred_fns[dataset] label_decoder = label_decoders[dataset] predictions = pred_fn(iter_predict(teX, teM)) if label_decoder is not None: predictions = [label_decoder[prediction] for prediction in predictions] return predictions
logger = ResultLogger(path=os.path.join( log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2 if dataset == 'rocstories': (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset( rocstories(data_dir, n_valid=n_valid), encoder=text_encoder) n_y = 2 n_ctx = min(max( [len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)] + [len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)] + [len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)] ) + 3, n_ctx) vocab = n_vocab + n_special + n_ctx trX, trM = transform_roc(trX1, trX2, trX3) vaX, vaM = transform_roc(vaX1, vaX2, vaX3) if submit: teX, teM = transform_roc(teX1, teX2, teX3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = ResultLogger(path=os.path.join(log_dir, '{}log.json'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") dataLoader = DataLoader() ((trX, trY), (vaX, vaY), (teX, )) = encode_dataset(*dataLoader.veracity(data_dir, topic=topic), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 2 max_len = n_ctx - 2 # Define maximum context as the minimum of [512, x] where x is the max sentence length n_ctx = min( max([len(x[:max_len]) for x in trX] + [len(x[:max_len]) for x in vaX] + [len(x[:max_len]) for x in teX]) + 3, n_ctx) vocab = n_vocab + n_special + n_ctx training_engine = TrainingEngine()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device {} n_gpu {}".format(device, n_gpu)) res_logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) logger.info("Encoding dataset...") ((trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3)) = encode_dataset(*rocstories(data_dir, n_valid=args.n_valid), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2 n_ctx = min( max([ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3) ] + [ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3) ] + [
for step in ins['text'] ] v_passage.append(text) v_ing.append(ins['ing']) v_gold.append(ins['gold']) v_all_ings.append(ins['all_ings']) dataset = ( tlm_passage, tlm_ing, ), (t_ing, t_gold), (v_ing, v_gold) (( trlmX1, trlmX2, ), (trX2, trY), (teX2, teY)) = encode_dataset(*dataset, encoder=text_encoder) trX1 = encode_dataset_whole(t_passage, encoder=text_encoder) teX1 = encode_dataset_whole(v_passage, encoder=text_encoder) trX3 = encode_dataset_whole(t_all_ings, encoder=text_encoder) teX3 = encode_dataset_whole(v_all_ings, encoder=text_encoder) print(n_ctx) vocab = n_vocab + n_special + n_ctx trlmX, trlmM = transform_recipe_whole_just_recipe(trlmX1, trlmX2, trlmX2) trlmX, valmX = trlmX[:-lmval], trlmX[-lmval:] trlmM, valmM = trlmM[:-lmval], trlmM[-lmval:] trX, trM = transform_recipe_whole(trX1, trX2, trX3) trX, vaX = trX[:-taskval], trX[-taskval:]
log_dir = args.log_dir submission_dir = args.submission_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") ((trX1, trY), (vaX1, vaY), teX1) = encode_dataset(*checkpoint5(data_dir, n_valid=args.n_valid), encoder=text_encoder) #print(trX1) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2 n_ctx = min( max([len(x1[:max_len]) for x1 in trX1] + [len(x1[:max_len]) for x1 in vaX1] + [len(x1[:max_len]) for x1 in teX1]) + 2, n_ctx) vocab = n_vocab + n_special + n_ctx trX, trM = transform_checkpoint5(trX1)
data_dir = args.data_dir log_dir = args.log_dir submission_dir = args.submission_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) start_time = time.time() print("Encoding dataset...") (trX, vaX) = encode_dataset(*getData(data_dir, n_valid=args.n_valid), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) n_special = 3 max_len = n_ctx // 2 - 2 n_ctx = 626 * 2 + 4 vocab = n_vocab + n_special + n_ctx print(vocab) trX, trM = transform_roc(trX) vaX, vaM = transform_roc(vaX) n_train = len(trX) n_valid = len(vaX) n_batch_train = args.n_batch * max(n_gpu, 1)
data_dir = args.data_dir log_dir = args.log_dir submission_dir = args.submission_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") firstsent, secondsent = getData(data_dir, n_valid=args.n_valid) firstbpe, secondbpe = encode_dataset(*(firstsent, secondsent), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) n_special = 3 max_len = n_ctx // 2 - 2 n_ctx = 1256 vocab = n_vocab + n_special + n_ctx n_train = len(firstsent) n_valid = len(secondsent) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter dh_model = LMModel(args, vocab, n_ctx)
def train(self): global_step = tf.train.get_or_create_global_step() X_train = tf.placeholder(tf.int32, [self.n_batch_train, 2, n_ctx, 2]) M_train = tf.placeholder(tf.float32, [self.n_batch_train, 2, n_ctx]) X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2]) M = tf.placeholder(tf.float32, [None, 2, n_ctx]) Y_train = tf.placeholder(tf.int32, [self.n_batch_train]) Y = tf.placeholder(tf.int32, [None]) #self.train, self.logits, self.clf_losses, self.lm_losses = self.mgpu_train(self.X_train, self.M_train, self.Y_train) xs = [X_train, M_train, Y_train] gpu_ops = [] gpu_grads = [] xs = (tf.split(x, n_gpu, 0) for x in xs) optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=b1, beta2=b2, epsilon=e) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): logits, clf_losses, lm_losses = self.model(*xs, train=True, reuse=do_reuse) if lm_coef > 0: train_loss = tf.reduce_mean( clf_losses) + lm_coef * tf.reduce_mean(lm_losses) else: train_loss = tf.reduce_mean(clf_losses) raw_grads_and_vars = optimizer.compute_gradients(train_loss) grads_and_vars = [(tf.clip_by_global_norm([gv[0]], max_grad_norm)[0][0], gv[1]) for gv in raw_grads_and_vars] gpu_grads.append(grads_and_vars) gpu_ops.append([logits, clf_losses, lm_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] logits, clf_losses, lm_losses = ops grads = average_grads(gpu_grads) train_op = optimizer.apply_gradients(grads, global_step=global_step) clf_loss = tf.reduce_mean(clf_losses) saver = tf.train.Saver(max_to_keep=5) self.params = find_trainable_variables('model_lm') self.eval_mgpu_logits, self.eval_mgpu_clf_losses, self.eval_mgpu_lm_losses = self.mgpu_predict( X_train, M_train, Y_train) self.eval_logits, self.eval_clf_losses, self.eval_lm_losses = self.model( X, M, Y, train=False, reuse=True) self.eval_clf_loss = tf.reduce_mean(self.eval_clf_losses) self.eval_mgpu_clf_loss = tf.reduce_mean(self.eval_mgpu_clf_losses) summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES) def trva_split(data, index): return [data[i] for i in index] x1, x2, y = encode_dataset(self.text_encoder, atec(data_dir)) valid_index = np.load('data/valid_index.npy') if data_dir == 'data/para.tsv': valid_index = np.concatenate([ valid_index, valid_index + len(y) // 4, valid_index + len(y) // 2, valid_index + 3 * len(y) // 4 ]) valid_index = valid_index.tolist() train_index = list(set(valid_index) ^ set(range(len(y)))) trX1, trX2, trY = trva_split(x1, train_index), trva_split( x2, train_index), trva_split(y, train_index) vaX1, vaX2, vaY = trva_split(x1, valid_index), trva_split( x2, valid_index), trva_split(y, valid_index) trX, trM = self.transform_roc(trX1, trX2) vaX, vaM = self.transform_roc(vaX1, vaX2) n_train = len(trY) n_valid = len(vaY) self.n_updates_total = (n_train // self.n_batch_train) * n_iter self.build_graph() if pre_load: shapes = json.load(open('model/params_shapes.json')) offsets = np.cumsum([np.prod(shape) for shape in shapes]) init_params = [ np.load('model/params_{}.npy'.format(n)) for n in range(10) ] init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = [ param.reshape(shape) for param, shape in zip(init_params, shapes) ] init_params[0] = init_params[0][:+n_ctx] init_params[0] = np.concatenate([ init_params[1], (np.random.randn(self.n_special, n_embd) * 0.02).astype( np.float32), init_params[0] ], 0) del init_params[1] if self.n_transfer == -1: self.n_transfer = 0 else: self.n_transfer = 1 + self.n_transfer * 12 self.sess.run([ p.assign(ip) for p, ip in zip(self.params[:self.n_transfer], init_params[:self.n_transfer]) ]) if not new_model: print('loading old model') self.load() print('load success') n_updates = 0 n_epochs = 0 self.save(os.path.join(save_dir, desc, 'best_params.jl')) self.best_score = 0 def log(): def iter_apply(Xs, Ms, Ys): fns = [ lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x)) ] results = [] for xmb, mmb, ymb in iter_data((Xs, Ms, Ys), n_batch=self.n_batch_train, truncate=False, verbose=True): n = len(xmb) if n == self.n_batch_train: res = sess.run( [self.eval_mgpu_logits, self.eval_mgpu_clf_loss], { X_train: xmb, M_train: mmb, Y_train: ymb }) else: res = sess.run([self.eval_logits, self.eval_clf_loss], { X: xmb, M: mmb, Y: ymb }) res = [r * n for r in res] results.append(res) results = zip(*results) return [fn(res) for res, fn in zip(results, fns)] # global best_score tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid]) va_logits, va_cost = iter_apply(vaX, vaM, vaY) tr_cost = tr_cost / len(trY[:n_valid]) va_cost = va_cost / n_valid tr_f1 = f1_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100. va_f1 = f1_score(vaY, np.argmax(va_logits, 1)) * 100. self.logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_f1=tr_f1, va_f1=va_f1) print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_f1, va_f1)) score = va_f1 if score > self.best_score: self.best_score = score self.save(os.path.join(save_dir, desc, 'best_params.jl')) for i in range(n_iter): for xmb, mmb, ymb in iter_data( (shuffle(trX, trM, trY, random_state=np.random)), n_batch=self.n_batch_train, truncate=True, verbose=True): cost, _ = self.sess.run([self.clf_loss, self.train], { self.X_train: xmb, self.M_train: mmb, self.Y_train: ymb }) n_updates += 1 if n_updates % 1000 == 0: log() n_epochs += 1 log()
def ccc_train(self): # Resolve hostnames and ports of other nodes host, hosts = client(bootstrap_host, bootstrap_port) # Create a cluster and identify the job name and task of this node cluster = tf.train.ClusterSpec({ 'ps': hosts[:num_ps], 'worker': hosts[num_ps:] }) task = hosts.index(host) job_name = ('ps', 'worker')[task >= num_ps] task = cluster.job_tasks(job_name).index(host) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True server = tf.train.Server(cluster, job_name=job_name, task_index=task, config=tf_config) if job_name == 'ps': # create a shared queue on the parameter server which is visible on /job:ps/task:%d with tf.device('/job:ps/task:%d' % task): queue = tf.FIFOQueue(cluster.num_tasks('worker'), tf.int32, shared_name='done_queue%d' % task) # wait for the queue to be filled with tf.Session(server.target) as sess: for i in range(cluster.num_tasks('worker')): sess.run(queue.dequeue()) print('ps:%d received "done" from worker:%d' % (task, i)) print('ps:%d quitting' % task) elif job_name == 'worker': with tf.device( tf.train.replica_device_setter( worker_device='/job:worker/task:%d' % task, cluster=cluster)): global_step = tf.train.get_or_create_global_step() sentences = self.batched_data( tfrecord_filename, self.single_example_parser, self.n_batch_train, padded_shapes=tf.Dimension(n_ctx), num_epochs=n_iter) sentences = tf.cast(sentences, tf.int32) max_len = tf.shape(sentences)[1] #sentences.get_shape()[1] xmb = tf.reshape(sentences, [self.n_batch_train, 1, max_len, 1]) M_train = tf.cast( tf.reshape(tf.sign(xmb), [self.n_batch_train, 1, max_len]), tf.float32) positions = tf.reshape(tf.range( self.n_vocab + self.n_special, self.n_vocab + self.n_special + max_len), shape=[1, 1, max_len, 1]) #tf.constant(np.arange(self.n_vocab + self.n_special, self.n_vocab + self.n_special + max_len),shape=[1, 1, max_len, 1]) positions = tf.tile(positions, [self.n_batch_train, 1, 1, 1]) X_train = tf.concat([xmb, positions], axis=3) optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=b1, beta2=b2, epsilon=e) gpu_grads = [] gpu_loss = [] gpu_ppl = [] xs = [X_train, M_train] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i)), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): lm_losses = self.model(*xs, train=True, num_ps=num_ps) train_ppl_single = tf.reduce_mean(math.e**lm_losses) train_loss_single = tf.reduce_mean(lm_losses) gpu_loss.append(train_loss_single) gpu_ppl.append(train_ppl_single) optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=b1, beta2=b2, epsilon=e) raw_grads_and_vars = optimizer.compute_gradients( train_loss_single) grads_and_vars = [ (tf.clip_by_global_norm([gv[0]], max_grad_norm)[0][0], gv[1]) for gv in raw_grads_and_vars ] gpu_grads.append(grads_and_vars) train_ppl = tf.reduce_mean(gpu_ppl) train_loss = tf.reduce_mean(gpu_loss) grads = average_grads(gpu_grads) train_op = optimizer.apply_gradients(grads, global_step=global_step) saver = tf.train.Saver(max_to_keep=5) X = tf.placeholder(tf.int32, [None, 1, n_ctx, 2]) M = tf.placeholder(tf.float32, [None, 1, n_ctx]) valid_lm_losses = self.model(X, M, train=False, reuse=True) valid_ppl = tf.reduce_mean(math.e**valid_lm_losses) valid_loss = tf.reduce_mean(valid_lm_losses) self.params = find_trainable_variables('model_lm') tf.summary.scalar('train_loss', train_loss) #tf.summary.scalar('valid_loss', valid_loss) tf.summary.scalar('train_ppl', train_ppl) #tf.summary.scalar('valid_ppl', valid_ppl) summary_op = tf.summary.merge_all() done_ops = [] # create a shared queue on the worker which is visible on /job:ps/task:%d for i in range(cluster.num_tasks('ps')): with tf.device('/job:ps/task:%d' % i): with tf.name_scope('done_queue'): done_queue = tf.FIFOQueue(cluster.num_tasks('worker'), tf.int32, shared_name='done_queue' + str(i)) done_ops.append(done_queue.enqueue(task)) scaffold = tf.train.Scaffold(saver=saver) summary_hook = tf.train.SummarySaverHook(save_steps=1000, output_dir=save_dir, summary_op=summary_op) hooks = [ summary_hook, # tf.train.CheckpointSaverHook(save_secs=600, checkpoint_dir=save_dir, saver=saver), tf.train.StopAtStepHook(last_step=1000000), tf.train.LoggingTensorHook( { 'step': global_step, 'train_loss': train_loss, 'ppl': train_ppl }, every_n_iter=100), tf.train.FinalOpsHook([done_ops]) ] valid_data = pre_train_valid(valid_dir) vaX1 = encode_dataset(self.text_encoder, pre_train(valid_data))[0] vaX, vaM = self.transform_roc(vaX1) with tf.train.MonitoredTrainingSession(master=server.target, is_chief=(task == 0), hooks=hooks, save_checkpoint_secs=600, checkpoint_dir=save_dir, scaffold=scaffold) as sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: while not coord.should_stop(): ppl, loss, _, step = sess.run([ train_ppl, train_loss, train_op, global_step ]) #,options=run_options, run_metadata=run_metadata) if step % steps_to_validate == 0: va_cost = [] va_ppl = [] for xm, mm in iter_data((vaX, vaM), n_batch=self.n_batch_train, truncate=False, verbose=True): ps = sess.run(self.params) joblib.dump(ps, save_dir + 'model_lm.params', protocol=2) res, ppl = sess.run([valid_loss, valid_ppl], { X: xm, M: mm }) va_cost.append(np.sum(res)) va_ppl.append(np.sum(ppl)) va_cost = np.average(va_cost) va_ppl = np.average(va_ppl) tf.logging.info( '=========n_steps:\t%d valid_cost:\t%.3f valid ppl:\t%.3f==========' % (step, va_cost, va_ppl)) except tf.errors.OutOfRangeError: print('Epochs Complete!') finally: coord.request_stop() coord.join(threads)
print(args) globals().update(args.__dict__) random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) # data preprocess (trX1, trX2, trX3, trX4, trX5, trY), \ (vaX1, vaX2, vaX3, vaX4, vaX5, vaY), \ (teX1, teX2, teX3, teX4, teX5, teY) = encode_dataset(text_encoder, race(data_dir)) n_y = 4 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 # max_len = n_ctx//2-2 # n_ctx = min(max([len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)])+3, n_ctx) max_len = n_ctx trX, trM = transform_race(trX1, trX2, trX3, trX4, trX5) vaX, vaM = transform_race(vaX1, vaX2, vaX3, vaX4, vaX5) if submit: teX, teM = transform_race(teX1, teX2, teX3, teX4, teX5) n_train = len(trY)
print(args) globals().update(args.__dict__) random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) # data preprocess (vaX1, vaX2, vaX3, vaX4, vaX5, vaY), \ (teX1, teX2, teX3, teX4, teX5, teY), \ (m_teX1, m_teX2, m_teX3, m_teX4, m_teX5, m_teY), \ (h_teX1, h_teX2, h_teX3, h_teX4, h_teX5, h_teY) = encode_dataset(text_encoder, race(data_dir, is_train=False)) n_y = 4 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 # max_len = n_ctx//2-2 # n_ctx = min(max([len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)])+3, n_ctx) max_len = n_ctx vaX, vaM = transform_race(vaX1, vaX2, vaX3, vaX4, vaX5) teX, teM = transform_race(teX1, teX2, teX3, teX4, teX5) m_teX, m_teM = transform_race(m_teX1, m_teX2, m_teX3, m_teX4, m_teX5) h_teX, h_teM = transform_race(h_teX1, h_teX2, h_teX3, h_teX4, h_teX5) n_valid = len(vaY)
submission_dir = args.submission_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") ((trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3)) = encode_dataset(*anli(data_dir, n_valid=args.n_valid), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2 n_ctx = min( max([ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3) ] + [ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3) ] + [
v_ing.append('ok') v_gold.append([0]) v_all_ings.append(['a','b']) ''' for ins in val_file: text= [step.replace('-lrb-','(').replace('-rrb-', ')') for step in ins['ve_$replaced_text']] v_passage.append(text) v_ing.append(ins['ing']) v_gold.append(ins['gold']) v_all_ings.append(ins['all_ings']) #print(tlm_passage[0]) a = (tlm_passage, tlm_ing,), (t_ing,t_gold),(v_ing,v_gold) ((trlmX1, trlmX2,),(trX2, trY),(vaX2, vaY)) = encode_dataset(*a,encoder = text_encoder) #trlmX1 = encode_dataset_whole(tlm_passage, encoder = text_encoder) trX1 = encode_dataset_whole(t_passage, encoder = text_encoder) vaX1 = encode_dataset_whole(v_passage, encoder = text_encoder) print(vaX1[0][1]) trX3 = encode_dataset_whole(t_all_ings, encoder = text_encoder) vaX3 = encode_dataset_whole(v_all_ings, encoder = text_encoder) n_batch_train = args.n_batch * max(n_gpu, 1) print(n_ctx) vocab = n_vocab + n_special + n_ctx trlmX, trlmM = transform_recipe_whole_just_recipe(trlmX1, trlmX2,trlmX2) trlmX, valmX = trlmX[:-lmval], trlmX[-lmval:] trlmM, valmM = trlmM[:-lmval], trlmM[-lmval:] trX, trM = transform_recipe_whole(trX1, trX2, trX3)
dataset = args.dataset device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) log_file = os.path.join(log_dir, '{}.jsonl'.format(dataset)) logger = ResultLogger(path=log_file, **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") ((trX, trY), (vaX, vaY), (teX, teY)) = encode_dataset( *preprocess_fns[dataset](data_dir, sentence_pair=args.sentence_pair), encoder=text_encoder, skip_preprocess=args.skip_preprocess) encoder['_start_'] = len(encoder) if args.sentence_pair or args.force_delimiter: encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 2 + int('_delimiter_' in encoder) if args.sentence_pair: max_len = n_ctx // 2 - 2 else: max_len = n_ctx - n_special if not args.force_max_ctx: if args.sentence_pair: n_ctx = min( sum(
parser.add_argument('--e', type=float, default=1e-8) args = parser.parse_args() print(args) globals().update(args.__dict__) random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) #tf.random.set_seed(seed) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder) n_y = 2 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx//2-2 n_ctx = min(max([len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)])+3, n_ctx) trX, trM = transform_roc(trX1, trX2, trX3) vaX, vaM = transform_roc(vaX1, vaX2, vaX3) if submit: teX, teM = transform_roc(teX1, teX2, teX3) n_train = len(trY) n_valid = len(vaY)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) # logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc))) text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") ((trX, trY), (vaX, vaY), _) = encode_dataset(*imdb(data_dir, n_train=100, n_valid=1000), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx - 2 vocab = n_vocab + n_special + n_ctx def transform_imdb(X): n_batch = len(X) xmb = np.zeros((n_batch, n_ctx, 2), dtype=np.int32) mmb = np.zeros((n_batch, n_ctx), dtype=np.float32) start = encoder['_start_']
encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2 if dataset == 'rocstories': (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir, n_valid=n_valid), encoder=text_encoder) n_y = 2 n_ctx = min( max([ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3) ] + [ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3) ] + [ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3) ]) + 3, n_ctx) vocab = n_vocab + n_special + n_ctx trX, trM = transform_roc(trX1, trX2, trX3)
# torch.device object used throughout this script TODO add gpu setting device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir, n_valid=args.n_valid), encoder=text_encoder) n_y = 2 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2 n_ctx = min( max([ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3) ] + [ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)
val_gold.append(int(ins['gold'])) #train_context[0] = '_delimiter_2' train_context[0] = 'Abercrombie' print(train_context[0]) print('\n') print(train_op1[0]) print('\n') a = (train_context, train_op1, train_op2, train_op3, train_op4, train_gold), (val_context, val_op1, val_op2, val_op3, val_op4, val_gold) print("Encoding dataset...") ((trX1, trX2, trX3, trX4, trX5, trY), (vaX1, vaX2, vaX3, vaX4, vaX5, vaY)) = encode_dataset(*a, encoder=text_encoder) print(trX1[0]) print('\n') print(trX2[0]) print('\n') clf_token = encoder['_classify_'] n_special = 5 max_len = 510 n_ctx = min( max([ max(len(x2[:max_len]), len(x3[:max_len]), len(x4[:max_len]), len(x5[:max_len])) for x1, x2, x3, x4, x5 in zip(trX1, trX2, trX3, trX4, trX5) ] + [
desc: None, } device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") ((trX, trY), (vaX, vaY), (teX, )) = encode_dataset(*multiclas(data_dir, n_valid=args.n_valid), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 2 max_len = n_ctx - 2 # Define maximum context as the minimum of [512, x] where x is the max sentence length n_ctx = min( max([len(x[:max_len]) for x in trX] + [len(x[:max_len]) for x in vaX] + [len(x[:max_len]) for x in teX]) + 3, n_ctx) vocab = n_vocab + n_special + n_ctx trX, trM = transform_sst2(trX)
elif X1[i][j] == '[[/HL]]': hl = 0 else: X += [X1[i][j]] if hl == 1: H += [hl1t] else: H += [hl2t] H1 += [H] X1n += [X] return X1n, H1 (trX1, trX2, trX3, trX4, trX5, trX6, trX7, trX8, trX9, trY), (vaX1, vaX2, vaX3, vaX4, vaX5, vaX6, vaX7, vaX8, vaX9, vaY), (teX1, teX2, teX3, teX4, teX5, teX6, teX7, teX8, teX9) = encode_dataset(race(data_dir), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) encoder['_hl1_'] = len(encoder) encoder['_hl2_'] = len(encoder) clf_token = encoder['_classify_'] trX1, trH1 = get_hl(trX1, encoder['_hl1_'], encoder['_hl2_']) vaX1, vaH1 = get_hl(vaX1, encoder['_hl1_'], encoder['_hl2_']) teX1, teH1 = get_hl(teX1, encoder['_hl1_'], encoder['_hl2_']) trX7, trH7 = get_hl(trX7, encoder['_hl1_'], encoder['_hl2_']) vaX7, vaH7 = get_hl(vaX7, encoder['_hl1_'], encoder['_hl2_']) teX7, teH7 = get_hl(teX7, encoder['_hl1_'], encoder['_hl2_']) trX8, trH8 = get_hl(trX8, encoder['_hl1_'], encoder['_hl2_'])
elif X1[i][j] == '[[/SQ]]': sq = 0 else: X += [X1[i][j]] if sq == 1: S += [s1t] else: S += [s2t] S1 += [S] X1n += [X] return X1n, S1 (trX1, trX2, trX3, trX4, trX5, trY), (vaX1, vaX2, vaX3, vaX4, vaX5, vaY), (teX1, teX2, teX3, teX4, teX5) = encode_dataset(dream(data_dir), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) encoder['_speaker1_'] = len(encoder) encoder['_speaker2_'] = len(encoder) clf_token = encoder['_classify_'] trX1, trS1 = get_speaker(trX1, encoder['_speaker1_'], encoder['_speaker2_']) vaX1, vaS1 = get_speaker(vaX1, encoder['_speaker1_'], encoder['_speaker2_']) teX1, teS1 = get_speaker(teX1, encoder['_speaker1_'], encoder['_speaker2_']) n_special = len(encoder)
data_dir = args.data_dir log_dir = args.log_dir submission_dir = args.submission_dir topic = args.topic device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") ((trX, trY), (vaX, vaY), (teX, )) = encode_dataset(*stance(data_dir, topic=topic), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 2 max_len = n_ctx - 2 # Define maximum context as the minimum of [512, x] where x is the max sentence length n_ctx = min(max( [len(x[:max_len]) for x in trX] + [len(x[:max_len]) for x in vaX] + [len(x[:max_len]) for x in teX] ) + 3, n_ctx) vocab = n_vocab + n_special + n_ctx trX, trM = transform_stance(trX)
def encode(self, text_sequence, max_length): return encode_dataset(self.tokenizer, text_sequence, max_length)
args = parser.parse_args() print(args) globals().update(args.__dict__) random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder) n_y = 2 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2 n_ctx = min( max([ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3) ] + [ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3) ] + [
print("device", device, "n_gpu", n_gpu) # Initialize the logger: logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) # Initialize the text encoder with the vocabulary and encoder file: text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) # Encode the different datasets using the text encoders: print("Encoding dataset...") ((trX, trY), (vaX, vaY), (teX, )) = encode_dataset(*stance(data_dir, train_file, test_file), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 2 max_len = n_ctx - 2 # Define maximum context as the minimum of [512, x] where x is the max sentence length: n_ctx = min( max([len(x[:max_len]) for x in trX] + [len(x[:max_len]) for x in vaX] + [len(x[:max_len]) for x in teX]) + 3, n_ctx) # Apply word embedding on the training and validation datasets: vocab = n_vocab + n_special + n_ctx
print(args) globals().update(args.__dict__) random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) #(trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder) #enco_ry = ruoyao(data_dir) #(trX1,trX2,tyY), (vaX1, vaX2, vaY), (teX1, teX2) = ruoyao(data_dir) #print(trX1[0]) (trX1,trX2,trY), (vaX1, vaX2, vaY), (teX1, teX2, teY) = encode_dataset(ruoyao(data_dir), encoder=text_encoder) n_y = 2 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx//2-2 n_ctx = min(max([len(x1[:max_len])+len(x2[:max_len]) for x1, x2 in zip(trX1, trX2)]+[len(x1[:max_len])+len(x2[:max_len]) for x1, x2 in zip(vaX1, vaX2)]+[len(x1[:max_len])+len(x2[:max_len]) for x1, x2 in zip(teX1, teX2)])+3, n_ctx) trX, trM = transform_ruoyao(trX1, trX2) vaX, vaM = transform_ruoyao(vaX1, vaX2) if submit: teX, teM = transform_ruoyao(teX1, teX2) n_train = len(trY) n_valid = len(vaY)
def train(self): global_step = tf.train.get_or_create_global_step() X_train = tf.placeholder(tf.int32, [self.n_batch_train, 2, n_ctx, 2]) M_train = tf.placeholder(tf.float32, [self.n_batch_train, 2, n_ctx]) X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2]) M = tf.placeholder(tf.float32, [None, 2, n_ctx]) Y_train = tf.placeholder(tf.int32, [self.n_batch_train]) Y = tf.placeholder(tf.int32, [None]) #self.train, self.logits, self.clf_losses, self.lm_losses = self.mgpu_train(self.X_train, self.M_train, self.Y_train) xs = [X_train, M_train, Y_train] gpu_ops = [] gpu_grads = [] xs = (tf.split(x, n_gpu, 0) for x in xs) optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=b1, beta2=b2, epsilon=e) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): logits, clf_losses, lm_losses = self.model(*xs, train=True, reuse=do_reuse) if lm_coef > 0: train_loss = tf.reduce_mean( clf_losses) + lm_coef * tf.reduce_mean(lm_losses) else: train_loss = tf.reduce_mean(clf_losses) raw_grads_and_vars = optimizer.compute_gradients(train_loss) grads_and_vars = [(tf.clip_by_global_norm([gv[0]], max_grad_norm)[0][0], gv[1]) for gv in raw_grads_and_vars] gpu_grads.append(grads_and_vars) gpu_ops.append([logits, clf_losses, lm_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] logits, clf_losses, lm_losses = ops grads = average_grads(gpu_grads) train_op = optimizer.apply_gradients(grads, global_step=global_step) clf_loss = tf.reduce_mean(clf_losses) saver = tf.train.Saver(max_to_keep=5) self.params = find_trainable_variables('model_lm') if pre_load: restore_op = [ p.assign(ip) for p, ip in zip( self.params, joblib.load(lm_dir + '/model_lm.params')) ] self.eval_mgpu_logits, self.eval_mgpu_clf_losses, self.eval_mgpu_lm_losses = self.mgpu_predict( X_train, M_train, Y_train) self.eval_logits, self.eval_clf_losses, self.eval_lm_losses = self.model( X, M, Y, train=False, reuse=True) self.eval_clf_loss = tf.reduce_mean(self.eval_clf_losses) self.eval_mgpu_clf_loss = tf.reduce_mean(self.eval_mgpu_clf_losses) summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES) def trva_split(data, index): return [data[i] for i in index] x1, x2, y = encode_dataset(self.text_encoder, atec(data_dir)) valid_index = np.load('data/valid_index.npy') if data_dir == 'data/para.tsv': valid_index = np.concatenate([ valid_index, valid_index + len(y) // 4, valid_index + len(y) // 2, valid_index + 3 * len(y) // 4 ]) valid_index = valid_index.tolist() train_index = list(set(valid_index) ^ set(range(len(y)))) trX1, trX2, trY = trva_split(x1, train_index), trva_split( x2, train_index), trva_split(y, train_index) vaX1, vaX2, vaY = trva_split(x1, valid_index), trva_split( x2, valid_index), trva_split(y, valid_index) trX, trM = self.transform_roc(trX1, trX2) vaX, vaM = self.transform_roc(vaX1, vaX2) n_train = len(trY) n_valid = len(vaY) self.n_updates_total = (n_train // self.n_batch_train) * n_iter def log(): def iter_apply(Xs, Ms, Ys): fns = [ lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x)) ] results = [] for xmb, mmb, ymb in iter_data((Xs, Ms, Ys), n_batch=self.n_batch_train, truncate=False, verbose=True): n = len(xmb) if n == self.n_batch_train: res = sess.run( [self.eval_mgpu_logits, self.eval_mgpu_clf_loss], { X_train: xmb, M_train: mmb, Y_train: ymb }) else: res = sess.run([self.eval_logits, self.eval_clf_loss], { X: xmb, M: mmb, Y: ymb }) res = [r * n for r in res] results.append(res) results = zip(*results) return [fn(res) for res, fn in zip(results, fns)] # global best_score tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid]) va_logits, va_cost = iter_apply(vaX, vaM, vaY) tr_cost = tr_cost / len(trY[:n_valid]) va_cost = va_cost / n_valid tr_f1 = f1_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100. va_f1 = f1_score(vaY, np.argmax(va_logits, 1)) * 100. tf.logging.info( '%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_f1, va_f1)) scaffold = tf.train.Scaffold(saver=saver) log_hook = tf.train.LoggingTensorHook( { 'step': global_step, 'train_loss': clf_loss }, every_n_iter=100) summary_hook = tf.train.SummarySaverHook(save_steps=100, output_dir=save_dir, summary_op=summary_op) hooks = [summary_hook, log_hook] tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True n_epochs = 0 with tf.train.MonitoredTrainingSession(hooks=hooks, save_checkpoint_secs=600, checkpoint_dir=save_dir, scaffold=scaffold, config=tf_config) as sess: if pre_load: sess.run(restore_op) for i in range(n_iter): for xmb, mmb, ymb in iter_data( (shuffle(trX, trM, trY, random_state=np.random)), n_batch=self.n_batch_train, truncate=True, verbose=True): cost, _, n_updates = sess.run( [clf_loss, train_op, global_step], { X_train: xmb, M_train: mmb, Y_train: ymb }) if n_updates % 100 == 0: log() n_epochs += 1 log()