def init(): if args.seed: torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) else: seed = np.random.randint(2**31) torch.manual_seed(seed) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True batch_size = args.batch_size use_chars = args.char_dim > 0 data = Data_preprocessor() data = data.preprocess(data_dir=args.data_dir, max_example=args.max_example, no_training_set=False, use_chars=use_chars) training_batch_loader = DataLoader(data.training, batch_size, shuffle=True) validation_batch_loader = DataLoader(data.validation, batch_size, shuffle=False) testing_batch_loader = DataLoader(data.testing, batch_size, shuffle=False) print("loading word2vec file") embed_path = os.path.join(args.data_dir, args.embed_file) embed_init, embed_dim = load_word2vec_embeddings(data.dictionary[0], embed_path) print("Embedding dimension: {}".format(embed_dim)) model = GAReader(args.n_layers, data.vocab_size, data.n_chars,args.drop_out, args.gru_size, embed_init, embed_dim, \ args.train_emb, args.char_dim, args.use_feat,args.gating_fn) model.cuda() optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=args.init_learning_rate) criterion = nn.CrossEntropyLoss().cuda() return model, optimizer, criterion, training_batch_loader, validation_batch_loader, testing_batch_loader
def test(args): use_chars = args.char_dim > 0 # load data dp = data_preprocessor() data = dp.preprocess(question_dir=args.data_dir, no_training_set=True, max_example=args.max_example, use_chars=use_chars) #import ipdb; ipdb.set_trace() idx_to_word = dict([(v, k) for (k, v) in data.dictionary[0].items()]) # build minibatch loader test_batch_loader = minibatch_loader(data.validation, args.batch_size, shuffle=False) with tf.device('/device:GPU:0'): model = GAReader(args.n_layers, data.vocab_size, data.n_chars, args.gru_size, 100, args.train_emb, args.char_dim, args.use_feat, args.gating_fn, save_attn=True) with tf.Session( config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) as sess: model.restore(sess, args.save_dir, args.ckpt) logging.info('-' * 50) logging.info("Start testing...") test_writer = tf.summary.FileWriter('logs/test', sess.graph) model.validate(sess, test_batch_loader, write_results=True)
def main(save_path, params): nhidden = params['nhidden'] dropout = params['dropout'] word2vec = params['word2vec'] dataset = params['dataset'] nlayers = params['nlayers'] train_emb = params['train_emb'] char_dim = params['char_dim'] use_feat = params['use_feat'] gating_fn = params['gating_fn'] ent_setup = params['ent_setup'] # ent, ent-anonym, no-ent data_path = params['data_path'] # save settings shutil.copyfile('config.py', '%s/config.py' % save_path) use_chars = char_dim > 0 if dataset == "clicr": dp = DataPreprocessor.DataPreprocessorClicr() data = dp.preprocess( #"/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/", data_path, ent_setup=ent_setup, no_training_set=False, use_chars=use_chars) elif dataset == "clicr_novice": dp = DataPreprocessor.DataPreprocessorNovice() data = dp.preprocess(data_path, ent_setup=ent_setup, no_training_set=False, use_chars=use_chars) else: dp = DataPreprocessor.DataPreprocessor() data = dp.preprocess(data_path, no_training_set=False, use_chars=use_chars) print("building minibatch loaders ...") batch_loader_train = MiniBatchLoader.MiniBatchLoader(data.training, BATCH_SIZE, sample=1.0) batch_loader_val = MiniBatchLoader.MiniBatchLoader(data.validation, BATCH_SIZE) print("building network ...") W_init, embed_dim, = Helpers.load_word2vec_embeddings( data.dictionary[0], word2vec) m = GAReader.Model(nlayers, data.vocab_size, data.num_chars, W_init, nhidden, embed_dim, dropout, train_emb, char_dim, use_feat, gating_fn) print("training ...") num_iter = 0 max_acc = 0. deltas = [] logger = open(save_path + '/log', 'a') if os.path.isfile('%s/best_model.p' % save_path): print('loading previously saved model') m.load_model('%s/best_model.p' % save_path) else: print('saving init model') m.save_model('%s/model_init.p' % save_path) print('loading init model') m.load_model('%s/model_init.p' % save_path) for epoch in range(NUM_EPOCHS): estart = time.time() new_max = False for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_train: loss, tr_acc, probs = m.train(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl) message = "Epoch %d TRAIN loss=%.4e acc=%.4f elapsed=%.1f" % ( epoch, loss, tr_acc, time.time() - estart) print(message) logger.write(message + '\n') num_iter += 1 if num_iter % VALIDATION_FREQ == 0: total_loss, total_acc, n, n_cand = 0., 0., 0, 0. for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_val: outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl) loss, acc, probs = outs[:3] bsize = dw.shape[0] total_loss += bsize * loss total_acc += bsize * acc n += bsize val_acc = total_acc / n if val_acc > max_acc: max_acc = val_acc m.save_model('%s/best_model.p' % save_path) new_max = True message = "Epoch %d VAL loss=%.4e acc=%.4f max_acc=%.4f" % ( epoch, total_loss / n, val_acc, max_acc) print(message) logger.write(message + '\n') # m.save_model('%s/model_%d.p'%(save_path,epoch)) message = "After Epoch %d: Train acc=%.4f, Val acc=%.4f" % ( epoch, tr_acc, val_acc) print(message) logger.write(message + '\n') # learning schedule if epoch >= 2: m.anneal() # stopping criterion if not new_max: break logger.close()
def train(args): use_chars = args.char_dim > 0 # load data dp = data_preprocessor() data = dp.preprocess(question_dir=args.data_dir, no_training_set=False, max_example=args.max_example, use_chars=use_chars) # build minibatch loader train_batch_loader = minibatch_loader(data.training, args.batch_size, sample=1.0) valid_batch_loader = minibatch_loader(data.validation, args.batch_size, shuffle=False) test_batch_loader = minibatch_loader(data.test, args.batch_size, shuffle=False) if not args.resume: logging.info("loading word2vec file ...") embed_init, embed_dim = \ load_word2vec_embeddings(data.dictionary[0], args.embed_file) logging.info("embedding dim: {}".format(embed_dim)) logging.info("initialize model ...") model = GAReader(args.n_layers, data.vocab_size, data.n_chars, args.gru_size, embed_dim, args.train_emb, args.char_dim, args.use_feat, args.gating_fn) model.build_graph(args.grad_clip, embed_init) init = tf.global_variables_initializer() saver = tf.train.Saver(tf.global_variables()) else: model = GAReader(args.n_layers, data.vocab_size, data.n_chars, args.gru_size, 100, args.train_emb, args.char_dim, args.use_feat, args.gating_fn) with tf.Session() as sess: # training phase if not args.resume: sess.run(init) if args.init_test: logging.info('-' * 50) logging.info("Initial test ...") best_loss, best_acc = model.validate(sess, valid_batch_loader) else: best_acc = 0. else: model.restore(sess, args.save_dir) saver = tf.train.Saver(tf.global_variables()) logging.info('-' * 50) lr = args.init_learning_rate logging.info("Start training ...") for epoch in range(args.n_epoch): start = time.time() it = loss = acc = n_example = 0 if epoch >= 2: lr /= 2 for dw, dt, qw, qt, a, m_dw, m_qw, tt, \ tm, c, m_c, cl, fnames in train_batch_loader: loss_, acc_ = model.train(sess, dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames, args.drop_out, lr) loss += loss_ acc += acc_ it += 1 n_example += dw.shape[0] if it % args.print_every == 0 or \ it % len(train_batch_loader) == 0: spend = (time.time() - start) / 60 statement = "Epoch: {}, it: {} (max: {}), "\ .format(epoch, it, len(train_batch_loader)) statement += "loss: {:.3f}, acc: {:.3f}, "\ .format(loss / args.print_every, acc / n_example) statement += "time: {:.1f}(m)"\ .format(spend) logging.info(statement) loss = acc = n_example = 0 start = time.time() # save model if it % args.eval_every == 0 or \ it % len(train_batch_loader) == 0: valid_loss, valid_acc = model.validate( sess, valid_batch_loader) if valid_acc >= best_acc: logging.info("Best valid acc: {}".format(best_acc)) model.save(sess, saver, args.save_dir) start = time.time() # test model logging.info("Final test ...") model.validate(sess, test_batch_loader)
def main(save_path, params): nhidden = params['nhidden'] dropout = params['dropout'] word2vec = params['word2vec'] dataset = params['dataset'] nlayers = params['nlayers'] train_emb = params['train_emb'] char_dim = params['char_dim'] use_feat = params['use_feat'] gating_fn = params['gating_fn'] out = 'out' # save settings shutil.copyfile('config.py', '%s/config.py' % save_path) use_chars = char_dim > 0 dp = DataPreprocessor.DataPreprocessor() data = dp.preprocess(dataset, no_training_set=False, use_chars=use_chars) word_dictionary = data.dictionary[0] the_index = word_dictionary['the'] #print('the index : {}'.format(word_dictionary['the'])) idx_to_word = dict([(v, k) for (k, v) in word_dictionary.iteritems()]) words = [idx_to_word[i] for i in sorted(idx_to_word.keys())] print("building minibatch loaders ...") batch_loader_train = MiniBatchLoader.MiniBatchLoader(data.training, BATCH_SIZE, sample=1.0) batch_loader_val = MiniBatchLoader.MiniBatchLoader(data.validation, BATCH_SIZE) print("building network ...") W_init, embed_dim, = Helpers.load_word2vec_embeddings( data.dictionary[0], word2vec) #print('the embedding : {}'.format(W_init[the_index])) #print(W_init[0:5]) print("running GAReader ...") m = GAReader.Model(nlayers, data.vocab_size, data.num_chars, W_init, nhidden, embed_dim, dropout, train_emb, char_dim, use_feat, gating_fn, words).build_network() m.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE, clipnorm=GRAD_CLIP), loss=tf.keras.losses.categorical_crossentropy, metrics=[tf.keras.metrics.categorical_accuracy]) #tf.enable_eager_execution(config=tf.ConfigProto(allow_soft_placement = True)) with tf.Graph().as_default(): with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: K.set_session(sess) #with tf.device('/gpu:0:'): tensorboard = TensorBoardCustom(log_dir="logs", words=words) modelcheckpoint = tf.keras.callbacks.ModelCheckpoint( 'output/weights.{epoch:02d}-{val_loss:.2f}.hdf5') writer = tf.summary.FileWriter("logs") def schedule(epoch, lr): if epoch >= 3: return lr * 0.5 else: return lr lrate = LearningRateScheduler(schedule, verbose=1) for epoch in xrange(NUM_EPOCHS): for (inputs, a) in batch_loader_train: [dw, qw, m_dw, m_qw, c, m_c, cl] = inputs m = GAReader.Model(nlayers, data.vocab_size, data.num_chars, W_init, nhidden, embed_dim, dropout, train_emb, char_dim, use_feat, gating_fn, words).build_network() m.compile(optimizer=tf.keras.optimizers.Adam( lr=LEARNING_RATE, clipnorm=GRAD_CLIP), loss=tf.keras.losses.categorical_crossentropy, metrics=[tf.keras.metrics.categorical_accuracy]) #print(dw.shape) #print('dw : {}'.format(dw)) #print('qw : {}'.format(qw)) #print('m_dw : {}'.format(m_dw)) #print('m_qw : {}'.format(m_qw)) #print('c : {}'.format(c)) #print([idx_to_word[i] for i in dw[0, :, 0].tolist()]) train_summary = m.train_on_batch( inputs, to_categorical(a, batch_loader_train.max_num_cand)) print(m.get_weights()[0]) print('epoch: {}, train loss: {}, train acc: {}'.format( epoch, train_summary[0], train_summary[1])) lr = tf.summary.scalar('learning_rate', LEARNING_RATE) summary = tf.summary.merge_all() s = sess.run(summary) writer.add_summary(s) writer.close()
def main(load_path, params, mode='test'): nhidden = params['nhidden'] dropout = params['dropout'] word2vec = params['word2vec'] dataset = params['dataset'] nlayers = params['nlayers'] train_emb = params['train_emb'] char_dim = params['char_dim'] use_feat = params['use_feat'] gating_fn = params['gating_fn'] ent_setup = params['ent_setup'] data_path = params['data_path'] # save settings shutil.copyfile('config.py', '%s/config_test.py' % load_path) use_chars = char_dim > 0 if dataset == "clicr": dp = DataPreprocessor.DataPreprocessorClicr() #dataset_path = "/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/" #dataset_path = "data/" data = dp.preprocess(data_path, ent_setup=ent_setup, no_training_set=True) elif dataset == "clicr_novice": dp = DataPreprocessor.DataPreprocessorNovice() data = dp.preprocess(data_path, ent_setup=ent_setup, no_training_set=True) else: dp = DataPreprocessor.DataPreprocessor() data = dp.preprocess(data_path, no_training_set=True) inv_vocab = data.inv_dictionary assert os.path.exists(params["test_file"] if mode == "test" else params["validation_file"]) print("building minibatch loaders ...") if mode == 'test': batch_loader_test = MiniBatchLoader.MiniBatchLoader( data.test, BATCH_SIZE) else: batch_loader_test = MiniBatchLoader.MiniBatchLoader( data.validation, BATCH_SIZE) f_to_cand = {i[-1]: i[3] for i in batch_loader_test.questions} print("building network ...") W_init, embed_dim = Helpers.load_word2vec_embeddings( data.dictionary[0], word2vec) m = GAReader.Model(nlayers, data.vocab_size, data.num_chars, W_init, nhidden, embed_dim, dropout, train_emb, char_dim, use_feat, gating_fn, save_attn=False) print("model load path") print('%s/best_model.p' % load_path) m.load_model('%s/best_model.p' % load_path) print("testing ...") pr = np.zeros((len(batch_loader_test.questions), batch_loader_test.max_num_cand)).astype('float32') fids, attns = [], [] pred_ans = {} total_loss, total_acc, n = 0., 0., 0 for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_test: outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl) loss, acc, probs = outs[:3] attns += [[fnames[0], probs[0, :]] + [o[0, :, :] for o in outs[3:]] ] # store one attention for f in range(len(fnames)): pred_cand = probs[f].argmax() pred_a_ids = f_to_cand[fnames[f]][pred_cand] pred_a = " ".join([inv_vocab[i] for i in pred_a_ids]) if ent_setup == "ent-anonym" and (dataset == "clicr" or dataset == "clicr_novice"): relabeling_dicts = data.test_relabeling_dicts if mode == 'test' else data.val_relabeling_dicts pred_a = relabeling_dicts[fnames[f]][pred_a] pred_ans[fnames[f]] = pred_a bsize = dw.shape[0] total_loss += bsize * loss total_acc += bsize * acc pr[n:n + bsize, :] = probs fids += fnames n += bsize if (params["dataset"] == "clicr" or params["dataset"] == "clicr_plain" or params["dataset"] == "clicr_novice") \ and (mode == 'test' or mode == 'validation'): print("writing predictions") preds_data = utils.to_output_preds(pred_ans) preds_filepath = load_path + '/{}.preds'.format(mode) utils.write_preds(preds_data, file_name=preds_filepath) utils.external_eval(preds_filepath, preds_filepath + ".scores", params["test_file"] if mode == "test" else params["validation_file"], extended=True) logger = open(load_path + '/log.test', 'a') message = '%s Loss %.4e acc=%.4f' % (mode.upper(), total_loss / n, total_acc / n) print(message) logger.write(message + '\n') logger.close() np.save('%s/%s.probs' % (load_path, mode), np.asarray(pr)) pickle.dump(attns, open('%s/%s.attns' % (load_path, mode), 'wb')) f = open('%s/%s.ids' % (load_path, mode), 'w') for item in fids: f.write(item + '\n') f.close()
def train(args): use_chars = args.char_dim > 0 # load data dp = data_preprocessor() data = dp.preprocess(question_dir=args.data_dir, no_training_set=False, max_example=args.max_example, use_chars=use_chars) #import ipdb; ipdb.set_trace() idx_to_word = dict([(v, k) for (k, v) in data.dictionary[0].items()]) # build minibatch loader train_batch_loader = minibatch_loader(data.training, args.batch_size, sample=1.0) valid_batch_loader = minibatch_loader(data.validation, args.batch_size, shuffle=False) test_batch_loader = minibatch_loader(data.test, args.batch_size, shuffle=False) with tf.device('/device:GPU:0'): if not args.resume: logging.info("loading word2vec file ...") embed_init, embed_dim = \ load_word2vec_embeddings(data.dictionary[0], args.embed_file) logging.info("embedding dim: {}".format(embed_dim)) logging.info("initialize model ...") model = GAReader(args.n_layers, data.vocab_size, data.n_chars, args.gru_size, embed_dim, args.train_emb, args.char_dim, args.use_feat, args.gating_fn, True) model.build_graph(args.grad_clip, embed_init) init = tf.global_variables_initializer() saver = tf.train.Saver(tf.global_variables()) else: model = GAReader(args.n_layers, data.vocab_size, data.n_chars, args.gru_size, 100, args.train_emb, args.char_dim, args.use_feat, args.gating_fn, True) with tf.Session( config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) as sess: # training phase if not args.resume: step = 0 sess.run(init) else: step = int( re.search('step_([0-9]+?)-(.*?)', args.ckpt).group(1)) model.restore(sess, args.save_dir, args.ckpt) saver = tf.train.Saver(tf.global_variables()) if args.init_test: logging.info('-' * 50) logging.info("Initial test ...") best_loss, best_acc = model.validate(sess, valid_batch_loader) else: best_acc = 0. logging.info('-' * 50) logging.info("Start training ...") train_writer = tf.summary.FileWriter('logs/train', sess.graph) while step < args.n_epoch * len(train_batch_loader): epoch = int(math.floor(step / len(train_batch_loader))) start = time.time() it = loss = acc = n_example = 0 lr = args.init_learning_rate if epoch >= 2: lr = args.init_learning_rate / 2**(epoch - 1) for dw, dt, qw, qt, a, m_dw, m_qw, tt, \ tm, c, m_c, cl, fnames in train_batch_loader: step += 1 tf.summary.text( 'doc', tf.constant(get_text(idx_to_word, dw[0], m_dw[0]))) if step % 1000 == 0: logging.info('running train step with summary..') loss_, acc_, summary = model.train( sess, dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames, args.drop_out, lr, True) train_writer.add_summary(summary, step) else: loss_, acc_ = model.train(sess, dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames, args.drop_out, lr) loss += loss_ acc += acc_ it += 1 n_example += dw.shape[0] tf.summary.scalar('train_loss', tf.constant(loss_)) tf.summary.scalar('train_accuracy', tf.constant(acc_)) if step % args.print_every == 0 or \ it % len(train_batch_loader) == 0: spend = (time.time() - start) / 60 statement = "Epoch: {}, it: {} (max: {}), "\ .format(epoch, it, len(train_batch_loader)) statement += "loss: {:.3f}, acc: {:.3f}, "\ .format(loss / args.print_every, acc / n_example) statement += "time: {:.1f}(m)"\ .format(spend) logging.info(statement) loss = acc = n_example = 0 start = time.time() # save model if step % args.eval_every == 0 or \ it % len(train_batch_loader) == 0: valid_loss, valid_acc = model.validate( sess, valid_batch_loader) tf.summary.scalar('val_loss', tf.constant(valid_loss)) tf.summary.scalar('val_accuracy', tf.constant(valid_acc)) if valid_acc >= best_acc: best_loss = valid_loss best_acc = valid_acc logging.info("Best valid acc: {}".format(best_acc)) model.save(sess, saver, args.save_dir, step, valid_acc, valid_loss) start = time.time() train_writer.close() # test model logging.info("Final test ...") model.validate(sess, test_batch_loader, write_results=True)
# NOTE: make sure vocab.txt is already there! data = dp.preprocess(DATASET, no_training_set=True) inv_vocab = data.inv_dictionary print("building minibatch loaders ...") if not 'CANDIDATE_SUBSET' in locals(): CANDIDATE_SUBSET = False if dataset == 'validation': batch_loader_test = MiniBatchLoader.MiniBatchLoader( data.validation, 128, shuffle=False, candidate_subset=CANDIDATE_SUBSET) elif dataset == 'test': batch_loader_test = MiniBatchLoader.MiniBatchLoader( data.test, 128, shuffle=False, candidate_subset=CANDIDATE_SUBSET) print("building network ...") m = GAReader.Model(K, data.vocab_size) print("loading model from file...") m.load_model(model_path) print("predicting ...") fid = open(output_path, 'w', 0) pr = [] gt = [] for d, q, a, m_d, m_q, c, m_c, fnames in batch_loader_test: loss, acc, probs = m.validate(d, q, a, m_d, m_q, m_c) probs_sorted = np.argpartition(-probs, top_K - 1)[:, :top_K] predicted = map(lambda x: ' '.join(map(lambda i: inv_vocab[i], x)),
def main(load_path, params, mode='test'): nhidden = params['nhidden'] dropout = params['dropout'] word2vec = params['word2vec'] dataset = params['dataset'] nlayers = params['nlayers'] train_emb = params['train_emb'] char_dim = params['char_dim'] use_feat = params['use_feat'] gating_fn = params['gating_fn'] dp = DataPreprocessor.DataPreprocessor() data = dp.preprocess(dataset, no_training_set=True) inv_vocab = data.inv_dictionary print("building minibatch loaders ...") if mode == 'test': batch_loader_test = MiniBatchLoader.MiniBatchLoader( data.test, BATCH_SIZE) else: batch_loader_test = MiniBatchLoader.MiniBatchLoader( data.validation, BATCH_SIZE) print("building network ...") W_init, embed_dim = Helpers.load_word2vec_embeddings( data.dictionary[0], word2vec) m = GAReader.Model(nlayers, data.vocab_size, data.num_chars, W_init, nhidden, embed_dim, dropout, train_emb, char_dim, use_feat, gating_fn, save_attn=True) m.load_model('%s/best_model.p' % load_path) print("testing ...") pr = np.zeros((len(batch_loader_test.questions), batch_loader_test.max_num_cand)).astype('float32') fids, attns = [], [] total_loss, total_acc, n = 0., 0., 0 for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_test: outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl) loss, acc, probs = outs[:3] attns += [[fnames[0], probs[0, :]] + [o[0, :, :] for o in outs[3:]] ] # store one attention bsize = dw.shape[0] total_loss += bsize * loss total_acc += bsize * acc pr[n:n + bsize, :] = probs fids += fnames n += bsize logger = open(load_path + '/log', 'a', 0) message = '%s Loss %.4e acc=%.4f' % (mode.upper(), total_loss / n, total_acc / n) print message logger.write(message + '\n') logger.close() np.save('%s/%s.probs' % (load_path, mode), np.asarray(pr)) pkl.dump(attns, open('%s/%s.attns' % (load_path, mode), 'w')) f = open('%s/%s.ids' % (load_path, mode), 'w') for item in fids: f.write(item + '\n') f.close()