def main(): vocab = data.Vocabulary() data.build_vocab(vocab, config.vector_file) # build vocabulary # classifier = models.Attentionclassifier(vocab_size=vocab.n_words, # emb_dim=config.DIM, # hidden_size=config.HIDDEN_SIZE, # num_layer=config.NUM_LAYER, # dropout=config.drop_out, # bidirectional=config.bidirectional, # label_size=config.label_class, # use_pretrain=True, # embed_matrix=vocab.vector, # embed_freeze=False).to(config.device) classifier = models.FinetuneModel1(vocab_size=vocab.n_words, emb_dim=config.DIM, hidden_size=config.HIDDEN_SIZE, num_layer=config.NUM_LAYER, dropout=config.drop_out, bidirectional=config.bidirectional, label_size=config.label_class, hidden_size1=128, use_pretrain=True, embed_matrix=vocab.vector, embed_freeze=False).to(config.device) model_dict = classifier.state_dict() pretrained_model = torch.load(config.model_path) pretrained_dict = dict() for k, v in pretrained_model.items(): if k == 'state_dict': for kk, vv in v.items(): if kk in model_dict: pretrained_dict[kk] = vv # # 更新现有的model_dict model_dict.update(pretrained_dict) # 加载实际需要的model_dict classifier.load_state_dict(model_dict) # classifier.eval() test_data = data.Sentiment(config.predict_file, vocab) test_dataloader = DataLoader(test_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) predict(classifier, test_dataloader, config.silent)
def main(unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception('Problem with flags: %s' % unused_argv) # choose what level of logging you want tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # user_rating has the elements in the following order # user_id, item_id, rating, time, num_words, review user_rating, user_id_to_idx, item_id_to_idx = read_file(FLAGS.data_path) num_users = len(user_id_to_idx) num_items = len(item_id_to_idx) num_reviews = len(user_rating) print('Number of total users / items / reviews: %d / %d / %d' % (num_users, num_items, num_reviews)) users_ratings = [ur for ur in user_rating] train_ratings, test_ratings, valid_ratings = split_data(users_ratings) # build vocabulary id_to_word, word_to_id = build_vocab(users_ratings, FLAGS.vocab_size) train_item_doc = token_to_id(train_ratings, word_to_id) valid_item_doc = token_to_id(valid_ratings, word_to_id) current_datetime = datetime.now() subfolder_timestamp = datetime.strftime(current_datetime, '%Y%m%d-%H%M%S') subfolder_dataname = os.path.basename(FLAGS.data_path) log_folder = os.path.join(FLAGS.log_root, subfolder_dataname + '-' + subfolder_timestamp) # save vocab to output folder pathlib.Path(log_folder).mkdir(parents=True, exist_ok=True) with open(os.path.join(log_folder, 'vocab.csv'), 'w') as f: for idx, token in id_to_word.items(): f.write('%s,%s\n' % (idx, token)) # Try offset model offset_model = offsetModel(train_ratings, valid_ratings, test_ratings) offset_model.train() # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['init_stddev', 'emb_dim', 'min_kappa', 'max_kappa', 'vocab_size', 'mu', 'max_iter_steps', 'num_iter_steps', 'threshold'] hps_dict = {} for key,val in FLAGS.flag_values_dict().items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps = namedtuple('HParams', hps_dict.keys())(**hps_dict) hft_model = HFTModel(hps, train_ratings, valid_ratings, test_ratings, train_item_doc, valid_item_doc, num_users, num_items, num_reviews, log_folder) hft_model.build_graph() hft_model.train()
def main(args): print "main" """ SEED """ np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) """ DATA """ train, val, test = get_nli_hypoth(args.train_lbls_file, args.train_src_file, args.val_lbls_file, \ args.val_src_file, args.test_lbls_file, args.test_src_file, \ args.max_train_sents, args.max_val_sents, args.max_test_sents) word_vecs = build_vocab( train['hypoths'] + val['hypoths'] + test['hypoths'], args.embdfile, args.lorelei_embds) args.word_emb_dim = len(word_vecs[word_vecs.keys()[0]]) lbls_file = args.train_lbls_file global IDX2LBL if "mpe" in lbls_file or "snli" in lbls_file or "multinli" in lbls_file or "sick" in lbls_file or "joci" in lbls_file: IDX2LBL = {0: 'entailment', 1: 'neutral', 2: 'contradiction'} elif "spr" in lbls_file or "dpr" in lbls_file or "fnplus" in lbls_file or "add_one" in lbls_file: IDX2LBL = {0: 'entailed', 1: 'not-entailed'} elif "scitail" in lbls_file: IDX2LBL = {0: 'entailment', 1: 'neutral'} nli_net = torch.load(args.model) print(nli_net) # loss weight = torch.FloatTensor(args.n_classes).fill_(1) loss_fn = nn.CrossEntropyLoss(weight=weight) loss_fn.size_average = False if args.gpu_id > -1: nli_net.cuda() loss_fn.cuda() """ Train model on Natural Language Inference task """ epoch = 1 for pair in [(train, 'train'), (val, 'val'), (test, 'test')]: #args.batch_size = len(pair[0]['lbls']) eval_acc = evaluate( 0, pair[0], args, word_vecs, nli_net, pair[1], "%s/%s_%s" % (args.outputdir, pair[1], args.pred_file))
def main(args): print "main" """ SEED """ np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) """ DATA """ train, valid, test = get_nli(args.nlipath) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], GLOVE_PATH) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) args.word_emb_dim = 300 nli_net = torch.load(args.model) print(nli_net) # loss weight = torch.FloatTensor(args.n_classes).fill_(1) loss_fn = nn.CrossEntropyLoss(weight=weight) loss_fn.size_average = False if args.gpu_id > -1: nli_net.cuda() loss_fn.cuda() """ Train model on Natural Language Inference task """ epoch = 1 for pair in [(train, 'train'), (valid, 'dev'), (test, 'test')]: #args.batch_size = len(pair[0]['lbls']) eval_acc = evaluate_preds( 0, pair[0], args, word_vec, nli_net, pair[1], "%s/%s_%s" % (args.outputdir, pair[1], args.pred_file)) print "Accuracy on " + pair[1] + ": " + str(eval_acc)
def get_vocab(args): # build a vocabulary from all train,dev,test set of the actual snli plus the test set of the # all the transfer tasks. train, valid, test = {}, {}, {} for split in ['test', 'valid', 'train']: for s in ['s1', 's2']: eval(split)[s] = [] for datapath, n_classes in [ (args.test_path, args.data_to_n_classes[args.test_data]), (args.train_path, args.data_to_n_classes[args.train_data]) ]: transfer_train, transfer_valid, transfer_test = get_nli( datapath, n_classes) for split in ['test', 'valid', 'train']: for s in ['s1', 's2']: eval(split)[s].extend(eval("transfer_" + split)[s]) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], args.embdfile) return word_vec
def main(args): """ SEED """ np.random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu_id > -1: torch.cuda.manual_seed(args.seed) """ DATA """ train, valid, test = get_nli(args.nlipath, args.n_classes) word_vecs = build_vocab(train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], args.embdfile) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array([['<s>'] + [word for word in sent.split() if word in word_vecs] + ['</s>'] for sent in eval(data_type)[split]]) args.word_emb_dim = len(word_vecs[list(word_vecs.keys())[0]]) nli_model_configs = get_model_configs(args, len(word_vecs)) nli_model_configs["n_classes"] = args.n_classes # define premise and hypoth encoders premise_encoder = eval(nli_model_configs['encoder_type'])(nli_model_configs) hypoth_encoder = eval(nli_model_configs['encoder_type'])(nli_model_configs) shared_nli_net = SharedNLINet(nli_model_configs, premise_encoder, hypoth_encoder) shared_hypoth_net = SharedHypothNet(nli_model_configs, hypoth_encoder) print(shared_nli_net) print(shared_hypoth_net) if args.pre_trained_model: print( "Pre_trained_model: " + args.pre_trained_model) pre_trained_model = torch.load(args.pre_trained_model) shared_nli_net_params = shared_nli_net.state_dict() pre_trained_params = pre_trained_model.state_dict() assert shared_nli_net_params.keys() == pre_trained_params.keys(), "load model has different parameter state names that NLI_HYPOTHS_NET" for key, parameters in shared_nli_net_params.items(): if parameters.size() == pre_trained_params[key].size(): shared_nli_net_params[key] = pre_trained_params[key] shared_nli_net.load_state_dict(shared_nli_net_params) print(shared_nli_net) if args.pre_trained_adv_model: print( "Pre_trained_adv_model: " + args.pre_trained_adv_model) pre_trained_model = torch.load(args.pre_trained_adv_model) shared_hypoth_net_params = shared_hypoth_net.state_dict() pre_trained_params = pre_trained_model.state_dict() assert shared_hypoth_net_params.keys() == pre_trained_params.keys(), "load model has different parameter state names that NLI_HYPOTHS_NET" for key, parameters in nli_hypoth_params.items(): if parameters.size() == pre_trained_params[key].size(): shared_hypoth_net_params[key] = pre_trained_params[key] shared_hypoth_net.load_state_dict(shared_hypoth_net_params) print(shared_hypoth_net) # nli loss weight = torch.FloatTensor(args.n_classes).fill_(1) loss_fn_nli = nn.CrossEntropyLoss(weight=weight) loss_fn_nli.size_average = False # hypoth (adversarial) loss weight = torch.FloatTensor(args.n_classes).fill_(1) loss_fn_hypoth = nn.CrossEntropyLoss(weight=weight) loss_fn_hypoth.size_average = False # optimizer optim_fn, optim_params = get_optimizer(args.optimizer) optimizer_nli = optim_fn(shared_nli_net.parameters(), **optim_params) #optimizer_hypoth = optim_fn(shared_hypoth_net.parameters(), **optim_params) # only pass hypoth classifier params to avoid updating shared encoder params twice optimizer_hypoth = optim_fn(shared_hypoth_net.classifier.parameters(), **optim_params) if args.gpu_id > -1: shared_nli_net.cuda() shared_hypoth_net.cuda() loss_fn_nli.cuda() loss_fn_hypoth.cuda() """ TRAIN """ global val_acc_best, lr, stop_training, adam_stop val_acc_best = -1e10 adam_stop = False stop_training = False lr = optim_params['lr'] if 'sgd' in args.optimizer else None """ Train model on Natural Language Inference task """ epoch = 1 while not stop_training and epoch <= args.n_epochs: train_acc_nli, train_acc_hypoth, shared_nli_net, shared_hypoth_net = trainepoch(epoch, train, optimizer_nli, optimizer_hypoth, args, word_vecs, shared_nli_net, shared_hypoth_net, loss_fn_nli, loss_fn_hypoth, args.adv_lambda, args.adv_hyp_encoder_lambda) eval_acc_nli, eval_acc_hypoth = evaluate(epoch, valid, optimizer_nli, optimizer_hypoth, args, word_vecs, shared_nli_net, shared_hypoth_net, 'valid', adv_lambda=args.adv_lambda) epoch += 1
metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time Usage: ", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""Usage: python run_lstmPooling.py [train/test]""") print('Configuring self Attention Model...') config = Config() if not os.path.exists(vocab_dir): build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat2id = read_category() words, word2id = read_vocab(vocab_dir) config.vocab_size = len(words) model = SelfAttentionModel(config) if sys.argv[1] == 'train': train() else: test()
print(params) """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, valid, test = get_nli(params.nlipath) word_vec = build_vocab(train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], params.word_emb_path) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array([['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) """ MODEL """ # model config config_nli_model = { 'n_words' : len(word_vec) ,
def main(args): print "main" """ SEED """ np.random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu_id > -1: torch.cuda.manual_seed(args.seed) """ DATA """ train, val, test = get_nli_hypoth(args.train_lbls_file, args.train_src_file, args.val_lbls_file, \ args.val_src_file, args.test_lbls_file, args.test_src_file, \ args.max_train_sents, args.max_val_sents, args.max_test_sents, args.remove_dup) word_vecs = build_vocab( train['hypoths'] + val['hypoths'] + test['hypoths'], args.embdfile, args.lorelei_embds) args.word_emb_dim = len(word_vecs[word_vecs.keys()[0]]) nli_model_configs = get_model_configs(args, len(word_vecs)) lbls_file = args.train_lbls_file if "mpe" in lbls_file or "snli" in lbls_file or "multinli" in lbls_file or "sick" in lbls_file or "joci" in lbls_file: nli_model_configs["n_classes"] = 3 elif "spr" in lbls_file or "dpr" in lbls_file or "fnplus" in lbls_file or "add_one" in lbls_file or "scitail" in lbls_file: nli_model_configs["n_classes"] = 2 nli_net = NLI_HYPOTHS_Net(nli_model_configs) print(nli_net) # loss weight = torch.FloatTensor(args.n_classes).fill_(1) loss_fn = nn.CrossEntropyLoss(weight=weight) loss_fn.size_average = False # optimizer optim_fn, optim_params = get_optimizer(args.optimizer) optimizer = optim_fn(nli_net.parameters(), **optim_params) if args.gpu_id > -1: nli_net.cuda() loss_fn.cuda() """ TRAIN """ global val_acc_best, lr, stop_training, adam_stop val_acc_best = -1e10 adam_stop = False stop_training = False lr = optim_params['lr'] if 'sgd' in args.optimizer else None """ Train model on Natural Language Inference task """ epoch = 1 while not stop_training and epoch <= args.n_epochs: train_acc, nli_net = trainepoch(epoch, train, optimizer, args, word_vecs, nli_net, loss_fn) eval_acc = evaluate(epoch, val, optimizer, args, word_vecs, nli_net, 'valid') epoch += 1
def pretrain(): # Parse command line arguments argparser = argparse.ArgumentParser() # train argparser.add_argument('--mode', '-m', choices=('pretrain', 'adversarial', 'inference'), type=str, required=True) argparser.add_argument('--batch_size', '-b', type=int, default=168) argparser.add_argument('--num_epoch', '-e', type=int, default=10) argparser.add_argument('--print_every', type=int, default=100) argparser.add_argument('--use_cuda', default=True) argparser.add_argument('--g_learning_rate', '-glr', type=float, default=0.001) argparser.add_argument('--d_learning_rate', '-dlr', type=float, default=0.001) # resume argparser.add_argument('--resume', action='store_true', dest='resume') argparser.add_argument('--resume_dir', type=str) argparser.add_argument('--resume_epoch', type=int) # save argparser.add_argument('--exp_dir', type=str, required=True) # model argparser.add_argument('--emb_dim', type=int, default=128) argparser.add_argument('--hidden_dim', type=int, default=256) argparser.add_argument('--dropout_rate', '-drop', type=float, default=0.5) argparser.add_argument('--n_layers', type=int, default=1) argparser.add_argument('--response_max_len', type=int, default=15) # data argparser.add_argument('--train_query_file', '-tqf', type=str, required=True) argparser.add_argument('--train_response_file', '-trf', type=str, required=True) argparser.add_argument('--valid_query_file', '-vqf', type=str, required=True) argparser.add_argument('--valid_response_file', '-vrf', type=str, required=True) argparser.add_argument('--vocab_file', '-vf', type=str, default='') argparser.add_argument('--max_vocab_size', '-mv', type=int, default=100000) args = argparser.parse_args() # set up the output directory exp_dirname = os.path.join(args.exp_dir, args.mode, time.strftime("%Y-%m-%d-%H-%M-%S")) os.makedirs(exp_dirname) # set up the logger tqdm_logging.config(logger, os.path.join(exp_dirname, 'train.log'), mode='w', silent=False, debug=True) if not args.vocab_file: logger.info("no vocabulary file") build_vocab(args.train_query_file, args.train_response_file, seperated=True) sys.exit() else: vocab, rev_vocab = load_vocab(args.vocab_file, max_vocab=args.max_vocab_size) vocab_size = len(vocab) word_embeddings = nn.Embedding(vocab_size, args.emb_dim, padding_idx=SYM_PAD) E = EncoderRNN(vocab_size, args.emb_dim, args.hidden_dim, args.n_layers, args.dropout_rate, bidirectional=True, variable_lengths=True) G = Generator(vocab_size, args.response_max_len, args.emb_dim, 2 * args.hidden_dim, args.n_layers, dropout_p=args.dropout_rate) if args.use_cuda: word_embeddings.cuda() E.cuda() G.cuda() loss_func = nn.NLLLoss(size_average=False) params = list(word_embeddings.parameters()) + list(E.parameters()) + list( G.parameters()) opt = torch.optim.Adam(params, lr=args.g_learning_rate) logger.info('----------------------------------') logger.info('Pre-train a neural conversation model') logger.info('----------------------------------') logger.info('Args:') logger.info(str(args)) logger.info('Vocabulary from ' + args.vocab_file) logger.info('vocabulary size: %d' % vocab_size) logger.info('Loading text data from ' + args.train_query_file + ' and ' + args.train_response_file) # resume training from other experiment if args.resume: assert args.resume_epoch >= 0, 'If resume training, please assign resume_epoch' reload_model(args.resume_dir, args.resume_epoch, word_embeddings, E, G) start_epoch = args.resume_epoch + 1 else: start_epoch = 0 # dump args with open(os.path.join(exp_dirname, 'args.pkl'), 'wb') as f: pickle.dump(args, f) for e in range(start_epoch, args.num_epoch): logger.info('---------------------training--------------------------') train_data_generator = batcher(args.batch_size, args.train_query_file, args.train_response_file) logger.info("Epoch: %d/%d" % (e, args.num_epoch)) step = 0 total_loss = 0.0 total_valid_char = [] cur_time = time.time() while True: try: post_sentences, response_sentences = train_data_generator.next( ) except StopIteration: # save model save_model(exp_dirname, e, word_embeddings, E, G) # evaluation eval(args.valid_query_file, args.valid_response_file, args.batch_size, word_embeddings, E, G, loss_func, args.use_cuda, vocab, args.response_max_len) break post_ids = [sentence2id(sent, vocab) for sent in post_sentences] response_ids = [ sentence2id(sent, vocab) for sent in response_sentences ] posts_var, posts_length = padding_inputs(post_ids, None) responses_var, responses_length = padding_inputs( response_ids, args.response_max_len) # sort by post length posts_length, perms_idx = posts_length.sort(0, descending=True) posts_var = posts_var[perms_idx] responses_var = responses_var[perms_idx] responses_length = responses_length[perms_idx] # 在sentence后面加eos references_var = torch.cat([ responses_var, Variable(torch.zeros(responses_var.size(0), 1).long(), requires_grad=False) ], dim=1) for idx, length in enumerate(responses_length): references_var[idx, length] = SYM_EOS # show case #for p, r, ref in zip(posts_var.data.numpy()[:10], responses_var.data.numpy()[:10], references_var.data.numpy()[:10]): # print ''.join(id2sentence(p, rev_vocab)) # print ''.join(id2sentence(r, rev_vocab)) # print ''.join(id2sentence(ref, rev_vocab)) # print if args.use_cuda: posts_var = posts_var.cuda() responses_var = responses_var.cuda() references_var = references_var.cuda() embedded_post = word_embeddings(posts_var) embedded_response = word_embeddings(responses_var) _, dec_init_state = E(embedded_post, input_lengths=posts_length.numpy()) log_softmax_outputs = G.supervise( embedded_response, dec_init_state, word_embeddings) # [B, T, vocab_size] outputs = log_softmax_outputs.view(-1, vocab_size) mask_pos = mask(references_var).view(-1).unsqueeze(-1) masked_output = outputs * (mask_pos.expand_as(outputs)) loss = loss_func(masked_output, references_var.view(-1)) / (posts_var.size(0)) opt.zero_grad() loss.backward() opt.step() total_loss += loss * (posts_var.size(0)) total_valid_char.append(mask_pos) if step % args.print_every == 0: total_loss_val = total_loss.cpu().data.numpy()[0] total_valid_char_val = torch.sum( torch.cat(total_valid_char, dim=1)).cpu().data.numpy()[0] logger.info( 'Step %5d: (per word) training perplexity %.2f (%.1f iters/sec)' % (step, math.exp(total_loss_val / total_valid_char_val), args.print_every / (time.time() - cur_time))) total_loss = 0.0 total_valid_char = [] total_case_num = 0 cur_time = time.time() step = step + 1
""" np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) random.seed(params.seed) """ DATA """ #train, valid, test = get_nli(params.nlipath) train, valid, test = get_MSRP_data() #print(len(valid['s1'][100].split()), len(valid['a1'][100])) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], params.word_emb_path) ''' for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array([['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) ''' for data_type in ['train', 'valid', 'test']: for k in (('s1', 'a1'), ('s2', 'a2')): struct_sent = eval(data_type)[k[0]] struct_action = eval(data_type)[k[1]] for i in range(len(struct_sent)): sent = struct_sent[i].split() action = struct_action[i]
'Share vocab between source and destination') flags.DEFINE_boolean('showex', True, 'Show generated examples every few epochs') flags.DEFINE_boolean('sample', False, 'If showing examples, sample?') flags.DEFINE_boolean('attn', False, 'Use attention') f2i = {} v1 = [0] v2 = [1] if FLAGS.sharedv is True: v1.append(1) v2.append(0) vocab1 = build_vocab(v1, [FLAGS.train, FLAGS.test]) vocab2 = build_vocab(v2, [FLAGS.train, FLAGS.test]) embed1 = Word2VecModel(FLAGS.embed1, vocab1, FLAGS.unif) print('Loaded word embeddings: ' + FLAGS.embed1) if FLAGS.embed2 is None: print('No embed2 found, using embed1 for both') args.embed2 = args.embed1 embed2 = Word2VecModel(FLAGS.embed2, vocab2, FLAGS.unif) print('Loaded word embeddings: ' + FLAGS.embed2) ts = load_sentences(FLAGS.train, embed1.vocab, embed2.vocab, FLAGS.mxlen) es = load_sentences(FLAGS.test, embed1.vocab, embed2.vocab, FLAGS.mxlen)
def finetune(): vocab = data.Vocabulary() data.build_vocab(vocab, config.vector_file) # build vocabulary train_data = data.Sentiment(config.finetune_train_file, vocab) train_dataloader = DataLoader(train_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) valid_data = data.Sentiment(config.finetune_valid_file, vocab) valid_dataloader = DataLoader(valid_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) test_data = data.Sentiment(config.finetune_test_file, vocab) test_dataloader = DataLoader(test_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) classifier = models.FinetuneModel1(vocab_size=vocab.n_words, emb_dim=config.DIM, hidden_size=config.HIDDEN_SIZE, num_layer=config.NUM_LAYER, dropout=config.drop_out, bidirectional=config.bidirectional, label_size=config.label_class, hidden_size1=128, use_pretrain=True, embed_matrix=vocab.vector, embed_freeze=False).to(config.device) model_dict = classifier.state_dict() pretrained_model = torch.load(config.model_path) # 将pretrained_dict里不属于model_dict的键剔除掉 pretrained_dict = dict() for k, v in pretrained_model.items(): if k == 'state_dict': for kk, vv in v.items(): if kk in model_dict: pretrained_dict[kk] = vv # 更新现有的model_dict model_dict.update(pretrained_dict) # 加载实际需要的model_dict classifier.load_state_dict(model_dict) # 固定网络参数,不更新 for param in classifier.parameters(): param.requires_grad = False # 将最后final层的参数设置可以更新 for param in classifier.final.parameters(): param.requires_grad = True # new_model = models.FinetuneModel(classifier, hidden_size1=128, class_size=2) # print(new_model) criterion = nn.NLLLoss() # optimizer = torch.optim.Adam(classifier.parameters()) # optimizer = torch.optim.RMSprop(classifier.parameters(), lr=0.001, alpha=0.9, momentum=0.2) optimizer = torch.optim.Adadelta(filter(lambda p: p.requires_grad, classifier.parameters()), lr=0.01, rho=0.9, eps=1e-06, weight_decay=0) # optimizer = torch.optim.RMSprop(classifier.parameters()) best_f1 = 0 for epoch in range(config.finetune_epochs): # lr update # adjust_learning_rate(optimizer, epoch) # 测试不同优化器的学习率是否是自适应的 for param_group in optimizer.param_groups: print("here lr :{}".format(param_group['lr'])) logging.info("epoch {0:04d}".format(epoch)) main.train(train_dataloader, classifier, criterion, optimizer, epoch, config.finetune_batch_size, config.silent) test_f1, val_loss = main.test(valid_dataloader, classifier, criterion, epoch, config.finetune_batch_size, config.silent) is_best = test_f1 > best_f1 # True or False best_f1 = max(test_f1, best_f1) logging.info("best f1 is {}".format(best_f1)) main.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': classifier.state_dict(), 'acc': test_f1, 'best_acc': best_f1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint='../output/', save_file='finetune_model_best.pth.tar') predict.predict(classifier, test_dataloader, config.silent)
def main(args): print "main" """ SEED """ np.random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu_id > -1: torch.cuda.manual_seed(args.seed) """ DATA """ train, val, test = get_nli_text(args.train_lbls_file, args.train_src_file, args.val_lbls_file, \ args.val_src_file, args.test_lbls_file, args.test_src_file, \ args.max_train_sents, args.max_val_sents, args.max_test_sents, args.remove_dup) word_vecs = build_vocab( train['hypoths'] + val['hypoths'] + test['hypoths'] + train['premises'] + val['premises'] + test['premises'], args.embdfile, args.lorelei_embds) args.word_emb_dim = len(word_vecs[word_vecs.keys()[0]]) nli_model_configs = get_model_configs(args, len(word_vecs)) lbls_file = args.train_lbls_file if "mpe" in lbls_file or "snli" in lbls_file or "multinli" in lbls_file or "sick" in lbls_file or "joci" in lbls_file or "glue" in lbls_file: nli_model_configs["n_classes"] = 3 elif "spr" in lbls_file or "dpr" in lbls_file or "fnplus" in lbls_file or "add_one" in lbls_file or "scitail" in lbls_file: nli_model_configs["n_classes"] = 2 # define premise and hypoth encoders premise_encoder = eval( nli_model_configs['encoder_type'])(nli_model_configs) hypoth_encoder = eval(nli_model_configs['encoder_type'])(nli_model_configs) shared_nli_net = SharedNLINet(nli_model_configs, premise_encoder, hypoth_encoder) shared_hypoth_net = SharedHypothNet(nli_model_configs, hypoth_encoder) print(shared_nli_net) print(shared_hypoth_net) if args.pre_trained_model: print "Pre_trained_model: " + args.pre_trained_model pre_trained_model = torch.load(args.pre_trained_model) shared_nli_net_params = shared_nli_net.state_dict() pre_trained_params = pre_trained_model.state_dict() assert shared_nli_net_params.keys() == pre_trained_params.keys( ), "load model has different parameter state names that NLI_HYPOTHS_NET" for key, parameters in shared_nli_net_params.items(): if parameters.size() == pre_trained_params[key].size(): shared_nli_net_params[key] = pre_trained_params[key] shared_nli_net.load_state_dict(shared_nli_net_params) print(shared_nli_net) if args.pre_trained_adv_model: print "Pre_trained_adv_model: " + args.pre_trained_adv_model pre_trained_model = torch.load(args.pre_trained_adv_model) shared_hypoth_net_params = shared_hypoth_net.state_dict() pre_trained_params = pre_trained_model.state_dict() assert shared_hypoth_net_params.keys() == pre_trained_params.keys( ), "load model has different parameter state names that NLI_HYPOTHS_NET" for key, parameters in nli_hypoth_params.items(): if parameters.size() == pre_trained_params[key].size(): shared_hypoth_net_params[key] = pre_trained_params[key] shared_hypoth_net.load_state_dict(shared_hypoth_net_params) print(shared_hypoth_net) # nli loss weight = torch.FloatTensor(args.n_classes).fill_(1) loss_fn_nli = nn.CrossEntropyLoss(weight=weight) loss_fn_nli.size_average = False # hypoth (adversarial) loss weight = torch.FloatTensor(args.n_classes).fill_(1) loss_fn_hypoth = nn.CrossEntropyLoss(weight=weight) loss_fn_hypoth.size_average = False # optimizer optim_fn, optim_params = get_optimizer(args.optimizer) optimizer_nli = optim_fn(shared_nli_net.parameters(), **optim_params) #optimizer_hypoth = optim_fn(shared_hypoth_net.parameters(), **optim_params) # only pass hypoth classifier params to avoid updating shared encoder params twice optimizer_hypoth = optim_fn(shared_hypoth_net.classifier.parameters(), **optim_params) if args.gpu_id > -1: shared_nli_net.cuda() shared_hypoth_net.cuda() loss_fn_nli.cuda() loss_fn_hypoth.cuda() """ TRAIN """ global val_acc_best, lr, stop_training, adam_stop val_acc_best = -1e10 adam_stop = False stop_training = False lr = optim_params['lr'] if 'sgd' in args.optimizer else None """ Train model on Natural Language Inference task """ epoch = 1 while not stop_training and epoch <= args.n_epochs: train_acc_nli, train_acc_hypoth, shared_nli_net, shared_hypoth_net = trainepoch( epoch, train, optimizer_nli, optimizer_hypoth, args, word_vecs, shared_nli_net, shared_hypoth_net, loss_fn_nli, loss_fn_hypoth, args.adv_lambda, args.adv_hyp_encoder_lambda) eval_acc_nli, eval_acc_hypoth = evaluate(epoch, val, optimizer_nli, optimizer_hypoth, args, word_vecs, shared_nli_net, shared_hypoth_net, 'valid', adv_lambda=args.adv_lambda) epoch += 1
print(params) """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, valid, test = get_nli(params.nlipath) word_vec = build_vocab(train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], W2V_PATH) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array([['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) params.word_emb_dim = 300 """ MODEL """ # model config
print('Guess: %s' % sent) print( '------------------------------------------------------------------------' ) f2i = {} v1 = [0] v2 = [1] if FLAGS.sharedv is True: v1.append(1) v2.append(0) vocab1 = build_vocab(v1, {FLAGS.train, FLAGS.test}) vocab2 = build_vocab(v2, {FLAGS.train, FLAGS.test}) embed1 = Word2VecModel(FLAGS.embed1, vocab1, FLAGS.unif) print('Loaded word embeddings: ' + FLAGS.embed1) if FLAGS.embed2 is None: print('No embed2 found, using embed1 for both') args.embed2 = args.embed1 embed2 = Word2VecModel(FLAGS.embed2, vocab2, FLAGS.unif) print('Loaded word embeddings: ' + FLAGS.embed2) ts = load_sentences(FLAGS.train, embed1.vocab, embed2.vocab, FLAGS.mxlen, FLAGS.batchsz)
language = args.language model_name = args.model embed = args.embedding using_word = args.use_word # 整合参数 config = Config(datasets_path,language,model_name,embed,using_word) # 构建数据集 start_time = time.time() print("Loading data...") # 加载数据集 train_data, dev_data, test_data = build_dataset(config) print (len(train_data[1])) # 构建词表 vocab_class = build_vocab(config,train_data) config.class_num = vocab_class.label_num # 构建词向量 config.embedding_pretrained = construct_embedding(config,vocab_class) # 构建迭代器 train_iter, dev_iter, test_iter = build_iterator(config,vocab_class,train_data,dev_data,test_data) end_time = time.time() print("Time usage:", end_time - start_time) # 加载模型 model = load_model(config) init_network(model) print(f'The model has {count_parameters(model):,} trainable parameters') # 训练模型 best_valid_loss = float('inf') for epoch in range(config.epoch_num):
params.outputdir + "/" + params.outputmodelname + "/" + 'commandline_args.txt', 'w') as f: args = parser.parse_args() json.dump(args.__dict__, f, indent=2) """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, valid, test = get_nli(params.dataset_path) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], params.vector_rep) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) params.word_emb_dim = 300 """ MODEL """ # model encoder_types = [
dtype = torch.FloatTensor #Print Flags for key, value in vars(FLAGS).items(): print(key + ' : ' + str(value)) main() nli_path = nli_DEFAULT glove_path = glove_DEFAULT train, dev, test = get_nli(nli_path) vocab, embeddings = build_vocab( train['s1'] + train['s2'] + test['s1'] + test['s2'] + dev['s1'] + dev['s2'], glove_path) config = { 'n_words': len(embeddings), 'emb_dim': FLAGS.emb_dim, 'lstm_dim': FLAGS.lstm_dim, 'dpout': FLAGS.dpout, 'fc_dim': FLAGS.fc_dim, 'b_size': FLAGS.bsize, 'n_classes': FLAGS.n_classes, 'model_name': FLAGS.model_name, 'n_classes': FLAGS.n_classes, } #append every sentence with <s> in the start and </s> in the end. Also, ignore the words in sentences for which no embedding
from model import w2v from data import build_vocab from embedding import gen_word2vec as word2vec sess = tf.InteractiveSession() # config config = { 'batch_size' : 16, 'embed_size' : 200, 'neg_sample_size' : 100, } # data data, unique_neg_data, idx2word, word2idx, vocab = build_vocab() name2idx = dict([(name, idx) for idx, name in enumerate(data.keys())]) idx2name = dict([(idx, name) for idx, name in enumerate(data.keys())]) vocab_size = len(idx2word) character_size = len(data) # model (words, counts, words_per_epoch, epoch, words, pos_x, pos_y) = word2vec.skipgram( filename = 'text8', batch_size = 16, window_size = 2, min_count = 5, subsample = 1e-3 )
import torch.optim as optim import pandas as pd from torch.utils.data import DataLoader import numpy as np from torch.autograd import Variable import torch.nn.functional as F if __name__ == "__main__": print("starting...") # prepare data csv_dataset = pd.read_csv(config.file_name, header=None) # csv_file format: dataframe print("data loaded") vocab = data.Vocabulary() data.build_vocab(vocab) # build vocabulary print("build vocab success") train_data = data.sentimentDataset(vocab, csv_dataset, train_size=config.TRAIN_RATIO, test_size=config.TEST_RATIO, train=True) test_data = data.sentimentDataset(vocab, csv_dataset, train=False) train_dataloader = DataLoader(train_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) test_dataloader = DataLoader(test_data, batch_size=config.TEST_BATCH_SIZE,
def main(): global args args = parser.parse_args() if args.save is '': args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt')) checkpoint_file = os.path.join(save_path, 'checkpoint_epoch_%s.pth.tar') logging.debug("run arguments: %s", args) logging.info("using pretrained cnn %s", args.cnn) cnn = resnet.__dict__[args.cnn](pretrained=True) vocab = build_vocab() model = CaptionModel(cnn, vocab, embedding_size=args.embedding_size, rnn_size=args.rnn_size, num_layers=args.num_layers, share_embedding_weights=args.share_weights) train_data = get_iterator(get_coco_data(vocab, train=True), batch_size=args.batch_size, max_length=args.max_length, shuffle=True, num_workers=args.workers) val_data = get_iterator(get_coco_data(vocab, train=False), batch_size=args.eval_batch_size, max_length=args.max_length, shuffle=False, num_workers=args.workers) if 'cuda' in args.type: cudnn.benchmark = True model.cuda() optimizer = select_optimizer( args.optimizer, params=model.parameters(), lr=args.lr) regime = lambda e: {'lr': args.lr * (args.lr_decay ** e), 'momentum': args.momentum, 'weight_decay': args.weight_decay} model.finetune_cnn(False) def forward(model, data, training=True, optimizer=None): use_cuda = 'cuda' in args.type loss = nn.CrossEntropyLoss() perplexity = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() if training: model.train() else: model.eval() end = time.time() for i, (imgs, (captions, lengths)) in enumerate(data): data_time.update(time.time() - end) if use_cuda: imgs = imgs.cuda() captions = captions.cuda(async=True) imgs = Variable(imgs, volatile=not training) captions = Variable(captions, volatile=not training) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] pred, _ = model(imgs, input_captions, lengths) err = loss(pred, target_captions) perplexity.update(math.exp(err.data[0])) if training: optimizer.zero_grad() err.backward() clip_grad_norm(model.rnn.parameters(), args.grad_clip) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: logging.info('{phase} - Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Perplexity {perp.val:.4f} ({perp.avg:.4f})'.format( epoch, i, len(data), phase='TRAINING' if training else 'EVALUATING', batch_time=batch_time, data_time=data_time, perp=perplexity)) return perplexity.avg for epoch in range(args.start_epoch, args.epochs): if epoch >= args.finetune_epoch: model.finetune_cnn(True) optimizer = adjust_optimizer( optimizer, epoch, regime) # Train train_perp = forward( model, train_data, training=True, optimizer=optimizer) # Evaluate val_perp = forward(model, val_data, training=False) logging.info('\n Epoch: {0}\t' 'Training Perplexity {train_perp:.4f} \t' 'Validation Perplexity {val_perp:.4f} \n' .format(epoch + 1, train_perp=train_perp, val_perp=val_perp)) model.save_checkpoint(checkpoint_file % (epoch + 1))
args = parser.parse_args() gpu = not args.nogpu if path.exists(args.outdir) is False: print('Creating path: %s' % (args.outdir)) makedirs(args.outdir) f2i = {} v1 = [0] v2 = [1] if args.sharedv is True: v1.append(1) v2.append(0) vocab1 = build_vocab(v1, {args.train, args.test}) vocab2 = build_vocab(v2, {args.train, args.test}) embed1 = Word2VecModel(args.embed1, vocab1, args.unif) print('Loaded word embeddings: ' + args.embed1) if args.embed2 is None: print('No embed2 found, using embed1 for both') args.embed2 = args.embed1 embed2 = Word2VecModel(args.embed2, vocab2, args.unif) print('Loaded word embeddings: ' + args.embed2) ts = load_sentences(args.train, embed1.vocab, embed2.vocab, args.mxlen, args.batchsz, long_0_tensor_alloc)
embed[j, i, :] = word_vec[batch[i][j]] return torch.from_numpy(embed).float(), lengths ''' GLOVE_PATH = '<glove>/<path>' wenda_infersent = torch.load('./glove_modeldir/GloVe.pickle') wenda_infersent.encoder.enc_lstm.flatten_parameters() train, valid, test = get_nli('./<corpus>/<path>') train['s1'] = list(set(train['s1'])) train['s2'] = list(set(train['s2'])) print(len(train['s1'])) word_vec = build_vocab(train['s1'], GLOVE_PATH) for split in ['s1', 's2']: for data_type in ['train']: eval(data_type)[split] = np.array( [[word for word in list(sent) if word in word_vec] for sent in eval(data_type)[split]]) permutation = np.random.permutation(len(train['s1'])) s1 = train['s1'][permutation] #word_vec = build_vocab(s1, GLOVE_PATH) print([''.join(sent) for sent in s1[:50]]) wenda_cod = wenda_infersent.encoder
parser.add_argument('--init_embedding', action='store_true', help='whether init embedding') parser.add_argument('--embedding_source', type=str, default='./', help='pretrained embedding path') args = parser.parse_args() if __name__ == '__main__': train = data.load_data('train.json', args.word_base) test = data.load_data('test.json', args.word_base) # train = data.load_data('train_squad.json', args.word_base) # test = data.load_data('dev_squad.json', args.word_base) vocabulary, pad_lens = data.build_vocab(train, test, args.vocab_size) print('Vocab size: %d | Max context: %d | Max question: %d' % (len(vocabulary), pad_lens[0], pad_lens[1])) train, valid = data.split_exp(train, args.valid_ratio) print('Train: %d | Valid: %d | Test: %d' % (len(train), len(valid), len(test))) train_engine = DataLoader(data.DataEngine(train, vocabulary, pad_lens), batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=use_cuda) valid_engine = DataLoader(data.DataEngine(valid, vocabulary, pad_lens), batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=use_cuda)
def main(): best_f1 = 0 print(config.device) vocab = data.Vocabulary() data.build_vocab(vocab, config.vector_file) # build vocabulary train_data = data.Sentiment(config.train_file, vocab) train_dataloader = DataLoader(train_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) test_data = data.Sentiment(config.test_file, vocab) test_dataloader = DataLoader(test_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) # classifier = models.RNNClassifier(nembedding=config.DIM, # hidden_size=config.HIDDEN_SIZE, # num_layer=config.NUM_LAYER, # dropout=config.drop_out, # vocab_size=vocab.n_words, # use_pretrain=True, # embed_matrix=vocab.vector, # embed_freeze=False, # label_size=config.label_class).to(config.device) classifier = models.Attentionclassifier(vocab_size=vocab.n_words, emb_dim=config.DIM, hidden_size=config.HIDDEN_SIZE, num_layer=config.NUM_LAYER, dropout=config.drop_out, bidirectional=config.bidirectional, label_size=config.label_class, use_pretrain=True, embed_matrix=vocab.vector, embed_freeze=False).to( config.device) criterion = nn.NLLLoss() # optimizer = torch.optim.Adam(classifier.parameters()) optimizer = torch.optim.RMSprop(classifier.parameters(), lr=config.LR, alpha=0.9, momentum=0.2) # optimizer = torch.optim.RMSprop(classifier.parameters()) # optimizer, scheduler = adam_optimizers(classifier.parameters()) # optimizer = torch.optim.Adadelta(classifier.parameters(), lr=config.LR, rho=0.9, eps=1e-06, weight_decay=0) for epoch in range(config.epochs): # lr update adjust_learning_rate(optimizer, epoch) # 测试不同优化器的学习率是否是自适应的 # for param_group in optimizer.param_groups: # print("here lr :{}".format(param_group['lr'])) logging.info("epoch {0:04d}".format(epoch)) train(train_dataloader, classifier, criterion, optimizer, epoch, config.TRAIN_BATCH_SIZE, config.silent) test_f1, val_loss = test(test_dataloader, classifier, criterion, epoch, config.TRAIN_BATCH_SIZE, config.silent) # scheduler.step(val_loss) is_best = test_f1 > best_f1 # True or False best_f1 = max(test_f1, best_f1) logging.info("best f1 is {}".format(best_f1)) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': classifier.state_dict(), 'acc': test_f1, 'best_acc': best_f1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint='../output/')
# get lengths for each dialogue train_fr_lens = get_lengths(os.path.join(file_path, 'dataset/data/Friends'), 'friends_train') train_ep_lens = get_lengths( os.path.join(file_path, 'dataset/data/EmotionPush'), 'emotionpush_train') train_lens = np.append(train_fr_lens, train_ep_lens) dev_fr_lens = get_lengths(os.path.join(file_path, 'dataset/data/Friends'), 'friends_dev') dev_ep_lens = get_lengths(os.path.join(file_path, 'dataset/data/EmotionPush'), 'emotionpush_dev') all_data = [ line.rstrip() for line in open(os.path.join(DATA_PATH, 'data-all.en'), 'r') ] word_vec = build_vocab(all_data, GLOVE_PATH) # add <s> and </s> to each of the sentences for data_type in ['train', 'dev_fr', 'dev_ep']: eval(data_type)['sent'] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)['sent']]) """ MODEL """ model = BLSTMAttnNet(embed_size=params.embed_size, lstm_dim=params.lstm_dim, fc_dim=params.fc_dim, num_classes=params.num_classes, max_sent_len=params.max_sent_len, attn_dropout=params.attn_dropout,
with open(params.hypes, 'rb') as f: json_config = json.load(f) data_dir = json_config['data_dir'] prefix = json_config[params.corpus] glove_path = json_config['glove_path'] if params.char and params.corpus == "gw_cn_5": prefix = prefix.replace('discourse', 'discourse_char') """ DATA """ train, valid, test = get_dis(data_dir, prefix, params.corpus) word_vec = build_vocab(train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], glove_path) # unknown words instead of map to <unk>, this directly takes them out for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array([['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) params.word_emb_dim = 300 dis_labels = get_labels(params.corpus) label_size = len(dis_labels) """
def main(args): GLOVE_PATH = "dataset/GloVe/glove.840B.300d.txt" parser = argparse.ArgumentParser(description='NLI training') # paths parser.add_argument("--nlipath", type=str, default='dataset/SNLI/', help="NLI data path (SNLI or MultiNLI)") parser.add_argument("--outputdir", type=str, default='savedir/', help="Output directory") parser.add_argument("--outputmodelname", type=str, default='model.pickle') # dataset, dimensions, transfer learning parser.add_argument("--dataset", type=str, required=True, help="Semantic similarity dataset") parser.add_argument('--dimension', nargs='+', required=True, help='Dimension(s) on the dataset') parser.add_argument('--transfer', default='DNT', help='Transfer learning approach') parser.add_argument('--save', default='no', help='Save trained model') parser.add_argument( '--load_model', default='no', help='If load model, do not perform training, just evalute') # training parser.add_argument("--n_epochs", type=int, default=10) parser.add_argument("--batch_size", type=int, default=16) parser.add_argument("--dpout_model", type=float, default=0., help="encoder dropout") parser.add_argument("--dpout_fc", type=float, default=0., help="classifier dropout") parser.add_argument("--nonlinear_fc", type=float, default=0, help="use nonlinearity in fc") parser.add_argument("--optimizer", type=str, default="sgd,lr=5", help="adam or sgd,lr=0.1") parser.add_argument("--lrshrink", type=float, default=5, help="shrink factor for sgd") parser.add_argument("--decay", type=float, default=1., help="lr decay") parser.add_argument("--minlr", type=float, default=1e-5, help="minimum lr") parser.add_argument("--max_norm", type=float, default=5., help="max norm (grad clipping)") # model parser.add_argument("--encoder_type", type=str, default='BLSTMEncoder', help="see list of encoders") parser.add_argument("--enc_lstm_dim", type=int, default=2048, help="encoder nhid dimension") parser.add_argument("--n_enc_layers", type=int, default=1, help="encoder num layers") parser.add_argument("--fc_dim", type=int, default=512, help="nhid of fc layers") parser.add_argument("--n_classes", type=int, default=3, help="entailment/neutral/contradiction") parser.add_argument("--pool_type", type=str, default='max', help="max or mean") # gpu parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID") parser.add_argument("--seed", type=int, default=1236, help="seed") params, _ = parser.parse_known_args(args) # set gpu device torch.cuda.set_device(params.gpu_id) # print parameters passed, and all parameters #print('\ntogrep : {0}\n'.format(sys.argv[1:])) #print(params) def trainepoch(epoch): print('TRAINING : Epoch ' + str(epoch)) nli_net.train() logs = [] last_time = time.time() #correct = 0. # shuffle the data permutation = np.random.permutation(len(train['s1'])) s1 = train['s1'][permutation] s2 = train['s2'][permutation] targets = [x[permutation] for x in train['labels']] optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * params.decay if epoch>1\ and 'sgd' in params.optimizer else optimizer.param_groups[0]['lr'] #print('Learning rate : {0}'.format(optimizer.param_groups[0]['lr'])) for stidx in range(0, len(s1), params.batch_size): tgt_batches = [] # prepare batch s1_batch, s1_len = get_batch(s1[stidx:stidx + params.batch_size], word_vec) s2_batch, s2_len = get_batch(s2[stidx:stidx + params.batch_size], word_vec) s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable( s2_batch.cuda()) for i, _ in enumerate(MTL_index): tgt_batches.append( Variable( torch.FloatTensor( targets[i][stidx:stidx + params.batch_size])).cuda()) #for dim in [1,2,3,4]: # model forward outputs = nli_net((s1_batch, s1_len), (s2_batch, s2_len)) # loss if params.transfer == 'DNT': #print(outputs[0]) #print((tgt_batches[0] - 1)/(params.n_classes-1)) losses = [ nli_net.loss_fn(outputs[i], (tgt_batches[i] - 1) / (params.n_classes - 1)) for i, _ in enumerate(MTL_index) ] elif params.transfer == 'NT': losses = [ nli_net.loss_fn(outputs[i], tgt_batches[i]) for i, _ in enumerate(MTL_index) ] #if 'kl' in MTL_index: # output1 = torch.log(output1) loss = np.sum(losses) #loss = loss1 + loss2 + loss3 + loss4# + loss5 + loss6 + loss7 + loss8 #ADDED #optimizer.zero_grad() #loss1.backward(retain_graph=True) #loss2.backward(retain_graph=True) #loss3.backward(retain_graph=True) #loss4.backward(retain_graph=True) #optimizer.step() #END ADDED """ if dim == 1: loss = nli_net.loss_fn(output1, tgt_batch1) elif dim == 2: loss = nli_net.loss_fn(output2, tgt_batch2) elif dim == 3: loss = nli_net.loss_fn(output3, tgt_batch3) elif dim == 4: loss = nli_net.loss_fn(output4, tgt_batch4) """ # backward optimizer.zero_grad() loss.backward() # optimizer step optimizer.step() def evaluate(epoch, eval_type='valid', flag='', correlation=spearmanr, transfer='NT'): nli_net.eval() #correct = 0. preds = [] r = np.arange(1, 1 + nli_net.n_classes) global val_acc_best, lr, stop_training, adam_stop if eval_type == 'valid': print('VALIDATION : Epoch {0}'.format(epoch)) s1 = valid['s1'] s2 = valid['s2'] targets = valid['scores'] elif eval_type == 'test': print('TEST : Epoch {0}'.format(epoch)) s1 = test['s1'] s2 = test['s2'] targets = test['scores'] elif eval_type == 'train': print('EVAL ON TRAIN : Epoch {0}'.format(epoch)) s1 = train['s1'] s2 = train['s2'] targets = train['scores'] else: raise ValueError('Wrong eval_type.') probas = [[] for _ in MTL_index] correct = 0. for i in range(0, len(s1), params.batch_size): # prepare batch s1_batch, s1_len = get_batch(s1[i:i + params.batch_size], word_vec) s2_batch, s2_len = get_batch(s2[i:i + params.batch_size], word_vec) s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable( s2_batch.cuda()) # model forward outputs = nli_net((s1_batch, s1_len), (s2_batch, s2_len)) for i, _ in enumerate(MTL_index): if len(probas[i]) == 0: probas[i] = outputs[i].data.cpu().numpy() else: probas[i] = np.concatenate( (probas[i], outputs[i].data.cpu().numpy()), axis=0) """ if 2 in MTL_index: if 'e' in MTL_index: tgt_batch2 = Variable(torch.LongTensor(target2[i:i + params.batch_size])).cuda() pred2 = output2.data.max(1)[1] correct += pred2.long().eq(tgt_batch2.data.long()).cpu().sum() else: if len(probas2) == 0: probas2 = output2.data.cpu().numpy() else: probas2 = np.concatenate((probas2, output2.data.cpu().numpy()), axis=0) """ if transfer == 'NT': ret = [ correlation(np.dot(x, r), y)[0] for x, y in zip(probas, targets) ] elif transfer == 'DNT': ret = [correlation(x, y)[0] for x, y in zip(probas, targets)] else: raise ValueError('Wrong transfer.') """ if 2 in MTL_index: if 'e' in MTL_index: ret.append(round(100 * correct/len(s1), 2)) else: yhat2 = np.dot(probas2, r) p2 = spearmanr(yhat2, target2)[0] ret.append(p2) else: ret.append(0) """ return ret """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ #for i in range(1,9): # print(i) # print('----------') dataset_path = { 'stsbenchmark': '../stsbenchmark/', 'sts12': '../SemEval12/', 'sick': '../SICK/', 'activities': '../human_activity_phrase_data/', 'sag': '../ShortAnswerGrading_v2.0/data/processed/', 'typed': '../SemEval13/typed/' } #MTL_index = [1,2,3,4, 'mse'] #'e' MTL_index = [int(x) for x in params.dimension] train, valid, test = get_sts(dataset_path[params.dataset], MTL_index, params.transfer, params.n_classes) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], GLOVE_PATH) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array( [[word for word in sent.split() if word in word_vec] for sent in eval(data_type)[split]]) #eval(data_type)[split] = np.array([['<s>'] + # [word for word in sent.split() if word in word_vec or word[:2] == 'dc'] + # ['</s>'] for sent in eval(data_type)[split]]) params.word_emb_dim = 300 """ MODEL """ # model config config_nli_model = { 'n_words': len(word_vec), 'word_emb_dim': params.word_emb_dim, 'enc_lstm_dim': params.enc_lstm_dim, 'n_enc_layers': params.n_enc_layers, 'dpout_model': params.dpout_model, 'dpout_fc': params.dpout_fc, 'fc_dim': params.fc_dim, 'bsize': params.batch_size, 'n_classes': params.n_classes, 'pool_type': params.pool_type, 'nonlinear_fc': params.nonlinear_fc, 'encoder_type': params.encoder_type, 'use_cuda': True, 'MTL_index': MTL_index, 'transfer': params.transfer } # model encoder_types = [ 'BLSTMEncoder', 'BLSTMprojEncoder', 'BGRUlastEncoder', 'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder', 'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder' ] assert params.encoder_type in encoder_types, "encoder_type must be in " + \ str(encoder_types) perfs_all = [] for rd in range(1): print("Round", rd) if params.load_model == 'no': nli_net = NLINet(config_nli_model) nli_net.encoder = torch.load('encoder/infersent.allnli.pickle', map_location={ 'cuda:1': 'cuda:0', 'cuda:2': 'cuda:0' }) else: nli_net = torch.load(params.load_model) print(nli_net) # optimizer optim_fn, optim_params = get_optimizer(params.optimizer) optimizer = optim_fn(nli_net.parameters(), **optim_params) # cuda by default nli_net.cuda() """ TRAIN """ val_acc_best = -1e10 adam_stop = False stop_training = False lr = optim_params['lr'] if 'sgd' in params.optimizer else None last_result = 0 last_test_result = 0 drop_count = 0 """ Train model on Natural Language Inference task """ correlation = spearmanr if params.dataset == 'activities' else pearsonr epoch = 0 perfs_valid = evaluate(epoch, 'valid', 'begin', correlation, params.transfer) perfs_test = evaluate(epoch, 'test', 'begin', correlation, params.transfer) print(perfs_valid, perfs_test) epoch += 1 if params.load_model == 'no': while not stop_training and epoch <= params.n_epochs: trainepoch(epoch) perfs_valid = evaluate(epoch, 'valid', '', correlation, params.transfer) perfs_test = evaluate(epoch, 'test', '', correlation, params.transfer) print(perfs_valid, perfs_test) epoch += 1 #perfs_all.append(perfs) if params.save != 'no': torch.save(nli_net, params.save)
return train_losses, dev_losses, model_path if __name__ == '__main__': data_dir = '../data/snli_1.0/' files = [ data_dir + s for s in ['snli_1.0_train.jsonl', 'snli_1.0_dev.jsonl', 'snli_1.0_test.jsonl'] ] vocab_file = data_dir + 'vocab.pkl' # including place for padding and UNK vocab_size = 30000 vocab = [] if not os.path.exists(vocab_file): build_vocab(files[0], vocab_file) vocab, vocab_idx = load_vocab(vocab_file) snli_train_file = data_dir + 'snli_train.pkl' if not os.path.exists(snli_train_file): preprocess_snli_jsonl(files[0], vocab_idx, snli_train_file, vocab_size) data = {} for f in ['train', 'dev', 'test']: data[f] = load_snli(data_dir + 'snli_%s.pkl' % f) batch_size = 256 embedding_size = 300 state_size = 512 inverse_drop_rate = 0.8 learning_rate = 3e-3
# print parameters passed, and all parameters print('\ntogrep : {0}\n'.format(sys.argv[1:])) print(params) """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, valid, test = get_nli(params.nlipath) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], GLOVE_PATH) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array([['<s>'] + [word for word in sent.split() if word in word_vec] +\ ['</s>'] for sent in eval(data_type)[split]]) params.word_emb_dim = 300 #params.word_emb_dim = 512 """ MODEL """ # model config config_nli_model = { 'n_words': len(word_vec),
def fill_tre_with_vectors(): train_tree, valid_tree, test_tree = get_SICK_tree_data() filename = "transformer_SICk" print(filename) parser = argparse.ArgumentParser(description='NLI training') # paths parser.add_argument("--nlipath", type=str, default='dataset/SNLI/', help="NLI data path (SNLI or MultiNLI)") parser.add_argument("--outputdir", type=str, default='savedir/', help="Output directory") parser.add_argument("--outputmodelname", type=str, default='model.pickle') parser.add_argument("--word_emb_path", type=str, default="glove.840B.300d.txt", help="word embedding file path") # training parser.add_argument("--n_epochs", type=int, default=500) parser.add_argument("--batch_size", type=int, default=16) parser.add_argument("--dpout_model", type=float, default=0.1, help="encoder dropout") parser.add_argument("--dpout_fc", type=float, default=0.1, help="classifier dropout") parser.add_argument("--nonlinear_fc", type=float, default=5, help="use nonlinearity in fc") parser.add_argument("--optimizer", type=str, default="sgd,lr=0.1", help="adam or sgd,lr=0.1") parser.add_argument("--lrshrink", type=float, default=1, help="shrink factor for sgd") parser.add_argument("--decay", type=float, default=0.99, help="lr decay") parser.add_argument("--minlr", type=float, default=1e-5, help="minimum lr") parser.add_argument("--max_norm", type=float, default=5., help="max norm (grad clipping)") # model parser.add_argument("--encoder_type", type=str, default='LSTMEncoder', help="see list of encoders") parser.add_argument("--enc_lstm_dim", type=int, default=600, help="encoder nhid dimension") parser.add_argument("--n_enc_layers", type=int, default=1, help="encoder num layers") parser.add_argument("--fc_dim", type=int, default=150, help="nhid of fc layers") parser.add_argument("--n_classes", type=int, default=2, help="entailment/neutral/contradiction") parser.add_argument("--pool_type", type=str, default='max', help="max or mean") # gpu parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID") parser.add_argument("--seed", type=int, default=1234, help="seed") # data parser.add_argument("--word_emb_dim", type=int, default=300, help="word embedding dimension") params, _ = parser.parse_known_args() # set gpu device torch.cuda.set_device(params.gpu_id) # print parameters passed, and all parameters print('\ntogrep : {0}\n'.format(sys.argv[1:])) print(params) """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ #train, valid, test = get_nli(params.nlipath) train_tree, valid_tree, test_tree = get_SICK_tree_data() train, valid, test = get_SICK_data() word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], params.word_emb_path) for i in range(len(train_tree['s1'])): x = deepcopy(assign_vectors(train_tree['s1'][i], word_vec)) train_tree['s1'][i] = deepcopy(x) x = deepcopy(assign_vectors(train_tree['s2'][i], word_vec)) train_tree['s1'][i] = deepcopy(x) for i in range(len(test_tree['s1'])): x = deepcopy(assign_vectors(test_tree['s1'][i], word_vec)) test_tree['s1'][i] = deepcopy(x) x = deepcopy(assign_vectors(test_tree['s2'][i], word_vec)) test_tree['s1'][i] = deepcopy(x) for i in range(len(valid_tree['s1'])): x = deepcopy(assign_vectors(valid_tree['s1'][i], word_vec)) valid_tree['s1'][i] = deepcopy(x) x = deepcopy(assign_vectors(valid_tree['s2'][i], word_vec)) valid_tree['s1'][i] = deepcopy(x) with open("sick_tree_data_tensor.pkl", "wb") as f: pickle.dump([train_tree, valid_tree, test_tree], f) return train_tree, valid_tree, test_tree