Esempio n. 1
0
def main():

    vocab = data.Vocabulary()
    data.build_vocab(vocab, config.vector_file)  # build vocabulary

    # classifier = models.Attentionclassifier(vocab_size=vocab.n_words,
    #                                         emb_dim=config.DIM,
    #                                         hidden_size=config.HIDDEN_SIZE,
    #                                         num_layer=config.NUM_LAYER,
    #                                         dropout=config.drop_out,
    #                                         bidirectional=config.bidirectional,
    #                                         label_size=config.label_class,
    #                                         use_pretrain=True,
    #                                         embed_matrix=vocab.vector,
    #                                         embed_freeze=False).to(config.device)

    classifier = models.FinetuneModel1(vocab_size=vocab.n_words,
                                       emb_dim=config.DIM,
                                       hidden_size=config.HIDDEN_SIZE,
                                       num_layer=config.NUM_LAYER,
                                       dropout=config.drop_out,
                                       bidirectional=config.bidirectional,
                                       label_size=config.label_class,
                                       hidden_size1=128,
                                       use_pretrain=True,
                                       embed_matrix=vocab.vector,
                                       embed_freeze=False).to(config.device)

    model_dict = classifier.state_dict()

    pretrained_model = torch.load(config.model_path)

    pretrained_dict = dict()

    for k, v in pretrained_model.items():
        if k == 'state_dict':
            for kk, vv in v.items():
                if kk in model_dict:
                    pretrained_dict[kk] = vv

    # # 更新现有的model_dict
    model_dict.update(pretrained_dict)

    # 加载实际需要的model_dict
    classifier.load_state_dict(model_dict)
    # classifier.eval()
    test_data = data.Sentiment(config.predict_file, vocab)
    test_dataloader = DataLoader(test_data,
                                 batch_size=config.TRAIN_BATCH_SIZE,
                                 shuffle=True,
                                 collate_fn=data.collate_fn)
    predict(classifier, test_dataloader, config.silent)
Esempio n. 2
0
def main(unused_argv):

  if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
    raise Exception('Problem with flags: %s' % unused_argv)

  # choose what level of logging you want
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) 

  # user_rating has the elements in the following order
  # user_id, item_id, rating, time, num_words, review
  user_rating, user_id_to_idx, item_id_to_idx = read_file(FLAGS.data_path)
  num_users = len(user_id_to_idx)
  num_items = len(item_id_to_idx)
  num_reviews = len(user_rating)
  print('Number of total users / items / reviews: %d / %d / %d' % 
          (num_users, num_items, num_reviews))
  users_ratings = [ur for ur in user_rating]
  train_ratings, test_ratings, valid_ratings = split_data(users_ratings)

  # build vocabulary
  id_to_word, word_to_id = build_vocab(users_ratings, FLAGS.vocab_size)
  train_item_doc = token_to_id(train_ratings, word_to_id)
  valid_item_doc = token_to_id(valid_ratings, word_to_id)

  current_datetime = datetime.now()
  subfolder_timestamp = datetime.strftime(current_datetime, '%Y%m%d-%H%M%S')
  subfolder_dataname = os.path.basename(FLAGS.data_path)
  log_folder = os.path.join(FLAGS.log_root, subfolder_dataname + '-' + subfolder_timestamp)
  # save vocab to output folder
  pathlib.Path(log_folder).mkdir(parents=True, exist_ok=True) 
  with open(os.path.join(log_folder, 'vocab.csv'), 'w') as f:    
    for idx, token in id_to_word.items():
      f.write('%s,%s\n' % (idx, token))
  
  # Try offset model
  offset_model = offsetModel(train_ratings, valid_ratings, test_ratings)
  offset_model.train()

  # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
  hparam_list = ['init_stddev', 'emb_dim', 'min_kappa', 'max_kappa', 
                 'vocab_size', 'mu', 'max_iter_steps', 'num_iter_steps',
                 'threshold']
  hps_dict = {}
  for key,val in FLAGS.flag_values_dict().items(): # for each flag
    if key in hparam_list: # if it's in the list
      hps_dict[key] = val # add it to the dict
  hps = namedtuple('HParams', hps_dict.keys())(**hps_dict)

  hft_model = HFTModel(hps, train_ratings, valid_ratings, test_ratings,
                       train_item_doc, valid_item_doc,
                       num_users, num_items, num_reviews, log_folder)
  hft_model.build_graph()  
  hft_model.train()
Esempio n. 3
0
def main(args):
    print "main"
    """
  SEED
  """
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    """
  DATA
  """
    train, val, test = get_nli_hypoth(args.train_lbls_file, args.train_src_file, args.val_lbls_file, \
                                      args.val_src_file, args.test_lbls_file, args.test_src_file, \
                                      args.max_train_sents, args.max_val_sents, args.max_test_sents)

    word_vecs = build_vocab(
        train['hypoths'] + val['hypoths'] + test['hypoths'], args.embdfile,
        args.lorelei_embds)
    args.word_emb_dim = len(word_vecs[word_vecs.keys()[0]])

    lbls_file = args.train_lbls_file
    global IDX2LBL
    if "mpe" in lbls_file or "snli" in lbls_file or "multinli" in lbls_file or "sick" in lbls_file or "joci" in lbls_file:
        IDX2LBL = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
    elif "spr" in lbls_file or "dpr" in lbls_file or "fnplus" in lbls_file or "add_one" in lbls_file:
        IDX2LBL = {0: 'entailed', 1: 'not-entailed'}
    elif "scitail" in lbls_file:
        IDX2LBL = {0: 'entailment', 1: 'neutral'}

    nli_net = torch.load(args.model)
    print(nli_net)

    # loss
    weight = torch.FloatTensor(args.n_classes).fill_(1)
    loss_fn = nn.CrossEntropyLoss(weight=weight)
    loss_fn.size_average = False

    if args.gpu_id > -1:
        nli_net.cuda()
        loss_fn.cuda()
    """
  Train model on Natural Language Inference task
  """
    epoch = 1

    for pair in [(train, 'train'), (val, 'val'), (test, 'test')]:
        #args.batch_size = len(pair[0]['lbls'])
        eval_acc = evaluate(
            0, pair[0], args, word_vecs, nli_net, pair[1],
            "%s/%s_%s" % (args.outputdir, pair[1], args.pred_file))
Esempio n. 4
0
def main(args):
    print "main"
    """
  SEED
  """
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    """
  DATA
  """

    train, valid, test = get_nli(args.nlipath)
    word_vec = build_vocab(
        train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] +
        test['s2'], GLOVE_PATH)

    for split in ['s1', 's2']:
        for data_type in ['train', 'valid', 'test']:
            eval(data_type)[split] = np.array(
                [['<s>'] + [word
                            for word in sent.split() if word in word_vec] +
                 ['</s>'] for sent in eval(data_type)[split]])

    args.word_emb_dim = 300

    nli_net = torch.load(args.model)
    print(nli_net)

    # loss
    weight = torch.FloatTensor(args.n_classes).fill_(1)
    loss_fn = nn.CrossEntropyLoss(weight=weight)
    loss_fn.size_average = False

    if args.gpu_id > -1:
        nli_net.cuda()
        loss_fn.cuda()
    """
  Train model on Natural Language Inference task
  """
    epoch = 1

    for pair in [(train, 'train'), (valid, 'dev'), (test, 'test')]:
        #args.batch_size = len(pair[0]['lbls'])
        eval_acc = evaluate_preds(
            0, pair[0], args, word_vec, nli_net, pair[1],
            "%s/%s_%s" % (args.outputdir, pair[1], args.pred_file))
        print "Accuracy on " + pair[1] + ": " + str(eval_acc)
Esempio n. 5
0
def get_vocab(args):
    # build a vocabulary from all train,dev,test set of the actual snli plus the test set of the
    # all the transfer tasks.
    train, valid, test = {}, {}, {}
    for split in ['test', 'valid', 'train']:
        for s in ['s1', 's2']:
            eval(split)[s] = []
    for datapath, n_classes in [
        (args.test_path, args.data_to_n_classes[args.test_data]),
        (args.train_path, args.data_to_n_classes[args.train_data])
    ]:
        transfer_train, transfer_valid, transfer_test = get_nli(
            datapath, n_classes)
        for split in ['test', 'valid', 'train']:
            for s in ['s1', 's2']:
                eval(split)[s].extend(eval("transfer_" + split)[s])

    word_vec = build_vocab(
        train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] +
        test['s2'], args.embdfile)
    return word_vec
Esempio n. 6
0
def main(args):

  """
  SEED
  """
  np.random.seed(args.seed)
  torch.manual_seed(args.seed)
  if args.gpu_id > -1:
    torch.cuda.manual_seed(args.seed)

  """
  DATA
  """
  train, valid, test = get_nli(args.nlipath, args.n_classes)
  word_vecs = build_vocab(train['s1'] + train['s2'] +
                       valid['s1'] + valid['s2'] +
                       test['s1'] + test['s2'], args.embdfile)

  for split in ['s1', 's2']:
    for data_type in ['train', 'valid', 'test']:
        eval(data_type)[split] = np.array([['<s>'] +
            [word for word in sent.split() if word in word_vecs] +
            ['</s>'] for sent in eval(data_type)[split]])


  args.word_emb_dim = len(word_vecs[list(word_vecs.keys())[0]])

  nli_model_configs = get_model_configs(args, len(word_vecs))


  nli_model_configs["n_classes"] = args.n_classes

  # define premise and hypoth encoders
  premise_encoder = eval(nli_model_configs['encoder_type'])(nli_model_configs)
  hypoth_encoder = eval(nli_model_configs['encoder_type'])(nli_model_configs)
  shared_nli_net = SharedNLINet(nli_model_configs, premise_encoder, hypoth_encoder)
  shared_hypoth_net = SharedHypothNet(nli_model_configs, hypoth_encoder)
  print(shared_nli_net)
  print(shared_hypoth_net)

  if args.pre_trained_model:
    print( "Pre_trained_model: " + args.pre_trained_model)
    pre_trained_model = torch.load(args.pre_trained_model)
  
    shared_nli_net_params = shared_nli_net.state_dict()
    pre_trained_params = pre_trained_model.state_dict()
    assert shared_nli_net_params.keys() == pre_trained_params.keys(), "load model has different parameter state names that NLI_HYPOTHS_NET"
    for key, parameters in shared_nli_net_params.items():
      if parameters.size() == pre_trained_params[key].size():
        shared_nli_net_params[key] = pre_trained_params[key]
    shared_nli_net.load_state_dict(shared_nli_net_params)

  print(shared_nli_net)

  if args.pre_trained_adv_model:
    print( "Pre_trained_adv_model: " + args.pre_trained_adv_model)
    pre_trained_model = torch.load(args.pre_trained_adv_model)
  
    shared_hypoth_net_params = shared_hypoth_net.state_dict()
    pre_trained_params = pre_trained_model.state_dict()
    assert shared_hypoth_net_params.keys() == pre_trained_params.keys(), "load model has different parameter state names that NLI_HYPOTHS_NET"
    for key, parameters in nli_hypoth_params.items():
      if parameters.size() == pre_trained_params[key].size():
        shared_hypoth_net_params[key] = pre_trained_params[key]
    shared_hypoth_net.load_state_dict(shared_hypoth_net_params)

  print(shared_hypoth_net)


  # nli loss
  weight = torch.FloatTensor(args.n_classes).fill_(1)
  loss_fn_nli = nn.CrossEntropyLoss(weight=weight)
  loss_fn_nli.size_average = False

  # hypoth (adversarial) loss
  weight = torch.FloatTensor(args.n_classes).fill_(1)
  loss_fn_hypoth = nn.CrossEntropyLoss(weight=weight)
  loss_fn_hypoth.size_average = False

  # optimizer
  optim_fn, optim_params = get_optimizer(args.optimizer)
  optimizer_nli = optim_fn(shared_nli_net.parameters(), **optim_params)
  #optimizer_hypoth = optim_fn(shared_hypoth_net.parameters(), **optim_params)
  # only pass hypoth classifier params to avoid updating shared encoder params twice 
  optimizer_hypoth = optim_fn(shared_hypoth_net.classifier.parameters(), **optim_params)

  if args.gpu_id > -1:
    shared_nli_net.cuda()
    shared_hypoth_net.cuda()
    loss_fn_nli.cuda()
    loss_fn_hypoth.cuda()

  """
  TRAIN
  """
  global val_acc_best, lr, stop_training, adam_stop
  val_acc_best = -1e10
  adam_stop = False
  stop_training = False
  lr = optim_params['lr'] if 'sgd' in args.optimizer else None

  """
  Train model on Natural Language Inference task
  """
  epoch = 1

  while not stop_training and epoch <= args.n_epochs:
    train_acc_nli, train_acc_hypoth, shared_nli_net, shared_hypoth_net = trainepoch(epoch, train, optimizer_nli, optimizer_hypoth, args, word_vecs, shared_nli_net, shared_hypoth_net, loss_fn_nli, loss_fn_hypoth, args.adv_lambda, args.adv_hyp_encoder_lambda)
    eval_acc_nli, eval_acc_hypoth = evaluate(epoch, valid, optimizer_nli, optimizer_hypoth, args, word_vecs, shared_nli_net, shared_hypoth_net, 'valid', adv_lambda=args.adv_lambda)
    epoch += 1
Esempio n. 7
0
        metrics.classification_report(y_test_cls,
                                      y_pred_cls,
                                      target_names=categories))

    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time Usage: ", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""Usage: python run_lstmPooling.py [train/test]""")

    print('Configuring self Attention Model...')
    config = Config()
    if not os.path.exists(vocab_dir):
        build_vocab(train_dir, vocab_dir, config.vocab_size)

    categories, cat2id = read_category()
    words, word2id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = SelfAttentionModel(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()
print(params)


"""
SEED
"""
np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.cuda.manual_seed(params.seed)

"""
DATA
"""
train, valid, test = get_nli(params.nlipath)
word_vec = build_vocab(train['s1'] + train['s2'] +
                       valid['s1'] + valid['s2'] +
                       test['s1'] + test['s2'], params.word_emb_path)

for split in ['s1', 's2']:
    for data_type in ['train', 'valid', 'test']:
        eval(data_type)[split] = np.array([['<s>'] +
            [word for word in sent.split() if word in word_vec] +
            ['</s>'] for sent in eval(data_type)[split]])


"""
MODEL
"""
# model config
config_nli_model = {
    'n_words'        :  len(word_vec)          ,
Esempio n. 9
0
def main(args):
    print "main"
    """
  SEED
  """
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.gpu_id > -1:
        torch.cuda.manual_seed(args.seed)
    """
  DATA
  """
    train, val, test = get_nli_hypoth(args.train_lbls_file, args.train_src_file, args.val_lbls_file, \
                                      args.val_src_file, args.test_lbls_file, args.test_src_file, \
                                      args.max_train_sents, args.max_val_sents, args.max_test_sents, args.remove_dup)

    word_vecs = build_vocab(
        train['hypoths'] + val['hypoths'] + test['hypoths'], args.embdfile,
        args.lorelei_embds)
    args.word_emb_dim = len(word_vecs[word_vecs.keys()[0]])

    nli_model_configs = get_model_configs(args, len(word_vecs))

    lbls_file = args.train_lbls_file
    if "mpe" in lbls_file or "snli" in lbls_file or "multinli" in lbls_file or "sick" in lbls_file or "joci" in lbls_file:
        nli_model_configs["n_classes"] = 3
    elif "spr" in lbls_file or "dpr" in lbls_file or "fnplus" in lbls_file or "add_one" in lbls_file or "scitail" in lbls_file:
        nli_model_configs["n_classes"] = 2

    nli_net = NLI_HYPOTHS_Net(nli_model_configs)
    print(nli_net)

    # loss
    weight = torch.FloatTensor(args.n_classes).fill_(1)
    loss_fn = nn.CrossEntropyLoss(weight=weight)
    loss_fn.size_average = False

    # optimizer
    optim_fn, optim_params = get_optimizer(args.optimizer)
    optimizer = optim_fn(nli_net.parameters(), **optim_params)

    if args.gpu_id > -1:
        nli_net.cuda()
        loss_fn.cuda()
    """
  TRAIN
  """
    global val_acc_best, lr, stop_training, adam_stop
    val_acc_best = -1e10
    adam_stop = False
    stop_training = False
    lr = optim_params['lr'] if 'sgd' in args.optimizer else None
    """
  Train model on Natural Language Inference task
  """
    epoch = 1

    while not stop_training and epoch <= args.n_epochs:
        train_acc, nli_net = trainepoch(epoch, train, optimizer, args,
                                        word_vecs, nli_net, loss_fn)
        eval_acc = evaluate(epoch, val, optimizer, args, word_vecs, nli_net,
                            'valid')
        epoch += 1
Esempio n. 10
0
def pretrain():
    # Parse command line arguments
    argparser = argparse.ArgumentParser()

    # train
    argparser.add_argument('--mode',
                           '-m',
                           choices=('pretrain', 'adversarial', 'inference'),
                           type=str,
                           required=True)
    argparser.add_argument('--batch_size', '-b', type=int, default=168)
    argparser.add_argument('--num_epoch', '-e', type=int, default=10)
    argparser.add_argument('--print_every', type=int, default=100)
    argparser.add_argument('--use_cuda', default=True)
    argparser.add_argument('--g_learning_rate',
                           '-glr',
                           type=float,
                           default=0.001)
    argparser.add_argument('--d_learning_rate',
                           '-dlr',
                           type=float,
                           default=0.001)

    # resume
    argparser.add_argument('--resume', action='store_true', dest='resume')
    argparser.add_argument('--resume_dir', type=str)
    argparser.add_argument('--resume_epoch', type=int)

    # save
    argparser.add_argument('--exp_dir', type=str, required=True)

    # model
    argparser.add_argument('--emb_dim', type=int, default=128)
    argparser.add_argument('--hidden_dim', type=int, default=256)
    argparser.add_argument('--dropout_rate', '-drop', type=float, default=0.5)
    argparser.add_argument('--n_layers', type=int, default=1)
    argparser.add_argument('--response_max_len', type=int, default=15)

    # data
    argparser.add_argument('--train_query_file',
                           '-tqf',
                           type=str,
                           required=True)
    argparser.add_argument('--train_response_file',
                           '-trf',
                           type=str,
                           required=True)
    argparser.add_argument('--valid_query_file',
                           '-vqf',
                           type=str,
                           required=True)
    argparser.add_argument('--valid_response_file',
                           '-vrf',
                           type=str,
                           required=True)
    argparser.add_argument('--vocab_file', '-vf', type=str, default='')
    argparser.add_argument('--max_vocab_size', '-mv', type=int, default=100000)

    args = argparser.parse_args()

    # set up the output directory
    exp_dirname = os.path.join(args.exp_dir, args.mode,
                               time.strftime("%Y-%m-%d-%H-%M-%S"))
    os.makedirs(exp_dirname)

    # set up the logger
    tqdm_logging.config(logger,
                        os.path.join(exp_dirname, 'train.log'),
                        mode='w',
                        silent=False,
                        debug=True)

    if not args.vocab_file:
        logger.info("no vocabulary file")
        build_vocab(args.train_query_file,
                    args.train_response_file,
                    seperated=True)
        sys.exit()
    else:
        vocab, rev_vocab = load_vocab(args.vocab_file,
                                      max_vocab=args.max_vocab_size)

    vocab_size = len(vocab)

    word_embeddings = nn.Embedding(vocab_size,
                                   args.emb_dim,
                                   padding_idx=SYM_PAD)
    E = EncoderRNN(vocab_size,
                   args.emb_dim,
                   args.hidden_dim,
                   args.n_layers,
                   args.dropout_rate,
                   bidirectional=True,
                   variable_lengths=True)
    G = Generator(vocab_size,
                  args.response_max_len,
                  args.emb_dim,
                  2 * args.hidden_dim,
                  args.n_layers,
                  dropout_p=args.dropout_rate)

    if args.use_cuda:
        word_embeddings.cuda()
        E.cuda()
        G.cuda()

    loss_func = nn.NLLLoss(size_average=False)
    params = list(word_embeddings.parameters()) + list(E.parameters()) + list(
        G.parameters())
    opt = torch.optim.Adam(params, lr=args.g_learning_rate)

    logger.info('----------------------------------')
    logger.info('Pre-train a neural conversation model')
    logger.info('----------------------------------')

    logger.info('Args:')
    logger.info(str(args))

    logger.info('Vocabulary from ' + args.vocab_file)
    logger.info('vocabulary size: %d' % vocab_size)
    logger.info('Loading text data from ' + args.train_query_file + ' and ' +
                args.train_response_file)

    # resume training from other experiment
    if args.resume:
        assert args.resume_epoch >= 0, 'If resume training, please assign resume_epoch'
        reload_model(args.resume_dir, args.resume_epoch, word_embeddings, E, G)
        start_epoch = args.resume_epoch + 1
    else:
        start_epoch = 0

    # dump args
    with open(os.path.join(exp_dirname, 'args.pkl'), 'wb') as f:
        pickle.dump(args, f)

    for e in range(start_epoch, args.num_epoch):
        logger.info('---------------------training--------------------------')
        train_data_generator = batcher(args.batch_size, args.train_query_file,
                                       args.train_response_file)
        logger.info("Epoch: %d/%d" % (e, args.num_epoch))
        step = 0
        total_loss = 0.0
        total_valid_char = []
        cur_time = time.time()
        while True:
            try:
                post_sentences, response_sentences = train_data_generator.next(
                )
            except StopIteration:
                # save model
                save_model(exp_dirname, e, word_embeddings, E, G)
                # evaluation
                eval(args.valid_query_file, args.valid_response_file,
                     args.batch_size, word_embeddings, E, G, loss_func,
                     args.use_cuda, vocab, args.response_max_len)
                break

            post_ids = [sentence2id(sent, vocab) for sent in post_sentences]
            response_ids = [
                sentence2id(sent, vocab) for sent in response_sentences
            ]
            posts_var, posts_length = padding_inputs(post_ids, None)
            responses_var, responses_length = padding_inputs(
                response_ids, args.response_max_len)
            # sort by post length
            posts_length, perms_idx = posts_length.sort(0, descending=True)
            posts_var = posts_var[perms_idx]
            responses_var = responses_var[perms_idx]
            responses_length = responses_length[perms_idx]

            # 在sentence后面加eos
            references_var = torch.cat([
                responses_var,
                Variable(torch.zeros(responses_var.size(0), 1).long(),
                         requires_grad=False)
            ],
                                       dim=1)
            for idx, length in enumerate(responses_length):
                references_var[idx, length] = SYM_EOS

            # show case
            #for p, r, ref in zip(posts_var.data.numpy()[:10], responses_var.data.numpy()[:10], references_var.data.numpy()[:10]):
            #    print ''.join(id2sentence(p, rev_vocab))
            #    print ''.join(id2sentence(r, rev_vocab))
            #    print ''.join(id2sentence(ref, rev_vocab))
            #    print

            if args.use_cuda:
                posts_var = posts_var.cuda()
                responses_var = responses_var.cuda()
                references_var = references_var.cuda()

            embedded_post = word_embeddings(posts_var)
            embedded_response = word_embeddings(responses_var)

            _, dec_init_state = E(embedded_post,
                                  input_lengths=posts_length.numpy())
            log_softmax_outputs = G.supervise(
                embedded_response, dec_init_state,
                word_embeddings)  # [B, T, vocab_size]

            outputs = log_softmax_outputs.view(-1, vocab_size)
            mask_pos = mask(references_var).view(-1).unsqueeze(-1)
            masked_output = outputs * (mask_pos.expand_as(outputs))
            loss = loss_func(masked_output,
                             references_var.view(-1)) / (posts_var.size(0))

            opt.zero_grad()
            loss.backward()
            opt.step()

            total_loss += loss * (posts_var.size(0))
            total_valid_char.append(mask_pos)

            if step % args.print_every == 0:
                total_loss_val = total_loss.cpu().data.numpy()[0]
                total_valid_char_val = torch.sum(
                    torch.cat(total_valid_char, dim=1)).cpu().data.numpy()[0]
                logger.info(
                    'Step %5d: (per word) training perplexity %.2f (%.1f iters/sec)'
                    % (step, math.exp(total_loss_val / total_valid_char_val),
                       args.print_every / (time.time() - cur_time)))
                total_loss = 0.0
                total_valid_char = []
                total_case_num = 0
                cur_time = time.time()
            step = step + 1
Esempio n. 11
0
"""
np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.cuda.manual_seed(params.seed)
random.seed(params.seed)
"""
DATA
"""
#train, valid, test = get_nli(params.nlipath)

train, valid, test = get_MSRP_data()

#print(len(valid['s1'][100].split()), len(valid['a1'][100]))

word_vec = build_vocab(
    train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] +
    test['s2'], params.word_emb_path)
'''
for split in ['s1', 's2']:
    for data_type in ['train', 'valid', 'test']:
        eval(data_type)[split] = np.array([['<s>'] +
            [word for word in sent.split() if word in word_vec] +
            ['</s>'] for sent in eval(data_type)[split]])
'''
for data_type in ['train', 'valid', 'test']:
    for k in (('s1', 'a1'), ('s2', 'a2')):
        struct_sent = eval(data_type)[k[0]]
        struct_action = eval(data_type)[k[1]]
        for i in range(len(struct_sent)):
            sent = struct_sent[i].split()
            action = struct_action[i]
Esempio n. 12
0
                     'Share vocab between source and destination')
flags.DEFINE_boolean('showex', True,
                     'Show generated examples every few epochs')
flags.DEFINE_boolean('sample', False, 'If showing examples, sample?')
flags.DEFINE_boolean('attn', False, 'Use attention')

f2i = {}

v1 = [0]
v2 = [1]

if FLAGS.sharedv is True:
    v1.append(1)
    v2.append(0)

vocab1 = build_vocab(v1, [FLAGS.train, FLAGS.test])
vocab2 = build_vocab(v2, [FLAGS.train, FLAGS.test])

embed1 = Word2VecModel(FLAGS.embed1, vocab1, FLAGS.unif)

print('Loaded word embeddings: ' + FLAGS.embed1)

if FLAGS.embed2 is None:
    print('No embed2 found, using embed1 for both')
    args.embed2 = args.embed1

embed2 = Word2VecModel(FLAGS.embed2, vocab2, FLAGS.unif)
print('Loaded word embeddings: ' + FLAGS.embed2)

ts = load_sentences(FLAGS.train, embed1.vocab, embed2.vocab, FLAGS.mxlen)
es = load_sentences(FLAGS.test, embed1.vocab, embed2.vocab, FLAGS.mxlen)
Esempio n. 13
0
def finetune():
    vocab = data.Vocabulary()
    data.build_vocab(vocab, config.vector_file)  # build vocabulary

    train_data = data.Sentiment(config.finetune_train_file, vocab)

    train_dataloader = DataLoader(train_data,
                                  batch_size=config.TRAIN_BATCH_SIZE,
                                  shuffle=True,
                                  collate_fn=data.collate_fn)

    valid_data = data.Sentiment(config.finetune_valid_file, vocab)

    valid_dataloader = DataLoader(valid_data,
                                  batch_size=config.TRAIN_BATCH_SIZE,
                                  shuffle=True,
                                  collate_fn=data.collate_fn)

    test_data = data.Sentiment(config.finetune_test_file, vocab)

    test_dataloader = DataLoader(test_data,
                                 batch_size=config.TRAIN_BATCH_SIZE,
                                 shuffle=True,
                                 collate_fn=data.collate_fn)

    classifier = models.FinetuneModel1(vocab_size=vocab.n_words,
                                       emb_dim=config.DIM,
                                       hidden_size=config.HIDDEN_SIZE,
                                       num_layer=config.NUM_LAYER,
                                       dropout=config.drop_out,
                                       bidirectional=config.bidirectional,
                                       label_size=config.label_class,
                                       hidden_size1=128,
                                       use_pretrain=True,
                                       embed_matrix=vocab.vector,
                                       embed_freeze=False).to(config.device)

    model_dict = classifier.state_dict()

    pretrained_model = torch.load(config.model_path)

    # 将pretrained_dict里不属于model_dict的键剔除掉

    pretrained_dict = dict()

    for k, v in pretrained_model.items():
        if k == 'state_dict':
            for kk, vv in v.items():
                if kk in model_dict:
                    pretrained_dict[kk] = vv

    # 更新现有的model_dict
    model_dict.update(pretrained_dict)

    # 加载实际需要的model_dict
    classifier.load_state_dict(model_dict)

    # 固定网络参数,不更新
    for param in classifier.parameters():
        param.requires_grad = False

    # 将最后final层的参数设置可以更新
    for param in classifier.final.parameters():
        param.requires_grad = True

    # new_model = models.FinetuneModel(classifier, hidden_size1=128, class_size=2)
    # print(new_model)

    criterion = nn.NLLLoss()
    # optimizer = torch.optim.Adam(classifier.parameters())
    # optimizer = torch.optim.RMSprop(classifier.parameters(), lr=0.001, alpha=0.9, momentum=0.2)
    optimizer = torch.optim.Adadelta(filter(lambda p: p.requires_grad,
                                            classifier.parameters()),
                                     lr=0.01,
                                     rho=0.9,
                                     eps=1e-06,
                                     weight_decay=0)
    # optimizer = torch.optim.RMSprop(classifier.parameters())

    best_f1 = 0

    for epoch in range(config.finetune_epochs):

        # lr update
        # adjust_learning_rate(optimizer, epoch)
        # 测试不同优化器的学习率是否是自适应的
        for param_group in optimizer.param_groups:
            print("here lr :{}".format(param_group['lr']))

        logging.info("epoch {0:04d}".format(epoch))
        main.train(train_dataloader, classifier, criterion, optimizer, epoch,
                   config.finetune_batch_size, config.silent)
        test_f1, val_loss = main.test(valid_dataloader, classifier, criterion,
                                      epoch, config.finetune_batch_size,
                                      config.silent)

        is_best = test_f1 > best_f1  # True or False
        best_f1 = max(test_f1, best_f1)

        logging.info("best f1 is {}".format(best_f1))
        main.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': classifier.state_dict(),
                'acc': test_f1,
                'best_acc': best_f1,
                'optimizer': optimizer.state_dict(),
            },
            is_best,
            checkpoint='../output/',
            save_file='finetune_model_best.pth.tar')

    predict.predict(classifier, test_dataloader, config.silent)
Esempio n. 14
0
def main(args):
    print "main"
    """
  SEED
  """
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.gpu_id > -1:
        torch.cuda.manual_seed(args.seed)
    """
  DATA
  """
    train, val, test = get_nli_text(args.train_lbls_file, args.train_src_file, args.val_lbls_file, \
                                      args.val_src_file, args.test_lbls_file, args.test_src_file, \
                                      args.max_train_sents, args.max_val_sents, args.max_test_sents, args.remove_dup)

    word_vecs = build_vocab(
        train['hypoths'] + val['hypoths'] + test['hypoths'] +
        train['premises'] + val['premises'] + test['premises'], args.embdfile,
        args.lorelei_embds)
    args.word_emb_dim = len(word_vecs[word_vecs.keys()[0]])

    nli_model_configs = get_model_configs(args, len(word_vecs))

    lbls_file = args.train_lbls_file
    if "mpe" in lbls_file or "snli" in lbls_file or "multinli" in lbls_file or "sick" in lbls_file or "joci" in lbls_file or "glue" in lbls_file:
        nli_model_configs["n_classes"] = 3
    elif "spr" in lbls_file or "dpr" in lbls_file or "fnplus" in lbls_file or "add_one" in lbls_file or "scitail" in lbls_file:
        nli_model_configs["n_classes"] = 2

    # define premise and hypoth encoders
    premise_encoder = eval(
        nli_model_configs['encoder_type'])(nli_model_configs)
    hypoth_encoder = eval(nli_model_configs['encoder_type'])(nli_model_configs)
    shared_nli_net = SharedNLINet(nli_model_configs, premise_encoder,
                                  hypoth_encoder)
    shared_hypoth_net = SharedHypothNet(nli_model_configs, hypoth_encoder)
    print(shared_nli_net)
    print(shared_hypoth_net)

    if args.pre_trained_model:
        print "Pre_trained_model: " + args.pre_trained_model
        pre_trained_model = torch.load(args.pre_trained_model)

        shared_nli_net_params = shared_nli_net.state_dict()
        pre_trained_params = pre_trained_model.state_dict()
        assert shared_nli_net_params.keys() == pre_trained_params.keys(
        ), "load model has different parameter state names that NLI_HYPOTHS_NET"
        for key, parameters in shared_nli_net_params.items():
            if parameters.size() == pre_trained_params[key].size():
                shared_nli_net_params[key] = pre_trained_params[key]
        shared_nli_net.load_state_dict(shared_nli_net_params)

    print(shared_nli_net)

    if args.pre_trained_adv_model:
        print "Pre_trained_adv_model: " + args.pre_trained_adv_model
        pre_trained_model = torch.load(args.pre_trained_adv_model)

        shared_hypoth_net_params = shared_hypoth_net.state_dict()
        pre_trained_params = pre_trained_model.state_dict()
        assert shared_hypoth_net_params.keys() == pre_trained_params.keys(
        ), "load model has different parameter state names that NLI_HYPOTHS_NET"
        for key, parameters in nli_hypoth_params.items():
            if parameters.size() == pre_trained_params[key].size():
                shared_hypoth_net_params[key] = pre_trained_params[key]
        shared_hypoth_net.load_state_dict(shared_hypoth_net_params)

    print(shared_hypoth_net)

    # nli loss
    weight = torch.FloatTensor(args.n_classes).fill_(1)
    loss_fn_nli = nn.CrossEntropyLoss(weight=weight)
    loss_fn_nli.size_average = False

    # hypoth (adversarial) loss
    weight = torch.FloatTensor(args.n_classes).fill_(1)
    loss_fn_hypoth = nn.CrossEntropyLoss(weight=weight)
    loss_fn_hypoth.size_average = False

    # optimizer
    optim_fn, optim_params = get_optimizer(args.optimizer)
    optimizer_nli = optim_fn(shared_nli_net.parameters(), **optim_params)
    #optimizer_hypoth = optim_fn(shared_hypoth_net.parameters(), **optim_params)
    # only pass hypoth classifier params to avoid updating shared encoder params twice
    optimizer_hypoth = optim_fn(shared_hypoth_net.classifier.parameters(),
                                **optim_params)

    if args.gpu_id > -1:
        shared_nli_net.cuda()
        shared_hypoth_net.cuda()
        loss_fn_nli.cuda()
        loss_fn_hypoth.cuda()
    """
  TRAIN
  """
    global val_acc_best, lr, stop_training, adam_stop
    val_acc_best = -1e10
    adam_stop = False
    stop_training = False
    lr = optim_params['lr'] if 'sgd' in args.optimizer else None
    """
  Train model on Natural Language Inference task
  """
    epoch = 1

    while not stop_training and epoch <= args.n_epochs:
        train_acc_nli, train_acc_hypoth, shared_nli_net, shared_hypoth_net = trainepoch(
            epoch, train, optimizer_nli, optimizer_hypoth, args, word_vecs,
            shared_nli_net, shared_hypoth_net, loss_fn_nli, loss_fn_hypoth,
            args.adv_lambda, args.adv_hyp_encoder_lambda)
        eval_acc_nli, eval_acc_hypoth = evaluate(epoch,
                                                 val,
                                                 optimizer_nli,
                                                 optimizer_hypoth,
                                                 args,
                                                 word_vecs,
                                                 shared_nli_net,
                                                 shared_hypoth_net,
                                                 'valid',
                                                 adv_lambda=args.adv_lambda)
        epoch += 1
Esempio n. 15
0
print(params)


"""
SEED
"""
np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.cuda.manual_seed(params.seed)

"""
DATA
"""
train, valid, test = get_nli(params.nlipath)
word_vec = build_vocab(train['s1'] + train['s2'] +
                       valid['s1'] + valid['s2'] +
                       test['s1'] + test['s2'], W2V_PATH)

for split in ['s1', 's2']:
    for data_type in ['train', 'valid', 'test']:
        eval(data_type)[split] = np.array([['<s>'] +
            [word for word in sent.split() if word in word_vec] +
            ['</s>'] for sent in eval(data_type)[split]])

params.word_emb_dim = 300


"""
MODEL
"""
# model config
Esempio n. 16
0
        print('Guess: %s' % sent)
        print(
            '------------------------------------------------------------------------'
        )


f2i = {}

v1 = [0]
v2 = [1]

if FLAGS.sharedv is True:
    v1.append(1)
    v2.append(0)

vocab1 = build_vocab(v1, {FLAGS.train, FLAGS.test})
vocab2 = build_vocab(v2, {FLAGS.train, FLAGS.test})

embed1 = Word2VecModel(FLAGS.embed1, vocab1, FLAGS.unif)

print('Loaded word embeddings: ' + FLAGS.embed1)

if FLAGS.embed2 is None:
    print('No embed2 found, using embed1 for both')
    args.embed2 = args.embed1

embed2 = Word2VecModel(FLAGS.embed2, vocab2, FLAGS.unif)
print('Loaded word embeddings: ' + FLAGS.embed2)

ts = load_sentences(FLAGS.train, embed1.vocab, embed2.vocab, FLAGS.mxlen,
                    FLAGS.batchsz)
    language = args.language
    model_name = args.model
    embed = args.embedding
    using_word = args.use_word

    # 整合参数
    config = Config(datasets_path,language,model_name,embed,using_word)

    # 构建数据集
    start_time = time.time()
    print("Loading data...")
    # 加载数据集
    train_data, dev_data, test_data = build_dataset(config)                                             
    print (len(train_data[1]))
    # 构建词表
    vocab_class = build_vocab(config,train_data)                                                        
    config.class_num = vocab_class.label_num                         
    # 构建词向量
    config.embedding_pretrained = construct_embedding(config,vocab_class)                               
    # 构建迭代器
    train_iter, dev_iter, test_iter = build_iterator(config,vocab_class,train_data,dev_data,test_data)  
    end_time = time.time()
    print("Time usage:", end_time - start_time)

    # 加载模型
    model = load_model(config)      
    init_network(model)
    print(f'The model has {count_parameters(model):,} trainable parameters')                                                                    
    # 训练模型
    best_valid_loss = float('inf')
    for epoch in range(config.epoch_num):
Esempio n. 18
0
            params.outputdir + "/" + params.outputmodelname + "/" +
            'commandline_args.txt', 'w') as f:
        args = parser.parse_args()
        json.dump(args.__dict__, f, indent=2)
"""
SEED
"""
np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.cuda.manual_seed(params.seed)
"""
DATA
"""
train, valid, test = get_nli(params.dataset_path)
word_vec = build_vocab(
    train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] +
    test['s2'], params.vector_rep)

for split in ['s1', 's2']:
    for data_type in ['train', 'valid', 'test']:
        eval(data_type)[split] = np.array(
            [['<s>'] + [word for word in sent.split() if word in word_vec] +
             ['</s>'] for sent in eval(data_type)[split]])

params.word_emb_dim = 300
"""
MODEL
"""

# model
encoder_types = [
Esempio n. 19
0
    dtype = torch.FloatTensor

    #Print Flags
    for key, value in vars(FLAGS).items():
        print(key + ' : ' + str(value))


main()

nli_path = nli_DEFAULT
glove_path = glove_DEFAULT

train, dev, test = get_nli(nli_path)
vocab, embeddings = build_vocab(
    train['s1'] + train['s2'] + test['s1'] + test['s2'] + dev['s1'] +
    dev['s2'], glove_path)

config = {
    'n_words': len(embeddings),
    'emb_dim': FLAGS.emb_dim,
    'lstm_dim': FLAGS.lstm_dim,
    'dpout': FLAGS.dpout,
    'fc_dim': FLAGS.fc_dim,
    'b_size': FLAGS.bsize,
    'n_classes': FLAGS.n_classes,
    'model_name': FLAGS.model_name,
    'n_classes': FLAGS.n_classes,
}

#append every sentence with <s> in the start and </s> in the end. Also, ignore the words in sentences for which no embedding
Esempio n. 20
0
from model import w2v
from data import build_vocab
from embedding import gen_word2vec as word2vec

sess = tf.InteractiveSession()

# config
config = {
    'batch_size'      : 16,
    'embed_size'      : 200,
    'neg_sample_size' : 100,
}

# data
data, unique_neg_data, idx2word, word2idx, vocab = build_vocab()

name2idx = dict([(name, idx) for idx, name in enumerate(data.keys())])
idx2name = dict([(idx, name) for idx, name in enumerate(data.keys())])

vocab_size = len(idx2word)
character_size = len(data)

# model
(words, counts, words_per_epoch, epoch, words, pos_x, pos_y) = word2vec.skipgram(
    filename = 'text8',
    batch_size = 16,
    window_size = 2,
    min_count = 5,
    subsample = 1e-3
)
Esempio n. 21
0
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader
import numpy as np
from torch.autograd import Variable
import torch.nn.functional as F

if __name__ == "__main__":

    print("starting...")
    # prepare data
    csv_dataset = pd.read_csv(config.file_name,
                              header=None)  # csv_file format: dataframe
    print("data loaded")
    vocab = data.Vocabulary()
    data.build_vocab(vocab)  # build vocabulary

    print("build vocab success")
    train_data = data.sentimentDataset(vocab,
                                       csv_dataset,
                                       train_size=config.TRAIN_RATIO,
                                       test_size=config.TEST_RATIO,
                                       train=True)
    test_data = data.sentimentDataset(vocab, csv_dataset, train=False)

    train_dataloader = DataLoader(train_data,
                                  batch_size=config.TRAIN_BATCH_SIZE,
                                  shuffle=True,
                                  collate_fn=data.collate_fn)
    test_dataloader = DataLoader(test_data,
                                 batch_size=config.TEST_BATCH_SIZE,
Esempio n. 22
0
def main():
    global args
    args = parser.parse_args()
    if args.save is '':
        args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    setup_logging(os.path.join(save_path, 'log.txt'))
    checkpoint_file = os.path.join(save_path, 'checkpoint_epoch_%s.pth.tar')

    logging.debug("run arguments: %s", args)
    logging.info("using pretrained cnn %s", args.cnn)
    cnn = resnet.__dict__[args.cnn](pretrained=True)

    vocab = build_vocab()
    model = CaptionModel(cnn, vocab,
                         embedding_size=args.embedding_size,
                         rnn_size=args.rnn_size,
                         num_layers=args.num_layers,
                         share_embedding_weights=args.share_weights)

    train_data = get_iterator(get_coco_data(vocab, train=True),
                              batch_size=args.batch_size,
                              max_length=args.max_length,
                              shuffle=True,
                              num_workers=args.workers)
    val_data = get_iterator(get_coco_data(vocab, train=False),
                            batch_size=args.eval_batch_size,
                            max_length=args.max_length,
                            shuffle=False,
                            num_workers=args.workers)

    if 'cuda' in args.type:
        cudnn.benchmark = True
        model.cuda()

    optimizer = select_optimizer(
        args.optimizer, params=model.parameters(), lr=args.lr)
    regime = lambda e: {'lr': args.lr * (args.lr_decay ** e),
                        'momentum': args.momentum,
                        'weight_decay': args.weight_decay}
    model.finetune_cnn(False)

    def forward(model, data, training=True, optimizer=None):
        use_cuda = 'cuda' in args.type
        loss = nn.CrossEntropyLoss()
        perplexity = AverageMeter()
        batch_time = AverageMeter()
        data_time = AverageMeter()

        if training:
            model.train()
        else:
            model.eval()

        end = time.time()
        for i, (imgs, (captions, lengths)) in enumerate(data):
            data_time.update(time.time() - end)
            if use_cuda:
                imgs = imgs.cuda()
                captions = captions.cuda(async=True)
            imgs = Variable(imgs, volatile=not training)
            captions = Variable(captions, volatile=not training)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions, lengths)[0]

            pred, _ = model(imgs, input_captions, lengths)
            err = loss(pred, target_captions)
            perplexity.update(math.exp(err.data[0]))

            if training:
                optimizer.zero_grad()
                err.backward()
                clip_grad_norm(model.rnn.parameters(), args.grad_clip)
                optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if i % args.print_freq == 0:
                logging.info('{phase} - Epoch: [{0}][{1}/{2}]\t'
                             'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                             'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                             'Perplexity {perp.val:.4f} ({perp.avg:.4f})'.format(
                                 epoch, i, len(data),
                                 phase='TRAINING' if training else 'EVALUATING',
                                 batch_time=batch_time,
                                 data_time=data_time, perp=perplexity))

        return perplexity.avg

    for epoch in range(args.start_epoch, args.epochs):
        if epoch >= args.finetune_epoch:
            model.finetune_cnn(True)
        optimizer = adjust_optimizer(
            optimizer, epoch, regime)
        # Train
        train_perp = forward(
            model, train_data, training=True, optimizer=optimizer)
        # Evaluate
        val_perp = forward(model, val_data, training=False)

        logging.info('\n Epoch: {0}\t'
                     'Training Perplexity {train_perp:.4f} \t'
                     'Validation Perplexity {val_perp:.4f} \n'
                     .format(epoch + 1, train_perp=train_perp, val_perp=val_perp))
        model.save_checkpoint(checkpoint_file % (epoch + 1))
Esempio n. 23
0
args = parser.parse_args()
gpu = not args.nogpu

if path.exists(args.outdir) is False:
    print('Creating path: %s' % (args.outdir))
    makedirs(args.outdir)

f2i = {}
v1 = [0]
v2 = [1]

if args.sharedv is True:
    v1.append(1)
    v2.append(0)

vocab1 = build_vocab(v1, {args.train, args.test})
vocab2 = build_vocab(v2, {args.train, args.test})

embed1 = Word2VecModel(args.embed1, vocab1, args.unif)

print('Loaded word embeddings: ' + args.embed1)

if args.embed2 is None:
    print('No embed2 found, using embed1 for both')
    args.embed2 = args.embed1

embed2 = Word2VecModel(args.embed2, vocab2, args.unif)
print('Loaded word embeddings: ' + args.embed2)

ts = load_sentences(args.train, embed1.vocab, embed2.vocab, args.mxlen,
                    args.batchsz, long_0_tensor_alloc)
            embed[j, i, :] = word_vec[batch[i][j]]
    return torch.from_numpy(embed).float(), lengths
'''

GLOVE_PATH = '<glove>/<path>'

wenda_infersent = torch.load('./glove_modeldir/GloVe.pickle')
wenda_infersent.encoder.enc_lstm.flatten_parameters()

train, valid, test = get_nli('./<corpus>/<path>')

train['s1'] = list(set(train['s1']))
train['s2'] = list(set(train['s2']))
print(len(train['s1']))

word_vec = build_vocab(train['s1'], GLOVE_PATH)

for split in ['s1', 's2']:
    for data_type in ['train']:
        eval(data_type)[split] = np.array(
            [[word for word in list(sent) if word in word_vec]
             for sent in eval(data_type)[split]])

permutation = np.random.permutation(len(train['s1']))

s1 = train['s1'][permutation]
#word_vec = build_vocab(s1, GLOVE_PATH)

print([''.join(sent) for sent in s1[:50]])

wenda_cod = wenda_infersent.encoder
Esempio n. 25
0
parser.add_argument('--init_embedding',
                    action='store_true',
                    help='whether init embedding')
parser.add_argument('--embedding_source',
                    type=str,
                    default='./',
                    help='pretrained embedding path')

args = parser.parse_args()

if __name__ == '__main__':
    train = data.load_data('train.json', args.word_base)
    test = data.load_data('test.json', args.word_base)
    # train = data.load_data('train_squad.json', args.word_base)
    # test = data.load_data('dev_squad.json', args.word_base)
    vocabulary, pad_lens = data.build_vocab(train, test, args.vocab_size)
    print('Vocab size: %d | Max context: %d | Max question: %d' %
          (len(vocabulary), pad_lens[0], pad_lens[1]))
    train, valid = data.split_exp(train, args.valid_ratio)
    print('Train: %d | Valid: %d | Test: %d' %
          (len(train), len(valid), len(test)))
    train_engine = DataLoader(data.DataEngine(train, vocabulary, pad_lens),
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=use_cuda)
    valid_engine = DataLoader(data.DataEngine(valid, vocabulary, pad_lens),
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=use_cuda)
Esempio n. 26
0
def main():

    best_f1 = 0
    print(config.device)

    vocab = data.Vocabulary()
    data.build_vocab(vocab, config.vector_file)  # build vocabulary

    train_data = data.Sentiment(config.train_file, vocab)

    train_dataloader = DataLoader(train_data,
                                  batch_size=config.TRAIN_BATCH_SIZE,
                                  shuffle=True,
                                  collate_fn=data.collate_fn)

    test_data = data.Sentiment(config.test_file, vocab)

    test_dataloader = DataLoader(test_data,
                                 batch_size=config.TRAIN_BATCH_SIZE,
                                 shuffle=True,
                                 collate_fn=data.collate_fn)

    # classifier = models.RNNClassifier(nembedding=config.DIM,
    #                                   hidden_size=config.HIDDEN_SIZE,
    #                                   num_layer=config.NUM_LAYER,
    #                                   dropout=config.drop_out,
    #                                   vocab_size=vocab.n_words,
    #                                   use_pretrain=True,
    #                                   embed_matrix=vocab.vector,
    #                                   embed_freeze=False,
    #                                   label_size=config.label_class).to(config.device)

    classifier = models.Attentionclassifier(vocab_size=vocab.n_words,
                                            emb_dim=config.DIM,
                                            hidden_size=config.HIDDEN_SIZE,
                                            num_layer=config.NUM_LAYER,
                                            dropout=config.drop_out,
                                            bidirectional=config.bidirectional,
                                            label_size=config.label_class,
                                            use_pretrain=True,
                                            embed_matrix=vocab.vector,
                                            embed_freeze=False).to(
                                                config.device)

    criterion = nn.NLLLoss()
    # optimizer = torch.optim.Adam(classifier.parameters())
    optimizer = torch.optim.RMSprop(classifier.parameters(),
                                    lr=config.LR,
                                    alpha=0.9,
                                    momentum=0.2)
    # optimizer = torch.optim.RMSprop(classifier.parameters())

    # optimizer, scheduler = adam_optimizers(classifier.parameters())

    # optimizer = torch.optim.Adadelta(classifier.parameters(), lr=config.LR, rho=0.9, eps=1e-06, weight_decay=0)

    for epoch in range(config.epochs):

        # lr update
        adjust_learning_rate(optimizer, epoch)
        # 测试不同优化器的学习率是否是自适应的
        # for param_group in optimizer.param_groups:
        #     print("here lr :{}".format(param_group['lr']))

        logging.info("epoch {0:04d}".format(epoch))
        train(train_dataloader, classifier, criterion, optimizer, epoch,
              config.TRAIN_BATCH_SIZE, config.silent)
        test_f1, val_loss = test(test_dataloader, classifier, criterion, epoch,
                                 config.TRAIN_BATCH_SIZE, config.silent)

        # scheduler.step(val_loss)

        is_best = test_f1 > best_f1  # True or False
        best_f1 = max(test_f1, best_f1)

        logging.info("best f1 is {}".format(best_f1))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': classifier.state_dict(),
                'acc': test_f1,
                'best_acc': best_f1,
                'optimizer': optimizer.state_dict(),
            },
            is_best,
            checkpoint='../output/')
Esempio n. 27
0
# get lengths for each dialogue
train_fr_lens = get_lengths(os.path.join(file_path, 'dataset/data/Friends'),
                            'friends_train')
train_ep_lens = get_lengths(
    os.path.join(file_path, 'dataset/data/EmotionPush'), 'emotionpush_train')
train_lens = np.append(train_fr_lens, train_ep_lens)

dev_fr_lens = get_lengths(os.path.join(file_path, 'dataset/data/Friends'),
                          'friends_dev')
dev_ep_lens = get_lengths(os.path.join(file_path, 'dataset/data/EmotionPush'),
                          'emotionpush_dev')

all_data = [
    line.rstrip() for line in open(os.path.join(DATA_PATH, 'data-all.en'), 'r')
]
word_vec = build_vocab(all_data, GLOVE_PATH)
# add <s> and </s> to each of the sentences
for data_type in ['train', 'dev_fr', 'dev_ep']:
    eval(data_type)['sent'] = np.array(
        [['<s>'] + [word
                    for word in sent.split() if word in word_vec] + ['</s>']
         for sent in eval(data_type)['sent']])
"""
MODEL
"""
model = BLSTMAttnNet(embed_size=params.embed_size,
                     lstm_dim=params.lstm_dim,
                     fc_dim=params.fc_dim,
                     num_classes=params.num_classes,
                     max_sent_len=params.max_sent_len,
                     attn_dropout=params.attn_dropout,
Esempio n. 28
0
with open(params.hypes, 'rb') as f:
    json_config = json.load(f)

data_dir = json_config['data_dir']
prefix = json_config[params.corpus]
glove_path = json_config['glove_path']

if params.char and params.corpus == "gw_cn_5":
    prefix = prefix.replace('discourse', 'discourse_char')

"""
DATA
"""
train, valid, test = get_dis(data_dir, prefix, params.corpus)
word_vec = build_vocab(train['s1'] + train['s2'] +
                       valid['s1'] + valid['s2'] +
                       test['s1'] + test['s2'], glove_path)

# unknown words instead of map to <unk>, this directly takes them out
for split in ['s1', 's2']:
    for data_type in ['train', 'valid', 'test']:
        eval(data_type)[split] = np.array([['<s>'] +
                                           [word for word in sent.split() if word in word_vec] +
                                           ['</s>'] for sent in eval(data_type)[split]])

params.word_emb_dim = 300

dis_labels = get_labels(params.corpus)
label_size = len(dis_labels)

"""
def main(args):

    GLOVE_PATH = "dataset/GloVe/glove.840B.300d.txt"

    parser = argparse.ArgumentParser(description='NLI training')
    # paths
    parser.add_argument("--nlipath",
                        type=str,
                        default='dataset/SNLI/',
                        help="NLI data path (SNLI or MultiNLI)")
    parser.add_argument("--outputdir",
                        type=str,
                        default='savedir/',
                        help="Output directory")
    parser.add_argument("--outputmodelname", type=str, default='model.pickle')

    # dataset, dimensions, transfer learning
    parser.add_argument("--dataset",
                        type=str,
                        required=True,
                        help="Semantic similarity dataset")
    parser.add_argument('--dimension',
                        nargs='+',
                        required=True,
                        help='Dimension(s) on the dataset')
    parser.add_argument('--transfer',
                        default='DNT',
                        help='Transfer learning approach')
    parser.add_argument('--save', default='no', help='Save trained model')
    parser.add_argument(
        '--load_model',
        default='no',
        help='If load model, do not perform training, just evalute')

    # training
    parser.add_argument("--n_epochs", type=int, default=10)
    parser.add_argument("--batch_size", type=int, default=16)
    parser.add_argument("--dpout_model",
                        type=float,
                        default=0.,
                        help="encoder dropout")
    parser.add_argument("--dpout_fc",
                        type=float,
                        default=0.,
                        help="classifier dropout")
    parser.add_argument("--nonlinear_fc",
                        type=float,
                        default=0,
                        help="use nonlinearity in fc")
    parser.add_argument("--optimizer",
                        type=str,
                        default="sgd,lr=5",
                        help="adam or sgd,lr=0.1")
    parser.add_argument("--lrshrink",
                        type=float,
                        default=5,
                        help="shrink factor for sgd")
    parser.add_argument("--decay", type=float, default=1., help="lr decay")
    parser.add_argument("--minlr", type=float, default=1e-5, help="minimum lr")
    parser.add_argument("--max_norm",
                        type=float,
                        default=5.,
                        help="max norm (grad clipping)")

    # model
    parser.add_argument("--encoder_type",
                        type=str,
                        default='BLSTMEncoder',
                        help="see list of encoders")
    parser.add_argument("--enc_lstm_dim",
                        type=int,
                        default=2048,
                        help="encoder nhid dimension")
    parser.add_argument("--n_enc_layers",
                        type=int,
                        default=1,
                        help="encoder num layers")
    parser.add_argument("--fc_dim",
                        type=int,
                        default=512,
                        help="nhid of fc layers")
    parser.add_argument("--n_classes",
                        type=int,
                        default=3,
                        help="entailment/neutral/contradiction")
    parser.add_argument("--pool_type",
                        type=str,
                        default='max',
                        help="max or mean")

    # gpu
    parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID")
    parser.add_argument("--seed", type=int, default=1236, help="seed")

    params, _ = parser.parse_known_args(args)

    # set gpu device
    torch.cuda.set_device(params.gpu_id)

    # print parameters passed, and all parameters
    #print('\ntogrep : {0}\n'.format(sys.argv[1:]))
    #print(params)

    def trainepoch(epoch):
        print('TRAINING : Epoch ' + str(epoch))
        nli_net.train()
        logs = []

        last_time = time.time()
        #correct = 0.
        # shuffle the data
        permutation = np.random.permutation(len(train['s1']))

        s1 = train['s1'][permutation]
        s2 = train['s2'][permutation]

        targets = [x[permutation] for x in train['labels']]

        optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * params.decay if epoch>1\
            and 'sgd' in params.optimizer else optimizer.param_groups[0]['lr']
        #print('Learning rate : {0}'.format(optimizer.param_groups[0]['lr']))

        for stidx in range(0, len(s1), params.batch_size):
            tgt_batches = []
            # prepare batch
            s1_batch, s1_len = get_batch(s1[stidx:stidx + params.batch_size],
                                         word_vec)
            s2_batch, s2_len = get_batch(s2[stidx:stidx + params.batch_size],
                                         word_vec)
            s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(
                s2_batch.cuda())
            for i, _ in enumerate(MTL_index):
                tgt_batches.append(
                    Variable(
                        torch.FloatTensor(
                            targets[i][stidx:stidx +
                                       params.batch_size])).cuda())

            #for dim in [1,2,3,4]:
            # model forward
            outputs = nli_net((s1_batch, s1_len), (s2_batch, s2_len))

            # loss
            if params.transfer == 'DNT':
                #print(outputs[0])
                #print((tgt_batches[0] - 1)/(params.n_classes-1))
                losses = [
                    nli_net.loss_fn(outputs[i], (tgt_batches[i] - 1) /
                                    (params.n_classes - 1))
                    for i, _ in enumerate(MTL_index)
                ]
            elif params.transfer == 'NT':
                losses = [
                    nli_net.loss_fn(outputs[i], tgt_batches[i])
                    for i, _ in enumerate(MTL_index)
                ]
            #if 'kl' in MTL_index:
            #    output1 = torch.log(output1)

            loss = np.sum(losses)

            #loss = loss1 + loss2 + loss3 + loss4# + loss5 + loss6 + loss7 + loss8
            #ADDED
            #optimizer.zero_grad()
            #loss1.backward(retain_graph=True)
            #loss2.backward(retain_graph=True)
            #loss3.backward(retain_graph=True)
            #loss4.backward(retain_graph=True)
            #optimizer.step()
            #END ADDED
            """
            if dim == 1:
                loss = nli_net.loss_fn(output1, tgt_batch1)
            elif dim == 2:
                loss = nli_net.loss_fn(output2, tgt_batch2)
            elif dim == 3:
                loss = nli_net.loss_fn(output3, tgt_batch3)
            elif dim == 4:
                loss = nli_net.loss_fn(output4, tgt_batch4)
            """
            # backward
            optimizer.zero_grad()
            loss.backward()

            # optimizer step
            optimizer.step()

    def evaluate(epoch,
                 eval_type='valid',
                 flag='',
                 correlation=spearmanr,
                 transfer='NT'):
        nli_net.eval()
        #correct = 0.
        preds = []
        r = np.arange(1, 1 + nli_net.n_classes)
        global val_acc_best, lr, stop_training, adam_stop

        if eval_type == 'valid':
            print('VALIDATION : Epoch {0}'.format(epoch))
            s1 = valid['s1']
            s2 = valid['s2']
            targets = valid['scores']
        elif eval_type == 'test':
            print('TEST : Epoch {0}'.format(epoch))
            s1 = test['s1']
            s2 = test['s2']
            targets = test['scores']
        elif eval_type == 'train':
            print('EVAL ON TRAIN : Epoch {0}'.format(epoch))
            s1 = train['s1']
            s2 = train['s2']
            targets = train['scores']
        else:
            raise ValueError('Wrong eval_type.')

        probas = [[] for _ in MTL_index]
        correct = 0.

        for i in range(0, len(s1), params.batch_size):
            # prepare batch
            s1_batch, s1_len = get_batch(s1[i:i + params.batch_size], word_vec)
            s2_batch, s2_len = get_batch(s2[i:i + params.batch_size], word_vec)
            s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(
                s2_batch.cuda())

            # model forward
            outputs = nli_net((s1_batch, s1_len), (s2_batch, s2_len))

            for i, _ in enumerate(MTL_index):
                if len(probas[i]) == 0:
                    probas[i] = outputs[i].data.cpu().numpy()
                else:
                    probas[i] = np.concatenate(
                        (probas[i], outputs[i].data.cpu().numpy()), axis=0)
            """
            if 2 in MTL_index:
                if 'e' in MTL_index:
                    tgt_batch2 = Variable(torch.LongTensor(target2[i:i + params.batch_size])).cuda()
                    pred2 = output2.data.max(1)[1]
                    correct += pred2.long().eq(tgt_batch2.data.long()).cpu().sum()
                else:
                    if len(probas2) == 0:
                        probas2 = output2.data.cpu().numpy()
                    else:
                        probas2 = np.concatenate((probas2, output2.data.cpu().numpy()), axis=0)
           """

        if transfer == 'NT':
            ret = [
                correlation(np.dot(x, r), y)[0]
                for x, y in zip(probas, targets)
            ]
        elif transfer == 'DNT':
            ret = [correlation(x, y)[0] for x, y in zip(probas, targets)]
        else:
            raise ValueError('Wrong transfer.')
        """
        if 2 in MTL_index:      
            if 'e' in MTL_index:
                ret.append(round(100 * correct/len(s1), 2))
            else:
                yhat2 = np.dot(probas2, r)
                p2 = spearmanr(yhat2, target2)[0]
                ret.append(p2)
        else:
            ret.append(0)
        """

        return ret

    """
    SEED
    """
    np.random.seed(params.seed)
    torch.manual_seed(params.seed)
    torch.cuda.manual_seed(params.seed)
    """
    DATA
    """
    #for i in range(1,9):
    #    print(i)
    #    print('----------')
    dataset_path = {
        'stsbenchmark': '../stsbenchmark/',
        'sts12': '../SemEval12/',
        'sick': '../SICK/',
        'activities': '../human_activity_phrase_data/',
        'sag': '../ShortAnswerGrading_v2.0/data/processed/',
        'typed': '../SemEval13/typed/'
    }
    #MTL_index = [1,2,3,4, 'mse'] #'e'
    MTL_index = [int(x) for x in params.dimension]
    train, valid, test = get_sts(dataset_path[params.dataset], MTL_index,
                                 params.transfer, params.n_classes)

    word_vec = build_vocab(
        train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] +
        test['s2'], GLOVE_PATH)

    for split in ['s1', 's2']:
        for data_type in ['train', 'valid', 'test']:
            eval(data_type)[split] = np.array(
                [[word for word in sent.split() if word in word_vec]
                 for sent in eval(data_type)[split]])
            #eval(data_type)[split] = np.array([['<s>'] +
            #    [word for word in sent.split() if word in word_vec or word[:2] == 'dc'] +
            #    ['</s>'] for sent in eval(data_type)[split]])

    params.word_emb_dim = 300
    """
    MODEL
    """
    # model config
    config_nli_model = {
        'n_words': len(word_vec),
        'word_emb_dim': params.word_emb_dim,
        'enc_lstm_dim': params.enc_lstm_dim,
        'n_enc_layers': params.n_enc_layers,
        'dpout_model': params.dpout_model,
        'dpout_fc': params.dpout_fc,
        'fc_dim': params.fc_dim,
        'bsize': params.batch_size,
        'n_classes': params.n_classes,
        'pool_type': params.pool_type,
        'nonlinear_fc': params.nonlinear_fc,
        'encoder_type': params.encoder_type,
        'use_cuda': True,
        'MTL_index': MTL_index,
        'transfer': params.transfer
    }

    # model
    encoder_types = [
        'BLSTMEncoder', 'BLSTMprojEncoder', 'BGRUlastEncoder',
        'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder',
        'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder'
    ]
    assert params.encoder_type in encoder_types, "encoder_type must be in " + \
                                                    str(encoder_types)
    perfs_all = []
    for rd in range(1):
        print("Round", rd)
        if params.load_model == 'no':
            nli_net = NLINet(config_nli_model)
            nli_net.encoder = torch.load('encoder/infersent.allnli.pickle',
                                         map_location={
                                             'cuda:1': 'cuda:0',
                                             'cuda:2': 'cuda:0'
                                         })
        else:
            nli_net = torch.load(params.load_model)
        print(nli_net)

        # optimizer
        optim_fn, optim_params = get_optimizer(params.optimizer)
        optimizer = optim_fn(nli_net.parameters(), **optim_params)

        # cuda by default
        nli_net.cuda()
        """
        TRAIN
        """
        val_acc_best = -1e10
        adam_stop = False
        stop_training = False
        lr = optim_params['lr'] if 'sgd' in params.optimizer else None

        last_result = 0
        last_test_result = 0
        drop_count = 0
        """
        Train model on Natural Language Inference task
        """
        correlation = spearmanr if params.dataset == 'activities' else pearsonr
        epoch = 0
        perfs_valid = evaluate(epoch, 'valid', 'begin', correlation,
                               params.transfer)
        perfs_test = evaluate(epoch, 'test', 'begin', correlation,
                              params.transfer)
        print(perfs_valid, perfs_test)
        epoch += 1

        if params.load_model == 'no':
            while not stop_training and epoch <= params.n_epochs:
                trainepoch(epoch)
                perfs_valid = evaluate(epoch, 'valid', '', correlation,
                                       params.transfer)
                perfs_test = evaluate(epoch, 'test', '', correlation,
                                      params.transfer)
                print(perfs_valid, perfs_test)

                epoch += 1
            #perfs_all.append(perfs)
        if params.save != 'no':
            torch.save(nli_net, params.save)
Esempio n. 30
0
    return train_losses, dev_losses, model_path


if __name__ == '__main__':
    data_dir = '../data/snli_1.0/'
    files = [
        data_dir + s for s in
        ['snli_1.0_train.jsonl', 'snli_1.0_dev.jsonl', 'snli_1.0_test.jsonl']
    ]

    vocab_file = data_dir + 'vocab.pkl'
    # including place for padding and UNK
    vocab_size = 30000
    vocab = []
    if not os.path.exists(vocab_file):
        build_vocab(files[0], vocab_file)
    vocab, vocab_idx = load_vocab(vocab_file)

    snli_train_file = data_dir + 'snli_train.pkl'
    if not os.path.exists(snli_train_file):
        preprocess_snli_jsonl(files[0], vocab_idx, snli_train_file, vocab_size)

    data = {}
    for f in ['train', 'dev', 'test']:
        data[f] = load_snli(data_dir + 'snli_%s.pkl' % f)

    batch_size = 256
    embedding_size = 300
    state_size = 512
    inverse_drop_rate = 0.8
    learning_rate = 3e-3
Esempio n. 31
0
# print parameters passed, and all parameters
print('\ntogrep : {0}\n'.format(sys.argv[1:]))
print(params)
"""
SEED
"""
np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.cuda.manual_seed(params.seed)
"""
DATA
"""
train, valid, test = get_nli(params.nlipath)
word_vec = build_vocab(
    train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] +
    test['s2'], GLOVE_PATH)

for split in ['s1', 's2']:
    for data_type in ['train', 'valid', 'test']:
        eval(data_type)[split] = np.array([['<s>'] + [word for word in sent.split() if word in word_vec] +\
                                          ['</s>'] for sent in eval(data_type)[split]])

params.word_emb_dim = 300
#params.word_emb_dim = 512
"""
MODEL
"""
# model config
config_nli_model = {
    'n_words': len(word_vec),
Esempio n. 32
0
def fill_tre_with_vectors():
    train_tree, valid_tree, test_tree = get_SICK_tree_data()
    filename = "transformer_SICk"
    print(filename)
    parser = argparse.ArgumentParser(description='NLI training')
    # paths
    parser.add_argument("--nlipath",
                        type=str,
                        default='dataset/SNLI/',
                        help="NLI data path (SNLI or MultiNLI)")
    parser.add_argument("--outputdir",
                        type=str,
                        default='savedir/',
                        help="Output directory")
    parser.add_argument("--outputmodelname", type=str, default='model.pickle')
    parser.add_argument("--word_emb_path",
                        type=str,
                        default="glove.840B.300d.txt",
                        help="word embedding file path")

    # training
    parser.add_argument("--n_epochs", type=int, default=500)
    parser.add_argument("--batch_size", type=int, default=16)
    parser.add_argument("--dpout_model",
                        type=float,
                        default=0.1,
                        help="encoder dropout")
    parser.add_argument("--dpout_fc",
                        type=float,
                        default=0.1,
                        help="classifier dropout")
    parser.add_argument("--nonlinear_fc",
                        type=float,
                        default=5,
                        help="use nonlinearity in fc")
    parser.add_argument("--optimizer",
                        type=str,
                        default="sgd,lr=0.1",
                        help="adam or sgd,lr=0.1")
    parser.add_argument("--lrshrink",
                        type=float,
                        default=1,
                        help="shrink factor for sgd")
    parser.add_argument("--decay", type=float, default=0.99, help="lr decay")
    parser.add_argument("--minlr", type=float, default=1e-5, help="minimum lr")
    parser.add_argument("--max_norm",
                        type=float,
                        default=5.,
                        help="max norm (grad clipping)")

    # model
    parser.add_argument("--encoder_type",
                        type=str,
                        default='LSTMEncoder',
                        help="see list of encoders")
    parser.add_argument("--enc_lstm_dim",
                        type=int,
                        default=600,
                        help="encoder nhid dimension")
    parser.add_argument("--n_enc_layers",
                        type=int,
                        default=1,
                        help="encoder num layers")
    parser.add_argument("--fc_dim",
                        type=int,
                        default=150,
                        help="nhid of fc layers")
    parser.add_argument("--n_classes",
                        type=int,
                        default=2,
                        help="entailment/neutral/contradiction")
    parser.add_argument("--pool_type",
                        type=str,
                        default='max',
                        help="max or mean")

    # gpu
    parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID")
    parser.add_argument("--seed", type=int, default=1234, help="seed")

    # data
    parser.add_argument("--word_emb_dim",
                        type=int,
                        default=300,
                        help="word embedding dimension")

    params, _ = parser.parse_known_args()

    # set gpu device
    torch.cuda.set_device(params.gpu_id)

    # print parameters passed, and all parameters
    print('\ntogrep : {0}\n'.format(sys.argv[1:]))
    print(params)
    """
    SEED
    """
    np.random.seed(params.seed)
    torch.manual_seed(params.seed)
    torch.cuda.manual_seed(params.seed)
    """
    DATA
    """
    #train, valid, test = get_nli(params.nlipath)

    train_tree, valid_tree, test_tree = get_SICK_tree_data()
    train, valid, test = get_SICK_data()

    word_vec = build_vocab(
        train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] +
        test['s2'], params.word_emb_path)

    for i in range(len(train_tree['s1'])):
        x = deepcopy(assign_vectors(train_tree['s1'][i], word_vec))
        train_tree['s1'][i] = deepcopy(x)
        x = deepcopy(assign_vectors(train_tree['s2'][i], word_vec))
        train_tree['s1'][i] = deepcopy(x)

    for i in range(len(test_tree['s1'])):
        x = deepcopy(assign_vectors(test_tree['s1'][i], word_vec))
        test_tree['s1'][i] = deepcopy(x)
        x = deepcopy(assign_vectors(test_tree['s2'][i], word_vec))
        test_tree['s1'][i] = deepcopy(x)

    for i in range(len(valid_tree['s1'])):
        x = deepcopy(assign_vectors(valid_tree['s1'][i], word_vec))
        valid_tree['s1'][i] = deepcopy(x)
        x = deepcopy(assign_vectors(valid_tree['s2'][i], word_vec))
        valid_tree['s1'][i] = deepcopy(x)

    with open("sick_tree_data_tensor.pkl", "wb") as f:
        pickle.dump([train_tree, valid_tree, test_tree], f)

    return train_tree, valid_tree, test_tree