Ejemplo n.º 1
0
 def step(self, norm_type=2):
     """
     Run an update eventually clipping the gradients
     """
     if self.max_norm is not None:
         clip_grad_norm(self.params, self.max_norm, norm_type=norm_type)
     self.optim.step()
Ejemplo n.º 2
0
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    if args.no_partialbn:
        model.module.partialBN(False)
    else:
        model.module.partialBN(True)

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1,5))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0], input.size(0))
        top5.update(prec5[0], input.size(0))


        # compute gradient and do SGD step
        optimizer.zero_grad()

        loss.backward()

        if args.clip_gradient is not None:
            total_norm = clip_grad_norm(model.parameters(), args.clip_gradient)
            if total_norm > args.clip_gradient:
                print("clipping gradient: {} with coef {}".format(total_norm, args.clip_gradient / total_norm))

        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print(('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1, top5=top5, lr=optimizer.param_groups[-1]['lr'])))
Ejemplo n.º 3
0
    def step(self):
        """Update the model parameters based on current gradients.

        Optionally, will employ gradient modification or update learning
        rate.
        """
        self._step += 1

        # Decay method used in tensor2tensor.
        if self.decay_method == "noam":
            self._set_rate(
                self.original_lr *
                (self.model_size ** (-0.5) *
                 min(self._step ** (-0.5),
                     self._step * self.warmup_steps**(-1.5))))

        if self.max_grad_norm:
            clip_grad_norm(self.params, self.max_grad_norm)
        self.optimizer.step()
Ejemplo n.º 4
0
def train(e, model, optimizer, train_iter, vocab_size, grad_clip, DE, EN):
    model.train()
    total_loss = 0
    pad = EN.vocab.stoi['<pad>']
    for b, batch in enumerate(train_iter):
        src, len_src = batch.src
        trg, len_trg = batch.trg
        src, trg = src.cuda(), trg.cuda()
        optimizer.zero_grad()
        output = model(src, trg)
        loss = F.cross_entropy(output[1:].view(-1, vocab_size),
                               trg[1:].contiguous().view(-1),
                               ignore_index=pad)
        loss.backward()
        clip_grad_norm(model.parameters(), grad_clip)
        optimizer.step()
        total_loss += loss.data[0]

        if b % 100 == 0 and b != 0:
            total_loss = total_loss / 100
            print("[%d][loss:%5.2f][pp:%5.2f]" %
                  (b, total_loss, math.exp(total_loss)))
            total_loss = 0
Ejemplo n.º 5
0
            log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE),
                                                            batch_actions_t]
            loss_policy_v = -log_prob_actions_v.mean()

            loss_policy_v.backward(retain_graph=True)
            grads = np.concatenate([
                p.grad.data.cpu().numpy().flatten() for p in net.parameters()
                if p.grad is not None
            ])

            prob_v = F.softmax(logits_v)
            entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
            entropy_loss_v = -ENTROPY_BETA * entropy_v
            loss_v = loss_policy_v + entropy_loss_v
            loss_v.backward()
            nn_utils.clip_grad_norm(net.parameters(), GRAD_L2_CLIP)
            optimizer.step()
            loss_v += loss_policy_v

            # calc KL-div
            new_logits_v = net(states_v)
            new_prob_v = F.softmax(new_logits_v)
            kl_div_v = -(
                (new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean()
            writer.add_scalar("kl", kl_div_v.data.cpu().numpy()[0], step_idx)

            writer.add_scalar("baseline", baseline, step_idx)
            writer.add_scalar("entropy",
                              entropy_v.data.cpu().numpy()[0], step_idx)
            writer.add_scalar("batch_scales", np.mean(batch_scales), step_idx)
            writer.add_scalar("batch_scales_std", scale_std, step_idx)
Ejemplo n.º 6
0
def train(lr, net, epoch, train_loader, valid_loader, transform,
          hyperparameters, batch_size):

    # register hypercurve
    agent = Agent(port=5001)
    hyperparameters['criteria'] = 'train loss'
    train_loss = agent.register(hyperparameters, 'loss')

    hyperparameters['criteria'] = 'valid loss'
    valid_loss = agent.register(hyperparameters, 'loss')

    hyperparameters['criteria'] = 'valid bleu'
    valid_bleu = agent.register(hyperparameters, 'bleu')

    hyperparameters['criteria'] = 'train bleu'
    train_bleu = agent.register(hyperparameters, 'bleu')

    hyperparameters['criteria'] = 'scheduled sampling probability'
    hyper_ssprob = agent.register(hyperparameters, 'probability')

    if torch.cuda.is_available():
        net.cuda()
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        net.parameters()),
                                 lr=lr)
    net.train()

    best_score = -1
    global_steps = 0
    best_valid_loss = 10000
    for iepoch in range(epoch):

        new_epoch = False
        batchid = 0
        for (_, data) in enumerate(train_loader, 0):

            entext = data['entext']
            enlen = data['enlen']
            zhlabel = data['zhlabel']
            zhgtruth = data['zhgtruth']
            zhlen = data['zhlen']

            ssprob = max(math.exp(-(global_steps - 100000) / 500000), 0.8)
            print('scheduled sampling pro: ', ssprob)
            logits, predic = net(entext, zhgtruth, enlen, ssprob, True)
            loss = net.get_loss(logits, zhlabel)
            optimizer.zero_grad()
            loss.backward()
            utils.clip_grad_norm(net.parameters(), 5)
            optimizer.step()

            batchid += 1
            global_steps += 1

            print(global_steps, iepoch, batchid, sum(loss.data.cpu().numpy()))
            agent.append(train_loss, global_steps,
                         sum(loss.data.cpu().numpy()))
            agent.append(hyper_ssprob, global_steps, ssprob)

            if batchid % 50 == 0:
                net.eval()
                logits, predic = net(entext, zhgtruth, enlen, ssprob, True)

                tmppre = [0 for i in range(len(entext))]
                tmplabel = [0 for i in range(len(entext))]
                for i in range(len(entext)):
                    tmppre[i] = transform.clip(predic[i], language='zh')
                    tmplabel[i] = zhlabel[i][:zhlen[i]]

                tmpscore = bleuscore.score(tmppre, tmplabel)
                for i in range(25):
                    ans_ = transform.i2t(tmplabel[i], language='zh')
                    pre_ = transform.i2t(tmppre[i], language='zh')
                    print(ans_)
                    print(pre_)
                    print('-------------------\n')
                agent.append(train_bleu, global_steps, tmpscore)
                del logits, predic

            if batchid % 400 == 0:
                print('\n------------------------\n')
                net.eval()
                all_pre = []
                all_lable = []
                all_len = []
                all_loss = 0
                bats = 0
                for (_, data) in enumerate(valid_loader, 0):

                    entext = data['entext']
                    enlen = data['enlen']
                    zhlabel = data['zhlabel']
                    zhgtruth = data['zhgtruth']
                    zhlen = data['zhlen']

                    logits, predic = net(entext, zhgtruth, enlen, 0, False)
                    loss = net.get_loss(logits, zhlabel)

                    all_pre.extend(predic)
                    all_lable.extend(zhlabel)
                    all_len.extend(zhlen)
                    all_loss += sum(loss.data.cpu().numpy())

                    del loss, logits, predic
                    bats += 1

                for i in range(len(all_pre)):
                    all_pre[i] = transform.clip(all_pre[i], language='zh')
                    all_lable[i] = all_lable[i][:all_len[i]]

                score = bleuscore.score(all_pre, all_lable)

                for i in range(0, 600, 6):

                    ans_ = transform.i2t(all_lable[i], language='zh')
                    pre_ = transform.i2t(all_pre[i], language='zh')
                    print(ans_)
                    print(pre_)
                    print('-------------------\n')

                all_loss /= bats
                print(global_steps, iepoch, batchid, all_loss, score,
                      '\n********************\n')
                agent.append(valid_loss, global_steps, all_loss)
                agent.append(valid_bleu, global_steps, score)

                if best_valid_loss > all_loss or best_score < score:

                    best_valid_loss = all_loss
                    bestscore = score
                    torch.save(
                        net.state_dict(), model_dir +
                        "ssprob-{:3f}-loss-{:3f}-steps-{:d}-model.pkl".format(
                            ssprob, all_loss, global_steps))
                del all_lable, all_len, all_loss, all_pre
            net.train()
def train(hp_dict, args, data_dir, save_path):
    use_chars = hp_dict['char_dim'] > 0
    # load data
    dp = preprocessing()
    data = dp.preprocess(data_dir, no_training_set=False, use_chars=use_chars)

    # build minibatch loader
    train_batch_loader = mini_batch_loader(data.training,
                                           BATCH_SIZE,
                                           sample_rate=1.0,
                                           len_bin=hp_dict['use_bin'])
    valid_batch_loader = mini_batch_loader(data.validation,
                                           BATCH_SIZE,
                                           shuffle=False,
                                           len_bin=hp_dict['use_bin'])
    test_batch_loader = mini_batch_loader(data.test,
                                          BATCH_SIZE,
                                          shuffle=False,
                                          len_bin=hp_dict['use_bin'])

    logging.info("loading word2vec file ...")
    embed_init, embed_dim = \
        load_word2vec_embeddings(data.dictionary[0], hp_dict['embed_file'],EMBED_SIZE)
    logging.info("embedding dim: {}".format(embed_dim))
    logging.info("initialize model ...")

    model = GA_reader(hp_dict['nhidden'], data.vocab_size, embed_dim,
                      embed_init, hp_dict['train_emb'], use_chars,
                      hp_dict['char_nhidden'], data.n_chars,
                      hp_dict['char_dim'], hp_dict['nlayers'],
                      hp_dict['gating_fn'], hp_dict['use_feat'],
                      hp_dict['dropout'])

    if USE_CUDA:
        model.cuda()
    logging.info("Running on cuda: {}".format(USE_CUDA))
    # training phase
    opt = torch.optim.Adam(params=filter(lambda p: p.requires_grad,
                                         model.parameters()),
                           lr=LEARNING_RATE)

    shutil.copyfile('config.py', os.path.join(save_path, 'config.py'))
    #
    # load existing best model
    if os.path.isfile(os.path.join(save_path, 'best_model.pkl')):
        print('loading previously best model')
        model.load_state_dict(
            torch.load(os.path.join(save_path, 'best_model.pkl')))
    # load existing train_model
    elif os.path.isfile(os.path.join(save_path, 'init_model.pkl')):
        print('loading init model')
        model.load_state_dict(
            torch.load(os.path.join(save_path, 'init_model.pkl')))

    logging.info('-' * 50)
    logging.info("Start training ...")
    best_valid_acc = best_test_acc = 0
    for epoch in range(NUM_EPOCHS):
        new_max = False
        if epoch >= 2:
            for param_group in opt.param_groups:
                param_group['lr'] /= 2
        model.train()
        acc = loss = n_examples = it = 0
        start = time.time()

        for dw, dw_m,qw,qw_m,dt,qt,tt,tm, \
                 answear, candidate, candi_m, cloze_pos, fnames in train_batch_loader:
            n_examples += dw.shape[0]
            feat = feat_fuc(dw, qw)
            #-------train-------#
            dw, dw_m,qw,qw_m,dt,qt,tt,tm, answear, candidate, candi_m, cloze_pos,feat=to_vars(\
           [dw, dw_m,qw,qw_m,dt,qt,tt,tm, answear, candidate, candi_m, cloze_pos,feat], use_cuda=USE_CUDA)

            loss_, acc_ = model(dw, dw_m, qw, qw_m, dt, qt, tt, tm, answear,
                                candidate, candi_m, cloze_pos,
                                feat)  # tensor.float size 1
            #print(acc_.cpu().data.numpy())
            loss += loss_.cpu().data.numpy()[0]  # numpy [1]
            acc += acc_.cpu().data.numpy()[0]
            it += 1
            opt.zero_grad()
            loss_.backward()
            clip_grad_norm(parameters=filter(lambda p: p.requires_grad,
                                             model.parameters()),
                           max_norm=GRAD_CLIP)
            opt.step()
            if it % print_every == 0 \
                    or it % len(train_batch_loader) == 0:
                spend = (time.time() - start) / 60
                statement = "Epoch: {}, it: {} (max: {}), "\
                    .format(epoch, it, len(train_batch_loader))
                statement += "loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\
                    .format(loss / print_every, acc / n_examples, spend)
                logging.info(statement)
                del acc, loss, n_examples
                acc = loss = n_examples = 0
                start = time.time()
                # save every print
                torch.save(model.state_dict(),
                           os.path.join(save_path, 'init_model.pkl'))
                # torch.save(model,os.path.join(save_path,'init_model.pkl'))
#-------valid-------#
            if it % eval_every == 0:
                start = time.time()
                model.eval()
                test_loss, test_acc = evaluate(model, valid_batch_loader,
                                               USE_CUDA)
                spend = (time.time() - start) / 60
                statement = "Valid loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\
                    .format(test_loss, test_acc, spend)
                logging.info(statement)
                if best_valid_acc < test_acc:
                    best_valid_acc = test_acc
                    new_max = True
                    # store best valid model
                    torch.save(model.state_dict(),
                               os.path.join(save_path, 'best_model.pkl'))
                    #torch.save(model,os.path.join(save_path,'best_model.pkl'))
                logging.info("Best valid acc: {:.3f}".format(best_valid_acc))
                model.train()
                start = time.time()
#-------test-------#
        start = time.time()
        model.eval()
        test_loss, test_acc = evaluate(model, test_batch_loader, USE_CUDA)
        spend = (time.time() - start) / 60
        logging.info("Test loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\
                     .format(test_loss, test_acc, spend))
        if best_test_acc < test_acc:
            best_test_acc = test_acc
        logging.info("Best test acc: {:.3f}".format(best_test_acc))
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch PennTreeBank RNN/LSTM Language Model')
    parser.add_argument('--data',
                        type=str,
                        default='../data/',
                        help='location of the data corpus')
    parser.add_argument('--presaved',
                        action='store_true',
                        help='use presaved data')
    parser.add_argument('--glovedata',
                        type=str,
                        default='../data/',
                        help='location of the pretrained glove embeddings')
    parser.add_argument('--din', type=int, default=30, help='length of LSTM')
    parser.add_argument('--demb',
                        type=int,
                        default=100,
                        help='size of word embeddings')
    parser.add_argument('--dhid',
                        type=int,
                        default=100,
                        help='humber of hidden units per layer')
    parser.add_argument('--dout',
                        type=int,
                        default=2,
                        help='number of output classes')
    parser.add_argument('--nlayers',
                        type=int,
                        default=1,
                        help='number of layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='initial learning rate')
    parser.add_argument('--clip',
                        type=float,
                        default=0.25,
                        help='gradient clipping')
    parser.add_argument('--embinit',
                        type=str,
                        default='random',
                        help='embedding weight initialization type')
    parser.add_argument('--decinit',
                        type=str,
                        default='random',
                        help='decoder weight initialization type')
    parser.add_argument('--hidinit',
                        type=str,
                        default='random',
                        help='recurrent hidden weight initialization type')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.0,
                        help='dropout applied to layers (0 = no dropout)')
    parser.add_argument('--reweight',
                        action='store_true',
                        help='reweight loss function')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='upper epoch limit')

    parser.add_argument('--clean', action='store_true', help='clean text')
    parser.add_argument('--rm_stops',
                        action='store_true',
                        help='remove stop words')

    parser.add_argument('--batchsize',
                        type=int,
                        default=2000,
                        metavar='N',
                        help='batch size')
    parser.add_argument('--seed', type=int, default=3, help='random seed')
    parser.add_argument('--vocabsize',
                        type=int,
                        default=200000,
                        help='random seed')
    parser.add_argument('--optimizer',
                        action='store_true',
                        help='use ADAM optimizer')
    parser.add_argument('--cuda', action='store_true', help='use CUDA')
    parser.add_argument('--loginterval',
                        type=int,
                        default=20,
                        metavar='N',
                        help='report interval')
    parser.add_argument('--save',
                        type=str,
                        default='',
                        help='path to save the final model')
    args = parser.parse_args()

    pipe = None
    corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe)
    train_data = pd.read_csv('../data/train_data_shuffle.csv')
    valid_data = pd.read_csv('../data/val_data_shuffle.csv')
    train_data = train_data.fillna(' ')
    valid_data = valid_data.fillna(' ')

    if args.reweight:
        print('Downsampling')
        #downsample
        pos_valid = valid_data[valid_data['is_duplicate'] == 1]
        neg_valid = valid_data[valid_data['is_duplicate'] == 0]
        p = 0.19
        pl = len(pos_valid)
        tl = len(pos_valid) + len(neg_valid)
        val = int(pl - (pl - p * tl) / ((1 - p)))
        pos_valid = pos_valid.iloc[:int(val)]
        valid_data = pd.concat([pos_valid, neg_valid])

    print('Splitting Train')
    q1 = list(train_data['question1'].map(str))
    q2 = list(train_data['question2'].map(str))
    y = list(train_data['is_duplicate'])

    print('Splitting Valid')
    q1_val = list(valid_data['question1'].map(str))
    q2_val = list(valid_data['question2'].map(str))
    y_val = list(valid_data['is_duplicate'])

    train_feat = pd.read_csv('../data/train_features_all_norm.csv')
    val_feat = train_feat.iloc[valid_data['id']].values
    train_feat = train_feat.iloc[train_data['id']].values

    print('Splitting Data')
    if args.clean:
        print('Cleaning Data')
        stops = None
        if args.rm_stops:
            stops = stops = set(stopwords.words("english"))
        q1 = [split_text(x, stops) for x in q1]
        q2 = [split_text(x, stops) for x in q2]
        q1_val = [split_text(x, stops) for x in q1_val]
        q2_val = [split_text(x, stops) for x in q2_val]
    else:
        q1 = [x.lower().split() for x in q1]
        q2 = [x.lower().split() for x in q2]
        q1_val = [x.lower().split() for x in q1_val]
        q2_val = [x.lower().split() for x in q2_val]

    print('Downsample Weight: ', np.mean(y_val))

    corpus.gen_vocab(q1 + q2 + q2_val + q1_val)

    n_feat = train_feat.shape[1]
    d_in = args.din
    feat_max = int(np.max([n_feat, d_in]))

    X = torch.Tensor(len(train_data), 1, 3, feat_max)
    X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1,
                                                             feat_max)).long()
    X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2,
                                                             feat_max)).long()
    X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(train_feat))
    y = torch.from_numpy(np.array(y)).long()

    X_val = torch.Tensor(len(valid_data), 1, 3, feat_max)
    X_val[:, 0,
          0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val,
                                                           feat_max)).long()
    X_val[:, 0,
          1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val,
                                                           feat_max)).long()
    X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat))
    y_val = torch.from_numpy(np.array(y_val)).long()

    if args.cuda:
        X, y = X.cuda(), y.cuda()
        X_val, y_val = X_val.cuda(), y_val.cuda()

    print('Generating Data Loaders')
    #X.size len(train_data),1,2,fix_length
    train_dataset = TensorDataset(X, y)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batchsize,
                              shuffle=True)
    valid_loader = DataLoader(TensorDataset(X_val, y_val),
                              batch_size=args.batchsize,
                              shuffle=False)

    num_train = len(X)

    del X, y, X_val, y_val, train_feat, val_feat, q1, q2, q1_val, q2_val

    ntokens = len(corpus)
    glove_embeddings = None
    if args.embinit == 'glove':
        assert args.demb in (50, 100, 200, 300)
        glove_embeddings = get_glove_embeddings(args.glovedata,
                                                corpus.dictionary.word2idx,
                                                ntokens, args.demb)

    model = LSTMModelMLPFeat(args.din, args.dhid, args.nlayers, args.dout,
                             args.demb, n_feat, args.vocabsize, args.dropout,
                             args.embinit, args.hidinit, args.decinit,
                             glove_embeddings, args.cuda)

    if args.cuda:
        model.cuda()

    if args.reweight:
        w_tensor = torch.Tensor([1.309028344, 0.472001959])
        if args.cuda:
            w_tensor = w_tensor.cuda()
        criterion = nn.NLLLoss(weight=w_tensor)
    else:
        criterion = nn.NLLLoss()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    model_config = '\t'.join([
        str(x) for x in (torch.__version__, args.clip, args.nlayers, args.din,
                         args.demb, args.dhid, args.embinit, args.decinit,
                         args.hidinit, args.dropout, args.optimizer,
                         args.reweight, args.lr, args.vocabsize,
                         args.batchsize, args.clean, args.rm_stops)
    ])

    print(
        'Pytorch | Clip | #Layers | InSize | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer | Reweight | LR | VocabSize | batchsize | Clean | Stops'
    )
    print(model_config)

    # best_val_acc = 0.78
    best_ll = 0.3
    for epoch in range(args.epochs):
        model.train()
        total_cost = 0
        start_time = time.time()
        cur_loss = 0
        for ind, (qs, duplicate) in enumerate(train_loader):
            model.zero_grad()
            pred = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(),
                         qs[:, 0, 2, :n_feat])
            if args.cuda:
                pred = pred.cuda()
                duplicate = duplicate.cuda()
            duplicate = Variable(duplicate)
            loss = criterion(pred, duplicate)
            loss.backward()
            clip_grad_norm(model.parameters(), args.clip)

            if optimizer:
                optimizer.step()
            else:
                for p in model.parameters():
                    p.data.add_(-args.lr, p.grad.data)

            total_cost += loss.data[0]
            cur_loss += loss.data[0]

            if ind % args.loginterval == 0 and ind > 0:
                cur_loss = loss.data[0] / args.loginterval
                elapsed = time.time() - start_time
                print(
                    '| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | '
                    'Loss {:.6f}'.format(epoch, ind,
                                         num_train // args.batchsize,
                                         elapsed * 1000.0 / args.loginterval,
                                         cur_loss))
                start_time = time.time()
                cur_loss = 0

        model.eval()

        train_acc, train_ll = evaluate(model, train_loader, args.cuda, d_in,
                                       n_feat)
        val_acc, val_ll = evaluate(model, valid_loader, args.cuda, d_in,
                                   n_feat)
        # if args.save and (val_acc > best_val_acc):
        if args.save and (val_ll < best_ll):
            with open(args.save + '_corpus.pkl', 'wb') as corp_f:
                pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL)
            torch.save(model.cpu(), args.save)
            torch.save(model.cpu().state_dict(), args.save + ".state_dict")
            with open(args.save + ".state_dict.config", "w") as f:
                f.write(model_config)
            best_ll = val_ll
            if args.cuda:
                model.cuda()

        print(
            'Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}'
            .format(epoch, total_cost, train_acc, val_acc, train_ll, val_ll))
        print('-' * 89)

    del train_loader

    print('Reloading Best Model')
    model = torch.load(args.save)
    model.cuda()
    model.eval()

    print('RELOADING VALID')

    valid_data = pd.read_csv('../data/val_data_shuffle.csv')
    valid_data = valid_data.fillna(' ')

    q1_val = list(valid_data['question1'].map(str))
    q2_val = list(valid_data['question2'].map(str))
    y_val = list(valid_data['is_duplicate'])

    train_feat = pd.read_csv('../data/train_features_all_norm.csv')
    val_feat = train_feat.iloc[valid_data['id']].values

    if args.clean:
        print('Cleaning Data')
        stops = None
        if args.rm_stops:
            stops = stops = set(stopwords.words("english"))
        q1_val = [split_text(x, stops) for x in q1_val]
        q2_val = [split_text(x, stops) for x in q2_val]
    else:
        q1_val = [x.lower().split() for x in q1_val]
        q2_val = [x.lower().split() for x in q2_val]

    X_val = torch.Tensor(len(valid_data), 1, 3, feat_max)
    X_val[:, 0,
          0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val,
                                                           feat_max)).long()
    X_val[:, 0,
          1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val,
                                                           feat_max)).long()
    X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat))
    y_val = torch.from_numpy(np.array(y_val)).long()

    if args.cuda:
        X_val, y_val = X_val.cuda(), y_val.cuda()

    valid_loader = DataLoader(TensorDataset(X_val, y_val),
                              batch_size=args.batchsize,
                              shuffle=False)

    del X_val, y_val, train_feat, val_feat, q1_val, q2_val, valid_data

    print('PREDICTING VALID')
    pred_list = []
    for ind, (qs, _) in enumerate(valid_loader):
        out = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(),
                    qs[:, 0, 2, :n_feat])
        pred_list += list(out.exp()[:, 1].data.cpu().numpy())

    with open('../predictions/' + args.save + '_val.pkl', 'wb') as f:
        pkl.dump(pred_list, f, protocol=pkl.HIGHEST_PROTOCOL)

    if args.reweight:
        print('LOADING TEST DATA')
        test_data = pd.read_csv('../data/test.csv')
        test_data = test_data.fillna(' ')
        q1 = list(test_data['question1'].map(str))
        q2 = list(test_data['question2'].map(str))
        q1 = [x.lower().split() for x in q1]
        q2 = [x.lower().split() for x in q2]

        print('LOADING TEST FEATURES')
        test_feat = pd.read_csv('../data/test_features_all_norm.csv').values

        n_feat = test_feat.shape[1]
        d_in = args.din
        feat_max = int(np.max([n_feat, d_in]))

        X = torch.Tensor(len(test_data), 1, 3, feat_max)
        X[:, 0,
          0, :] = torch.from_numpy(corpus.pad_numericalize(q1,
                                                           feat_max)).long()
        X[:, 0,
          1, :] = torch.from_numpy(corpus.pad_numericalize(q2,
                                                           feat_max)).long()
        X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(test_feat))
        y = torch.LongTensor(len(test_data)).zero_()

        if args.cuda:
            X = X.cuda()
            y = y.cuda()

        test_loader = DataLoader(TensorDataset(X, y),
                                 batch_size=500,
                                 shuffle=False)

        print('PREDICTING')
        pred_list = []
        for ind, (qs, _) in enumerate(test_loader):
            out = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(),
                        qs[:, 0, 2, :n_feat])
            pred_list += list(out.exp()[:, 1].data.cpu().numpy())

        with open('../predictions/' + args.save + '.pkl', 'wb') as f:
            pkl.dump(pred_list, f, protocol=pkl.HIGHEST_PROTOCOL)
Ejemplo n.º 9
0
def main():
    cmd = argparse.ArgumentParser('Key-Value by H&Q')
    cmd.add_argument('--data_path', help='', default='../data/')
    cmd.add_argument('--hidden_size', help='', type=int, default=200)
    cmd.add_argument('--embed_size', help='', type=int, default=200)
    cmd.add_argument('--batch_size', help='', type=int, default=32)
    cmd.add_argument('--lr', help='', type=float, default=0.001)
    cmd.add_argument('--lr_decay', help='', type=float, default=1.0)
    cmd.add_argument('--max_epoch', help='', type=int, default=200)
    cmd.add_argument('--seed', help='', type=int, default=1234)
    cmd.add_argument('--dropout', help='', type=float, default=0.8)
    cmd.add_argument('--bleu_path', help='', default='../bleu/')
    cmd.add_argument('--grad_clip', help='', type=float, default=10)
    cmd.add_argument('--parallel_suffix', help='', type=str, default='123')
    cmd.add_argument('--model_save_path',
                     help='',
                     type=str,
                     default='../model')
    cmd.add_argument('--l2', help='', type=float, default=0.000005)
    cmd.add_argument('--key_flag', help='', type=str, default='True')

    args = cmd.parse_args(sys.argv[2:])
    print(args)
    # 存储参数配置
    json.dump(
        vars(args),
        codecs.open(os.path.join(args.model_save_path, 'config.json'),
                    'w',
                    encoding='utf-8'))

    random.seed(args.seed)
    torch.manual_seed(args.seed)

    # 数据预处理: 把标点和词分开,没有标点的统一加上.
    train_dialogs, valid_dialogs, test_dialogs = data_preprocess(
        args.data_path)
    # 提取keys, triples, entities
    keys, triples, entities, value_to_abstract_keys = key_extraction(
        train_dialogs, args.data_path)
    # 生成词典,先将key变成下划线形式加入词典,再将对话加入词典
    lang = Lang()
    lang, underlined_keys = generate_dict(keys, train_dialogs, lang,
                                          value_to_abstract_keys)
    logging.info('dict generated! dict size:{0}'.format(lang.word_size))

    # 存储词典
    with codecs.open(os.path.join(args.model_save_path, 'dict'),
                     'w',
                     encoding='utf-8') as fs:
        res_dict = []
        for word, idx in lang.word2idx.items():
            temp = word + '\t' + str(idx)
            res_dict.append(temp)
        res_dict = '\n'.join(res_dict)
        fs.write(res_dict)

    # 生成训练数据instances
    train_instances = generate_instances(keys, train_dialogs, triples,
                                         value_to_abstract_keys)
    valid_instances = generate_instances(keys, valid_dialogs, triples,
                                         value_to_abstract_keys)
    test_instances = generate_instances(keys, test_dialogs, triples,
                                        value_to_abstract_keys)
    # valid_instances = test_instances = train_instances
    #logging.info('instances sample: {0}'.format(train_instances))

    # Word2idx
    train_instances_idx = sentence_to_idx(lang,
                                          train_instances)  # [([],[]),()]
    valid_instances_idx = sentence_to_idx(lang, valid_instances)
    test_instances_idx = sentence_to_idx(lang, test_instances)
    # valid_instances_idx = test_instances_idx = train_instances_idx
    # keys2idx
    keys_idx = key_to_idx(lang, underlined_keys)

    train_instances_size = len(train_instances_idx)
    valid_instances_size = len(valid_instances_idx)
    test_instances_size = len(test_instances_idx)
    logging.info('trainging size:{0} valid size:{1} test size:{2}'.format(train_instances_size, valid_instances_size, \
                                                                          test_instances_size))

    encoder = Encoder(args.embed_size, args.hidden_size, args.dropout, lang)
    decoder = AttnDecoder(args.embed_size, args.hidden_size, args.dropout,
                          lang, args.key_flag)
    encoderdecoder = EncoderDecoder(args.embed_size, args.hidden_size,
                                    args.dropout, lang)
    encoder = encoder.cuda() if use_cuda else encoder
    decoder = decoder.cuda() if use_cuda else decoder
    encoderdecoder = encoderdecoder.cuda() if use_cuda else encoderdecoder
    encoder_optimizer = optim.Adam(encoder.parameters(),
                                   lr=args.lr,
                                   weight_decay=args.l2)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=args.lr,
                                   weight_decay=args.l2)
    encoderdecoder_optimizer = optim.Adam(decoder.parameters(),
                                          lr=args.lr,
                                          weight_decay=args.l2)

    # train
    best_valid_bleu_score, best_test_bleu_score = 0, 0
    best_valid_f, best_test_f = 0, 0
    order = list(range(len(train_instances_idx)))
    for i in range(args.max_epoch):
        logging.info(
            '--------------------Round {0}---------------------'.format(i))
        #random.shuffle(order)
        start_id = 0
        count = 0
        total_loss = 0
        for start_id in range(0, train_instances_size, args.batch_size):
            end_id = start_id + args.batch_size if start_id + args.batch_size < train_instances_size else train_instances_size
            batch_size = end_id - start_id
            batch_to_be_generated = [
                train_instances_idx[ids] for ids in order[start_id:end_id]
            ]
            batch_gold = [
                train_instances[ids] for ids in order[start_id:end_id]
            ]  # 对于train来说没有用
            batch_input, batch_output, _, sentence_lens = generate_batch(
                batch_to_be_generated, batch_gold, batch_size,
                lang.word2idx['pad'])

            # train
            encoder.train()
            decoder.train()
            encoderdecoder.train()
            encoder.zero_grad()
            decoder.zero_grad()
            encoderdecoder.zero_grad()
            loss = encoderdecoder.forward(batch_input, batch_output, sentence_lens, keys_idx, \
                                          encoder, decoder, lang.word2idx['pad'], args.embed_size)
            loss.backward()
            clip_grad_norm(encoder.parameters(), args.grad_clip)
            clip_grad_norm(decoder.parameters(), args.grad_clip)
            clip_grad_norm(encoderdecoder.parameters(), args.grad_clip)
            encoder_optimizer.step()
            decoder_optimizer.step()
            encoderdecoder_optimizer.step()

            total_loss += loss.data
            count += 1

            # if (count % 100 == 0):
            #     logging.info('average loss: {0}'.format(total_loss*1.0/count))

        valid_bleu_score, valid_f = evaluate(keys_idx, encoder, decoder, encoderdecoder, valid_instances_idx, valid_instances, lang, \
                              args.batch_size, args.embed_size, args.hidden_size, args.bleu_path, args.parallel_suffix)
        # if (valid_bleu_score > best_valid_bleu_score):
        #     test_bleu_score, test_f = evaluate(keys_idx, encoder, decoder, encoderdecoder, test_instances_idx, test_instances, lang, \
        #               args.batch_size, args.embed_size, args.hidden_size, args.bleu_path, args.parallel_suffix)
        #     best_test_bleu_score = max(best_test_bleu_score, test_bleu_score)
        #
        #     logging.info('New Record! test bleu score now: {0} best test bleu score ever: {1}'.format(\
        #         test_bleu_score, best_test_bleu_score))

        if (valid_f > best_valid_f):
            torch.save(
                encoder.state_dict(),
                os.path.join(args.model_save_path,
                             'encoder' + args.parallel_suffix))
            torch.save(
                decoder.state_dict(),
                os.path.join(args.model_save_path,
                             'decoder' + args.parallel_suffix))
            torch.save(
                encoderdecoder.state_dict(),
                os.path.join(args.model_save_path,
                             'encoderdecoder' + args.parallel_suffix))
            test_bleu_score, test_f = evaluate(keys_idx, encoder, decoder, encoderdecoder, test_instances_idx, test_instances, lang, \
                      args.batch_size, args.embed_size, args.hidden_size, args.bleu_path, args.parallel_suffix)
            best_test_f = max(best_test_f, test_f)
            best_test_bleu_score = max(best_test_bleu_score, test_bleu_score)
            logging.info('New Record! test F now: {0} best test F ever: {1} test bleu score now: {2} best test bleu score ever: {3}'.format(\
                test_f, best_test_f, test_bleu_score, best_test_bleu_score))
        best_valid_f = max(best_valid_f, valid_f)
        best_valid_bleu_score = max(best_valid_bleu_score, valid_bleu_score)
        logging.info('valid F: {0} best valid F ever: {1}'.format(
            valid_f, best_valid_f))

        logging.info(
            'valid bleu score: {0} best valid bleu score ever: {1}'.format(
                valid_bleu_score, best_valid_bleu_score))
    logging.info('Trianing complete! best valid bleu score: {0} best test bleu score: {1} best valid F: {2} best test F: {3}'\
                 .format(best_valid_bleu_score, best_test_bleu_score, best_valid_f, best_test_f))
    logging.info('suffix is {0}'.format(args.parallel_suffix))
    print(args)
Ejemplo n.º 10
0
    def train(self):

        info = {}
        if self.T % self.args.nec_update != 0:
            return info

        # print("Training")

        for _ in range(self.args.iters):

            # TODO: Use a named tuple for experience replay
            batch = self.replay.Sample(self.args.batch_size)
            columns = list(zip(*batch))

            states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3))
            # print(states)
            actions = columns[1]
            # print(actions)
            targets = Variable(torch.FloatTensor(columns[2]))
            # print(targets)
            keys = self.embedding(states).cpu()
            # print(keys)
            # print("Keys", keys.requires_grad)
            # for action in actions:
                # print(action)
            # for action, key in zip(actions, keys):
                # print(action, key)
                # kk = key.unsqueeze(0)
                # print("kk", kk.requires_grad)
                # k = self.dnds[action].lookup(key.unsqueeze(0))
                # print("key", key.requires_grad, key.volatile)
            model_predictions = torch.cat([self.dnds[action].lookup(key.unsqueeze(0)) for action, key in zip(actions, keys)])
            # print(model_predictions)
            # print(targets)

            td_error = model_predictions - targets
            # print(td_error)
            info["TD_Error"] = td_error.mean().data[0]

            l2_loss = (td_error).pow(2).mean()
            info["Loss"] = l2_loss.data[0]

            # Update
            self.optimizer.zero_grad()

            l2_loss.backward()

            # Taken from pytorch clip_grad_norm
            # Remove once the pip version it up to date with source
            gradient_norm = clip_grad_norm(self.embedding.parameters(), self.args.clip_value)
            if gradient_norm is not None:
                info["Norm"] = gradient_norm

            self.optimizer.step()

            if "States" in info:
                states_trained = info["States"]
                info["States"] = states_trained + columns[0]
            else:
                info["States"] = columns[0]

        return info
Ejemplo n.º 11
0
def forward(data_loader,
            model,
            criterion,
            epoch=0,
            training=True,
            optimizer=None):
    if args.gpus and len(args.gpus) > 1:
        model = torch.nn.DataParallel(model, args.gpus)

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    end = time.time()

    if training:
        optimizer.zero_grad()

    for i, (inputs, target) in enumerate(data_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        if args.gpus is not None:
            target = target.cuda(async=True)
        input_var = Variable(inputs.type(args.type), volatile=not training)
        target_var = Variable(target)

        # compute output
        if not training:
            output = model(input_var)
            loss = criterion(output, target_var)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output.data, target_var.data, topk=(1, 5))
            losses.update(loss.data[0], input_var.size(0))
            top1.update(prec1[0], input_var.size(0))
            top5.update(prec5[0], input_var.size(0))

        else:
            is_updating = ((i + 1) % args.batch_multiplier
                           == 0) or (i + 1 == len(data_loader))
            mini_inputs = input_var.chunk(args.batch_size //
                                          args.mini_batch_size)
            mini_targets = target_var.chunk(args.batch_size //
                                            args.mini_batch_size)

            # get the coefficent to scale noise
            eq_batch_num = (len(data_loader) + args.batch_multiplier -
                            1) // args.batch_multiplier
            if args.smoothing_type == 'constant':
                noise_coef = 1.
            elif args.smoothing_type == 'anneal':
                noise_coef = 1.0 / (
                    (1 + epoch * eq_batch_num + i // args.batch_multiplier)**
                    args.anneal_index)
                noise_coef = noise_coef**0.5
            elif args.smoothing_type == 'tanh':
                noise_coef = np.tanh(
                    args.tanh_scale *
                    ((float)(epoch * eq_batch_num + i // args.batch_multiplier)
                     / (float)(args.epochs * eq_batch_num) - .5))
                noise_coef = (noise_coef + 1.) / 2.0
            else:
                raise ValueError('Unknown smoothing-type')
            if i % args.print_freq == 0:
                logging.info(
                    '{phase} - Epoch: [{0}][{1}/{2}]\t'
                    'Noise Coefficient: {noise_coef:.4f}\t'.format(
                        epoch,
                        i,
                        len(data_loader),
                        phase='TRAINING' if training else 'EVALUATING',
                        noise_coef=noise_coef))

            for k, mini_input_var in enumerate(mini_inputs):

                noises = {}
                # randomly change current model @ each mini-mini-batch
                if args.sharpness_smoothing:
                    for key, p in model.named_parameters():
                        if hasattr(model, 'quiet_parameters') and (
                                key in model.quiet_parameters):
                            continue

                        if args.adapt_type == 'weight':
                            noise = (
                                torch.cuda.FloatTensor(p.size()).uniform_() *
                                2. -
                                1.) * args.sharpness_smoothing * torch.abs(
                                    p.data) * noise_coef

                        elif args.adapt_type == 'filter':
                            noise = (
                                torch.cuda.FloatTensor(p.size()).uniform_() *
                                2. - 1.)
                            noise_shape = noise.shape
                            noise_norms = noise.view(
                                [noise_shape[0], -1]).norm(p=2, dim=1) + 1.0e-6
                            p_norms = p.view([noise_shape[0], -1]).norm(p=2,
                                                                        dim=1)
                            for shape_idx in range(1, len(noise_shape)):
                                noise_norms = noise_norms.unsqueeze(-1)
                                p_norms = p_norms.unsqueeze(-1)
                            noise = noise / noise_norms * p_norms.data
                            #for idx in range(0, noise.shape[0]):
                            #  if 1 == len(noise.shape):
                            #    if np.abs(np.linalg.norm(noise[idx]))>1.0e-6:
                            #      noise[idx] = noise[idx] / np.linalg.norm(noise[idx]) * np.linalg.norm(p.data[idx])
                            #  else:
                            #    if np.abs(noise[idx].norm())>1.0e-6:
                            #      noise[idx] = noise[idx] / noise[idx].norm() * p.data[idx].norm()

                            noise = noise * args.sharpness_smoothing * noise_coef

                        elif args.adapt_type == 'none':
                            noise = (
                                torch.cuda.FloatTensor(p.size()).uniform_() *
                                2. -
                                1.) * args.sharpness_smoothing * noise_coef

                        else:
                            raise ValueError('Unkown --adapt-type')
                        noises[key] = noise
                        p.data.add_(noise)

                mini_target_var = mini_targets[k]
                output = model(mini_input_var)
                loss = criterion(output, mini_target_var)

                prec1, prec5 = accuracy(output.data,
                                        mini_target_var.data,
                                        topk=(1, 5))
                losses.update(loss.data[0], mini_input_var.size(0))
                top1.update(prec1[0], mini_input_var.size(0))
                top5.update(prec5[0], mini_input_var.size(0))

                # compute gradient and do SGD step
                loss.backward()

                # denoise @ each mini-mini-batch.
                if args.sharpness_smoothing:
                    for key, p in model.named_parameters():
                        if key in noises:
                            p.data.sub_(noises[key])

            if is_updating:
                n_batches = args.batch_multiplier
                if (i + 1) == len(data_loader):
                    n_batches = (i % args.batch_multiplier) + 1
                for p in model.parameters():
                    p.grad.data.div_(len(mini_inputs) * n_batches)
                clip_grad_norm(model.parameters(), 5.)
                optimizer.step()
                optimizer.zero_grad()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            logging.info('{phase} - Epoch: [{0}][{1}/{2}]\t'
                         'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                         'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                         'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                         'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                         'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                             epoch,
                             i,
                             len(data_loader),
                             phase='TRAINING' if training else 'EVALUATING',
                             batch_time=batch_time,
                             data_time=data_time,
                             loss=losses,
                             top1=top1,
                             top5=top5))

    return {'loss': losses.avg, 'prec1': top1.avg, 'prec5': top5.avg}
Ejemplo n.º 12
0
def fit_model(model, loss_op, optim_op, train_gen, val_gen, epochs,
              checkpoint_path, patience):
    """ Analog to Keras fit_generator function.

    # Arguments:
        model: Model to be finetuned.
        loss_op: loss operation (BCEWithLogitsLoss or CrossEntropy for e.g.)
        optim_op: optimization operation (Adam e.g.)
        train_gen: Training data iterator (DataLoader)
        val_gen: Validation data iterator (DataLoader)
        epochs: Number of epochs.
        checkpoint_path: Filepath where weights will be checkpointed to
            during training. This file will be rewritten by the function.
        patience: Patience for callback methods.
        verbose: Verbosity flag.

    # Returns:
        Accuracy of the trained model, ONLY if 'evaluate' is set.
    """
    # Save original checkpoint
    torch.save(model.state_dict(), checkpoint_path)

    model.eval()
    best_loss = np.mean([loss_op(model(Variable(xv)).squeeze(), Variable(yv.float()).squeeze()).data.cpu().numpy()[0] for xv, yv in val_gen])
    print("original val loss", best_loss)

    epoch_without_impr = 0
    for epoch in range(epochs):
        for i, data in enumerate(train_gen):
            X_train, y_train = data
            X_train = Variable(X_train, requires_grad=False)
            y_train = Variable(y_train, requires_grad=False)
            model.train()
            optim_op.zero_grad()
            output = model(X_train)
            loss = loss_op(output, y_train.float())
            loss.backward()
            clip_grad_norm(model.parameters(), 1)
            optim_op.step()

            acc = evaluate_using_acc(model, [(X_train.data, y_train.data)])
            print("== Epoch", epoch, "step", i, "train loss", loss.data.cpu().numpy()[0], "train acc", acc)

        model.eval()
        acc = evaluate_using_acc(model, val_gen)
        print("val acc", acc)

        val_loss = np.mean([loss_op(model(Variable(xv)).squeeze(), Variable(yv.float()).squeeze()).data.cpu().numpy()[0] for xv, yv in val_gen])
        print("val loss", val_loss)
        if best_loss is not None and val_loss >= best_loss:
            epoch_without_impr += 1
            print('No improvement over previous best loss: ', best_loss)

        # Save checkpoint
        if best_loss is None or val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), checkpoint_path)
            print('Saving model at', checkpoint_path)

        # Early stopping
        if epoch_without_impr >= patience:
            break
def train(train_iter, dev_iter, test_iter, model, args):
    if args.cuda:
        model.cuda()

    # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-8)
    # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay)
    # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,momentum=)
    # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr)

    if args.Adam is True:
        print("Adam Training......")
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay)
    elif args.SGD is True:
        print("SGD Training.......")
        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay,
                                    momentum=args.momentum_value)
    elif args.Adadelta is True:
        print("Adadelta Training.......")
        optimizer = torch.optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay)

    # lambda1 = lambda epoch: epoch // 30
    # lambda2 = lambda epoch: 0.99 ** epoch
    # print("lambda1 {} lambda2 {} ".format(lambda1, lambda2))
    # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda2])

    # scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.9)

    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')


    steps = 0
    epoch_step = 0
    model_count = 0
    model.train()
    for epoch in range(1, args.epochs+1):
        print("\n## 第{} 轮迭代,共计迭代 {} 次 !##\n".format(epoch, args.epochs))
        # scheduler.step()
        # print("now lr is {} \n".format(scheduler.get_lr()))
        print("now lr is {} \n".format(optimizer.param_groups[0].get("lr")))
        for batch in train_iter:
            feature, target = batch.text, batch.label
            feature.data.t_(), target.data.sub_(1)  # batch first, index align
            if args.cuda:
                feature, target = feature.cuda(), target.cuda()
            optimizer.zero_grad()
            logit = model(feature)
            loss = F.cross_entropy(logit, target)
            loss.backward()
            if args.init_clip_max_norm is not None:
                utils.clip_grad_norm(model.parameters(), max_norm=args.init_clip_max_norm)
            optimizer.step()
            steps += 1
            if steps % args.log_interval == 0:
                train_size = len(train_iter.dataset)
                corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
                accuracy = float(corrects)/batch.batch_size * 100.0
                sys.stdout.write(
                    '\rBatch[{}/{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps,
                                                                                train_size, loss.data[0], accuracy,
                                                                                corrects, batch.batch_size))
        # eval and test after every epoch
        eval(dev_iter, model, args, scheduler)
        if not os.path.isdir(args.save_dir):
            os.makedirs(args.save_dir)
        save_prefix = os.path.join(args.save_dir, 'snapshot')
        save_path = '{}_steps{}.pt'.format(save_prefix, steps)
        torch.save(model, save_path)
        print("\n", save_path, end=" ")
        test_model = torch.load(save_path)
        model_count += 1
        test_eval(test_iter, test_model, save_path, args, model_count)
    return model_count
Ejemplo n.º 14
0
def train():
    
    global_steps = 0
    best_em = -1
    best_f1 = -1
    for iepoch in range(epochs):

        batch = 0
        for tdata in train_loader:

            passage_tokens = tdata['passage_tokens']
            passage_len = tdata['passage_len']
            char_start_end = tdata['char_start_end'] 
            question_tokens = tdata['question_tokens'] 
            question_len = tdata['question_len']
            ground_truth = tdata['ground_truth']
            answer_tokens = tdata['answer_tokens']
            answer_len = tdata['answer_len']
            boundary = tdata['boundary']
            passage_str = tdata['passage_str']
            question_str = tdata['question_str']
            answer_str = tdata['answer_str']
            key = tdata['key']

            classification_labels, answer_index = prepare_classify(question_tokens, answer_tokens)
            
            fw_res = net(passage=passage_tokens, 
                         question=question_tokens,
                         answer=answer_tokens,
                         answer_index=answer_index,
                         decoder_inputs=ground_truth, 
                         is_classification = True,
                         is_generation = True,
                         is_teacher_forcing = True)
            
            match_logits = fw_res['match_logits']
            match_predictions = fw_res['match_predictions']
            
            generation_logits = fw_res['generation_logits']
            generation_predictions = fw_res['generation_predictions']
            
            classification_logits = fw_res['classification_logits']
            classification_predictions = fw_res['classification_predictions']
            
            print (classification_predictions)
            print (classification_labels.numpy())
            print (sum(classification_labels.numpy() == classification_predictions) / len(classification_predictions))
            
            loss_return = net.get_loss(match_logits=match_logits, 
                                       match_labels=boundary, 
                                       generation_logits=generation_logits, 
                                       generation_labels=question_tokens, 
                                       classification_logits=classification_logits, 
                                       classification_labels= classification_labels,
                                       is_match = False,
                                       is_generation = False,
                                       is_classification = True,
                                       lambda_m = lambda_m,
                                       lambda_g = lambda_g,
                                       lambda_c = lambda_c)
            
            match_loss = loss_return['match_loss']
            loss = loss_return['loss']
            generation_loss = loss_return['generation_loss']
            classification_loss = loss_return['classification_loss']
            
            optimizer.zero_grad()
            loss.backward()
            utils.clip_grad_norm(net.parameters(), 5)
            optimizer.step()

            print (global_steps, iepoch, batch, 'match loss: ', match_loss, 'generation loss: ', generation_loss,'classification loss: ', classification_loss ) 
            agent.append(train_match_loss, global_steps, match_loss)
            agent.append(train_generation_loss, global_steps, generation_loss)
            agent.append(train_classification_loss, global_steps, classification_loss)
            agent.append(train_loss, global_steps, sum(loss.cpu().data.numpy()))
            
            batch += 1
            global_steps += 1
            del fw_res, match_logits, match_predictions, loss, match_loss, generation_loss, loss_return

            '''
            if global_steps % 10 == 0:
                match_loss, loss = check(net, tdata)
                net.train()
                
            if global_steps % 20 == 0:
                dev_loss, em, f1 = valid()
                agent.append(valid_match_loss, global_steps, dev_loss)
                agent.append(valid_match_em, global_steps, em)
                agent.append(valid_match_f1, global_steps, f1)
                print (global_steps, iepoch, batch, dev_loss, em, f1)
                '''
            '''
                if em > best_em and f1 > best_f1:
                    save_model(net, dev_loss, em, f1, global_steps)
                '''
            '''
Ejemplo n.º 15
0
    def train(self):

        if self.T - self.target_sync_T > self.args.target:
            self.sync_target_network()
            self.target_sync_T = self.T

        info = {}

        for _ in range(self.args.iters):
            self.dqn.eval()

            # TODO: Use a named tuple for experience replay
            n_step_sample = 1
            if np.random.random() < self.args.n_step_mixing:
                n_step_sample = self.args.n_step
            batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, n_step_sample, self.args.gamma)
            columns = list(zip(*batch))

            states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3))
            actions = Variable(torch.LongTensor(columns[1]))
            terminal_states = Variable(torch.FloatTensor(columns[5]))
            rewards = Variable(torch.FloatTensor(columns[2]))
            # Have to clip rewards for DQN
            rewards = torch.clamp(rewards, -1, 1)
            steps = Variable(torch.FloatTensor(columns[4]))
            new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3))

            target_dqn_qvals = self.target_dqn(new_states).cpu()
            # Make a new variable with those values so that these are treated as constants
            target_dqn_qvals_data = Variable(target_dqn_qvals.data)

            q_value_targets = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states)
            inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma)
            # print(steps)
            q_value_targets = q_value_targets * torch.pow(inter, steps)
            if self.args.double:
                # Double Q Learning
                new_states_qvals = self.dqn(new_states).cpu()
                new_states_qvals_data = Variable(new_states_qvals.data)
                q_value_targets = q_value_targets * target_dqn_qvals_data.gather(1, new_states_qvals_data.max(1)[1])
            else:
                q_value_targets = q_value_targets * target_dqn_qvals_data.max(1)[0]
            q_value_targets = q_value_targets + rewards

            self.dqn.train()
            if self.args.gpu:
                actions = actions.cuda()
                q_value_targets = q_value_targets.cuda()
            model_predictions = self.dqn(states).gather(1, actions.view(-1, 1))

            # info = {}

            td_error = model_predictions - q_value_targets
            info["TD_Error"] = td_error.mean().data[0]

            # Update the priorities
            if not self.args.density_priority:
                self.replay.Update_Indices(indices, td_error.cpu().data.numpy(), no_pseudo_in_priority=self.args.count_td_priority)

            # If using prioritised we need to weight the td_error
            if self.args.prioritized and self.args.prioritized_is:
                # print(td_error)
                weights_tensor = torch.from_numpy(is_weights).float()
                weights_tensor = Variable(weights_tensor)
                if self.args.gpu:
                    weights_tensor = weights_tensor.cuda()
                # print(weights_tensor)
                td_error = td_error * weights_tensor
            l2_loss = (td_error).pow(2).mean()
            info["Loss"] = l2_loss.data[0]

            # Update
            self.optimizer.zero_grad()
            l2_loss.backward()

            # Taken from pytorch clip_grad_norm
            # Remove once the pip version it up to date with source
            gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value)
            if gradient_norm is not None:
                info["Norm"] = gradient_norm

            self.optimizer.step()

            if "States" in info:
                states_trained = info["States"]
                info["States"] = states_trained + columns[0]
            else:
                info["States"] = columns[0]

        # Pad out the states to be of size batch_size
        if len(info["States"]) < self.args.batch_size:
            old_states = info["States"]
            new_states = old_states[0] * (self.args.batch_size - len(old_states))
            info["States"] = new_states

        return info
Ejemplo n.º 16
0
def train(train_loader, model, act_criterion, comp_criterion, regression_criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    act_losses = AverageMeter()
    comp_losses = AverageMeter()
    reg_losses = AverageMeter()
    act_accuracies = AverageMeter()
    fg_accuracies = AverageMeter()
    bg_accuracies = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    optimizer.zero_grad()

    ohem_num = train_loader.dataset.fg_per_video
    comp_group_size = train_loader.dataset.fg_per_video + train_loader.dataset.incomplete_per_video
    for i, (out_frames, out_prop_len, out_prop_scaling, out_prop_type, out_prop_labels,
            out_prop_reg_targets, out_stage_split) \
            in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        input_var = torch.autograd.Variable(out_frames)
        scaling_var = torch.autograd.Variable(out_prop_scaling)
        target_var = torch.autograd.Variable(out_prop_labels)
        reg_target_var = torch.autograd.Variable(out_prop_reg_targets)
        prop_type_var = torch.autograd.Variable(out_prop_type)

        # compute output

        activity_out, activity_target, \
        completeness_out, completeness_target, \
        regression_out, regression_labels, regression_target = model(input_var, scaling_var, target_var,
                                                                     reg_target_var, prop_type_var)

        act_loss = act_criterion(activity_out, activity_target)
        comp_loss = comp_criterion(completeness_out, completeness_target, ohem_num, comp_group_size)
        reg_loss = regression_criterion(regression_out, regression_labels, regression_target)

        loss = act_loss + comp_loss * args.comp_loss_weight + reg_loss * args.reg_loss_weight

        reg_losses.update(reg_loss.data[0], out_frames.size(0))

        # measure mAP and record loss
        losses.update(loss.data[0], out_frames.size(0))
        act_losses.update(act_loss.data[0], out_frames.size(0))
        comp_losses.update(comp_loss.data[0], out_frames.size(0))

        act_acc = accuracy(activity_out, activity_target)
        act_accuracies.update(act_acc[0].data[0], activity_out.size(0))

        fg_acc = accuracy(activity_out.view(-1, 2, activity_out.size(1))[:, 0, :].contiguous(),
                          activity_target.view(-1, 2)[:, 0].contiguous())

        bg_acc = accuracy(activity_out.view(-1, 2, activity_out.size(1))[:, 1, :].contiguous(),
                          activity_target.view(-1, 2)[:, 1].contiguous())

        fg_accuracies.update(fg_acc[0].data[0], activity_out.size(0) // 2)
        bg_accuracies.update(bg_acc[0].data[0], activity_out.size(0) // 2)

        # compute gradient and do SGD step
        loss.backward()

        if i % args.iter_size == 0:
            # scale down gradients when iter size is functioning
            if args.iter_size != 1:
                for g in optimizer.param_groups:
                    for p in g['params']:
                        p.grad /= args.iter_size

            if args.clip_gradient is not None:
                total_norm = clip_grad_norm(model.parameters(), args.clip_gradient)
                if total_norm > args.clip_gradient:
                    print("clipping gradient: {} with coef {}".format(total_norm, args.clip_gradient / total_norm))
            else:
                total_norm = 0

            optimizer.step()
            optimizer.zero_grad()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Act. Loss {act_losses.val:.3f} ({act_losses.avg: .3f}) \t'
                  'Comp. Loss {comp_losses.val:.3f} ({comp_losses.avg: .3f}) '
                  .format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, act_losses=act_losses,
                    comp_losses=comp_losses, lr=optimizer.param_groups[0]['lr'], ) +
                  '\tReg. Loss {reg_loss.val:.3f} ({reg_loss.avg:.3f})'.format(
                      reg_loss=reg_losses)
                  + '\n Act. FG {fg_acc.val:.02f} ({fg_acc.avg:.02f}) Act. BG {bg_acc.avg:.02f} ({bg_acc.avg:.02f})'
                  .format(act_acc=act_accuracies,
                    fg_acc=fg_accuracies, bg_acc=bg_accuracies)
                  )
Ejemplo n.º 17
0
def train(lr, net, epoch, train_loader, valid_loader, transform, hyperparameters, batch_size):

    # register hypercurve
    agent = Agent(port=5005)
    hyperparameters['criteria'] = 'train loss'
    train_loss = agent.register(hyperparameters, 'loss')
    
    hyperparameters['criteria'] = 'valid loss'
    valid_loss = agent.register(hyperparameters, 'loss')
    
    hyperparameters['criteria'] = 'valid bleu'
    valid_bleu = agent.register(hyperparameters, 'bleu')
    
    hyperparameters['criteria'] = 'train bleu'
    train_bleu = agent.register(hyperparameters, 'bleu')
    
    hyperparameters['criteria'] = 'teacher_forcing_ratio'
    hyper_tfr = agent.register(hyperparameters, 'ratio')
    
    hyperparameters['criteria'] = 'teacher_forcing_loss'
    valid_tf_loss = agent.register(hyperparameters, 'loss')

    if torch.cuda.is_available():
        net.cuda()
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr = lr)
    net.train()
    
    best_score = -1
    global_steps = 578800
    best_valid_loss  = 10000
    for iepoch in range(epoch):
        
        batchid = 0
        for (_, tdata) in enumerate(train_loader, 0):

            entext = tdata['entext']
            enlen = tdata['enlen']
            zhlabel = tdata['zhlabel']
            zhgtruth = tdata['zhgtruth']
            zhlen = tdata['zhlen']
            enstr = tdata['enstr']
            zhstr = tdata['zhstr']
            
            teacher_forcing_ratio = 1
            print ('teacher_forcing_ratio: ', teacher_forcing_ratio)
            
            decoder_outputs, ret_dict = net(entext, zhgtruth,True, teacher_forcing_ratio)
            
            
            loss = net.get_loss(decoder_outputs, zhlabel)
            optimizer.zero_grad()
            loss.backward()
            utils.clip_grad_norm(net.parameters(), 5)
            optimizer.step()
            
            batchid += 1
            global_steps += 1
            
            print (global_steps, iepoch, batchid, max(enlen), sum(loss.data.cpu().numpy())) 
            agent.append(train_loss, global_steps, sum(loss.data.cpu().numpy()))
            agent.append(hyper_tfr, global_steps, teacher_forcing_ratio)
            
            if global_steps % 50 == 0:
                net.eval()
                decoder_outputs, ret_dict = net(entext, zhgtruth, True, teacher_forcing_ratio)
                
                length = ret_dict['length']
                prediction = [0 for i in range(len(length))]
                tmppre = [_.squeeze().cpu().data.tolist() for _ in ret_dict['sequence']]
                tmppre = np.array(tmppre).transpose(1, 0)
                
                for i in range(len(tmppre)):
                    
                    prediction[i] = tmppre[i][:length[i]]
                    prediction[i] = transform.i2t(prediction[i], language = 'zh')
                    prediction[i] = re.sub(r'nuk#', '', prediction[i])
                    prediction[i] = re.sub(r'eos#', '', prediction[i])

                tmpscore = bleuscore.score(prediction, zhstr)   
                
                for i in range(5):
                    print (prediction[i])
                    print (zhstr[i])
                    print ('-------------------\n')

                del decoder_outputs, ret_dict
                agent.append(train_bleu, global_steps, tmpscore)
                net.train()
	    
                
            if global_steps % 200 == 0:
                print ('\n------------------------\n')
                net.eval()
                all_pre = []
                all_label = []
                all_loss = 0
                all_en = []
                bats = 0
                teacher_forcing_loss = 0
                for (_, vdata) in enumerate(valid_loader, 0):
                    
                    entext = vdata['entext']
                    enlen = vdata['enlen']
                    zhlabel = vdata['zhlabel']
                    zhgtruth = vdata['zhgtruth']
                    zhlen = vdata['zhlen']
                    enstr = vdata['enstr']
                    zhstr = vdata['zhstr']
                    
                    decoder_outputs, ret_dict = net(entext, None, True, 0)
                    length = ret_dict['length']
                    prediction = [0 for i in range(len(length))]
                    tmppre = [_.squeeze().cpu().data.tolist() for _ in ret_dict['sequence']]
                    tmppre = np.array(tmppre).transpose(1, 0)
                    
                    for i in range(len(tmppre)):
                        prediction[i] = tmppre[i][:length[i]]
                        prediction[i] = transform.i2t(prediction[i], language = 'zh')
                        prediction[i] = re.sub(r'nuk#', '', prediction[i])
                        prediction[i] = re.sub(r'eos#', '', prediction[i])
                    
                    loss = net.get_loss(decoder_outputs, zhlabel)
                    
                    all_pre.extend(prediction)
                    all_label.extend(zhstr)
                    all_en.extend(enstr)
                    all_loss += sum(loss.data.cpu().numpy())
                    
                    del loss, decoder_outputs, ret_dict

                    # teacher forcing loss, to judge if overfit
                    decoder_outputs, _ = net(entext, zhgtruth, True, 1)
                    loss = net.get_loss(decoder_outputs, zhlabel)
                    teacher_forcing_loss += sum(loss.data.cpu().numpy()) 
                    bats += 1
                score = bleuscore.score(all_pre, all_label)
            
                for i in range(0, 400):
                    print (all_en[i])
                    print (all_pre[i])
                    print (all_label[i])
                    print ('-------------------\n')
        
                all_loss /= bats
                teacher_forcing_loss /= bats
                print (global_steps, iepoch, batchid, all_loss, teacher_forcing_loss, score, '\n********************\n')
                agent.append(valid_loss, global_steps, all_loss)
                agent.append(valid_bleu, global_steps, score)
                agent.append(valid_tf_loss, global_steps, teacher_forcing_loss)
                if best_valid_loss > all_loss:
                    
                    best_valid_loss = all_loss
                    #bestscore = score
                    _ = model_dir + "ratio-{:3f}-loss-{:3f}-score-{:3f}-steps-{:d}-model.pkl".format(teacher_forcing_ratio, all_loss, score, global_steps)
                    torch.save(net.state_dict(), _)
                
                elif global_steps % 400 == 0:
                    _ = model_dir + "ratio-{:3f}-loss-{:3f}-score-{:3f}-steps-{:d}-model.pkl".format(teacher_forcing_ratio, all_loss, score, global_steps)
                    torch.save(net.state_dict(), _)

                del all_label, all_loss, all_pre
                net.train()
Ejemplo n.º 18
0
def train(args, word_idx, train_data, valid_data, dev_data, writer=None):

    # build minibatch loader
    train_batch_loader = minibatch_loader(train_data,
                                          args.batch_size,
                                          sample=1.0,
                                          punc=args.punc)

    valid_batch_loader = minibatch_loader(valid_data,
                                          args.batch_size,
                                          shuffle=False,
                                          punc=args.punc)

    dev_batch_loader = minibatch_loader(dev_data,
                                        args.batch_size,
                                        shuffle=False,
                                        punc=args.punc)

    # training phase
    if args.restore_model != None:
        logging.info("restore from previous training...")

        _, embed_dim = load_word2vec_embeddings(word_idx, args.embed_file,
                                                args.embed_dim, False)

        model = AttSum(args.n_layers, args.vocab_size, args.drop_out,
                       args.gru_size, None, embed_dim, args.train_emb)

        checkpoint = torch.load(args.restore_model + '.chkpt')

        opt = torch.optim.Adam(params=filter(lambda p: p.requires_grad,
                                             model.parameters()),
                               lr=args.init_learning_rate)

        model.load_state_dict(checkpoint)
        '''
        model.load_state_dict(checkpoint['state_dict'])
        opt.load_state_dict(checkpoint['optimizer'])
        '''

    else:
        embed_init, embed_dim = load_word2vec_embeddings(
            word_idx, args.embed_file, args.embed_dim, True)

        logging.info("embedding dim: {}".format(embed_dim))
        logging.info("initialize model ...")
        model = AttSum(args.n_layers, args.vocab_size, args.drop_out,
                       args.gru_size, embed_init, embed_dim, args.train_emb)
        opt = torch.optim.Adam(params=filter(lambda p: p.requires_grad,
                                             model.parameters()),
                               lr=args.init_learning_rate)

    if USE_CUDA:
        model.cuda()
    logging.info("Running on cuda: {}".format(USE_CUDA))

    logging.info('-' * 50)
    logging.info("Start training ...")

    best_valid_acc = best_dev_acc = 0

    for epoch in range(args.epoch):
        '''
        if epoch >= 2:
            for param_group in opt.param_groups:
                param_group['lr'] /= 2
        '''

        model.train()
        train_acc = acc = train_loss = loss = n_examples = train_examples = it = 0
        start = time.time()



        for docs, anss, docs_mask, \
            cand_position in train_batch_loader:

            train_examples += docs.shape[0]
            n_examples += docs.shape[0]

            docs, anss, docs_mask, \
                cand_position = to_vars([docs, anss, docs_mask, \
                    cand_position], use_cuda=USE_CUDA)

            opt.zero_grad()

            loss_, acc_ = model(docs, anss, docs_mask, cand_position)

            train_loss += loss_.cpu().data.numpy()[0]
            loss += loss_.cpu().data.numpy()[0]
            train_acc += acc_.cpu().data.numpy()[0]
            acc += acc_.cpu().data.numpy()[0]
            it += 1

            loss_.backward()
            clip_grad_norm(parameters=filter(lambda p: p.requires_grad,
                                             model.parameters()),
                           max_norm=args.grad_clip)
            opt.step()

            if (it % args.print_every == 0):
                # on training
                spend = (time.time() - start) / 60
                statement = "it: {} (max: {}), "\
                    .format(it, len(train_batch_loader))
                statement += "train loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\
                    .format(loss / float(args.print_every), acc / n_examples, spend)
                logging.info(statement)

                # on valid
                model.eval()
                start = time.time()
                valid_loss, valid_acc = evaluate(model, valid_batch_loader,
                                                 USE_CUDA, False)
                spend = (time.time() - start) / 60

                logging.info(
                    "Valid loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)".format(
                        valid_loss, valid_acc, spend))
                if best_valid_acc < valid_acc:
                    best_valid_acc = valid_acc
                    logging.info(
                        "Best valid acc: {:.3f}".format(best_valid_acc))

                # on lambada dev
                start = time.time()
                dev_loss, dev_acc = evaluate(model, dev_batch_loader, USE_CUDA,
                                             True)
                spend = (time.time() - start) / 60

                logging.info(
                    "dev loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)".format(
                        dev_loss, dev_acc, spend))
                if best_dev_acc < dev_acc:
                    best_dev_acc = dev_acc
                    logging.info("Best dev acc: {:.3f}".format(best_dev_acc))
                    if args.save_mode == 'best':
                        model_name = args.save_model + '.chkpt'
                        torch.save(model.state_dict(), model_name)
                        logging.info(
                            '    - [Info] The checkpoint file has been updated [best].'
                        )

                if writer != None:
                    it_w = it / args.print_every
                    writer.add_scalar('data/train_loss',
                                      loss / float(args.print_every), it_w)
                    writer.add_scalar('data/train_acc', acc / n_examples, it_w)
                    writer.add_scalar('data/valid_loss', valid_loss, it_w)
                    writer.add_scalar('data/valid_acc', valid_acc, it_w)
                    writer.add_scalar('data/valid_loss', dev_loss, it_w)
                    writer.add_scalar('data/valid_acc', dev_acc, it_w)

                model.train()
                start = time.time()
                acc = loss = n_examples = 0

        logging.info(
            "End: train loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)".format(
                train_loss / len(train_batch_loader),
                train_acc / train_examples, spend))

        # on valid
        start = time.time()
        model.eval()
        valid_loss, valid_acc = evaluate(model, valid_batch_loader, USE_CUDA,
                                         False)
        spend = (time.time() - start) / 60

        logging.info(
            "End: Valid loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)".format(
                valid_loss, valid_acc, spend))
        if best_valid_acc < valid_acc:
            best_valid_acc = valid_acc
            logging.info("Best valid acc: {:.3f}".format(best_valid_acc))

        # on lambada dev
        start = time.time()
        dev_loss, dev_acc = evaluate(model, dev_batch_loader, USE_CUDA, True)
        spend = (time.time() - start) / 60

        logging.info(
            "End: dev loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)".format(
                dev_loss, dev_acc, spend))
        if best_dev_acc < dev_acc:
            best_dev_acc = dev_acc
            logging.info("Best dev acc: {:.3f}".format(best_dev_acc))

    #save checkpoint
    checkpoint = {
        'state_dict': model.state_dict(),
        'optimizer': opt.state_dict()
    }
    if args.save_model:
        if args.save_mode == 'series':
            model_name = args.save_model + '.chkpt'
            torch.save(model.state_dict(), model_name)
            #torch.save(checkpoint, model_name)
            logging.info(
                '    - [Info] The checkpoint file has been updated [series].')
        elif args.save_mode == 'all':
            model_name = args.save_model + '_accu_{accu:3.3f}.chkpt'.format(
                accu=100 * valid_accu)
            torch.save(checkpoint, model_name)
            logging.info(
                '    - [Info] The checkpoint file has been updated [all].')
        '''
Ejemplo n.º 19
0
def forward(data_loader, model, criterion, epoch=0, training=True, optimizer=None, U=None, V=None):
    if args.gpus and len(args.gpus) > 1:
        model = torch.nn.DataParallel(model, args.gpus)

    batch_time = AverageMeter()
    pruning_time = AverageMeter()
    select_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    end = time.time()

    masks = [torch.zeros(w.size()).cuda() for w in list(model.parameters())]


    for i, (inputs, target) in enumerate(data_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        if args.gpus is not None:
            target = target.cuda(async=True)
        input_var = Variable(inputs.type(args.type), volatile=not training)
        target_var = Variable(target)

        # compute output
        if not training:
            output = model(input_var)
            loss = criterion(output, target_var)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output.data, target_var.data, topk=(1, 5))
            losses.update(loss.data[0], input_var.size(0))
            top1.update(prec1[0], input_var.size(0))
            top5.update(prec5[0], input_var.size(0))

        else:

            mini_inputs = input_var.chunk(args.batch_size // args.mini_batch_size)
            mini_targets = target_var.chunk(args.batch_size // args.mini_batch_size)

            #TODO for debug shoul be delete
            if(0 == i):
                print('number of ghost batch is ', len(mini_inputs))

            optimizer.zero_grad()

            # fjr simulate distributed senario
            acc_grad = []
            if args.use_residue_acc:
                if torch.cuda.is_available():
                    acc_grad = [torch.zeros(w.size()).cuda() for w in list(model.parameters())]
                else:
                    print("gpu is not avaiable for acc_grad allocation")

            for k, mini_input_var in enumerate(mini_inputs):
                mini_target_var = mini_targets[k]
                output = model(mini_input_var)
                loss = criterion(output, mini_target_var)

                prec1, prec5 = accuracy(output.data, mini_target_var.data, topk=(1, 5))
                losses.update(loss.data[0], mini_input_var.size(0))
                top1.update(prec1[0], mini_input_var.size(0))
                top5.update(prec5[0], mini_input_var.size(0))

                # compute gradient and do SGD step
                # fjr
                if args.use_residue_acc:
                    # clear grad before accumulating
                    optimizer.zero_grad()

                loss.backward()

                if args.use_residue_acc:
                    if args.use_pruning:
                        clip_grad_norm(model.parameters(), 5. * (len(mini_inputs) ** -0.5))

                    idx = 0
                    for u, v, p in zip(U[k], V[k], model.parameters()):
                        prune_begin = time.time()
                        if args.use_pruning:
                            # TODO how to set rho (momentum)
                            g = p.grad.data / len(mini_inputs)
                            g += p.data * args.weight_decay / len(mini_inputs)
                            if args.use_nesterov:
                                u = args.momentum * (u + g)
                                v = v + u + g
                            else:
                                u = args.momentum * u + g
                                v = v + u

                            select_begin = time.time()
                            if args.use_sync and i % args.sync_interval == 0:
                                masks[idx] = 1;
                            else:
                                if args.use_warmup:
                                    # print("iter", i, "node ", k, " pruning layer ", idx)
                                    if (epoch == 0):
                                        masks[idx] = select_top_k(v, 1 - 0.75, masks[idx])
                                    elif (epoch == 1):
                                        masks[idx] = select_top_k(v, 1 - 0.9375, masks[idx])
                                    elif (epoch == 2):
                                        masks[idx] = select_top_k(v, 1 - 0.984375, masks[idx])
                                    elif (epoch == 3):
                                        masks[idx] = select_top_k(v, 1 - 0.996, masks[idx])
                                    else:
                                        masks[idx] = select_top_k(v, 1 - 0.999, masks[idx])
                                else:
                                    masks[idx] = select_top_k(v, 1 - 0.999, masks[idx])
                            select_time.update(time.time() - select_begin)


                            p.grad.data = v * masks[idx]
                            v = v * (1 - masks[idx])
                            u = u * (1 - masks[idx])

                            acc_grad[idx] += p.grad.data
                            U[k][idx] = u #new_residue
                            V[k][idx] = v
                        else:
                            acc_grad[idx] += p.grad.data / len(mini_inputs)

                        pruning_time.update(time.time() - prune_begin)

                        idx = idx + 1

            if args.use_residue_acc:
                # Master
                idx = 0
                for g, p in zip(acc_grad, model.parameters()):
                    # print("accumulated sparsity is", check_sparsity(g))
                    if args.use_pruning:
                    # TODO 1. use pytorch sgd optimizer to calculate mom and weight_decay, set mom and wd
                    # used with pruning
                        p.grad.data = g
                    else:
                    # TODO 2. implement weight_decay and momentum by myself, set mom=0 and wd = 0
                    # used with baseline
                        g += p.data * args.weight_decay
                        V[k][idx] = args.momentum * V[k][idx] + g
                        p.grad.data = V[k][idx]
                        # clip_grad_norm(model.parameters(), 5.)

                    idx = idx+1

            else:
                for p in model.parameters():
                    p.grad.data.div_(len(mini_inputs))
                #print("original grad norm before clip", p.grad.data.norm())
                clip_grad_norm(model.parameters(), 5.)
                #print("original grad norm after clip", p.grad.data.norm())

            optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            logging.info('{phase} - Epoch: [{0}][{1}/{2}]\t'
                         'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                         'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                         'Prune {pruning_time.val:.9f} ({pruning_time.avg:.3f})\t'
                         'Select {select_time.val:.9f} ({select_time.avg:.3f})\t'
                         'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                         'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                         'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                             epoch, i, len(data_loader),
                             phase='TRAINING' if training else 'EVALUATING',
                             batch_time=batch_time,
                             data_time=data_time,
                             pruning_time = pruning_time,
                             select_time = select_time,
                             loss=losses, top1=top1, top5=top5))

    return {'loss': losses.avg,
            'prec1': top1.avg,
            'prec5': top5.avg,
            'U' : U,
            'V' : V}
Ejemplo n.º 20
0
def train(
    train_loader,
    model,
    act_criterion,
    comp_criterion,
    regression_criterion,
    optimizer,
    epoch,
):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    act_losses = AverageMeter()
    comp_losses = AverageMeter()
    reg_losses = AverageMeter()
    act_accuracies = AverageMeter()
    fg_accuracies = AverageMeter()
    bg_accuracies = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    optimizer.zero_grad()

    ohem_num = train_loader.dataset.fg_per_video
    comp_group_size = (
        train_loader.dataset.fg_per_video + train_loader.dataset.incomplete_per_video
    )
    for (
        i,
        (
            out_frames,
            out_prop_len,
            out_prop_scaling,
            out_prop_type,
            out_prop_labels,
            out_prop_reg_targets,
            out_stage_split,
        ),
    ) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        input_var = torch.autograd.Variable(out_frames)
        scaling_var = torch.autograd.Variable(out_prop_scaling)
        target_var = torch.autograd.Variable(out_prop_labels)
        reg_target_var = torch.autograd.Variable(out_prop_reg_targets)
        prop_type_var = torch.autograd.Variable(out_prop_type)

        # compute output

        activity_out, activity_target, completeness_out, completeness_target, regression_out, regression_labels, regression_target = model(
            input_var, scaling_var, target_var, reg_target_var, prop_type_var
        )

        act_loss = act_criterion(activity_out, activity_target)
        comp_loss = comp_criterion(
            completeness_out, completeness_target, ohem_num, comp_group_size
        )
        reg_loss = regression_criterion(
            regression_out, regression_labels, regression_target
        )

        loss = (
            act_loss
            + comp_loss * args.comp_loss_weight
            + reg_loss * args.reg_loss_weight
        )

        reg_losses.update(reg_loss.data[0], out_frames.size(0))

        # measure mAP and record loss
        losses.update(loss.data[0], out_frames.size(0))
        act_losses.update(act_loss.data[0], out_frames.size(0))
        comp_losses.update(comp_loss.data[0], out_frames.size(0))

        act_acc = accuracy(activity_out, activity_target)
        act_accuracies.update(act_acc[0].data[0], activity_out.size(0))

        fg_acc = accuracy(
            activity_out.view(-1, 2, activity_out.size(1))[:, 0, :].contiguous(),
            activity_target.view(-1, 2)[:, 0].contiguous(),
        )

        bg_acc = accuracy(
            activity_out.view(-1, 2, activity_out.size(1))[:, 1, :].contiguous(),
            activity_target.view(-1, 2)[:, 1].contiguous(),
        )

        fg_accuracies.update(fg_acc[0].data[0], activity_out.size(0) // 2)
        bg_accuracies.update(bg_acc[0].data[0], activity_out.size(0) // 2)

        # compute gradient and do SGD step
        loss.backward()

        if i % args.iter_size == 0:
            # scale down gradients when iter size is functioning
            if args.iter_size != 1:
                for g in optimizer.param_groups:
                    for p in g["params"]:
                        p.grad /= args.iter_size

            if args.clip_gradient is not None:
                total_norm = clip_grad_norm(model.parameters(), args.clip_gradient)
                if total_norm > args.clip_gradient:
                    print(
                        (
                            "clipping gradient: {} with coef {}".format(
                                total_norm, args.clip_gradient / total_norm
                            )
                        )
                    )
            else:
                total_norm = 0

            optimizer.step()
            optimizer.zero_grad()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print(
                (
                    "Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t"
                    "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                    "Data {data_time.val:.3f} ({data_time.avg:.3f})\t"
                    "Loss {loss.val:.4f} ({loss.avg:.4f})\t"
                    "Act. Loss {act_losses.val:.3f} ({act_losses.avg: .3f}) \t"
                    "Comp. Loss {comp_losses.val:.3f} ({comp_losses.avg: .3f}) ".format(
                        epoch,
                        i,
                        len(train_loader),
                        batch_time=batch_time,
                        data_time=data_time,
                        loss=losses,
                        act_losses=act_losses,
                        comp_losses=comp_losses,
                        lr=optimizer.param_groups[0]["lr"],
                    )
                    + "\tReg. Loss {reg_loss.val:.3f} ({reg_loss.avg:.3f})".format(
                        reg_loss=reg_losses
                    )
                    + "\n Act. FG {fg_acc.val:.02f} ({fg_acc.avg:.02f}) Act. BG {bg_acc.avg:.02f} ({bg_acc.avg:.02f})".format(
                        act_acc=act_accuracies,
                        fg_acc=fg_accuracies,
                        bg_acc=bg_accuracies,
                    )
                )
            )
Ejemplo n.º 21
0
def train(args: Dict[str, str]):
    #   LJ: source corpus and target corpus
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    #   LJ: the validation set (source and target)
    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    #   LJ: the training and validation sentences pairs
    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    #   LJ: the configurations
    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    #   LJ: read the vocabulary
    vocab = pickle.load(open(args['--vocab'], 'rb'))

    #   LJ: set up the loss function (ignore to <pad>)
    nll_loss = nn.NLLLoss(ignore_index=0)

    #   LJ: build the model
    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                local_att=bool(args['--local']),
                conv=bool(args['--conv']),
                vocab=vocab,
                loss=nll_loss)
    bound = float(args['--uniform-init'])
    for p in model.parameters():
        torch.nn.init.uniform_(p.data, a=-bound, b=bound)

    src_embed_fn = args['--src_ebed_fn']
    tgt_embed_fn = args['--tgt_ebed_fn']

    #print(src_embed_fn)
    #print(tgt_embed_fn)

    if not src_embed_fn == "None":
        src_vectors = np.load(src_embed_fn)['embedding']
        model.src_embed.weight.data = torch.from_numpy(
            src_vectors).float().cuda()

    if not tgt_embed_fn == "None":
        tgt_vectors = np.load(tgt_embed_fn)['embedding']
        model.tgt_embed.weight.data = torch.from_numpy(
            tgt_vectors).float().cuda()

    #   LJ: the learning rate
    lr = float(args['--lr'])

    #   LJ: setting some initial losses, etc.
    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cumulative_tgt_words = report_tgt_words = 0
    cumulative_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    #   LJ: setup the optimizer
    # optimizer = optim.Adam(list(model.encoder.parameters())+list(model.decoder.parameters()), lr=lr)
    optimizer = optim.Adam(filter(lambda x: x.requires_grad,
                                  model.parameters()),
                           lr=lr)

    while True:

        #   start the epoch
        epoch += 1

        #   LJ: ok, we yield the sentences in a shuffle manner.
        for src_sents, tgt_sents in batch_iter(train_data,
                                               batch_size=train_batch_size,
                                               shuffle=True):

            model.set_model_to_train()

            train_iter += 1

            #   LJ: current batch size
            batch_size = len(src_sents)

            # (batch_size)
            # LJ: train on the mini-batch and get the loss, backpropagation

            # loss = -model(src_sents, tgt_sents)
            optimizer.zero_grad()
            loss, num_words = model(src_sents, tgt_sents)
            loss.backward()
            clip_grad_norm(
                list(model.encoder.parameters()) +
                list(model.decoder.parameters()), clip_grad)
            optimizer.step()

            #   add the loss to cumlinative loss
            report_loss += loss.detach().cpu().numpy() * num_words
            cum_loss += loss.detach().cpu().numpy() * num_words

            #   LJ: how many targets words are there in all target sentences in current batch
            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`

            #   LJ: all cumulative words
            report_tgt_words += tgt_words_num_to_predict
            cumulative_tgt_words += tgt_words_num_to_predict

            #   LJ: all number of instances handled
            report_examples += batch_size
            cumulative_examples += batch_size

            #   LJ: print out the training loss
            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(
                                                                                             report_loss / report_tgt_words),
                                                                                         cumulative_examples,
                                                                                         report_tgt_words / (
                                                                                                     time.time() - train_time),
                                                                                         time.time() - begin_time),
                      file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # the following code performs validation on dev set, and controls the learning schedule
            # if the dev score is better than the last check point, then the current model is saved.
            # otherwise, we allow for that performance degeneration for up to `--patience` times;
            # if the dev score does not increase after `--patience` iterations, we reload the previously
            # saved best model (and the state of the optimizer), halve the learning rate and continue
            # training. This repeats for up to `--max-num-trial` times.
            if train_iter % valid_niter == 0:
                model.set_model_to_eval()
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cumulative_examples,
                       np.exp(cum_loss / cumulative_tgt_words),
                       cumulative_examples),
                    file=sys.stderr)

                cum_loss = cumulative_examples = cumulative_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                #   LJ: the validation is implemented in a seperate function
                cum_loss, dev_ppl = model.evaluate_ppl(
                    dev_data,
                    batch_size=128)  # dev batch size can be a bit larger
                # valid_metric = -dev_ppl
                valid_metric = -cum_loss

                print('validation: iter %d, dev. ppl %f, val cum loss: %f' %
                      (train_iter, dev_ppl, cum_loss),
                      file=sys.stderr)

                #   LJ: a new better model is found.
                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)

                    # You may also save the optimizer's state, adjust the training weight, since we found there are too
                    #   much iterations without improvements.
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay learning rate, and restore from previously best checkpoint
                        lr = lr * float(args['--lr-decay'])
                        optimizer = optim.Adam(filter(
                            lambda x: x.requires_grad, model.parameters()),
                                               lr=lr)
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        # model.load(model_save_path)
                        model = utils.load_model_by_state_dict(
                            model, model_save_path)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        # You may also need to load the state of the optimizer saved before

                        # reset patience
                        patience = 0

            if epoch == int(args['--max-epoch']):
                print('reached maximum number of epochs!', file=sys.stderr)
                exit(0)
Ejemplo n.º 22
0
def train(train_loader, model, criterion, optimizer, epoch, log):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    if args.no_partialbn:
        model.module.partialBN(False)
    else:
        model.module.partialBN(True)

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0], input.size(0))
        top5.update(prec5[0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()

        loss.backward()

        if args.clip_gradient is not None:
            total_norm = clip_grad_norm(model.parameters(), args.clip_gradient)
            # if total_norm > args.clip_gradient:
            # print("clipping gradient: {} with coef {}".format(total_norm, args.clip_gradient / total_norm))

        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            output = ('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                          epoch,
                          i,
                          len(train_loader),
                          batch_time=batch_time,
                          data_time=data_time,
                          loss=losses,
                          top1=top1,
                          top5=top5,
                          lr=optimizer.param_groups[-1]['lr']))
            print(output)
            log.write(output + '\n')
            log.flush()
Ejemplo n.º 23
0
      objective_loss = F.cross_entropy(input=logits_squeezed, target=Variable(torch_labels))
      if objective_loss.data[0] > 5 and epoch > 10:
        #interested in phrase that have large loss (i.e. incorrectly classified)
        print(' '.join(tree.get_words()))

      loss_history.append(objective_loss.data[0])
      if step % 20 == 0 and step > 0:
        print("step %3d, last loss %0.3f, mean loss (%d steps) %0.3f" % (step, objective_loss.data[0], average_over, np.mean(loss_history[-average_over:])))
      optimizer.zero_grad()

      if np.isnan(objective_loss.data[0]):
        print("object_loss was not a number")
        sys.exit(1)
      else:
        objective_loss.backward()
        clip_grad_norm(model.parameters(), 5, norm_type=2.)
        #temp_grad += model.fcl._parameters['weight'].grad.data
        # # Update weights using gradient descent; w1.data and w2.data are Tensors,
        # # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
        # # Tensors.
        # loss.backward()
        # w1.data -= learning_rate * w1.grad.data
        # w2.data -= learning_rate * w2.grad.data
        optimizer.step()
    print("total root predicted correctly = ", total_root_prediction/ float(train_size))
    print("total node (including root) predicted correctly = ", total_summed_accuracy / float(train_size))

    total_dev_loss = 0.
    dev_correct_at_root = 0.
    dev_correct_all = 0.
    for step, dev_example in enumerate(dev_data):
Ejemplo n.º 24
0
def train():
    logging.info('Loading vocab,train and val dataset.Wait a second,please')

    embed = torch.Tensor(np.load(args.embedding)['embedding'])
    with open(args.word2id) as f:
        word2id = json.load(f)
    vocab = utils.Vocab(embed, word2id)

    with open(args.train_dir) as f:
        examples = [json.loads(line) for line in f]
    train_dataset = utils.Dataset(examples)

    with open(args.val_dir) as f:
        examples = [json.loads(line) for line in f]
    val_dataset = utils.Dataset(examples)

    # update args
    args.embed_num = embed.size(0)
    args.embed_dim = embed.size(1)
    args.kernel_sizes = [int(ks) for ks in args.kernel_sizes.split(',')]
    # build model
    net = getattr(models, args.model)(args, embed)
    if use_gpu:
        net.cuda()
    # load dataset
    train_iter = DataLoader(dataset=train_dataset,
                            batch_size=args.batch_size,
                            shuffle=True)
    val_iter = DataLoader(dataset=val_dataset,
                          batch_size=args.batch_size,
                          shuffle=False)
    # loss function
    criterion = nn.BCELoss()
    # model info
    print(net)
    params = sum(p.numel() for p in list(net.parameters())) / 1e6
    print('#Params: %.1fM' % (params))

    min_loss = float('inf')
    optimizer = torch.optim.Adam(net.parameters(), lr=args.lr)
    net.train()

    t1 = time()
    for epoch in range(1, args.epochs + 1):
        for i, batch in enumerate(train_iter):
            features, targets, _, doc_lens = vocab.make_features(batch)
            features, targets = Variable(features), Variable(targets.float())
            if use_gpu:
                features = features.cuda()
                targets = targets.cuda()
            probs = net(features, doc_lens)
            loss = criterion(probs, targets)
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm(net.parameters(), args.max_norm)
            optimizer.step()
            if args.debug:
                print('Batch ID:%d Loss:%f' % (i, loss.data[0]))
                continue
            if i % args.report_every == 0:
                cur_loss = eval(net, vocab, val_iter, criterion)
                if cur_loss < min_loss:
                    min_loss = cur_loss
                    best_path = net.save()
                logging.info('Epoch: %2d Min_Val_Loss: %f Cur_Val_Loss: %f' %
                             (epoch, min_loss, cur_loss))
    t2 = time()
    logging.info('Total Cost:%f h' % ((t2 - t1) / 3600))
Ejemplo n.º 25
0
    def train(self):

        if self.T - self.target_sync_T > self.args.target:
            self.sync_target_network()
            self.target_sync_T = self.T

        info = {}

        for _ in range(self.args.iters):
            self.dqn.eval()

            batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, self.args.n_step, self.args.gamma)
            columns = list(zip(*batch))

            states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3))
            actions = Variable(torch.LongTensor(columns[1]))
            terminal_states = Variable(torch.FloatTensor(columns[5]))
            rewards = Variable(torch.FloatTensor(columns[2]))
            # Have to clip rewards for DQN
            rewards = torch.clamp(rewards, -1, 1)
            steps = Variable(torch.FloatTensor(columns[4]))
            new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3))

            target_dqn_qvals = self.target_dqn(new_states).cpu()
            # Make a new variable with those values so that these are treated as constants
            target_dqn_qvals_data = Variable(target_dqn_qvals.data)

            q_value_gammas = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states)
            inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma)
            # print(steps)
            q_value_gammas = q_value_gammas * torch.pow(inter, steps)

            values = torch.linspace(self.args.v_min, self.args.v_max, steps=self.args.atoms)
            values = Variable(values)
            values = values.view(1, 1, self.args.atoms)
            values = values.expand(self.args.batch_size, self.args.actions, self.args.atoms)
            # print(values)

            q_value_gammas = q_value_gammas.view(self.args.batch_size, 1, 1)
            q_value_gammas = q_value_gammas.expand(self.args.batch_size, self.args.actions, self.args.atoms)
            # print(q_value_gammas)
            gamma_values = q_value_gammas * values
            # print(gamma_values)
            rewards = rewards.view(self.args.batch_size, 1, 1)
            rewards = rewards.expand(self.args.batch_size, self.args.actions, self.args.atoms)
            # print(rewards)
            operator_q_values = rewards + gamma_values
            # print(operator_q_values)

            clipped_operator_q_values = torch.clamp(operator_q_values, self.args.v_min, self.args.v_max)

            delta_z = (self.args.v_max - self.args.v_min) / (self.args.atoms - 1)
            # Using the notation from the categorical paper
            b_j = (clipped_operator_q_values - self.args.v_min) / delta_z
            # print(b_j)
            lower_bounds = torch.floor(b_j)
            upper_bounds = torch.ceil(b_j)

            # Work out the max action
            atom_values = Variable(torch.linspace(self.args.v_min, self.args.v_max, steps=self.args.atoms))
            atom_values = atom_values.view(1, 1, self.args.atoms)
            atom_values = atom_values.expand(self.args.batch_size, self.args.actions, self.args.atoms)

            # Sum over the atoms dimension
            target_expected_qvalues = torch.sum(target_dqn_qvals_data * atom_values, dim=2)
            # Get the maximum actions index across the batch size
            max_actions = target_expected_qvalues.max(dim=1)[1].view(-1)

            # Project back onto the original support for the max actions
            q_value_distribution_targets = torch.zeros(self.args.batch_size, self.args.atoms)

            # Distributions for the max actions
            # print(target_dqn_qvals_data, max_actions)
            q_value_max_actions_distribs = target_dqn_qvals_data.index_select(dim=1, index=max_actions)[:,0,:]
            # print(q_value_max_actions_distribs)

            # Lower_bounds_actions
            lower_bounds_actions = lower_bounds.index_select(dim=1, index=max_actions)[:,0,:]
            upper_bounds_actions = upper_bounds.index_select(dim=1, index=max_actions)[:,0,:]
            b_j_actions = b_j.index_select(dim=1, index=max_actions)[:,0,:]

            lower_bound_values_to_add = q_value_max_actions_distribs * (upper_bounds_actions - b_j_actions)
            upper_bound_values_to_add = q_value_max_actions_distribs * (b_j_actions - lower_bounds_actions)
            # print(lower_bounds_actions)
            # print(lower_bound_values_to_add)
            # Naive looping
            for b in range(self.args.batch_size):
                for l, pj in zip(lower_bounds_actions.data.type(torch.LongTensor)[b], lower_bound_values_to_add[b].data):
                    q_value_distribution_targets[b][l] += pj
                for u, pj in zip(upper_bounds_actions.data.type(torch.LongTensor)[b], upper_bound_values_to_add[b].data):
                    q_value_distribution_targets[b][u] += pj

            self.dqn.train()
            if self.args.gpu:
                actions = actions.cuda()
                # q_value_targets = q_value_targets.cuda()
                q_value_distribution_targets = q_value_distribution_targets.cuda()
            model_predictions = self.dqn(states).index_select(1, actions.view(-1))[:,0,:]
            q_value_distribution_targets = Variable(q_value_distribution_targets)
            # print(q_value_distribution_targets)
            # print(model_predictions) 

            # Cross entropy loss
            ce_loss = -torch.sum(q_value_distribution_targets * torch.log(model_predictions), dim=1)
            ce_batch_loss = ce_loss.mean()

            info = {}

            self.log("DQN/X_Entropy_Loss", ce_batch_loss.data[0], step=self.T)

            # Update
            self.optimizer.zero_grad()
            ce_batch_loss.backward()

            # Taken from pytorch clip_grad_norm
            # Remove once the pip version it up to date with source
            gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value)
            if gradient_norm is not None:
                info["Norm"] = gradient_norm

            self.optimizer.step()

            if "States" in info:
                states_trained = info["States"]
                info["States"] = states_trained + columns[0]
            else:
                info["States"] = columns[0]

        # Pad out the states to be of size batch_size
        if len(info["States"]) < self.args.batch_size:
            old_states = info["States"]
            new_states = old_states[0] * (self.args.batch_size - len(old_states))
            info["States"] = new_states

        return info
def train(model_str, embeddings, train_iter, val_iter=None, context_size=None, early_stopping=False, save=False, save_path=None,
          model_params={}, opt_params={}, train_params={}, cuda=CUDA_DEFAULT, reshuffle_train=False, TEXT=None):
    # Initialize model and other variables
    train_iter_, val_iter_, model, criterion, optimizer, scheduler = _train_initialize_variables(model_str, embeddings, model_params, train_iter, val_iter, opt_params, cuda)

    # First validation round before any training
    if val_iter_ is not None:
        model.eval()
        print("Model initialized")
        val_loss = predict(model, val_iter_, context_size=context_size,
                           save_loss=False, expt_name="dummy_expt", cuda=cuda)
        model.train()

    if scheduler is not None:
        scheduler.step(val_loss)

    print("All set. Actual Training begins")
    for epoch in range(train_params.get('n_ep', 30)):
        # Monitoring loss
        total_loss = 0
        count = 0

        # if using NNLM, reshuffle sentences
        if model_str == 'NNLM' and reshuffle_train:
            train_iter_, _, _ = rebuild_iterators(TEXT, batch_size=int(model_params['batch_size']))

        # Initialize hidden layer and memory(for LSTM). Converting to variable later.
        if model_str in recur_models:
            model.hidden = model.init_hidden()

        # Actual training loop.     
        for x_train, y_train in data_generator(train_iter_, model_str, context_size=context_size, cuda=cuda):

            optimizer.zero_grad()

            if model_str in recur_models:
                output, model_hidden = model(x_train)
                if model.model_str == 'LSTM':
                    model.hidden = model.hidden[0].detach(), model.hidden[1].detach()  # to break the computational graph epxlictly (backprop through `bptt_steps` steps only)
                else:
                    model.hidden = model.hidden.detach()  # to break the computational graph epxlictly (backprop through `bptt_steps` steps only)
            else:
                output = model(x_train)

            # Dimension matching to cut it right for loss function.
            if model_str in recur_models:
                batch_size, sent_length = y_train.size(0), y_train.size(1)
                loss = criterion(output.view(batch_size, -1, sent_length), y_train)
            else:
                loss = criterion(output, y_train)

            # backprop
            loss.backward()

            # Clip gradients to prevent exploding gradients in RNN/LSTM/GRU
            if model_str in recur_models:
                clip_grad_norm(model.parameters(), model_params.get("clip_grad_norm", 5))
            optimizer.step()

            # monitoring
            count += x_train.size(0) if model.model_str == 'NNLM2' else x_train.size(0) * x_train.size(1)  # in that case there are batch_size x bbp_length classifications per batch
            total_loss += t.sum(loss.data)  # .data to break so that you dont keep references

        # monitoring
        avg_loss = total_loss / count
        print("Average loss after %d epochs is %.4f" % (epoch, avg_loss))
        if val_iter_ is not None:
            model.eval()
            former_val_loss = val_loss * 1.
            val_loss = predict(model, val_iter_, context_size=context_size,
                               save_loss=False, expt_name="dummy_expt", cuda=cuda)
            if scheduler is not None:
                scheduler.step(val_loss)
            if val_loss > former_val_loss:
                if early_stopping:
                    break
            else:
                if save:
                    assert save_path is not None
                    # weights
                    save_model(model, save_path + '.pytorch')
                    # params
                    with open(save_path + '.params.json', 'w') as fp:
                        json.dump(model.params, fp)
                    # loss
                    with open(save_path + '.losses.txt', 'w') as fp:
                        fp.write('val: ' + str(val_loss))
                        fp.write('train: ' + str(avg_loss))
            model.train()

    return model
Ejemplo n.º 27
0
 def step(self):
     """Compute gradients norm."""
     if self.max_grad_norm:
         clip_grad_norm(self.params, self.max_grad_norm)
     self.optimizer.step()
Ejemplo n.º 28
0
def main():
    args_parser = argparse.ArgumentParser(
        description='Tuning with stack pointer parser')
    args_parser.add_argument('--mode',
                             choices=['RNN', 'LSTM', 'GRU', 'FastLSTM'],
                             help='architecture of rnn',
                             required=True)
    args_parser.add_argument('--num_epochs',
                             type=int,
                             default=200,
                             help='Number of training epochs')
    args_parser.add_argument('--batch_size',
                             type=int,
                             default=64,
                             help='Number of sentences in each batch')
    args_parser.add_argument('--hidden_size',
                             type=int,
                             default=256,
                             help='Number of hidden units in RNN')
    args_parser.add_argument('--arc_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--type_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--num_layers',
                             type=int,
                             default=1,
                             help='Number of layers of RNN')
    args_parser.add_argument('--num_filters',
                             type=int,
                             default=50,
                             help='Number of filters in CNN')
    args_parser.add_argument('--pos_dim',
                             type=int,
                             default=50,
                             help='Dimension of POS embeddings')
    args_parser.add_argument('--char_dim',
                             type=int,
                             default=50,
                             help='Dimension of Character embeddings')
    args_parser.add_argument('--opt',
                             choices=['adam', 'sgd', 'adadelta'],
                             help='optimization algorithm')
    args_parser.add_argument('--learning_rate',
                             type=float,
                             default=0.001,
                             help='Learning rate')
    args_parser.add_argument('--decay_rate',
                             type=float,
                             default=0.5,
                             help='Decay rate of learning rate')
    args_parser.add_argument('--clip',
                             type=float,
                             default=5.0,
                             help='gradient clipping')
    args_parser.add_argument('--gamma',
                             type=float,
                             default=0.0,
                             help='weight for regularization')
    args_parser.add_argument('--coverage',
                             type=float,
                             default=0.0,
                             help='weight for coverage loss')
    args_parser.add_argument('--p_rnn',
                             nargs=2,
                             type=float,
                             required=True,
                             help='dropout rate for RNN')
    args_parser.add_argument('--p_in',
                             type=float,
                             default=0.33,
                             help='dropout rate for input embeddings')
    args_parser.add_argument('--p_out',
                             type=float,
                             default=0.33,
                             help='dropout rate for output layer')
    args_parser.add_argument(
        '--prior_order',
        choices=['inside_out', 'left2right', 'deep_first', 'shallow_first'],
        help='prior order of children.',
        required=True)
    args_parser.add_argument('--schedule',
                             type=int,
                             help='schedule for learning rate decay')
    args_parser.add_argument(
        '--unk_replace',
        type=float,
        default=0.,
        help='The rate to replace a singleton word with UNK')
    args_parser.add_argument('--punctuation',
                             nargs='+',
                             type=str,
                             help='List of punctuations')
    args_parser.add_argument('--beam',
                             type=int,
                             default=1,
                             help='Beam size for decoding')
    args_parser.add_argument('--word_embedding',
                             choices=['glove', 'senna', 'sskip', 'polyglot'],
                             help='Embedding for words',
                             required=True)
    args_parser.add_argument('--word_path',
                             help='path for word embedding dict')
    args_parser.add_argument('--char_embedding',
                             choices=['random', 'polyglot'],
                             help='Embedding for characters',
                             required=True)
    args_parser.add_argument('--char_path',
                             help='path for character embedding dict')
    args_parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    args_parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    args_parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"
    args_parser.add_argument('--model_path',
                             help='path for saving model file.',
                             required=True)
    args_parser.add_argument('--model_name',
                             help='name for saving model file.',
                             required=True)

    args = args_parser.parse_args()

    logger = get_logger("PtrParser")

    mode = args.mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    model_path = args.model_path
    model_name = args.model_name
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    arc_space = args.arc_space
    type_space = args.type_space
    num_layers = args.num_layers
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    opt = args.opt
    momentum = 0.9
    betas = (0.9, 0.9)
    rho = 0.9
    eps = 1e-6
    decay_rate = args.decay_rate
    clip = args.clip
    gamma = args.gamma
    cov = args.coverage
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    prior_order = args.prior_order
    beam = args.beam
    punctuation = args.punctuation

    word_embedding = args.word_embedding
    word_path = args.word_path
    char_embedding = args.char_embedding
    char_path = args.char_path

    pos_dim = args.pos_dim
    word_dict, word_dim = utils.load_embedding_dict(word_embedding, word_path)
    char_dict = None
    char_dim = args.char_dim
    if char_embedding != 'random':
        char_dict, char_dim = utils.load_embedding_dict(
            char_embedding, char_path)
    logger.info("Creating Alphabets")

    alphabet_path = os.path.join(model_path, 'alphabets/')
    model_name = os.path.join(model_path, model_name)
    word_alphabet, char_alphabet, pos_alphabet, type_alphabet = conllx_stacked_data.create_alphabets(
        alphabet_path,
        train_path,
        data_paths=[dev_path, test_path],
        max_vocabulary_size=50000,
        embedd_dict=word_dict)

    num_words = word_alphabet.size()
    num_chars = char_alphabet.size()
    num_pos = pos_alphabet.size()
    num_types = type_alphabet.size()

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    data_train = conllx_stacked_data.read_stacked_data_to_variable(
        train_path,
        word_alphabet,
        char_alphabet,
        pos_alphabet,
        type_alphabet,
        use_gpu=use_gpu,
        prior_order=prior_order)
    num_data = sum(data_train[1])

    data_dev = conllx_stacked_data.read_stacked_data_to_variable(
        dev_path,
        word_alphabet,
        char_alphabet,
        pos_alphabet,
        type_alphabet,
        use_gpu=use_gpu,
        volatile=True,
        prior_order=prior_order)
    data_test = conllx_stacked_data.read_stacked_data_to_variable(
        test_path,
        word_alphabet,
        char_alphabet,
        pos_alphabet,
        type_alphabet,
        use_gpu=use_gpu,
        volatile=True,
        prior_order=prior_order)

    punct_set = None
    if punctuation is not None:
        punct_set = set(punctuation)
        logger.info("punctuations(%d): %s" %
                    (len(punct_set), ' '.join(punct_set)))

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / word_dim)
        table = np.empty([word_alphabet.size(), word_dim], dtype=np.float32)
        table[conllx_stacked_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, word_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in word_dict:
                embedding = word_dict[word]
            elif word.lower() in word_dict:
                embedding = word_dict[word.lower()]
            else:
                embedding = np.random.uniform(-scale, scale,
                                              [1, word_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('word OOV: %d' % oov)
        return torch.from_numpy(table)

    def construct_char_embedding_table():
        if char_dict is None:
            return None

        scale = np.sqrt(3.0 / char_dim)
        table = np.empty([num_chars, char_dim], dtype=np.float32)
        table[conllx_stacked_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, char_dim]).astype(np.float32)
        oov = 0
        for char, index, in char_alphabet.items():
            if char in char_dict:
                embedding = char_dict[char]
            else:
                embedding = np.random.uniform(-scale, scale,
                                              [1, char_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('character OOV: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    char_table = construct_char_embedding_table()

    window = 3
    network = StackPtrNet(word_dim,
                          num_words,
                          char_dim,
                          num_chars,
                          pos_dim,
                          num_pos,
                          num_filters,
                          window,
                          mode,
                          hidden_size,
                          num_layers,
                          num_types,
                          arc_space,
                          type_space,
                          embedd_word=word_table,
                          embedd_char=char_table,
                          p_in=p_in,
                          p_out=p_out,
                          p_rnn=p_rnn,
                          biaffine=True,
                          prior_order=prior_order)

    if use_gpu:
        network.cuda()

    pred_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet,
                               type_alphabet)
    gold_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet,
                               type_alphabet)

    def generate_optimizer(opt, lr, params):
        if opt == 'adam':
            return Adam(params,
                        lr=lr,
                        betas=betas,
                        weight_decay=gamma,
                        eps=eps)
        elif opt == 'sgd':
            return SGD(params,
                       lr=lr,
                       momentum=momentum,
                       weight_decay=gamma,
                       nesterov=True)
        elif opt == 'adadelta':
            return Adadelta(params,
                            lr=lr,
                            rho=rho,
                            weight_decay=gamma,
                            eps=eps)
        else:
            raise ValueError('Unknown optimization algorithm: %s' % opt)

    lr = learning_rate
    optim = generate_optimizer(opt, lr, network.parameters())
    opt_info = 'opt: %s, ' % opt
    if opt == 'adam':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)
    elif opt == 'sgd':
        opt_info += 'momentum=%.2f' % momentum
    elif opt == 'adadelta':
        opt_info += 'rho=%.2f, eps=%.1e' % (rho, eps)

    logger.info("Embedding dim: word=%d, char=%d, pos=%d" %
                (word_dim, char_dim, pos_dim))
    logger.info(
        "Network: %s, num_layer=%d, hidden=%d, filter=%d, arc_space=%d, type_space=%d"
        % (mode, num_layers, hidden_size, num_filters, arc_space, type_space))
    logger.info(
        "train: cov: %.1f, (#data: %d, batch: %d, clip: %.2f, dropout(in, out, rnn): (%.2f, %.2f, %s), unk_repl: %.2f)"
        % (cov, num_data, batch_size, clip, p_in, p_out, p_rnn, unk_replace))
    logger.info('prior order: %s, beam: %d' % (prior_order, beam))
    logger.info(opt_info)

    num_batches = num_data / batch_size + 1
    dev_ucorrect = 0.0
    dev_lcorrect = 0.0
    dev_ucomlpete_match = 0.0
    dev_lcomplete_match = 0.0

    dev_ucorrect_nopunc = 0.0
    dev_lcorrect_nopunc = 0.0
    dev_ucomlpete_match_nopunc = 0.0
    dev_lcomplete_match_nopunc = 0.0
    dev_root_correct = 0.0

    best_epoch = 0

    test_ucorrect = 0.0
    test_lcorrect = 0.0
    test_ucomlpete_match = 0.0
    test_lcomplete_match = 0.0

    test_ucorrect_nopunc = 0.0
    test_lcorrect_nopunc = 0.0
    test_ucomlpete_match_nopunc = 0.0
    test_lcomplete_match_nopunc = 0.0
    test_root_correct = 0.0
    test_total = 0
    test_total_nopunc = 0
    test_total_inst = 0
    test_total_root = 0

    patient = 0
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s, optim: %s, learning rate=%.6f, decay rate=%.2f (schedule=%d, patient=%d)): '
            % (epoch, mode, opt, lr, decay_rate, schedule, patient))
        train_err_arc_leaf = 0.
        train_err_arc_non_leaf = 0.
        train_err_type_leaf = 0.
        train_err_type_non_leaf = 0.
        train_err_cov = 0.
        train_total_leaf = 0.
        train_total_non_leaf = 0.
        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            input_encoder, input_decoder = conllx_stacked_data.get_batch_stacked_variable(
                data_train, batch_size, unk_replace=unk_replace)
            word, char, pos, heads, types, masks_e, lengths_e = input_encoder
            stacked_heads, children, stacked_types, masks_d, lengths_d = input_decoder
            optim.zero_grad()
            loss_arc_leaf, loss_arc_non_leaf, \
            loss_type_leaf, loss_type_non_leaf, \
            loss_cov, num_leaf, num_non_leaf = network.loss(word, char, pos, stacked_heads, children, stacked_types,
                                                            mask_e=masks_e, length_e=lengths_e, mask_d=masks_d, length_d=lengths_d)
            loss_arc = loss_arc_leaf + loss_arc_non_leaf
            loss_type = loss_type_leaf + loss_type_non_leaf
            loss = loss_arc + loss_type + cov * loss_cov
            loss.backward()
            clip_grad_norm(network.parameters(), clip)
            optim.step()

            num_leaf = num_leaf.data[0]
            num_non_leaf = num_non_leaf.data[0]

            train_err_arc_leaf += loss_arc_leaf.data[0] * num_leaf
            train_err_arc_non_leaf += loss_arc_non_leaf.data[0] * num_non_leaf

            train_err_type_leaf += loss_type_leaf.data[0] * num_leaf
            train_err_type_non_leaf += loss_type_non_leaf.data[0] * num_non_leaf

            train_err_cov += loss_cov.data[0] * (num_leaf + num_non_leaf)

            train_total_leaf += num_leaf
            train_total_non_leaf += num_non_leaf

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 10 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                err_arc_leaf = train_err_arc_leaf / train_total_leaf
                err_arc_non_leaf = train_err_arc_non_leaf / train_total_non_leaf
                err_arc = err_arc_leaf + err_arc_non_leaf

                err_type_leaf = train_err_type_leaf / train_total_leaf
                err_type_non_leaf = train_err_type_non_leaf / train_total_non_leaf
                err_type = err_type_leaf + err_type_non_leaf

                err_cov = train_err_cov / (train_total_leaf +
                                           train_total_non_leaf)

                err = err_arc + err_type + cov * err_cov
                log_info = 'train: %d/%d loss (leaf, non_leaf): %.4f, arc: %.4f (%.4f, %.4f), type: %.4f (%.4f, %.4f), coverage: %.4f, time left (estimated): %.2fs' % (
                    batch, num_batches, err, err_arc, err_arc_leaf,
                    err_arc_non_leaf, err_type, err_type_leaf,
                    err_type_non_leaf, err_cov, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        err_arc_leaf = train_err_arc_leaf / train_total_leaf
        err_arc_non_leaf = train_err_arc_non_leaf / train_total_non_leaf
        err_arc = err_arc_leaf + err_arc_non_leaf

        err_type_leaf = train_err_type_leaf / train_total_leaf
        err_type_non_leaf = train_err_type_non_leaf / train_total_non_leaf
        err_type = err_type_leaf + err_type_non_leaf

        err_cov = train_err_cov / (train_total_leaf + train_total_non_leaf)

        err = err_arc + err_type + cov * err_cov
        print(
            'train: %d loss (leaf, non_leaf): %.4f, arc: %.4f (%.4f, %.4f), type: %.4f (%.4f, %.4f), coverage: %.4f, time: %.2fs'
            % (num_batches, err, err_arc, err_arc_leaf, err_arc_non_leaf,
               err_type, err_type_leaf, err_type_non_leaf, err_cov,
               time.time() - start_time))

        # evaluate performance on dev data
        network.eval()
        pred_filename = 'tmp/%spred_dev%d' % (str(uid), epoch)
        pred_writer.start(pred_filename)
        gold_filename = 'tmp/%sgold_dev%d' % (str(uid), epoch)
        gold_writer.start(gold_filename)

        dev_ucorr = 0.0
        dev_lcorr = 0.0
        dev_total = 0
        dev_ucomlpete = 0.0
        dev_lcomplete = 0.0
        dev_ucorr_nopunc = 0.0
        dev_lcorr_nopunc = 0.0
        dev_total_nopunc = 0
        dev_ucomlpete_nopunc = 0.0
        dev_lcomplete_nopunc = 0.0
        dev_root_corr = 0.0
        dev_total_root = 0.0
        dev_total_inst = 0.0
        for batch in conllx_stacked_data.iterate_batch_stacked_variable(
                data_dev, batch_size):
            input_encoder, _ = batch
            word, char, pos, heads, types, masks, lengths = input_encoder
            heads_pred, types_pred, _, _ = network.decode(
                word,
                char,
                pos,
                mask=masks,
                length=lengths,
                beam=beam,
                leading_symbolic=conllx_stacked_data.NUM_SYMBOLIC_TAGS)

            word = word.data.cpu().numpy()
            pos = pos.data.cpu().numpy()
            lengths = lengths.cpu().numpy()
            heads = heads.data.cpu().numpy()
            types = types.data.cpu().numpy()

            pred_writer.write(word,
                              pos,
                              heads_pred,
                              types_pred,
                              lengths,
                              symbolic_root=True)
            gold_writer.write(word,
                              pos,
                              heads,
                              types,
                              lengths,
                              symbolic_root=True)

            stats, stats_nopunc, stats_root, num_inst = parser.eval(
                word,
                pos,
                heads_pred,
                types_pred,
                heads,
                types,
                word_alphabet,
                pos_alphabet,
                lengths,
                punct_set=punct_set,
                symbolic_root=True)
            ucorr, lcorr, total, ucm, lcm = stats
            ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
            corr_root, total_root = stats_root

            dev_ucorr += ucorr
            dev_lcorr += lcorr
            dev_total += total
            dev_ucomlpete += ucm
            dev_lcomplete += lcm

            dev_ucorr_nopunc += ucorr_nopunc
            dev_lcorr_nopunc += lcorr_nopunc
            dev_total_nopunc += total_nopunc
            dev_ucomlpete_nopunc += ucm_nopunc
            dev_lcomplete_nopunc += lcm_nopunc

            dev_root_corr += corr_root
            dev_total_root += total_root

            dev_total_inst += num_inst

        pred_writer.close()
        gold_writer.close()
        print(
            'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
            % (dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 / dev_total,
               dev_lcorr * 100 / dev_total, dev_ucomlpete * 100 /
               dev_total_inst, dev_lcomplete * 100 / dev_total_inst))
        print(
            'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
            % (dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc,
               dev_ucorr_nopunc * 100 / dev_total_nopunc, dev_lcorr_nopunc *
               100 / dev_total_nopunc, dev_ucomlpete_nopunc * 100 /
               dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst))
        print('Root: corr: %d, total: %d, acc: %.2f%%' %
              (dev_root_corr, dev_total_root,
               dev_root_corr * 100 / dev_total_root))

        if dev_ucorrect_nopunc <= dev_ucorr_nopunc:
            dev_ucorrect_nopunc = dev_ucorr_nopunc
            dev_lcorrect_nopunc = dev_lcorr_nopunc
            dev_ucomlpete_match_nopunc = dev_ucomlpete_nopunc
            dev_lcomplete_match_nopunc = dev_lcomplete_nopunc

            dev_ucorrect = dev_ucorr
            dev_lcorrect = dev_lcorr
            dev_ucomlpete_match = dev_ucomlpete
            dev_lcomplete_match = dev_lcomplete

            dev_root_correct = dev_root_corr

            best_epoch = epoch
            patient = 0
            torch.save(network, model_name)

            pred_filename = 'tmp/%spred_test%d' % (str(uid), epoch)
            pred_writer.start(pred_filename)
            gold_filename = 'tmp/%sgold_test%d' % (str(uid), epoch)
            gold_writer.start(gold_filename)

            test_ucorrect = 0.0
            test_lcorrect = 0.0
            test_ucomlpete_match = 0.0
            test_lcomplete_match = 0.0
            test_total = 0

            test_ucorrect_nopunc = 0.0
            test_lcorrect_nopunc = 0.0
            test_ucomlpete_match_nopunc = 0.0
            test_lcomplete_match_nopunc = 0.0
            test_total_nopunc = 0
            test_total_inst = 0

            test_root_correct = 0.0
            test_total_root = 0
            for batch in conllx_stacked_data.iterate_batch_stacked_variable(
                    data_test, batch_size):
                input_encoder, _ = batch
                word, char, pos, heads, types, masks, lengths = input_encoder
                heads_pred, types_pred, _, _ = network.decode(
                    word,
                    char,
                    pos,
                    mask=masks,
                    length=lengths,
                    beam=beam,
                    leading_symbolic=conllx_stacked_data.NUM_SYMBOLIC_TAGS)

                word = word.data.cpu().numpy()
                pos = pos.data.cpu().numpy()
                lengths = lengths.cpu().numpy()
                heads = heads.data.cpu().numpy()
                types = types.data.cpu().numpy()

                pred_writer.write(word,
                                  pos,
                                  heads_pred,
                                  types_pred,
                                  lengths,
                                  symbolic_root=True)
                gold_writer.write(word,
                                  pos,
                                  heads,
                                  types,
                                  lengths,
                                  symbolic_root=True)

                stats, stats_nopunc, stats_root, num_inst = parser.eval(
                    word,
                    pos,
                    heads_pred,
                    types_pred,
                    heads,
                    types,
                    word_alphabet,
                    pos_alphabet,
                    lengths,
                    punct_set=punct_set,
                    symbolic_root=True)
                ucorr, lcorr, total, ucm, lcm = stats
                ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
                corr_root, total_root = stats_root

                test_ucorrect += ucorr
                test_lcorrect += lcorr
                test_total += total
                test_ucomlpete_match += ucm
                test_lcomplete_match += lcm

                test_ucorrect_nopunc += ucorr_nopunc
                test_lcorrect_nopunc += lcorr_nopunc
                test_total_nopunc += total_nopunc
                test_ucomlpete_match_nopunc += ucm_nopunc
                test_lcomplete_match_nopunc += lcm_nopunc

                test_root_correct += corr_root
                test_total_root += total_root

                test_total_inst += num_inst

            pred_writer.close()
            gold_writer.close()
        else:
            if patient < schedule:
                patient += 1
            else:
                network = torch.load(model_name)
                lr = lr * decay_rate
                optim = generate_optimizer(opt, lr, network.parameters())
                patient = 0

        print(
            '----------------------------------------------------------------------------------------------------------------------------'
        )
        print(
            'best dev  W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (dev_ucorrect, dev_lcorrect, dev_total,
               dev_ucorrect * 100 / dev_total, dev_lcorrect * 100 / dev_total,
               dev_ucomlpete_match * 100 / dev_total_inst,
               dev_lcomplete_match * 100 / dev_total_inst, best_epoch))
        print(
            'best dev  Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (dev_ucorrect_nopunc, dev_lcorrect_nopunc, dev_total_nopunc,
               dev_ucorrect_nopunc * 100 / dev_total_nopunc,
               dev_lcorrect_nopunc * 100 / dev_total_nopunc,
               dev_ucomlpete_match_nopunc * 100 / dev_total_inst,
               dev_lcomplete_match_nopunc * 100 / dev_total_inst, best_epoch))
        print('best dev  Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)' %
              (dev_root_correct, dev_total_root,
               dev_root_correct * 100 / dev_total_root, best_epoch))
        print(
            '----------------------------------------------------------------------------------------------------------------------------'
        )
        print(
            'best test W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (test_ucorrect, test_lcorrect, test_total, test_ucorrect * 100 /
               test_total, test_lcorrect * 100 / test_total,
               test_ucomlpete_match * 100 / test_total_inst,
               test_lcomplete_match * 100 / test_total_inst, best_epoch))
        print(
            'best test Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            %
            (test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc,
             test_ucorrect_nopunc * 100 / test_total_nopunc,
             test_lcorrect_nopunc * 100 / test_total_nopunc,
             test_ucomlpete_match_nopunc * 100 / test_total_inst,
             test_lcomplete_match_nopunc * 100 / test_total_inst, best_epoch))
        print('best test Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)' %
              (test_root_correct, test_total_root,
               test_root_correct * 100 / test_total_root, best_epoch))
        print(
            '============================================================================================================================'
        )
Ejemplo n.º 29
0
    def train(self):

        if self.T - self.target_sync_T > self.args.target:
            self.sync_target_network()
            self.target_sync_T = self.T

        info = {}

        for _ in range(self.args.iters):
            self.dqn.eval()

            # TODO: Use a named tuple for experience replay
            n_step_sample = self.args.n_step
            batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, n_step_sample, self.args.gamma)
            columns = list(zip(*batch))

            states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3))
            actions = Variable(torch.LongTensor(columns[1]))
            terminal_states = Variable(torch.FloatTensor(columns[5]))
            rewards = Variable(torch.FloatTensor(columns[2]))
            # Have to clip rewards for DQN
            rewards = torch.clamp(rewards, -1, 1)
            steps = Variable(torch.FloatTensor(columns[4]))
            new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3))

            target_dqn_qvals = self.target_dqn(new_states).cpu()
            # Make a new variable with those values so that these are treated as constants
            target_dqn_qvals_data = Variable(target_dqn_qvals.data)

            q_value_targets = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states)
            inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma)
            # print(steps)
            q_value_targets = q_value_targets * torch.pow(inter, steps)
            if self.args.double:
                # Double Q Learning
                new_states_qvals = self.dqn(new_states).cpu()
                new_states_qvals_data = Variable(new_states_qvals.data)
                q_value_targets = q_value_targets * target_dqn_qvals_data.gather(1, new_states_qvals_data.max(1)[1])
            else:
                q_value_targets = q_value_targets * target_dqn_qvals_data.max(1)[0]
            q_value_targets = q_value_targets + rewards

            self.dqn.train()

            one_hot_actions = torch.zeros(self.args.batch_size, self.args.actions)

            for i in range(self.args.batch_size):
                one_hot_actions[i][actions[i].data] = 1

            if self.args.gpu:
                actions = actions.cuda()
                one_hot_actions = one_hot_actions.cuda()
                q_value_targets = q_value_targets.cuda()
                new_states = new_states.cuda()
            model_predictions_q_vals, model_predictions_state = self.dqn(states, Variable(one_hot_actions))
            model_predictions = model_predictions_q_vals.gather(1, actions.view(-1, 1))

            # info = {}

            td_error = model_predictions - q_value_targets
            info["TD_Error"] = td_error.mean().data[0]

            # Update the priorities
            if not self.args.density_priority:
                self.replay.Update_Indices(indices, td_error.cpu().data.numpy(), no_pseudo_in_priority=self.args.count_td_priority)

            # If using prioritised we need to weight the td_error
            if self.args.prioritized and self.args.prioritized_is:
                # print(td_error)
                weights_tensor = torch.from_numpy(is_weights).float()
                weights_tensor = Variable(weights_tensor)
                if self.args.gpu:
                    weights_tensor = weights_tensor.cuda()
                # print(weights_tensor)
                td_error = td_error * weights_tensor

            # Model 1 step state transition error

            # Save them every x steps
            if self.T % self.args.model_save_image == 0:
                os.makedirs("{}/transition_model/{}".format(self.args.log_path, self.T))
                for ii, image, action, next_state, current_state in zip(range(self.args.batch_size), model_predictions_state.cpu().data, actions.data, new_states.cpu().data, states.cpu().data):
                    image = image.numpy()[0]
                    image = np.clip(image, 0, 1)
                    # print(next_state)
                    next_state = next_state.numpy()[0]
                    current_state = current_state.numpy()[0]

                    black_bars = np.zeros_like(next_state[:1, :])
                    # print(black_bars.shape)

                    joined_image = np.concatenate((current_state, black_bars, image, black_bars, next_state), axis=0)
                    joined_image = np.transpose(joined_image)
                    self.log_image("{}/transition_model/{}/{}_____Action_{}".format(self.args.log_path, self.T, ii + 1, action), joined_image * 255)

                    # self.log_image("{}/transition_model/{}/{}_____Action_{}".format(self.args.log_path, self.T, ii + 1, action), image * 255)
                    # self.log_image("{}/transition_model/{}/{}_____Correct".format(self.args.log_path, self.T, ii + 1), next_state * 255)

            # print(model_predictions_state)

            # Cross Entropy Loss
            # TODO

            # Regresssion loss
            state_error = model_predictions_state - new_states
            # state_error_val = state_error.mean().data[0]

            info["State_Error"] = state_error.mean().data[0]
            self.log("DQN/State_Loss", state_error.mean().data[0], step=self.T)
            self.log("DQN/State_Loss_Squared", state_error.pow(2).mean().data[0], step=self.T)
            self.log("DQN/State_Loss_Max", state_error.abs().max().data[0], step=self.T)
            # self.log("DQN/Action_Matrix_Norm", self.dqn.action_matrix.weight.norm().cpu().data[0], step=self.T)

            combined_loss = (1 - self.args.model_loss) * td_error.pow(2).mean() + (self.args.model_loss) * state_error.pow(2).mean()
            l2_loss = combined_loss
            # l2_loss = (combined_loss).pow(2).mean()
            info["Loss"] = l2_loss.data[0]

            # Update
            self.optimizer.zero_grad()
            l2_loss.backward()

            # Taken from pytorch clip_grad_norm
            # Remove once the pip version it up to date with source
            gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value)
            if gradient_norm is not None:
                info["Norm"] = gradient_norm

            self.optimizer.step()

            if "States" in info:
                states_trained = info["States"]
                info["States"] = states_trained + columns[0]
            else:
                info["States"] = columns[0]

        # Pad out the states to be of size batch_size
        if len(info["States"]) < self.args.batch_size:
            old_states = info["States"]
            new_states = old_states[0] * (self.args.batch_size - len(old_states))
            info["States"] = new_states

        return info
Ejemplo n.º 30
0
 def step(self, closure=None):
     """Gradient clipping aware step()."""
     if self.gclip is not None and self.gclip > 0:
         # print("aaaa")
         clip_grad_norm(self.params, self.gclip)
     self.optim.step(closure)
Ejemplo n.º 31
0
def train(train_loader, model, criterion, optimizer, epoch, batch_logger):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    if args.no_partialbn:
        model.module.partialBN(False)
    else:
        model.module.partialBN(True)

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec = calculate_accuracy(output.data, target)
        losses.update(loss.item(), input.size(0))
        acc.update(prec, input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()

        if args.clip_gradient is not None:
            total_norm = clip_grad_norm(model.parameters(), args.clip_gradient)
            if total_norm > args.clip_gradient:
                print("clipping gradient: {} with coef {}".format(
                    total_norm, args.clip_gradient / total_norm))

        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        batch_logger.log({
            'epoch': epoch,
            'batch': i + 1,
            'loss': losses.val,
            'acc': acc.val,
            'lr': optimizer.param_groups[-1]['lr']
        })

        if i % args.print_freq == 0:
            print(('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
                   'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                   'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                   'Prec {acc.val:.3f} ({acc.avg:.3f})'.format(
                       epoch,
                       i,
                       len(train_loader),
                       batch_time=batch_time,
                       data_time=data_time,
                       loss=losses,
                       acc=acc,
                       lr=optimizer.param_groups[-1]['lr'])))
Ejemplo n.º 32
0
    def model_train(self, epoch_offset=0):
        create_dir(MODEL_SAVE_PATH)
        loss_for_regression = MSELoss()
        img_coors_json = read_json_file(BBOX_XYWH_JSON_PATH)

        optimizer = RMSprop(self.parameters(),
                            lr=LEARNING_RATE,
                            momentum=MOMENTUM)
        # optimizer = Adam(self.parameters(), lr=LEARNING_RATE)
        #         optimizer = SGD(self.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

        scheduler = StepLR(optimizer,
                           step_size=SCHEDULER_STEP,
                           gamma=SCHEDULER_GAMMA)

        for epoch in range(EPOCHS):
            epoch_loss = 0.0
            scheduler.step(epoch)
            LOGGER.debug('Epoch: %s, Current Learning Rate: %s',
                         str(epoch + epoch_offset), str(scheduler.get_lr()))
            for image, coors in img_coors_json.items():
                path_of_image = NORMALISED_IMAGES_PATH + image
                path_of_image = path_of_image.replace('%', '_')
                img = cv2.imread(path_of_image)
                img = torch.tensor(img).float().permute(2, 0, 1).unsqueeze(0)
                img = img.to(self.device)
                predicted_width, predicted_height, predicted_midpoint = self.forward(
                    img)

                #all are scaled
                mp_x = coors[0][0]
                mp_y = coors[0][1]
                mp = torch.cat((torch.tensor([[mp_x]]).to(
                    self.device), torch.tensor([[mp_y]]).to(self.device)),
                               dim=1).float()

                w = coors[0][2]
                h = coors[0][3]
                loss1 = loss_for_regression(
                    predicted_height,
                    torch.tensor([[h]]).float().to(self.device))
                loss2 = loss_for_regression(
                    predicted_width,
                    torch.tensor([[w]]).float().to(self.device))
                loss3 = loss_for_regression(predicted_midpoint,
                                            mp.to(self.device))
                loss = loss1 + loss2 + loss3 / 2
                optimizer.zero_grad()
                loss.backward()
                clip_grad_norm(self.parameters(), 0.5)
                optimizer.step()
                epoch_loss = epoch_loss + loss.item()

            if epoch % 5 == 0:
                print('epoch: ' + str(epoch) + ' ' + 'loss: ' +
                      str(epoch_loss))
            if epoch % EPOCH_SAVE_INTERVAL == 0:
                print('saving')
                torch.save(
                    self.state_dict(), MODEL_SAVE_PATH + 'model_epc_' +
                    str(epoch + epoch_offset) + '.pt')
        torch.save(
            self.state_dict(),
            MODEL_SAVE_PATH + 'model_epc_' + str(epoch + epoch_offset) + '.pt')
            logits_v = net(states_v)
            log_prob_v = F.log_softmax(logits_v)
            log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE), batch_actions_t]
            loss_policy_v = -log_prob_actions_v.mean()

            loss_policy_v.backward(retain_graph=True)
            grads = np.concatenate([p.grad.data.cpu().numpy().flatten()
                                    for p in net.parameters()
                                    if p.grad is not None])

            prob_v = F.softmax(logits_v)
            entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
            entropy_loss_v = -ENTROPY_BETA * entropy_v
            loss_v = loss_policy_v + entropy_loss_v
            loss_v.backward()
            nn_utils.clip_grad_norm(net.parameters(), GRAD_L2_CLIP)
            optimizer.step()
            loss_v += loss_policy_v

            # calc KL-div
            new_logits_v = net(states_v)
            new_prob_v = F.softmax(new_logits_v)
            kl_div_v = -((new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean()
            writer.add_scalar("kl", kl_div_v.data.cpu().numpy()[0], step_idx)

            writer.add_scalar("baseline", baseline, step_idx)
            writer.add_scalar("entropy", entropy_v.data.cpu().numpy()[0], step_idx)
            writer.add_scalar("batch_scales", np.mean(batch_scales), step_idx)
            writer.add_scalar("batch_scales_std", scale_std, step_idx)
            writer.add_scalar("loss_entropy", entropy_loss_v.data.cpu().numpy()[0], step_idx)
            writer.add_scalar("loss_policy", loss_policy_v.data.cpu().numpy()[0], step_idx)
Ejemplo n.º 34
0
def train(train_data_loader, net, criterion, optimizer, scheduler, epoch):
    net.train()
    # loss counters
    batch_time = AverageMeter()
    losses = AverageMeter()
    loc_losses = AverageMeter()
    cls_losses = AverageMeter()

    #### shuffle ####
    xxxx = copy.deepcopy(train_data_loader.dataset.ids)
    np.random.shuffle(arr)
    iii = 0
    for arr_i in arr:
        key = keys[arr_i]
        rang = my_dict[key]
        xxxx[iii:(iii + rang[1] - rang[0])] = xxx[rang[0]:rang[1]]
        iii += rang[1] - rang[0]
    train_data_loader.dataset.ids = copy.deepcopy(xxxx)

    # create batch iterator
    batch_iterator = None
    iter_count = 0
    t0 = time.perf_counter()

    for iteration in range(len(train_data_loader)):
        if not batch_iterator:
            batch_iterator = iter(train_data_loader)
        # load train data
        images, targets, img_indexs = next(batch_iterator)
        if args.cuda:
            images = Variable(images.cuda())
            targets = [
                Variable(anno.cuda(), volatile=True) for anno in targets
            ]
        else:
            images = Variable(images)
            targets = [Variable(anno, volatile=True) for anno in targets]
        # forward
        out = net(images, img_indexs)
        # backprop
        optimizer.zero_grad()

        loss_l, loss_c = criterion(out, targets)
        loss = loss_l + loss_c
        loss.backward()

        if args.clip_gradient is not None:
            total_norm = clip_grad_norm(net.parameters(), args.clip_gradient)
            if total_norm > args.clip_gradient:
                print("clipping gradient: {} with coef {}".format(
                    total_norm, args.clip_gradient / total_norm))

        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        loc_loss = loss_l.data[0]
        conf_loss = loss_c.data[0]
        # print('Loss data type ',type(loc_loss))
        loc_losses.update(loc_loss)
        cls_losses.update(conf_loss)
        losses.update((loc_loss + conf_loss) / 2.0)

        if iteration % args.print_step == 0 and iteration > 0:

            torch.cuda.synchronize()
            t1 = time.perf_counter()
            batch_time.update(t1 - t0)

            print_line = 'Epoch {:02d}/{:02d} Iteration {:06d}/{:06d} loc-loss {:.3f}({:.3f}) cls-loss {:.3f}({:.3f}) ' \
                         'average-loss {:.3f}({:.3f}) Timer {:0.3f}({:0.3f}) lr {:0.5f}'.format(
                epoch, args.epochs, iteration, len(train_data_loader), loc_losses.val, loc_losses.avg, cls_losses.val,
                cls_losses.avg, losses.val, losses.avg, batch_time.val, batch_time.avg, args.lr)

            torch.cuda.synchronize()
            t0 = time.perf_counter()
            log_file.write(print_line + '\n')
            print(print_line)
            iter_count += 1
            if iter_count % args.loss_reset_step == 0 and iter_count > 0:
                loc_losses.reset()
                cls_losses.reset()
                losses.reset()
                batch_time.reset()
                print('Reset accumulators of ', args.snapshot_pref, ' at',
                      iter_count * args.print_step)
                iter_count = 0
def train(model_str,
          train_iter,
          val_iter=None,
          source_embedding=None,
          target_embedding=None,
          early_stopping=False,
          save=False,
          save_path=None,
          model_params={},
          opt_params={},
          train_params={},
          cuda=CUDA_DEFAULT):
    # Initialize model and other variables
    model, criterion, optimizer, scheduler = _train_initialize_variables(model_str, model_params, opt_params, cuda, source_embedding, target_embedding)

    val_loss = 1e6
    best_val_loss = 1e6
    if scheduler is not None:
        assert val_iter is not None
        scheduler.step(val_loss)

    print("All set. Actual Training begins")
    for epoch in range(train_params.get('n_ep', 30)):
        # Monitoring loss
        total_loss = 0
        count = 0

        # Actual training loop.     
        for batch in train_iter:

            # Get data
            source = batch.src.transpose(0, 1)  # batch first
            target = batch.trg.transpose(0, 1)
            if cuda:
                source = source.cuda()
                target = target.cuda()

            # Initialize hidden layer and memory
            if model.model_str == 'LSTM':  # for LSTMA it is done in the forward because the init of the dec needs the last h of the enc
                model.hidden_enc = model.init_hidden('enc', source.size(0))
                model.hidden_dec = model.init_hidden('dec', source.size(0))

            # zero gradients
            optimizer.zero_grad()
            model.zero_grad()

            # predict
            output = model(source, target)

            # Dimension matching to cut it right for loss function.
            batch_size, sent_length = target.size(0), target.size(1)-1
            loss = criterion(output.view(batch_size, -1, sent_length), target[:, 1:])  # remove the first element of target (it is the SOS token)

            # Compute gradients, clip, and backprop
            loss.backward()
            clip_grad_norm(model.parameters(), model_params.get("clip_gradients", 5.))
            optimizer.step()

            # monitoring
            count += t.sum((target.data[:, 1:] != PAD_TOKEN).long())  # in that case there are batch_size x bbp_length classifications per batch, minus the pad tokens
            total_loss += t.sum(loss.data)  # .data so that you dont keep references

        # monitoring
        avg_loss = total_loss / count
        print("Average loss after %d epochs is %.4f" % (epoch, avg_loss))
        if val_iter is not None:
            model.eval()
            former_val_loss = val_loss * 1.
            val_loss = predict(model, val_iter, cuda=cuda)
            if scheduler is not None:
                scheduler.step(val_loss)
            if val_loss > former_val_loss:
                if early_stopping:
                    break
            else:
                if save and best_val_loss > val_loss:  # save only the best
                    best_val_loss = val_loss * 1.
                    assert save_path is not None
                    # weights
                    save_model(model, save_path + '.pytorch')
                    # params
                    with open(save_path + '.params.json', 'w') as fp:
                        json.dump(model.params, fp)
                    # loss
                    with open(save_path + '.losses.txt', 'w') as fp:
                        fp.write('val: ' + str(val_loss))
                        fp.write('train: ' + str(avg_loss))
            model.train()

    return model
Ejemplo n.º 36
0
 def step(self):
     # Compute gradients norm.
     if self.max_grad_norm:
         clip_grad_norm(self.params, self.max_grad_norm)
     self.optimizer.step()
Ejemplo n.º 37
0
def train(train_loader, model, act_criterion, comp_criterion,
          regression_criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    act_losses = AverageMeter()
    comp_losses = AverageMeter()
    reg_losses = AverageMeter()
    act_accuracies = AverageMeter()
    fg_accuracies = AverageMeter()
    bg_accuracies = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    optimizer.zero_grad()

    ohem_num = train_loader.dataset.fg_per_video
    comp_group_size = train_loader.dataset.fg_per_video + train_loader.dataset.incomplete_per_video
    for i, (prop_fts, prop_type, prop_labels,
            prop_reg_targets) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        batch_size = prop_fts[0].size(0)

        activity_out, activity_target, activity_prop_type, \
        completeness_out, completeness_target, \
        regression_out, regression_labels, regression_target = model((prop_fts[0], prop_fts[1]), prop_labels,
                                                                     prop_reg_targets, prop_type)

        act_loss = act_criterion(activity_out, activity_target)
        comp_loss = comp_criterion(completeness_out, completeness_target,
                                   ohem_num, comp_group_size)
        reg_loss = regression_criterion(regression_out, regression_labels,
                                        regression_target)

        loss = act_loss + comp_loss * args.comp_loss_weight + reg_loss * args.reg_loss_weight

        losses.update(loss.item(), batch_size)
        act_losses.update(act_loss.item(), batch_size)
        comp_losses.update(comp_loss.item(), batch_size)
        reg_losses.update(reg_loss.item(), batch_size)

        act_acc = accuracy(activity_out, activity_target)
        act_accuracies.update(act_acc[0].item(), activity_out.size(0))

        fg_indexer = (activity_prop_type == 0).nonzero().squeeze()
        bg_indexer = (activity_prop_type == 2).nonzero().squeeze()

        fg_acc = accuracy(activity_out[fg_indexer, :],
                          activity_target[fg_indexer])
        fg_accuracies.update(fg_acc[0].item(), len(fg_indexer))

        if len(bg_indexer) > 0:
            bg_acc = accuracy(activity_out[bg_indexer, :],
                              activity_target[bg_indexer])
            bg_accuracies.update(bg_acc[0].item(), len(bg_indexer))

        loss.backward()

        if i % args.iter_size == 0:
            # scale down gradients when iter size is functioning
            if args.iter_size != 1:
                for g in optimizer.param_groups:
                    for p in g['params']:
                        p.grad /= args.iter_size

            if args.clip_gradient is not None:
                total_norm = clip_grad_norm(model.parameters(),
                                            args.clip_gradient)
                if total_norm > args.clip_gradient:
                    logger.info("clipping gradient: {} with coef {}".format(
                        total_norm, args.clip_gradient / total_norm))
            else:
                total_norm = 0

            optimizer.step()
            optimizer.zero_grad()

        batch_time.update(time.time() - end)
        end = time.time()

        writer.add_scalar('data/loss', losses.val,
                          epoch * len(train_loader) + i + 1)
        writer.add_scalar('data/Reg_loss', reg_losses.val,
                          epoch * len(train_loader) + i + 1)
        writer.add_scalar('data/Act_loss', act_losses.val,
                          epoch * len(train_loader) + i + 1)
        writer.add_scalar('data/comp_loss', comp_losses.val,
                          epoch * len(train_loader) + i + 1)

        if i % args.print_freq == 0:
            logger.info(
                'Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
                'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                'Act. Loss {act_losses.val:.3f} ({act_losses.avg: .3f}) \t'
                'Comp. Loss {comp_losses.val:.3f} ({comp_losses.avg: .3f}) '.
                format(
                    epoch,
                    i,
                    len(train_loader),
                    batch_time=batch_time,
                    data_time=data_time,
                    loss=losses,
                    act_losses=act_losses,
                    comp_losses=comp_losses,
                    lr=optimizer.param_groups[0]['lr'],
                ) +
                '\tReg. Loss {reg_loss.val:.3f} ({reg_loss.avg:.3f})'.format(
                    reg_loss=reg_losses) +
                '\n Act. FG {fg_acc.val:.02f} ({fg_acc.avg:.02f}) Act. BG {bg_acc.avg:.02f} ({bg_acc.avg:.02f})'
                .format(act_acc=act_accuracies,
                        fg_acc=fg_accuracies,
                        bg_acc=bg_accuracies))
Ejemplo n.º 38
0
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    cls_losses = AverageMeter()
    res_losses = AverageMeter()
    losses = AverageMeter()

    model.train()

    end = time.time()
    optimizer.zero_grad()

    for i, (feats, labels, start_offsets,
            end_offsets) in enumerate(train_loader):
        data_time.update(time.time() - end)

        input_feats = torch.autograd.Variable(feats).cuda()
        input_labels = torch.autograd.Variable(labels).cuda()
        start_offsets = torch.autograd.Variable(start_offsets).cuda().float()
        end_offsets = torch.autograd.Variable(end_offsets).cuda().float()
        pred_labels = model(input_feats)

        cls_loss = criterion[0](pred_labels[:, :2], input_labels)
        res_loss = criterion[1](pred_labels[:, 2:], input_labels.float(),
                                start_offsets, end_offsets)
        cls_losses.update(cls_loss.cpu().item(), feats.size(0))
        res_losses.update(res_loss.cpu().item(), torch.sum(labels))
        loss = cls_loss + args.lambda_reg * res_loss
        losses.update(loss.cpu().item(), feats.size(0))

        # compute gradient and do SGD step
        loss.backward()

        if args.clip_gradient is not None:
            total_norm = clip_grad_norm(model.parameters(), args.clip_gradient)
            if total_norm > args.clip_gradient:
                print('Clipping gradient: {} with coef {}'.format(
                    total_norm, args.clip_gradient / total_norm))

        optimizer.step()
        optimizer.zero_grad()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print(
                'Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
                'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                'Data {data_time.val:.3f} ({data_time.avg:.3f})\n'
                'Classification Loss {cls_loss.val:.4f} ({cls_loss.avg:.4f})\t'
                'Regression Loss {res_loss.val:.4f} ({res_loss.avg:.4f})\t'
                'Loss {loss.val:.4f} ({loss.avg:.4f})\n'.format(
                    epoch,
                    i,
                    len(train_loader),
                    batch_time=batch_time,
                    data_time=data_time,
                    loss=losses,
                    cls_loss=cls_losses,
                    res_loss=res_losses,
                    lr=optimizer.param_groups[0]['lr']))
Ejemplo n.º 39
0
                # Load batch
                progress, notFinished, batch_input, batch_metadata, batch_labels = get_batch_data(args.batch_size,
                                                                                                  pick_data)
                if not notFinished:
                    break

                # zero the parameter gradients
                optimizer.zero_grad()

                # Run neural net + Calculate Loss
                outputs = net(Variable(batch_input), Variable(batch_metadata)).cuda()
                loss = criterion(outputs, Variable(batch_labels))

                # Backprop
                loss.backward()
                nnutils.clip_grad_norm(net.parameters(), 1.0)
                optimizer.step()

                # Update progress bar
                pb.animate(progress)
                batch_counter += 1
                sum_counter += 1
                sum += loss.data[0]

                if sum_counter == 80:
                    log_file.write(
                        '\n' + str(batch_counter) + ',' + str(sum / sum_counter))
                    log_file.flush()
                    sum = 0
                    sum_counter = 0
def train(train_iter, dev_iter, test_iter, model, args):
    if args.cuda:
        model.cuda()
        torch.cuda.manual_seed(hyperparams.seed_num)

    # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-8)
    # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay)
    # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,momentum=)
    # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr)

    if args.Adam is True:
        print("Adam Training......")
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.init_weight_decay)
    elif args.SGD is True:
        print("SGD Training.......")
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=args.lr,
                                    weight_decay=args.init_weight_decay,
                                    momentum=args.momentum_value)
    elif args.Adadelta is True:
        print("Adadelta Training.......")
        optimizer = torch.optim.Adadelta(model.parameters(),
                                         lr=args.lr,
                                         weight_decay=args.init_weight_decay)

    # lambda1 = lambda epoch: epoch // 30
    # lambda2 = lambda epoch: 0.99 ** epoch
    # print("lambda1 {} lambda2 {} ".format(lambda1, lambda2))
    # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda2])

    # scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.9)

    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

    steps = 0
    epoch_step = 0
    model_count = 0
    model.train()
    for epoch in range(1, args.epochs + 1):
        print("\n## 第{} 轮迭代,共计迭代 {} 次 !##\n".format(epoch, args.epochs))
        # scheduler.step()
        # print("now lr is {} \n".format(scheduler.get_lr()))
        print("now lr is {} \n".format(optimizer.param_groups[0].get("lr")))
        for batch in train_iter:
            feature, target = batch.text, batch.label
            feature.data.t_(), target.data.sub_(1)  # batch first, index align
            if args.cuda:
                feature, target = feature.cuda(), target.cuda()

            optimizer.zero_grad()
            model.zero_grad()

            logit = model(feature)
            loss = F.cross_entropy(logit, target)
            loss.backward()
            if args.init_clip_max_norm is not None:
                utils.clip_grad_norm(model.parameters(),
                                     max_norm=args.init_clip_max_norm)
            optimizer.step()

            steps += 1
            if steps % args.log_interval == 0:
                train_size = len(train_iter.dataset)
                corrects = (torch.max(logit, 1)[1].view(
                    target.size()).data == target.data).sum()
                accuracy = float(corrects) / batch.batch_size * 100.0
                sys.stdout.write(
                    '\rBatch[{}/{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.
                    format(steps, train_size, loss.data[0], accuracy, corrects,
                           batch.batch_size))
            if steps % args.test_interval == 0:
                eval(dev_iter, model, args)
            if steps % args.save_interval == 0:
                if not os.path.isdir(args.save_dir):
                    os.makedirs(args.save_dir)
                save_prefix = os.path.join(args.save_dir, 'snapshot')
                save_path = '{}_steps{}.pt'.format(save_prefix, steps)
                torch.save(model, save_path)
                print("\n", save_path, end=" ")
                test_model = torch.load(save_path)
                model_count += 1
                test_eval(test_iter, test_model, save_path, args, model_count)
    return model_count
Ejemplo n.º 41
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with Multitask bi-directional RNN-CNN-CRF')
    parser.add_argument('--config',
                        help='Config file (Python file format)',
                        default="config_multitask.py")
    parser.add_argument('--grid', help='Grid Search Options', default="{}")
    args = parser.parse_args()
    logger = get_logger("Multi-Task")
    use_gpu = torch.cuda.is_available()

    # Config Tensorboard Writer
    log_writer = SummaryWriter()

    # Load from config file
    spec = importlib.util.spec_from_file_location("config", args.config)
    config_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(config_module)
    config = config_module.entries

    # Load options from grid search
    options = eval(args.grid)
    for k, v in options.items():
        if isinstance(v, six.string_types):
            cmd = "%s = \"%s\"" % (k, v)
        else:
            cmd = "%s = %s" % (k, v)
            log_writer.add_scalar(k, v, 1)
        exec(cmd)

    # Load embedding dict
    embedding = config.embedding.embedding_type
    embedding_path = config.embedding.embedding_dict
    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)

    # Collect data path
    data_dir = config.data.data_dir
    data_names = config.data.data_names
    train_paths = [
        os.path.join(data_dir, data_name, "train.tsv")
        for data_name in data_names
    ]
    dev_paths = [
        os.path.join(data_dir, data_name, "devel.tsv")
        for data_name in data_names
    ]
    test_paths = [
        os.path.join(data_dir, data_name, "test.tsv")
        for data_name in data_names
    ]

    # Create alphabets
    logger.info("Creating Alphabets")
    if not os.path.exists('tmp'):
        os.mkdir('tmp')
    word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, ner_alphabet_task, label_reflect  = \
            bionlp_data.create_alphabets(os.path.join(Path(data_dir).abspath(), "alphabets", "_".join(data_names)), train_paths,
                    data_paths=dev_paths + test_paths, use_cache=True,
                    embedd_dict=embedd_dict, max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())
    logger.info(
        "NER Alphabet Size per Task: %s",
        str([task_alphabet.size() for task_alphabet in ner_alphabet_task]))

    #task_reflects = torch.LongTensor(reverse_reflect(label_reflect, ner_alphabet.size()))
    #if use_gpu:
    #    task_reflects = task_reflects.cuda()

    if embedding == 'elmo':
        logger.info("Loading ELMo Embedder")
        ee = ElmoEmbedder(options_file=config.embedding.elmo_option,
                          weight_file=config.embedding.elmo_weight,
                          cuda_device=config.embedding.elmo_cuda)
    else:
        ee = None

    logger.info("Reading Data")

    # Prepare dataset
    data_trains = [
        bionlp_data.read_data_to_variable(train_path,
                                          word_alphabet,
                                          char_alphabet,
                                          pos_alphabet,
                                          chunk_alphabet,
                                          ner_alphabet_task[task_id],
                                          use_gpu=use_gpu,
                                          elmo_ee=ee)
        for task_id, train_path in enumerate(train_paths)
    ]
    num_data = [sum(data_train[1]) for data_train in data_trains]
    num_labels = ner_alphabet.size()
    num_labels_task = [task_item.size() for task_item in ner_alphabet_task]

    data_devs = [
        bionlp_data.read_data_to_variable(dev_path,
                                          word_alphabet,
                                          char_alphabet,
                                          pos_alphabet,
                                          chunk_alphabet,
                                          ner_alphabet_task[task_id],
                                          use_gpu=use_gpu,
                                          volatile=True,
                                          elmo_ee=ee)
        for task_id, dev_path in enumerate(dev_paths)
    ]

    data_tests = [
        bionlp_data.read_data_to_variable(test_path,
                                          word_alphabet,
                                          char_alphabet,
                                          pos_alphabet,
                                          chunk_alphabet,
                                          ner_alphabet_task[task_id],
                                          use_gpu=use_gpu,
                                          volatile=True,
                                          elmo_ee=ee)
        for task_id, test_path in enumerate(test_paths)
    ]

    writer = BioNLPWriter(word_alphabet, char_alphabet, pos_alphabet,
                          chunk_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[bionlp_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if not embedd_dict == None and word in embedd_dict:
                embedding = embedd_dict[word]
            elif not embedd_dict == None and word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    # Construct network
    window = 3
    num_layers = 1
    mode = config.rnn.mode
    hidden_size = config.rnn.hidden_size
    char_dim = config.rnn.char_dim
    num_filters = config.rnn.num_filters
    tag_space = config.rnn.tag_space
    bigram = config.rnn.bigram
    attention_mode = config.rnn.attention
    if config.rnn.dropout == 'std':
        network = MultiTaskBiRecurrentCRF(
            len(data_trains),
            embedd_dim,
            word_alphabet.size(),
            char_dim,
            char_alphabet.size(),
            num_filters,
            window,
            mode,
            hidden_size,
            num_layers,
            num_labels,
            num_labels_task=num_labels_task,
            tag_space=tag_space,
            embedd_word=word_table,
            p_in=config.rnn.p,
            p_rnn=config.rnn.p,
            bigram=bigram,
            elmo=(embedding == 'elmo'),
            attention_mode=attention_mode,
            adv_loss_coef=config.multitask.adv_loss_coef,
            diff_loss_coef=config.multitask.diff_loss_coef,
            char_level_rnn=config.rnn.char_level_rnn)
    else:
        raise NotImplementedError

    if use_gpu:
        network.cuda()

    # Prepare training
    unk_replace = config.embedding.unk_replace
    num_epochs = config.training.num_epochs
    batch_size = config.training.batch_size
    lr = config.training.learning_rate
    momentum = config.training.momentum
    alpha = config.training.alpha
    lr_decay = config.training.lr_decay
    schedule = config.training.schedule
    gamma = config.training.gamma

    # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True)
    optim = RMSprop(network.parameters(),
                    lr=lr,
                    alpha=alpha,
                    momentum=momentum,
                    weight_decay=gamma)
    logger.info(
        "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d, crf=%s"
        % (mode, num_layers, hidden_size, num_filters, tag_space,
           'bigram' if bigram else 'unigram'))
    logger.info(
        "training: l2: %f, (#training data: %s, batch: %d, dropout: %.2f, unk replace: %.2f)"
        % (gamma, num_data, batch_size, config.rnn.p, unk_replace))

    num_batches = [x // batch_size + 1 for x in num_data]
    dev_f1 = [0.0 for x in num_data]
    dev_acc = [0.0 for x in num_data]
    dev_precision = [0.0 for x in num_data]
    dev_recall = [0.0 for x in num_data]
    test_f1 = [0.0 for x in num_data]
    test_acc = [0.0 for x in num_data]
    test_precision = [0.0 for x in num_data]
    test_recall = [0.0 for x in num_data]
    best_epoch = [0 for x in num_data]

    # Training procedure
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): '
            % (epoch, mode, config.rnn.dropout, lr, lr_decay, schedule))
        train_err = 0.
        train_total = 0.

        # Gradient decent on training data
        start_time = time.time()
        num_back = 0
        network.train()
        batch_count = 0
        for batch in range(1, 2 * num_batches[0] + 1):
            r = random.random()
            task_id = 0 if r <= 0.5 else random.randint(1, len(num_data) - 1)
            #if batch > num_batches[task_id]:
            #    batch = batch % num_batches[task_id] + 1
            batch_count += 1
            word, char, _, _, labels, masks, lengths, elmo_embedding = bionlp_data.get_batch_variable(
                data_trains[task_id], batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss, task_loss, adv_loss, diff_loss = network.loss(
                task_id,
                word,
                char,
                labels,
                mask=masks,
                elmo_word=elmo_embedding)
            #log_writer.add_scalars(
            #        'train_loss_task' + str(task_id),
            #        {'all_loss': loss, 'task_loss': task_loss, 'adv_loss': adv_loss, 'diff_loss': diff_loss},
            #        (epoch - 1) * (num_batches[task_id] + 1) + batch
            #)
            #log_writer.add_scalars(
            #        'train_loss_overview',
            #        {'all_loss': loss, 'task_loss': task_loss, 'adv_loss': adv_loss, 'diff_loss': diff_loss},
            #        (epoch - 1) * (sum(num_batches) + 1) + batch_count
            #)
            loss.backward()
            clip_grad_norm(network.parameters(), 5.0)
            optim.step()

            num_inst = word.size(0)
            train_err += loss.data[0] * num_inst
            train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (2 * num_batches[0] - batch) * time_ave

            # update log
            if batch % 100 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % (
                    batch, 2 * num_batches[0], train_err / train_total,
                    time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, time: %.2fs' %
              (2 * num_batches[0], train_err / train_total,
               time.time() - start_time))

        # Evaluate performance on dev data
        network.eval()
        for task_id in range(len(num_batches)):
            tmp_filename = 'tmp/%s_dev%d%d' % (str(uid), epoch, task_id)
            writer.start(tmp_filename)

            for batch in bionlp_data.iterate_batch_variable(
                    data_devs[task_id], batch_size):
                word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch
                preds, _ = network.decode(
                    task_id,
                    word,
                    char,
                    target=labels,
                    mask=masks,
                    leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS,
                    elmo_word=elmo_embedding)
                writer.write(word.data.cpu().numpy(),
                             pos.data.cpu().numpy(),
                             chunk.data.cpu().numpy(),
                             preds.cpu().numpy(),
                             labels.data.cpu().numpy(),
                             lengths.cpu().numpy())
            writer.close()
            acc, precision, recall, f1 = evaluate(tmp_filename)
            log_writer.add_scalars(
                'dev_task' + str(task_id), {
                    'accuracy': acc,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1
                }, epoch)
            print(
                'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (acc, precision, recall, f1))

            if dev_f1[task_id] < f1:
                dev_f1[task_id] = f1
                dev_acc[task_id] = acc
                dev_precision[task_id] = precision
                dev_recall[task_id] = recall
                best_epoch[task_id] = epoch

                # Evaluate on test data when better performance detected
                tmp_filename = 'tmp/%s_test%d%d' % (str(uid), epoch, task_id)
                writer.start(tmp_filename)

                for batch in bionlp_data.iterate_batch_variable(
                        data_tests[task_id], batch_size):
                    word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch
                    preds, _ = network.decode(
                        task_id,
                        word,
                        char,
                        target=labels,
                        mask=masks,
                        leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS,
                        elmo_word=elmo_embedding)
                    writer.write(word.data.cpu().numpy(),
                                 pos.data.cpu().numpy(),
                                 chunk.data.cpu().numpy(),
                                 preds.cpu().numpy(),
                                 labels.data.cpu().numpy(),
                                 lengths.cpu().numpy())
                writer.close()
                test_acc[task_id], test_precision[task_id], test_recall[
                    task_id], test_f1[task_id] = evaluate(tmp_filename)
                log_writer.add_scalars(
                    'test_task' + str(task_id), {
                        'accuracy': test_acc[task_id],
                        'precision': test_precision[task_id],
                        'recall': test_recall[task_id],
                        'f1': test_f1[task_id]
                    }, epoch)

            print(
                "================================================================================"
            )
            print("dataset: %s" % data_names[task_id])
            print(
                "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                % (dev_acc[task_id], dev_precision[task_id],
                   dev_recall[task_id], dev_f1[task_id], best_epoch[task_id]))
            print(
                "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                %
                (test_acc[task_id], test_precision[task_id],
                 test_recall[task_id], test_f1[task_id], best_epoch[task_id]))
            print(
                "================================================================================\n"
            )

            if epoch % schedule == 0:
                # lr = learning_rate / (1.0 + epoch * lr_decay)
                # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True)
                lr = lr * lr_decay
                optim.param_groups[0]['lr'] = lr

    # writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Ejemplo n.º 42
0
def fit_model(model, loss_op, optim_op, train_gen, val_gen, epochs,
              checkpoint_path, patience):
    """ Analog to Keras fit_generator function.

    # Arguments:
        model: Model to be finetuned.
        loss_op: loss operation (BCEWithLogitsLoss or CrossEntropy for e.g.)
        optim_op: optimization operation (Adam e.g.)
        train_gen: Training data iterator (DataLoader)
        val_gen: Validation data iterator (DataLoader)
        epochs: Number of epochs.
        checkpoint_path: Filepath where weights will be checkpointed to
            during training. This file will be rewritten by the function.
        patience: Patience for callback methods.
        verbose: Verbosity flag.

    # Returns:
        Accuracy of the trained model, ONLY if 'evaluate' is set.
    """
    # Save original checkpoint
    torch.save(model.state_dict(), checkpoint_path)

    model.eval()
    best_loss = np.mean([calc_loss(loss_op, model(Variable(xv)), Variable(yv)).data.cpu().numpy()[0] for xv, yv in val_gen])
    print("original val loss", best_loss)

    epoch_without_impr = 0
    for epoch in range(epochs):
        for i, data in enumerate(train_gen):
            X_train, y_train = data
            X_train = Variable(X_train, requires_grad=False)
            y_train = Variable(y_train, requires_grad=False)
            model.train()
            optim_op.zero_grad()
            output = model(X_train)
            loss = calc_loss(loss_op, output, y_train)
            loss.backward()
            clip_grad_norm(model.parameters(), 1)
            optim_op.step()

            acc = evaluate_using_acc(model, [(X_train.data, y_train.data)])
            print("== Epoch", epoch, "step", i, "train loss", loss.data.cpu().numpy()[0], "train acc", acc)

        model.eval()
        acc = evaluate_using_acc(model, val_gen)
        print("val acc", acc)

        val_loss = np.mean([calc_loss(loss_op, model(Variable(xv)), Variable(yv)).data.cpu().numpy()[0] for xv, yv in val_gen])
        print("val loss", val_loss)
        if best_loss is not None and val_loss >= best_loss:
            epoch_without_impr += 1
            print('No improvement over previous best loss: ', best_loss)

        # Save checkpoint
        if best_loss is None or val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), checkpoint_path)
            print('Saving model at', checkpoint_path)

        # Early stopping
        if epoch_without_impr >= patience:
            break
Ejemplo n.º 43
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch PennTreeBank RNN/LSTM Language Model')
    parser.add_argument('--data',
                        type=str,
                        default='../data/',
                        help='location of the data corpus')
    parser.add_argument('--presaved',
                        action='store_true',
                        help='use presaved data')
    parser.add_argument('--glovedata',
                        type=str,
                        default='../data/glove.6B',
                        help='location of the pretrained glove embeddings')
    parser.add_argument('--din', type=int, default=30, help='length of LSTM')
    parser.add_argument('--demb',
                        type=int,
                        default=100,
                        help='size of word embeddings')
    parser.add_argument('--dhid',
                        type=int,
                        default=100,
                        help='humber of hidden units per layer')
    parser.add_argument('--dout',
                        type=int,
                        default=2,
                        help='number of output classes')
    parser.add_argument('--nlayers',
                        type=int,
                        default=1,
                        help='number of layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='initial learning rate')
    parser.add_argument('--clip',
                        type=float,
                        default=0.25,
                        help='gradient clipping')
    parser.add_argument('--embinit',
                        type=str,
                        default='random',
                        help='embedding weight initialization type')
    parser.add_argument('--decinit',
                        type=str,
                        default='random',
                        help='decoder weight initialization type')
    parser.add_argument('--hidinit',
                        type=str,
                        default='random',
                        help='recurrent hidden weight initialization type')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.0,
                        help='dropout applied to layers (0 = no dropout)')
    parser.add_argument('--reweight',
                        action='store_true',
                        help='reweight loss function')
    parser.add_argument('--epochs',
                        type=int,
                        default=40,
                        help='upper epoch limit')
    parser.add_argument('--batchsize',
                        type=int,
                        default=20,
                        metavar='N',
                        help='batch size')
    parser.add_argument('--seed', type=int, default=3, help='random seed')
    parser.add_argument('--vocabsize',
                        type=int,
                        default=200000,
                        help='random seed')
    parser.add_argument('--optimizer',
                        action='store_true',
                        help='use ADAM optimizer')
    parser.add_argument('--pipeline',
                        action='store_true',
                        help='use pipeline file')
    parser.add_argument('--psw', type=int, default=1, help='remove stop words')
    parser.add_argument('--ppunc',
                        action='store_true',
                        help='remove punctuation')
    parser.add_argument('--pntok',
                        action='store_true',
                        help='use number tokens')
    parser.add_argument('--pkq',
                        action='store_true',
                        help='keep question words')
    parser.add_argument('--stem', action='store_true', help='use stemmer')
    parser.add_argument('--lemma', action='store_true', help='use lemmatizer')
    parser.add_argument('--freezeemb',
                        action='store_false',
                        help='freezes embeddings')
    parser.add_argument('--cuda', action='store_true', help='use CUDA')
    parser.add_argument('--loginterval',
                        type=int,
                        default=100,
                        metavar='N',
                        help='report interval')
    parser.add_argument('--save',
                        type=str,
                        default='',
                        help='path to save the final model')
    args = parser.parse_args()

    pipe = None
    if args.pipeline:
        stemmer, lemmatizer = None, None
        if args.stem:
            stemmer = SnowballStemmer('english')
        elif args.lemma:
            lemmatizer = WordNetLemmatizer()

    if not args.presaved:
        pipe = functools.partial(pipeline,
                                 rm_stop_words=args.psw,
                                 rm_punc=args.ppunc,
                                 number_token=args.pntok,
                                 keep_questions=args.pkq,
                                 stemmer=stemmer,
                                 lemmatizer=lemmatizer)
        corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe)
        X, y, X_val, y_val = load_data(args.data,
                                       corpus,
                                       args.din,
                                       train_split=0.9)

    else:
        print('Loading Presaved Data')
        X = torch.load(args.data + 'train_x.t')
        y = torch.load(args.data + 'train_y.t')
        X_val = torch.load(args.data + 'val_x.t')
        y_val = torch.load(args.data + 'val_y.t')
        with open(args.data + 'corpus.pkl', 'rb') as f:
            corpus = pkl.load(f)

    if args.cuda:
        X, y = X.cuda(), y.cuda()
        X_val, y_val = X_val.cuda(), y_val.cuda()

    print('Generating Data Loaders')
    #X.size len(train_data),1,2,fix_length
    train_dataset = TensorDataset(X, y)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batchsize,
                              shuffle=True)
    valid_loader = DataLoader(TensorDataset(X_val, y_val),
                              batch_size=args.batchsize,
                              shuffle=False)

    ntokens = len(corpus)
    glove_embeddings = None
    if args.embinit == 'glove':
        assert args.demb in (50, 100, 200, 300)
        glove_embeddings = get_glove_embeddings(args.glovedata,
                                                corpus.dictionary.word2idx,
                                                ntokens, args.demb)

    model = LSTMModelMLP(args.din, args.dhid, args.nlayers, args.dout,
                         args.demb, args.vocabsize, args.dropout, args.embinit,
                         args.hidinit, args.decinit, glove_embeddings,
                         args.cuda)

    if args.cuda:
        model.cuda()

    if args.reweight:
        w_tensor = torch.Tensor([1.309028344, 0.472001959])
        if args.cuda:
            w_tensor = w_tensor.cuda()
        criterion = nn.NLLLoss(weight=w_tensor)
    else:
        criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    model_config = '\t'.join([
        str(x)
        for x in (torch.__version__, args.clip, args.nlayers, args.din,
                  args.demb, args.dhid, args.embinit, args.decinit,
                  args.hidinit, args.dropout, args.optimizer, args.reweight,
                  args.lr, args.vocabsize, args.pipeline, args.psw, args.ppunc,
                  args.pntok, args.pkq, args.stem, args.lemma)
    ])

    print(
        'Pytorch | Clip | #Layers | InSize | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer | Reweight | LR | VocabSize | pipeline | stop | punc | ntoken | keep_ques | stem | lemma'
    )
    print(model_config)

    # best_val_acc = 0.78
    best_ll = 0.48
    for epoch in range(args.epochs):
        model.train()
        total_cost = 0
        start_time = time.time()
        cur_loss = 0
        for ind, (qs, duplicate) in enumerate(train_loader):
            model.zero_grad()
            pred = model(qs[:, 0, 0, :], qs[:, 0, 1, :])
            if args.cuda:
                pred = pred.cuda()
                duplicate = duplicate.cuda()
            duplicate = Variable(duplicate)
            loss = criterion(pred, duplicate)
            loss.backward()
            clip_grad_norm(model.parameters(), args.clip)

            if optimizer:
                optimizer.step()
            else:
                for p in model.parameters():
                    p.data.add_(-args.lr, p.grad.data)

            total_cost += loss.data[0]
            cur_loss += loss.data[0]

            if ind % args.loginterval == 0 and ind > 0:
                cur_loss = loss.data[0] / args.loginterval
                elapsed = time.time() - start_time
                print(
                    '| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | '
                    'Loss {:.6f}'.format(epoch, ind,
                                         len(X) // args.batchsize,
                                         elapsed * 1000.0 / args.loginterval,
                                         cur_loss))
                start_time = time.time()
                cur_loss = 0

        model.eval()
        train_acc, train_ll = evaluate(model, train_loader, args.cuda)
        val_acc, val_ll = evaluate(model, valid_loader, args.cuda)
        # if args.save and (val_acc > best_val_acc):
        if args.save and (val_ll < best_ll):
            with open(args.save + '_corpus.pkl', 'wb') as corp_f:
                pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL)
            torch.save(model.cpu(), args.save)
            torch.save(model.cpu().state_dict(), args.save + ".state_dict")
            with open(args.save + ".state_dict.config", "w") as f:
                f.write(model_config)
            best_ll = val_ll
            if args.cuda:
                model.cuda()

        print(
            'Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}'
            .format(epoch, total_cost, train_acc, val_acc, train_ll, val_ll))
        print('-' * 89)
Ejemplo n.º 44
0
            # add summary to logger
            logger.scalar_summary('square loss', square_loss.data[0], step)

            # Next train Generator on Criterion from Discriminator
            real_labels = Variable(label.fill_(1))
            g_loss = 0 if not use_adv_model else label_loss(model_adv.forward(outputs), real_labels)
            loss = square_loss + g_loss
            losses.append(loss.data[0])
            if use_adv_model:
                logger.scalar_summary('Generator Loss', g_loss.data[0], step)

            # Clip gradient norms
            optimizer.zero_grad()
            loss.backward()
            logger.scalar_summary('Composite Loss', loss.data[0], step)
            clip_grad_norm(model.parameters(), 1.0)
            optimizer.step()

            step += args.batch

        print('Composite loss: ', np.array(losses).mean())
        if epoch % args.snapshot == 0:
            # snapshot model and optimizer
            snapshot = {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'adv_model': None if not use_adv_model else model_adv.state_dict(),
                'optimizer': optimizer.state_dict(),
                'optimizer_adv': None if not use_adv_model else optimizer_adv.state_dict()
            }
            torch.save(snapshot, os.path.join(exp_path, 'best.pth'))
Ejemplo n.º 45
0
def train(train_loader, model, criterion, optimizer, optimizer_cent, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    if args.no_partialbn:
        model.module.partialBN(False)
    else:
        model.module.partialBN(True)

    # switch to train mode
    model.train()
    end = time.time()
    #centers = model.centers
    #print(center)
    for i, (input, target) in enumerate(train_loader):

        # print('##### i:', i)
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(async=True)
        # print('input size ====>', input.size())
        # print('input size', input.size())
        # input = input.view(-1,3,224,224)
        # print('input size ====>', input.size())
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        #print(output.shape)
        #feature = output(0)
        #center_loss = compute_center_loss(feature, model.centers, target_var)

        #print("tar  shape {}".format(target_var.shape))
        #print("ourput  shape {}".format(output.shape))
        #print("feature shape {}".format(feature.shape))
        loss = criterion(output, target_var)
        #print(loss)
        #print(output.shape)
        #sys.exit()
        #loss_cent = criterion_cent(feature, output[target_var)
        #print(loss_cent)
        #loss_cent = 0.03 * center_loss
        #loss = loss_cent +  loss
        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))
        top5.update(prec5.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        #optimizer_cent.zero_grad()

        loss.backward(retain_graph=True)

        if args.clip_gradient is not None:
            total_norm = clip_grad_norm(model.parameters(), args.clip_gradient)
            #print("total_norm: {}".format(total_norm))
            if total_norm > args.clip_gradient:
                print("clipping gradient: {} with coef {}".format(
                    total_norm, args.clip_gradient / total_norm))

        optimizer.step()
        #for param in criterion_cent.parameters():
        #    param.grad.data *= (1./0.1)

        #center_deltas =get_center_delta(feature, centers, target, alpha=0.5)
        #model.centers = centers - center_deltas

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print(('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
                   'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                   'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                   'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                   'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                       epoch,
                       i,
                       len(train_loader),
                       batch_time=batch_time,
                       data_time=data_time,
                       loss=losses,
                       top1=top1,
                       top5=top5,
                       lr=optimizer.param_groups[-1]['lr'])))
Ejemplo n.º 46
0
              torch.zeros(num_layers, batch_size, hidden_size).to(device))
    
    for i in range(0, ids.size(1) - seq_length, seq_length):
        # Get mini-batch inputs and targets
        inputs = ids[:, i:i+seq_length].to(device)
        targets = ids[:, (i+1):(i+1)+seq_length].to(device)
        
        # Forward pass
        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))
        
        # Backward and optimize
        model.zero_grad()
        loss.backward()
        clip_grad_norm(model.parameters(), 0.5)
        optimizer.step()

        step = (i+1) // seq_length
        if step % 100 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                   .format(epoch+1, num_epochs, step, num_batches, loss.item(), np.exp(loss.item())))

# Test the model
with torch.no_grad():
    with open('sample.txt', 'w') as f:
        # Set intial hidden ane cell states
        state = (torch.zeros(num_layers, 1, hidden_size).to(device),
                 torch.zeros(num_layers, 1, hidden_size).to(device))

        # Select one word id randomly
def main(args):
    CUDA = False
    folder_name = 'RL_' + args.name + '_' + args.task + '_' + args.architecture
    folder_path = os.path.join('./', folder_name)
    create_folder(folder_name)
    datasets = [IntegersLargerThanAverage(10000, i, 10) for i in range(4, 5)]
    critic = MovingAverageBaseline(0.9)
    if args.architecture == 'set':
        policy = BernoulliPolicy(IntegerSubsetNet())
    elif args.architecture == 'null':
        policy = BernoulliPolicy(IntegerSubsetNet(null_model=True))
    else:
        raise ValueError('Unknown architecture. Must be set or null!')

    optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3, eps=1e-2)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5000, gamma=0.99)
    if torch.cuda.is_available() and args.gpu != '':
        policy.cuda()
        CUDA = True
        print('Using GPU')

    environment = RLWrapper(datasets, 64, use_cuda=CUDA)
    data = environment.reset()
    rewards_list = []

    for n in range(args.n_episodes):  # run for epochs

        actions, log_prob_actions = policy(data)
        #policy_p = F.sigmoid(policy.fn_approximator(data))
        log_prob_actions = log_prob_actions.sum(1)
        baseline = critic(data).view(-1, 1)

        if n % 100 == 0:
            y_target = torch.FloatTensor(
                environment.current_dataset.supervised_objective(
                    data.data.int()))

        data, reward, _, info = environment.step(actions)

        advantage = reward - baseline

        critic.update_baseline(None, reward)
        loss = gradients.calculate_policy_gradient_terms(
            log_prob_actions, advantage)
        loss = loss.mean(
        )  # mean is fine since there is only really "one action"?

        optimizer.zero_grad()

        loss.backward()
        clip_grad_norm(policy.fn_approximator.parameters(), 40)
        optimizer.step()
        scheduler.step()
        rewards_list.append(reward.mean())
        if n % 100 == 0:
            set_acc, elem_acc = set_accuracy(y_target, actions.data)
            print('{}: loss {:3g}, episode_reward {:3g}, set acc: {},'
                  ' elem_acc: {}, set_size {}, entropy {}'.format(
                      n,
                      loss.cpu().data[0], reward.mean(), set_acc, elem_acc,
                      environment.current_dataset.set_size,
                      (-log_prob_actions *
                       log_prob_actions.exp()).sum().data[0]))
            print('reward distribution: {}'.format(
                Counter(reward.numpy().ravel().tolist())))

    # now put this into "supervised" mode
    datasets = [
        (i,
         torch.utils.data.DataLoader(IntegerSubsetsSupervised(256,
                                                              i,
                                                              10,
                                                              target='mean',
                                                              seed=5),
                                     batch_size=256)) for i in range(4, 10)
    ]

    set_sizes = []
    mse = []
    set_accs = []
    elem_accs = []
    torch.save(policy, os.path.join(folder_path, 'model-gpu.pyt'))
    criterion = torch.nn.BCELoss()
    for set_size, dataset in datasets:
        for i, (x, y) in enumerate(dataset):
            # prepare the data
            if CUDA:
                x = x.cuda()
                y = y.cuda()
            x, y = Variable(x, volatile=True), Variable(y,
                                                        volatile=True).float()

            # run it through the network
            y_hat, _ = policy(x)
            y_hat = y_hat.view_as(y)
            # calculate the loss
            loss = criterion(y_hat, y)
            if CUDA:
                loss = loss.cpu()
            set_sizes.append(set_size)
            mse.append(loss.data[0])
            set_acc, elem_acc = set_accuracy(y.squeeze(), y_hat.squeeze())
            set_accs.append(set_acc.data[0])
            elem_accs.append(elem_acc.data[0])

    print(set_sizes)
    print(mse)
    print(set_accs)
    print(torch.FloatTensor(set_accs).mean())
    policy.cpu()
    torch.save(
        {
            'set_sizes': set_sizes,
            'rewards_list': rewards_list,
            'mse': mse,
            'set_acc': set_accs,
            'elem_accs': elem_accs,
            'mean_acc': torch.FloatTensor(set_accs).mean()
        }, os.path.join(folder_path, 'results.json'))
    torch.save(policy, os.path.join(folder_path, 'model.pyt'))