Esempio n. 1
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    #hidden = model.init_hidden(args.batch_size)
    count = 0
    for i, batch in enumerate(train_data):
        data, targets, seq = batch['inp'], batch['tar'], batch['lens']
        hidden = model.init_hidden(args.batch_size)
        model.zero_grad()
        output, hidden = model(data, seq, hidden)
        loss = criterion(output.view(-1, ntokens), targets.view(-1))
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data

        if i % args.log_interval == 0 and i > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, i,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 2
0
def train(global_step):
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    if args.model != 'Transformer':
        hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        batch_start = time.time()
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        if args.model == 'Transformer':
            output = model(data)
        else:
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()
        elapsed_secs = time.time() - batch_start
        global_step += 1
        if hvd.rank() == 0 and global_step >= 1000:
            print(" %d %.6f " % (global_step, elapsed_secs))
        if global_step >= 1600:
            break
    return global_step
Esempio n. 3
0
def train():
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(train_batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        model.zero_grad()
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 4
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    optimizer = torch.optim.Adam(model.parameters(), args.lr, amsgrad=True)
    if args.model != 'Transformer':
        hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        model.zero_grad()
        output = model(data)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-lr)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        if args.dry_run:
            break
Esempio n. 5
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()

    for batch, i in enumerate(range(0, len(train_data))):
        data, lens, targets = get_batch(train_data, train_lens, train_tgt, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        output = model(data, lens)
        acc, recall, prec = metric(output, targets)
        loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        # for p in model.parameters():
        #    p.data.add_(-lr, p.grad.data)
        optim.step()
        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print (lr, prec)
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | acc {:5.2f} | recall {:5.2f} | prec {:5.2f}'.format(
                    epoch, batch, len(train_data), lr, elapsed * 1000 / args.log_interval, 
                    cur_loss, acc, recall, prec))
            total_loss = 0
            start_time = time.time()
Esempio n. 6
0
def train():
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        clipped_lr = lr * clip_gradient(model, args.clip)
        for p in model.parameters():
            p.data.add_(-clipped_lr, p.grad.data)

        total_loss += loss.data

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 7
0
def get_onehot_grad(model, batch):
    extracted_grads = {}

    def get_grad_hook(name):
        def hook(grad):
            extracted_grads[name] = grad

        return hook

    model.eval()
    v, q, a, idx, q_len = batch
    batch_size, length = q.shape
    v = Variable(v).cuda()
    q = Variable(q).cuda()
    q_len = Variable(q_len).cuda()

    out = model(v, q, q_len, embed_grad_hook=get_grad_hook('embed'))
    embed = model.module.text.embedding(q)
    pred = torch.max(out, 1)[1]
    loss = F.nll_loss(F.log_softmax(out, 1), pred)
    model.zero_grad()
    loss.backward()
    embed_grad = extracted_grads['embed']
    onehot_grad = embed.view(-1) * embed_grad.view(-1)
    onehot_grad = onehot_grad.view(batch_size, length, -1).sum(-1)
    return onehot_grad
Esempio n. 8
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    # hidden = model.reset_hidden()

    s = 0
    total = sum(tic_marks)
    for i in range(len(tic_marks)):
        data, targets = get_batch(train_data, i)
        s += data.size(0)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data
        print '%d/%d' % (s, total)
Esempio n. 9
0
def train():
    total_loss = 0
    start_time = time.time()
    ntokens = corpus.vocabulary.num_words
    hidden = model.init_hidden(BATCH_SIZE)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, SEQUENCE_LEN)):
        data, targets = get_batch(train_data, i)
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        optimizer.step()
        # clipped_lr = lr * clip_gradient(model, GRADIENT_CLIP)
        # for p in model.parameters():
        #     p.data.add_(-clipped_lr, p.grad.data)

        total_loss += loss.data

        if batch % LOG_INTERVAL == 0 and batch > 0:
            cur_loss = total_loss.item() / LOG_INTERVAL
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // SEQUENCE_LEN,
                    lr, elapsed * 1000 / LOG_INTERVAL, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 10
0
def train():
    # Turn on training mode which enables dropout and bn
    model.turn()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the ay to start of the dataset!!!
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs/ LSTMs
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
        total_loss += loss.data

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print(
                '|epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'
                .format(epoch, batch,
                        len(train_data) // args.bptt, lr,
                        elapsed * 1000 / args.log_interval, cur_loss,
                        math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 11
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 12
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)  #调用forward方法
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:04.4f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 13
0
def train():
    global lr
    global best_val_loss
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    eval_start_time = time.time()
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        # Evaluate every args.eval_interval batches
        if batch % args.eval_interval == 0 and batch > 0:
            val_t_loss = evaluate(val_t_data)
            val_f_loss = evaluate(val_f_data)
            logging('-' * 89)
            logging(
                '| eval {:3d} in epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} {:5.2f} | '
                'valid ppl {:8.2f} {:8.2f}'.format(
                    batch // args.eval_interval, epoch,
                    (time.time() - eval_start_time), val_t_loss, val_f_loss,
                    math.exp(val_t_loss), math.exp(val_f_loss)))
            logging('-' * 89)
            eval_start_time = time.time()
            # Save the model if the validation loss is the best we've seen so far.
            if best_val_loss is None or val_t_loss < best_val_loss:
                torch.save(model, os.path.join(args.save, 'model.pt'))
                best_val_loss = val_t_loss
            else:
                # Anneal the learning rate if no improvement has been seen in the validation dataset.
                lr /= 4.0
            model.train()
Esempio n. 14
0
def train(model, train_vars, train_labels, val_vars, test_labels):
    iterations = 400

    model.training = True

    loss_fn = torch.nn.BCELoss()
    learning_rate = 1e-3
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=0.00012)

    for t in range(iterations):
        prediction = model(train_vars)
        loss = loss_fn(prediction, train_labels)

        model.zero_grad()  # Zero out the previous gradient computation
        loss.backward()  # Compute the gradient
        optimizer.step()  # Use the gradient information to
        # make a step
        if t % 10 == 0:
            print("iter: {} --------".format(t))
            print("  loss: {:.4f}".format(loss.data[0]))
            print("  acc: {:.4f}".format(testNN(model, val_vars, test_labels)))

    return model
Esempio n. 15
0
def train():
    total_loss = 0.
    start_time = time.time()
    for i in range(0, train_data.size(1) - args.seq_size - 1):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        output = model(data)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if i % args.log_interval == 0 and i > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:.2e} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, i,
                    train_data.size(1) - args.seq_size, args.lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        if args.dry_run:
            break
Esempio n. 16
0
def train():
    model.train()  # Turn on training mode, which enables dropout.
    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data_seq, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data_seq, hidden)
        loss = criterion(output.view(-1, len(corpus.dictionary)), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                f'| epoch {epoch:3d} | {batch:5d}/{len(train_data) // args.bptt:5d} batches '
                f'| lr {lr:02.2f} | ms/batch {elapsed * 1000 / args.log_interval:5.2f} '
                f'| loss {cur_loss:5.2f} | ppl {math.exp(cur_loss):8.2f}')
            total_loss = 0
            start_time = time.time()
Esempio n. 17
0
    def train(self, model, train_data):
        # Turn on training mode which enables dropout.
        model.train()
        total_loss = 0.
        start_time = time.time()
        ntokens = len(self.corpus.dictionary)
        hidden = model.init_hidden(self.batch_size)

        for batch, i in enumerate(range(0, train_data.size(0) - 1, self.bptt)):
            data, targets = self.get_batch(train_data, i)
            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            model.zero_grad()
            hidden = self.repackage_hidden(hidden)
            output, hidden = model(data, hidden)
            loss = self.criterion(output, targets)
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(model.parameters(), self.clip)
            for p in model.parameters():
                p.data.add_(p.grad, alpha=-self.lr)

            total_loss += loss.item()

            if batch % self.log_interval == 0 and batch > 0:
                cur_loss = total_loss / self.log_interval
                elapsed = time.time() - start_time
                print('| {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                    batch, len(train_data) // self.bptt, self.lr,
                    elapsed * 1000 / self.log_interval, cur_loss, math.exp(cur_loss)))
                total_loss = 0
                start_time = time.time()
Esempio n. 18
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    # 33278
    ntokens = len(corpus.dictionary)
    # 以batch_size大小初始化隐藏层参数
    # 以下是在RNN的条件下,当在LSTM的情况下时,hidden为tuple
    # type(hidden) == > Tensor, hidden.size() == > [2, 20, 200]
    # args.batch_size为20
    hidden = model.init_hidden(args.batch_size)
    # args.bptt: sequence length
    # train_data.size(0)表示总共有多少个batch(与下面代码中的batch不同), train_data.size(1)
    # 表示batch_size的大小, bptt是序列长度, 也就是一次取多少个batch
    count = 0
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        # type(data)==>Tensor, data.size()===> 35*20, not zero
        # type(targets)==>Tensor, targets.size()===> 700, not zero
        # args.bptt is 35
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        # 参数中的hidden的size表示为(nlayers, bsz, nhid)
        # 此时得到的hidden算是原来hidden的副本
        hidden = repackage_hidden(hidden)
        # Sets gradients of all model parameters to zero.
        model.zero_grad()
        # 参数:(35*20, 2*20*200), output=(35,20,33278)  hidden=(2,20,200)
        # 35被称为sequence lenegth   20被称为batch_size   33278被称为num_directions*hidden_size
        # 上面的话可能有问题
        output, hidden = model(data, hidden)
        # output.view(-1, ntokens) == > 700*33278
        loss = criterion(output.view(-1, ntokens), targets)

        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        print("Done!")
        count += 1
        if count == 10:
            break
Esempio n. 19
0
def train_batch(model, source, target, optimizer, criterion, attn=False):
    loss = 0
    model.zero_grad()
    if attn:
        output, hidden, attention = model(source, target, use_target=True)
    else:
        output, hidden = model(source, target, use_target=True)

    # Take output minus the last character
    output = output[:-1, :, :]

    # Take target without the first character
    target = target[1:, :]

    output_flat = output.view(
        -1, model.output_size)  # [(tg_len x batch) x en_vocab_sz]
    # not sure whether to use ground truth target or network's prediction
    loss = criterion(output_flat, target.view(-1))
    loss.backward()

    # figure out how to do this
    # if L2 Norm of Gradient / 128 > 5, then g = 5g/s
    nn.utils.clip_grad_norm(model.parameters(), CLIP)
    optimizer.step()

    # Subtract one so that the padding count becomes zero. Count number of nonpadding in output
    non_padding = (target.view(-1) - 1.0).nonzero().size(0)
    return loss.data[0], non_padding
Esempio n. 20
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('[epoch %d, batch %d/%d], lr %.2f, avg_batch_cost: %.5f s, '
                  'loss %.2f, ppl %.2f' %
                  (epoch, batch, len(train_data) // args.bptt, lr,
                   elapsed / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 21
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, labels = get_batch(train_data, train_labels, corpus.train_seq_lens, i)
        hidden = model.init_hidden(args.batch_size)
        model.zero_grad()
        output, _ = model(data, hidden)
        mask = (data >= 0).float()
        loss, _ = model.loss(output, labels, mask)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 22
0
def train(num_epochs, model, device, train_loader, val_loader, images, texts, lengths, converter, optimizer, lr_scheduler, prediction_dir, print_iter, opt) :

#     criterion = CTCLoss()
#     criterion.to(device)
    criterion = torch.nn.CrossEntropyLoss(ignore_index = 0).to(device)
    images = images.to(device)
    model.to(device)
    for epoch in range(num_epochs) :
        count = 0
        model.train()
        for i, datas in enumerate(train_loader) :
            datas, targets = datas
            batch_size = datas.size(0)
            count += batch_size
            dataloader.loadData(images, datas)
            t, l = converter.encode(targets, opt.batch_max_length)
            dataloader.loadData(texts, t)
            dataloader.loadData(lengths, l)
            preds = model(images, t[:, :-1])
#             preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))

            cost = criterion(preds.view(-1, preds.shape[-1]), t[:, 1:].contiguous().view(-1))
            model.zero_grad()
            cost.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) 
            optimizer.step()
            if count % print_iter < train_loader.batch_size :
                print('epoch {} [{}/{}]loss : {}'.format(epoch, count, len(train_loader.dataset), cost))
    
        if (epoch %3==0) &(epoch !=0 ):
            res = validation(model, device, val_loader, images, texts, lengths, converter, prediction_dir, opt)
            save_model(opt.save_dir,  f'{epoch}_{round(float(res),3)}', model, optimizer, lr_scheduler, opt)
Esempio n. 23
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    random.shuffle(corpus.textind['train'])
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    for batch, i in enumerate(
            range(0, len(corpus.textind['train']), args.batch_size)):
        data, lengths = corpus.get_batch(args.batch_size, False, i, 'train')
        data = data.to(device)
        lengths = lengths.to(device)
        hidden = model.init_hidden(data.shape[1])
        loss = 0
        masksum = 0
        model.zero_grad()
        for seqind, j in enumerate(range(0, data.shape[0] - 1, args.bptt)):
            # data.shape[0] - 1 to not let EOU pass as input
            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            ei = min(j + args.bptt, data.shape[0] - 1)
            hidden = repackage_hidden(hidden)
            partoutput, hidden = model(data[j:ei], hidden)
            lossmat = criterion(partoutput.transpose(1, 2), data[j + 1:ei + 1])
            if (lengths >= ei).sum() == lengths.shape[0]:
                temploss = lossmat.sum()
                tempmasksum = lossmat.shape[0] * lossmat.shape[1]
            else:
                mask = (torch.arange(ei - j).to(device).expand(
                    len(lengths), ei - j) <
                        (lengths - j).unsqueeze(1)).t().float()
                temploss = (lossmat * mask).sum()
                tempmasksum = mask.sum()
            loss += temploss.data
            masksum += tempmasksum.data
            (temploss / tempmasksum).backward()
        loss /= masksum
        # loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss

        if batch % args.log_interval == 0 and batch != 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.3f} | ppl {:8.3f}'.format(
                    epoch, batch,
                    len(corpus.textind['train']) // args.batch_size, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            sys.stdout.flush()
            total_loss = 0
            start_time = time.time()
def train(model, train_data, lr):
    # Turn on training mode which enables dropout.
    model.train()
    model.set_mode('train')
    total_loss = 0.
    start_time = time.time()
    ntokens = len(dictionary)
    hidden = model.init_hidden(args.batch_size)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=lr,
                                weight_decay=args.wdecay)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        # gs534 add sentence resetting
        eosidx = dictionary.get_eos()
        if args.loss == 'nce':
            output, hidden = model(data, hidden, eosidx, targets)
            loss = criterion(output)
            loss.backward()
        else:
            output, hidden = model(data,
                                   hidden,
                                   separate=args.reset,
                                   eosidx=eosidx)
            loss = criterion(output.view(-1, ntokens), targets)
            loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        #for p in model.parameters():
        #    p.data.add_(-lr, p.grad.data)
        optimizer.step()

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            if args.loss == 'nce':
                print(
                    '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'nce_loss {:5.2f}'.format(
                        epoch, batch,
                        len(train_data) // args.bptt, lr,
                        elapsed * 1000 / args.log_interval, cur_loss))
            else:
                print(
                    '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                        epoch, batch,
                        len(train_data) // args.bptt, lr,
                        elapsed * 1000 / args.log_interval, cur_loss,
                        math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 25
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    if (not args.single) and (torch.cuda.device_count() > 1):
        # "module" is necessary when using DataParallel
        hidden = model.module.init_hidden(args.batch_size)
    else:
        hidden = model.init_hidden(args.batch_size)
    # UNCOMMENT FOR DEBUGGING
    #random.seed(10)
    if train_ccg_data is None:
        order = list(range(0, train_lm_data.size(0) - 1, args.bptt))
    else:
        order = list(
            range(0,
                  train_lm_data.size(0) + train_ccg_data.size(0) - 1,
                  args.bptt))
    random.shuffle(order)
    for batch, i in enumerate(
            order
    ):  #enumerate(range(0, train_lm_data.size(0) + train_ccg_data.size(0) - 1, args.bptt)):
        # if batch >= 1000: break
        # TAG
        if i > train_lm_data.size(0):
            data, targets = get_batch(train_ccg_data,
                                      i - train_lm_data.size(0))
        # LM
        else:
            data, targets = get_batch(train_lm_data, i)

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()  #data

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(order), lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 26
0
def train():
    batch_cnt = 0
    train_loss = 0

    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    if args.model != 'Transformer':
        hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        if args.model == 'Transformer':
            output = model(data)
            output = output.view(-1, ntokens)
        else:
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-lr)

        total_loss += loss.item()

        batch_cnt += 1
        train_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))

            train_loss_per_x_batch.add_data_point({
                'epoch_idx': epoch,
                'batch_idx': batch,
                'training_loss': cur_loss
            })

            total_loss = 0
            start_time = time.time()
        if args.dry_run:
            break

        train_loss /= batch_cnt
        return train_loss
Esempio n. 27
0
def train(adam_lr):
    # Turn on training mode which enables dropout.
    optimizer = optim.Adam(model.parameters(), lr=adam_lr)
    model.train()
    total_loss = 0.
    total_epoch_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    shuffle_x, shuffle_y = shuffle_train(train_data)
    #if args.model != 'Transformer':
    #    hidden = model.init_hidden(args.batch_size)
    #for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
    #for batch, i in enumerate(range(0, train_data.size(0)- args.bptt + 1)): for sliding window of step 1
    for idx in range(shuffle_x.size(0)):
        #data, targets = get_batch(train_data, i)
        #data, targets = get_batch_ffn(train_data, i)
        batch = idx
        data = shuffle_x[idx]
        targets = shuffle_y[idx]
        #print (data[:1])
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        #optimizer.zero_grad()
        if args.model == 'Transformer':
            output = model(data)
        else:
            #hidden = repackage_hidden(hidden) #what is hidden?
            #pdb.set_trace()
            #output, hidden = model(data, hidden)
            output = model(data)
        #loss = criterion(output.view(-1, ntokens), targets)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        #torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        #for p in model.parameters():
        #    p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()
        total_epoch_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            #print('cur_loss: ', cur_loss)
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, shuffle_x.size(0), lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            #pdb.set_trace()
            total_loss = 0
            start_time = time.time()

    return total_epoch_loss / (batch + 1)
Esempio n. 28
0
File: train.py Progetto: rbshi/elstm
def train(optimizer=None, h_sp=[0., 0.], h_th=[0., 0.], block=-1, lr=args.lr):
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        if not optimizer:
            model.zero_grad()
        else:
            optimizer.zero_grad()

        output, hidden = model(data,
                               hidden,
                               sparse=True,
                               h_sp=h_sp,
                               h_th=h_th,
                               block=block)
        loss = criterion(output.view(-1, ntokens), targets)

        # Add l1 regularization
        if args.l1:
            l1_regularization = torch.tensor(0, dtype=torch.float32).to(device)
            lambda_l1 = torch.tensor(args.l1_lambda).to(device)
            for param in model.parameters():
                l1_regularization += torch.norm(param, 1).to(device)
            loss = loss + lambda_l1 * l1_regularization

        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        if not optimizer:
            for p in model.parameters():
                p.data.add_(-lr, p.grad.data)
        else:
            optimizer.step()
            lr = get_lr(optimizer)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            logger.info(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:5.2e} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 29
0
def train(device, dataset, dataloader, model):
    print("in train")
    model = model.to(device)
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    # Training loop
    images_per_batch = {}
    batch_count, images_per_batch['train'], images_per_batch[
        'test'] = 0, [], []
    with tqdm(dataloader, total=config.num_batches) as pbar:
        for batch_idx, batch in enumerate(pbar):
            model.zero_grad()

            train_inputs, train_targets = batch['train']
            train_inputs = train_inputs.to(device=device)
            train_targets = train_targets.to(device=device)
            train_embeddings = model(train_inputs)

            test_inputs, test_targets = batch['test']
            test_inputs = test_inputs.to(device=device)
            test_targets = test_targets.to(device=device)
            test_embeddings = model(test_inputs)

            prototypes = get_prototypes(train_embeddings, train_targets,
                                        dataset.num_classes_per_task)
            loss = prototypical_loss(prototypes, test_embeddings, test_targets)

            loss.backward()
            optimizer.step()

            #Just keeping the count here
            batch_count += 1
            images_per_batch['train'].append(train_inputs.shape[1])
            images_per_batch['test'].append(test_inputs.shape[1])

            with torch.no_grad():
                accuracy = get_accuracy(prototypes, test_embeddings,
                                        test_targets)
                pbar.set_postfix(accuracy='{0:.4f}'.format(accuracy.item()))

            if batch_idx >= config.num_batches:
                break

    print("Number of batches in the dataloader: ", batch_count)

    # Save model
    if check_dir() is not None:
        filename = os.path.join(
            'saved_models',
            'protonet_cifar_fs_{0}shot_{1}way.pt'.format(config.k, config.n))
        with open(filename, 'wb') as f:
            state_dict = model.state_dict()
            torch.save(state_dict, f)
            print("Model saved")

    return batch_count, images_per_batch
Esempio n. 30
0
def train():
    global lr, best_val_loss
    # Turn on training mode which enables dropout.
    model.train()
    total_loss, nbatches = 0, 0
    start_time = time.time()
    hidden = model.init_hidden(args.batch_size)

    if not args.lm1b:
        data_gen = corpus.iter('train', args.batch_size, args.bptt, use_cuda=args.cuda)
    else:
        data_gen = train_corpus.batch_generator(seq_length=args.bptt, batch_size=args.batch_size)

    for b, batch in enumerate(data_gen):
        model.train()
        if args.lm1b:
            source, target, word_cnt, batch_len = get_batch(batch)
        else:
            source, target = batch
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()  # optimizer.zero_grad()
        model.softmax.set_target(target.data.view(-1))
        output, hidden = model(source, hidden)
        loss = criterion(output, target.view(-1))
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        optimizer.step()
        # for p in model.parameters():
        #     if p.grad is not None:
        #         p.data.add_(-lr, p.grad.data)

        total_loss += loss.data.cpu()
        # logging.info(total_loss)

        if b % args.log_interval == 0 and b > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            if not args.valid_per_epoch:
                val_loss = evaluate('valid')
                logging.info('| epoch {:3d} | batch {:5d} | lr {:02.5f} | ms/batch {:5.2f} | '
                        'loss {:5.2f} | ppl {:8.2f} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
                    epoch, b, lr,
                    elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss),
                    val_loss, math.exp(val_loss)))
            else:
                logging.info('| epoch {:3d} | batch {:5d} | lr {:02.5f} | ms/batch {:5.2f} | '
                      'loss {:5.2f} | ppl {:8.2f} '.format(
                    epoch, b, lr,
                    elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))

            total_loss = 0
            start_time = time.time()
def generate_targeted(model, lr, label, n_try, target_image, target_label,
                      idx):

    if n_try == 0:  # try to generate an example n number of times
        return

    lr_orig = lr
    lambda_img = 5.0
    criterion = nn.MSELoss().to(device)

    # having a well centered gaussian stops the output gaussian from saturating early

    data = torch.randn((1, 1, size_input, size_input))
    data = (data - torch.min(data)) / (torch.max(data) - torch.min(data))

    # data = torch.zeros((1,1,size_input,size_input))

    target = torch.zeros((1, size_output))
    target[0][label] = 1

    data, target = data.to(device), target.to(device)

    for i in range(epochs):

        data = Variable(data.data, requires_grad=True)
        output = torch.sigmoid(model(data))
        pred = output.max(1, keepdim=True)[1]

        # if i % report_every == 0 or i == epochs-1:
        # 	print(output.data.cpu().numpy()[0])

        if (i + 1) % (epochs / (num_sched_lr + 1)) == 0 and label == pred:
            lr /= 10.0
            print("LR sched lap")

        loss = criterion(output,
                         target) + lambda_img * criterion(data, target_image)
        model.zero_grad()
        loss.backward()

        data = data - lr * data.grad.data
        data = (data - torch.min(data)) / (torch.max(data) - torch.min(data))

    if model(data).max(1, keepdim=True)[1] == label:
        print("Success. " + str(label))
    else:
        print("Fail. Retry. " + str(label))
        return generate_targeted(model, lr_orig, label, n_try - 1,
                                 target_image, target_label, idx)

    data = data.view(1, size_input, size_input).detach().cpu()
    trans = torchvision.transforms.ToPILImage()
    data = trans(data)

    data.save("adversarial_outputs_conv/targeted/" + str(idx) +
              str(target_label) + "_to_" + str(label) + ".jpg")