Example #1
0
    def test_backward(self):
        dy = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
        x = np.array([[1, 2, 3], [4, 5, 6]])
        w = np.array([[1, 0, 0, 0], [-1, 0, 1, 0], [0, 2, -3, 1]])

        dx, dw, db = utils.backward(dy, x, w)
        self.assertTrue(np.allclose(dx, [[1, 2, -1], [5, 2, -1]]))

        self.assertTrue(
            np.allclose(
                dw, [[21, 26, 31, 36], [27, 34, 41, 48], [33, 42, 51, 60]]))

        self.assertTrue(np.allclose(db, [[6, 8, 10, 12]]))
Example #2
0
    def backward(self, dy):

        for t in reversed(range(self.l)):
            # for one time point only
            dVt = np.zeros_like(self.V)
            dWt = np.zeros_like(self.W)
            dUt = np.zeros_like(self.U)

            dyt = dy[t]
            dst, dVt, dbyt = utils.backward(dyt, self.s[t], self.V)
            dht = dst * (1 - self.h[t]**2)
            dbht = dht
            for i in reversed(range(t)):
                dxi, dUi, _ = utils.backward(dht, self.x[i], self.U)
                if i > 0:
                    dsi, dWi, _ = utils.backward(dht, self.s[i - 1], self.W)
                    dht = np.clip((dht @ self.W.T) * (1 - self.h[i - 1]**2),
                                  -self.clip, self.clip)
                else:
                    dsi, dWi, _ = utils.backward(dht, self.s0, self.W)

                if np.mean(dht) < 0.001:
                    dht *= 10
                    #print('vanishing gradient')

            dV += dVt
            dW += dWt
            dU += dUt
            dby += dbyt
            dbh += dbht

        dV = np.clip(dV, -self.clip, self.clip)
        dW = np.clip(dW, -self.clip, self.clip)
        dU = np.clip(dU, -self.clip, self.clip)
        dby = np.clip(dby, -self.clip, self.clip)
        dbh = np.clip(dbh, -self.clip, self.clip)

        return (dV, dW, dU, dby, dbh)
Example #3
0
    def backward(self, dy):
        N = dy.shape[0]
        dx = np.zeros([N, self.dim_in])
        dw = np.zeros([self.dim_k, self.dout])
        db = np.zeros([1, self.dout])

        for i in range(N):
            dyi = dy[i, :].reshape(self.dout, -1).T

            dfxi, dwi, dbi = utils.backward(dyi, self.fx[i, ], self.w)

            dx[i, ] = utils.unflatten(dfxi, self.shape_in, self.shape_k,
                                      self.pad, self.stride,
                                      self.indice).ravel()
            dw += dwi
            db += dbi

        self.dw_cache = dw
        self.db_cache = db
        return dx
Example #4
0
        if not args.use_data:
            _, batch = next(pretrain_dataloader_iter)
        else:
            batch = data_batches[batch_idx]
            data_batches[batch_idx] = None

        inst_pass += list(batch.values())[0].size(0)
        summary = {}

        for percent, inputs in utils.partition_inputs(batch, accumu_steps,
                                                      True):
            outputs = model(**inputs)
            for key in outputs:
                outputs[key] = outputs[key].mean() * percent
            utils.backward(outputs["loss"], amp_scaler)
            utils.add_output_to_summary(outputs, summary)

        utils.optimizer_step(optimizer, lr_scheduler, amp_scaler)
        del batch

        t1 = time.time()

        summary["idx"] = epoch * pretraining_config[
            "batches_per_epoch"] + batch_idx
        summary["batch_idx"] = batch_idx
        summary["epoch"] = epoch
        summary["time"] = round(t1 - t0, 4)
        summary["inst_pass"] = inst_pass
        summary["learning_rate"] = round(optimizer.param_groups[0]["lr"], 8)
        summary["time_since_start"] = round(time.time() - init_t, 4)
def main():
    args = get_args()
    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)

    cur_timestamp = str(datetime.now())[:-3]  # we also include ms to prevent the probability of name collision
    model_width = {'linear': '', 'cnn': args.n_filters_cnn, 'lenet': '', 'resnet18': ''}[args.model]
    model_str = '{}{}'.format(args.model, model_width)
    model_name = '{} dataset={} model={} eps={} attack={} m={} attack_init={} fgsm_alpha={} epochs={} pgd={}-{} grad_align_cos_lambda={} lr_max={} seed={}'.format(
        cur_timestamp, args.dataset, model_str, args.eps, args.attack, args.minibatch_replay, args.attack_init, args.fgsm_alpha, args.epochs,
        args.pgd_alpha_train, args.pgd_train_n_iters, args.grad_align_cos_lambda, args.lr_max, args.seed)
    if not os.path.exists('models'):
        os.makedirs('models')
    logger = utils.configure_logger(model_name, args.debug)
    logger.info(args)
    half_prec = args.half_prec
    n_cls = 2 if 'binary' in args.dataset else 10

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    double_bp = True if args.grad_align_cos_lambda > 0 else False
    n_eval_every_k_iter = args.n_eval_every_k_iter
    args.pgd_alpha = args.eps / 4

    eps, pgd_alpha, pgd_alpha_train = args.eps / 255, args.pgd_alpha / 255, args.pgd_alpha_train / 255
    train_data_augm = False if args.dataset in ['mnist'] else True
    train_batches = data.get_loaders(args.dataset, -1, args.batch_size, train_set=True, shuffle=True, data_augm=train_data_augm)
    train_batches_fast = data.get_loaders(args.dataset, n_eval_every_k_iter, args.batch_size, train_set=True, shuffle=False, data_augm=False)
    test_batches = data.get_loaders(args.dataset, args.n_final_eval, args.batch_size_eval, train_set=False, shuffle=False, data_augm=False)
    test_batches_fast = data.get_loaders(args.dataset, n_eval_every_k_iter, args.batch_size_eval, train_set=False, shuffle=False, data_augm=False)

    model = models.get_model(args.model, n_cls, half_prec, data.shapes_dict[args.dataset], args.n_filters_cnn).cuda()
    model.apply(utils.initialize_weights)
    model.train()

    if args.model == 'resnet18':
        opt = torch.optim.SGD(model.parameters(), lr=args.lr_max, momentum=0.9, weight_decay=args.weight_decay)
    elif args.model == 'cnn':
        opt = torch.optim.Adam(model.parameters(), lr=args.lr_max, weight_decay=args.weight_decay)
    elif args.model == 'lenet':
        opt = torch.optim.Adam(model.parameters(), lr=args.lr_max, weight_decay=args.weight_decay)
    else:
        raise ValueError('decide about the right optimizer for the new model')

    if half_prec:
        if double_bp:
            amp.register_float_function(torch, 'batch_norm')
        model, opt = amp.initialize(model, opt, opt_level="O1")

    if args.attack == 'fgsm':  # needed here only for Free-AT
        delta = torch.zeros(args.batch_size, *data.shapes_dict[args.dataset][1:]).cuda()
        delta.requires_grad = True

    lr_schedule = utils.get_lr_schedule(args.lr_schedule, args.epochs, args.lr_max)
    loss_function = nn.CrossEntropyLoss()

    train_acc_pgd_best, best_state_dict = 0.0, copy.deepcopy(model.state_dict())
    start_time = time.time()
    time_train, iteration, best_iteration = 0, 0, 0
    for epoch in range(args.epochs + 1):
        train_loss, train_reg, train_acc, train_n, grad_norm_x, avg_delta_l2 = 0, 0, 0, 0, 0, 0
        for i, (X, y) in enumerate(train_batches):
            if i % args.minibatch_replay != 0 and i > 0:  # take new inputs only each `minibatch_replay` iterations
                X, y = X_prev, y_prev
            time_start_iter = time.time()
            # epoch=0 runs only for one iteration (to check the training stats at init)
            if epoch == 0 and i > 0:
                break
            X, y = X.cuda(), y.cuda()
            lr = lr_schedule(epoch - 1 + (i + 1) / len(train_batches))  # epoch - 1 since the 0th epoch is skipped
            opt.param_groups[0].update(lr=lr)

            if args.attack in ['pgd', 'pgd_corner']:
                pgd_rs = True if args.attack_init == 'random' else False
                n_eps_warmup_epochs = 5
                n_iterations_max_eps = n_eps_warmup_epochs * data.shapes_dict[args.dataset][0] // args.batch_size
                eps_pgd_train = min(iteration / n_iterations_max_eps * eps, eps) if args.dataset == 'svhn' else eps
                delta = utils.attack_pgd_training(
                    model, X, y, eps_pgd_train, pgd_alpha_train, opt, half_prec, args.pgd_train_n_iters, rs=pgd_rs)
                if args.attack == 'pgd_corner':
                    delta = eps * utils.sign(delta)  # project to the corners
                    delta = clamp(X + delta, 0, 1) - X

            elif args.attack == 'fgsm':
                if args.minibatch_replay == 1:
                    if args.attack_init == 'zero':
                        delta = torch.zeros_like(X, requires_grad=True)
                    elif args.attack_init == 'random':
                        delta = utils.get_uniform_delta(X.shape, eps, requires_grad=True)
                    else:
                        raise ValueError('wrong args.attack_init')
                else:  # if Free-AT, we just reuse the existing delta from the previous iteration
                    delta.requires_grad = True

                X_adv = clamp(X + delta, 0, 1)
                output = model(X_adv)
                loss = F.cross_entropy(output, y)
                if half_prec:
                    with amp.scale_loss(loss, opt) as scaled_loss:
                        grad = torch.autograd.grad(scaled_loss, delta, create_graph=True if double_bp else False)[0]
                        grad /= scaled_loss / loss  # reverse back the scaling
                else:
                    grad = torch.autograd.grad(loss, delta, create_graph=True if double_bp else False)[0]

                grad = grad.detach()

                argmax_delta = eps * utils.sign(grad)

                n_alpha_warmup_epochs = 5
                n_iterations_max_alpha = n_alpha_warmup_epochs * data.shapes_dict[args.dataset][0] // args.batch_size
                fgsm_alpha = min(iteration / n_iterations_max_alpha * args.fgsm_alpha, args.fgsm_alpha) if args.dataset == 'svhn' else args.fgsm_alpha
                delta.data = clamp(delta.data + fgsm_alpha * argmax_delta, -eps, eps)
                delta.data = clamp(X + delta.data, 0, 1) - X

            elif args.attack == 'random_corner':
                delta = utils.get_uniform_delta(X.shape, eps, requires_grad=False)
                delta = eps * utils.sign(delta)

            elif args.attack == 'none':
                delta = torch.zeros_like(X, requires_grad=False)
            else:
                raise ValueError('wrong args.attack')

            # extra FP+BP to calculate the gradient to monitor it
            if args.attack in ['none', 'random_corner', 'pgd', 'pgd_corner']:
                grad = get_input_grad(model, X, y, opt, eps, half_prec, delta_init='none',
                                      backprop=args.grad_align_cos_lambda != 0.0)

            delta = delta.detach()

            output = model(X + delta)
            loss = loss_function(output, y)

            reg = torch.zeros(1).cuda()[0]  # for .item() to run correctly
            if args.grad_align_cos_lambda != 0.0:
                grad2 = get_input_grad(model, X, y, opt, eps, half_prec, delta_init='random_uniform', backprop=True)
                grads_nnz_idx = ((grad**2).sum([1, 2, 3])**0.5 != 0) * ((grad2**2).sum([1, 2, 3])**0.5 != 0)
                grad1, grad2 = grad[grads_nnz_idx], grad2[grads_nnz_idx]
                grad1_norms, grad2_norms = l2_norm_batch(grad1), l2_norm_batch(grad2)
                grad1_normalized = grad1 / grad1_norms[:, None, None, None]
                grad2_normalized = grad2 / grad2_norms[:, None, None, None]
                cos = torch.sum(grad1_normalized * grad2_normalized, (1, 2, 3))
                reg += args.grad_align_cos_lambda * (1.0 - cos.mean())

            loss += reg

            if epoch != 0:
                opt.zero_grad()
                utils.backward(loss, opt, half_prec)
                opt.step()

            time_train += time.time() - time_start_iter
            train_loss += loss.item() * y.size(0)
            train_reg += reg.item() * y.size(0)
            train_acc += (output.max(1)[1] == y).sum().item()
            train_n += y.size(0)

            with torch.no_grad():  # no grad for the stats
                grad_norm_x += l2_norm_batch(grad).sum().item()
                delta_final = clamp(X + delta, 0, 1) - X  # we should measure delta after the projection onto [0, 1]^d
                avg_delta_l2 += ((delta_final ** 2).sum([1, 2, 3]) ** 0.5).sum().item()

            if iteration % args.eval_iter_freq == 0:
                train_loss, train_reg = train_loss / train_n, train_reg / train_n
                train_acc, avg_delta_l2 = train_acc / train_n, avg_delta_l2 / train_n

                # it'd be incorrect to recalculate the BN stats on the test sets and for clean / adversarial points
                utils.model_eval(model, half_prec)

                test_acc_clean, _, _ = rob_acc(test_batches_fast, model, eps, pgd_alpha, opt, half_prec, 0, 1)
                test_acc_fgsm, test_loss_fgsm, fgsm_deltas = rob_acc(test_batches_fast, model, eps, eps, opt, half_prec, 1, 1, rs=False)
                test_acc_pgd, test_loss_pgd, pgd_deltas = rob_acc(test_batches_fast, model, eps, pgd_alpha, opt, half_prec, args.attack_iters, 1)
                cos_fgsm_pgd = utils.avg_cos_np(fgsm_deltas, pgd_deltas)
                train_acc_pgd, _, _ = rob_acc(train_batches_fast, model, eps, pgd_alpha, opt, half_prec, args.attack_iters, 1)  # needed for early stopping

                grad_x = utils.get_grad_np(model, test_batches_fast, eps, opt, half_prec, rs=False)
                grad_eta = utils.get_grad_np(model, test_batches_fast, eps, opt, half_prec, rs=True)
                cos_x_eta = utils.avg_cos_np(grad_x, grad_eta)

                time_elapsed = time.time() - start_time
                train_str = '[train] loss {:.3f}, reg {:.3f}, acc {:.2%} acc_pgd {:.2%}'.format(train_loss, train_reg, train_acc, train_acc_pgd)
                test_str = '[test] acc_clean {:.2%}, acc_fgsm {:.2%}, acc_pgd {:.2%}, cos_x_eta {:.3}, cos_fgsm_pgd {:.3}'.format(
                    test_acc_clean, test_acc_fgsm, test_acc_pgd, cos_x_eta, cos_fgsm_pgd)
                logger.info('{}-{}: {}  {} ({:.2f}m, {:.2f}m)'.format(epoch, iteration, train_str, test_str,
                                                                      time_train/60, time_elapsed/60))

                if train_acc_pgd > train_acc_pgd_best:  # catastrophic overfitting can be detected on the training set
                    best_state_dict = copy.deepcopy(model.state_dict())
                    train_acc_pgd_best, best_iteration = train_acc_pgd, iteration

                utils.model_train(model, half_prec)
                train_loss, train_reg, train_acc, train_n, grad_norm_x, avg_delta_l2 = 0, 0, 0, 0, 0, 0

            iteration += 1
            X_prev, y_prev = X.clone(), y.clone()  # needed for Free-AT

        if epoch == args.epochs:
            torch.save({'last': model.state_dict(), 'best': best_state_dict}, 'models/{} epoch={}.pth'.format(model_name, epoch))
            # disable global conversion to fp16 from amp.initialize() (https://github.com/NVIDIA/apex/issues/567)
            context_manager = amp.disable_casts() if half_prec else utils.nullcontext()
            with context_manager:
                last_state_dict = copy.deepcopy(model.state_dict())
                half_prec = False  # final eval is always in fp32
                model.load_state_dict(last_state_dict)
                utils.model_eval(model, half_prec)
                opt = torch.optim.SGD(model.parameters(), lr=0)

                attack_iters, n_restarts = (50, 10) if not args.debug else (10, 3)
                test_acc_clean, _, _ = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, 0, 1)
                test_acc_pgd_rr, _, deltas_pgd_rr = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, attack_iters, n_restarts)
                logger.info('[last: test on 10k points] acc_clean {:.2%}, pgd_rr {:.2%}'.format(test_acc_clean, test_acc_pgd_rr))

                if args.eval_early_stopped_model:
                    model.load_state_dict(best_state_dict)
                    utils.model_eval(model, half_prec)
                    test_acc_clean, _, _ = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, 0, 1)
                    test_acc_pgd_rr, _, deltas_pgd_rr = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, attack_iters, n_restarts)
                    logger.info('[best: test on 10k points][iter={}] acc_clean {:.2%}, pgd_rr {:.2%}'.format(
                        best_iteration, test_acc_clean, test_acc_pgd_rr))

        utils.model_train(model, half_prec)

    logger.info('Done in {:.2f}m'.format((time.time() - start_time) / 60))
    def train_epoch(self,
                    img_data_iter: List[data_utils.DataLoader],
                    step: int,
                    saving_path: str = None,
                    img_dev_data_iter: List[data_utils.DataLoader] = None,
                    max_step: int = 300000,
                    lex_dict=None,
                    **kwargs):
        "Standard Training and Logging Function"
        start = time.time()
        total_tokens, total_loss, tokens, cur_loss = 0, 0, 0, 0
        cur_loss = 0
        batch_zip, shortest = self.get_batch_zip(img_data_iter, None, None)

        model = (self.model.module
                 if hasattr(self.model, "module") else self.model)
        for i, batches in enumerate(batch_zip):
            for batch in batches:
                try:
                    self.optimizer.zero_grad()
                    captions = [b["captions"] for b in batch]
                    caption_pad_mask = [b["caption_mask"] for b in batch]
                    langs = [b["langs"] for b in batch]

                    with torch.no_grad():
                        image_encoding = self.caption_model(batch=batch,
                                                            encode_only=True)
                        image_encoding = image_encoding.view(
                            image_encoding.size(0), -1)

                    predictions = self.model(src_inputs=captions,
                                             src_mask=caption_pad_mask,
                                             src_langs=langs)
                    l2_loss = torch.dist(predictions, image_encoding,
                                         2) / predictions.size(0)
                    backward(l2_loss, self.optimizer, self.fp16)

                    loss = float(l2_loss.data)
                    tokens += int(predictions.size(0))
                    total_tokens += int(predictions.size(0))
                    total_loss += loss
                    cur_loss += loss

                    # We accumulate the gradients for both tasks!
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.clip)
                    self.optimizer.step()
                    step += 1

                    if step % 50 == 0 and tokens > 0:
                        elapsed = time.time() - start
                        print(
                            datetime.datetime.now(),
                            "Epoch Step: %d Loss: %f Image per Sec: %f " %
                            (step, cur_loss / tokens, tokens / elapsed))

                        if step % 500 == 0:
                            if img_dev_data_iter is not None and step % 5000 == 0:
                                loss = self.eval(img_dev_data_iter)
                                print("Dev Loss:", loss)

                            model.save(saving_path + ".latest")
                            with open(
                                    os.path.join(saving_path + ".latest",
                                                 "optim"), "wb") as fp:
                                pickle.dump(self.optimizer, fp)

                        start, tokens, cur_loss = time.time(), 0, 0

                    if step >= max_step:
                        break
                    if i == shortest - 1:
                        break
                except RuntimeError as err:
                    print(repr(err))
                    torch.cuda.empty_cache()

        try:
            print("Total loss in this epoch: %f" % (total_loss / total_tokens))
            model.save(saving_path + ".latest")

            loss = self.eval(img_dev_data_iter)
            print("Dev Loss:", loss)
        except RuntimeError as err:
            print(repr(err))

        return step
Example #7
0
 def backward(self, dy):
     N = dy.shape[0]
     dx, dw, db = utils.backward(dy, self.x, self.w)
     self.dw = dw
     self.db = db
     return dx
Example #8
0
import pygame
import pickle
import numpy as np
from game import init, iterate
from ann import NeuralNetwork
import utils

# Architecture (Specify archetecture here.)
network = NeuralNetwork(layers=[7, 14, 14, 7, 1],
                        activations=['sigmoid', 'sigmoid', 'sigmoid', 'tanh'])
lr = 0.1
losses = []

screen, font = init()
# Game Loop / Train Loop
frame_count, score, _, _, x = iterate.iterate(screen, font, 0, 0)
game = True
run = True
prediction = 0
while run:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            run = False
    prediction = utils.forward(x, network)
    frame_count, score, game, run, x = iterate.iterate(screen, font,
                                                       frame_count, score,
                                                       game, run, prediction)
    loss = utils.backward(prediction, x, lr, network)
    losses.append(loss)
pygame.quit()
Example #9
0
    def train_epoch(self,
                    step: int,
                    saving_path: str = None,
                    mt_dev_iter: List[data_utils.DataLoader] = None,
                    mt_train_iter: List[data_utils.DataLoader] = None,
                    max_step: int = 300000,
                    src_neg_iter: data_utils.DataLoader = None,
                    dst_neg_iter: data_utils.DataLoader = None,
                    **kwargs):
        "Standard Training and Logging Function"
        start = time.time()
        total_tokens, total_loss, tokens, cur_loss = 0, 0, 0, 0
        cur_loss = 0

        batch_zip, shortest = self.get_batch_zip(None, None, mt_train_iter)

        model = (self.model.module
                 if hasattr(self.model, "module") else self.model)

        for i, batches in enumerate(batch_zip):
            for batch in batches:
                self.optimizer.zero_grad()
                try:
                    src_inputs = batch["src_texts"].squeeze(0)
                    src_mask = batch["src_pad_mask"].squeeze(0)
                    tgt_inputs = batch["dst_texts"].squeeze(0)
                    tgt_mask = batch["dst_pad_mask"].squeeze(0)
                    src_langs = batch["src_langs"].squeeze(0)
                    dst_langs = batch["dst_langs"].squeeze(0)
                    src_neg_batch = next(iter(src_neg_iter))
                    src_neg_inputs = src_neg_batch["src_texts"].squeeze(0)
                    src_neg_mask = src_neg_batch["src_pad_mask"].squeeze(0)
                    src_neg_langs = src_neg_batch["langs"].squeeze(0)

                    dst_neg_batch = next(iter(dst_neg_iter))
                    tgt_neg_inputs = dst_neg_batch["src_texts"].squeeze(0)
                    tgt_neg_mask = dst_neg_batch["src_pad_mask"].squeeze(0)
                    dst_neg_langs = dst_neg_batch["langs"].squeeze(0)

                    if src_inputs.size(0) < self.num_gpu:
                        continue
                    loss = self.model(src_inputs=src_inputs,
                                      tgt_inputs=tgt_inputs,
                                      src_mask=src_mask,
                                      tgt_mask=tgt_mask,
                                      src_langs=src_langs,
                                      tgt_langs=dst_langs,
                                      src_neg_inputs=src_neg_inputs,
                                      tgt_neg_inputs=tgt_neg_inputs,
                                      src_neg_mask=src_neg_mask,
                                      tgt_neg_mask=tgt_neg_mask,
                                      src_neg_langs=src_neg_langs,
                                      tgt_neg_langs=dst_neg_langs,
                                      normalize=True)
                    nSens = src_inputs.size(0)

                    backward(loss, self.optimizer, self.fp16)

                    loss = float(loss.data) * nSens
                    tokens += nSens
                    total_tokens += nSens
                    total_loss += loss
                    cur_loss += loss

                    # We accumulate the gradients for both tasks!
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.clip)
                    self.optimizer.step()
                    step += 1

                    if step % 50 == 0 and tokens > 0:
                        elapsed = time.time() - start
                        print(
                            datetime.datetime.now(),
                            "Epoch Step: %d Loss: %f Tokens per Sec: %f " %
                            (step, cur_loss / tokens, tokens / elapsed))

                        if step % 500 == 0:
                            if mt_dev_iter is not None and step % 5000 == 0:
                                dev_loss = self.eval(mt_dev_iter, saving_path)
                                print("Dev Loss:", dev_loss)

                            model.save(saving_path + ".latest")
                            with open(
                                    os.path.join(saving_path + ".latest",
                                                 "optim"), "wb") as fp:
                                pickle.dump(self.optimizer, fp)

                        start, tokens, cur_loss = time.time(), 0, 0

                except RuntimeError as err:
                    print(repr(err))
                    torch.cuda.empty_cache()

            if i == shortest - 1:
                break
            if step >= max_step:
                break

        try:
            print("Total loss in this epoch: %f" % (total_loss / total_tokens))
            model.save(saving_path + ".latest")

            if mt_dev_iter is not None:
                dev_loss = self.eval(mt_dev_iter, saving_path)
                print("Dev Loss:", dev_loss)
        except RuntimeError as err:
            print(repr(err))

        return step
Example #10
0
        if isData():
            c = sys.stdin.read(1)
            print(c)

        if c == 's':
            stop_state = True
        elif c == 'h':
            happy_state = not happy_state
            c = ''
        else:  #c == '' or else.
            if c == 'w':
                forward(pwm, pwm)
                stop_state = False
            elif c == 'x':
                backward(pwm, pwm)
                stop_state = False
            elif c == 'a':
                spin_left(pwm, pwm)
                stop_state = False
            elif c == 'd':
                spin_right(pwm, pwm)
                stop_state = False

            elif c == 'q':
                pwm = pwm + pwm_increment if pwm <= max_speed - pwm_increment else pwm
                print("pwm: ", pwm)
            elif c == 'e':
                pwm = pwm - pwm_increment if pwm >= min_speed + pwm_increment else pwm
                print("pwm: ", pwm)
Example #11
0
    def train_epoch(self,
                    img_data_iter: List[data_utils.DataLoader],
                    step: int,
                    saving_path: str = None,
                    img_dev_data_iter: List[data_utils.DataLoader] = None,
                    max_step: int = 300000,
                    lex_dict=None,
                    accum=1,
                    mt_train_iter: List[data_utils.DataLoader] = None,
                    mt_dev_iter: List[data_utils.DataLoader] = None,
                    mtl_weight=0.1,
                    **kwargs):
        "Standard Training and Logging Function"
        start = time.time()
        total_tokens, total_loss, tokens, cur_loss = 0, 0, 0, 0
        cur_loss = 0
        batch_zip, shortest = self.get_batch_zip(img_data_iter, None,
                                                 mt_train_iter)

        model = (self.model.module
                 if hasattr(self.model, "module") else self.model)
        for i, batches in enumerate(batch_zip):
            for batch in batches:
                try:
                    is_img_batch = isinstance(batch,
                                              list) and "captions" in batch[0]
                    if is_img_batch:  # Captioning training data.
                        captions = [b["captions"] for b in batch]
                        caption_pad_mask = [b["caption_mask"] for b in batch]
                        proposals = [b["proposal"] for b in batch
                                     ] if lex_dict is not None else None
                        langs = [b["langs"] for b in batch]
                        if len(batch) < self.num_gpu:
                            continue

                        predictions = self.model(
                            tgt_inputs=captions,
                            tgt_mask=caption_pad_mask,
                            pad_idx=model.text_processor.pad_token_id(),
                            tgt_langs=langs,
                            batch=batch,
                            proposals=proposals,
                            log_softmax=True)
                        targets = torch.cat(
                            list(
                                map(lambda c: c[:, 1:].contiguous().view(-1),
                                    captions)))
                        tgt_mask_flat = torch.cat(
                            list(
                                map(lambda c: c[:, 1:].contiguous().view(-1),
                                    caption_pad_mask)))
                        targets = targets[tgt_mask_flat]
                    else:  # MT data!
                        src_inputs = batch["src_texts"].squeeze(0)
                        src_mask = batch["src_pad_mask"].squeeze(0)
                        tgt_inputs = batch["dst_texts"].squeeze(0)
                        tgt_mask = batch["dst_pad_mask"].squeeze(0)
                        src_langs = batch["src_langs"].squeeze(0)
                        dst_langs = batch["dst_langs"].squeeze(0)
                        proposals = batch["proposal"].squeeze(
                            0) if lex_dict is not None else None
                        if src_inputs.size(0) < self.num_gpu:
                            continue
                        predictions = self.model(
                            src_inputs=src_inputs,
                            tgt_inputs=tgt_inputs,
                            src_pads=src_mask,
                            tgt_mask=tgt_mask,
                            src_langs=src_langs,
                            tgt_langs=dst_langs,
                            proposals=proposals,
                            pad_idx=model.text_processor.pad_token_id(),
                            log_softmax=True)
                        targets = tgt_inputs[:, 1:].contiguous().view(-1)
                        tgt_mask_flat = tgt_mask[:, 1:].contiguous().view(-1)
                        targets = targets[tgt_mask_flat]
                    ntokens = targets.size(0)

                    if ntokens > 0:
                        if self.num_gpu == 1:
                            targets = targets.to(predictions.device)

                        loss = self.criterion(predictions, targets).mean()
                        weight = 1 if is_img_batch else mtl_weight
                        backward(loss * weight, self.optimizer, self.fp16)

                        loss = float(loss.data) * ntokens
                        tokens += ntokens
                        total_tokens += ntokens
                        total_loss += loss
                        cur_loss += loss

                        # We accumulate the gradients for both tasks!
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                       self.clip)
                        step += 1

                        if step % accum == 0:
                            self.optimizer.step()
                            self.optimizer.zero_grad()

                        if step % 50 == 0 and tokens > 0:
                            elapsed = time.time() - start
                            print(
                                datetime.datetime.now(),
                                "Epoch Step: %d Loss: %f Tokens per Sec: %f " %
                                (step, cur_loss / tokens, tokens / elapsed))

                            if step % 500 == 0:
                                if img_dev_data_iter is not None and step % 5000 == 0:
                                    bleu = self.eval_bleu(
                                        img_dev_data_iter, saving_path)
                                    print("Captioning BLEU:", bleu)
                                if mt_dev_iter is not None and step % 5000 == 0:
                                    bleu = super().eval_bleu(
                                        mt_dev_iter, saving_path)
                                    print("MT BLEU:", bleu)

                                model.save(saving_path + ".latest")
                                with open(
                                        os.path.join(saving_path + ".latest",
                                                     "optim"), "wb") as fp:
                                    pickle.dump(self.optimizer, fp)

                            start, tokens, cur_loss = time.time(), 0, 0

                        if step >= max_step:
                            break
                        if i == shortest - 1:
                            break
                except RuntimeError as err:
                    print(repr(err))
                    torch.cuda.empty_cache()

        try:
            if img_dev_data_iter is not None:
                bleu = self.eval_bleu(img_dev_data_iter, saving_path)
                print("Captioning BLEU:", bleu)
            if mt_dev_iter is not None:
                bleu = super().eval_bleu(mt_dev_iter, saving_path)
                print("MT BLEU:", bleu)

            print("Total loss in this epoch: %f" % (total_loss / total_tokens))
            model.save(saving_path + ".latest")
        except RuntimeError as err:
            print(repr(err))

        return step