def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.n = 1000
        self.butterfly = utils.get_img(path.join('example', 'butterfly.png'))
        # Batching
        self.butterfly = self.butterfly.repeat(16, 1, 1, 1)
        self.m = torch.Tensor([
            [3.2, 0.016, -68],
            [1.23, 1.7, -54],
            [0.008, 0.0001, 1],
        ])
        if cuda.is_available():
            self.butterfly = self.butterfly.cuda()
            self.m = self.m.cuda()

            with utils.Timer('Warm-up: {}'):
                for _ in range(100):
                    _ = core_warp.warp(
                        self.butterfly,
                        self.m,
                        sizes='auto',
                        kernel='bicubic',
                        fill_value=0,
                    )

                cuda.synchronize()
Beispiel #2
0
    def all_reduce_thread(self, input):
        input_device = input.get_device()
        if input_device == 0:
            data_list = [input]
            for i in range(self.allreduce_num - 1):
                data_list.append(self.queue[i].get())

            cuda.synchronize()
            # total_sum = Synchronize.data_list[0].cpu().clone()
            # for i in range(1, Synchronize.device_num):
            #     total_sum = total_sum + Synchronize.data_list[i].cpu()

            # for i in range(0, Synchronize.device_num):
            #     with torch.cuda.device_of(Synchronize.data_list[i]):
            #         Synchronize.result_list[i] = total_sum.clone().cuda()

            cuda.nccl.all_reduce(data_list)
            cuda.synchronize()

            for i in range(self.allreduce_num - 1):
                self.queue[i].task_done()
        else:
            self.queue[input_device - 1].put(input)
            self.queue[input_device - 1].join()

        return input
Beispiel #3
0
 def __init__(self, seed=0):
     manual_seed(0)
     cuda0 = device('cuda:0')
     self.W = normal(zeros((10, 784)), ones((10, 784))).to(device=cuda0)
     self.w0 = normal(zeros((10, 1)), ones((10, 1))).to(device=cuda0)
     cuda.synchronize()
     return
Beispiel #4
0
 def run_node(self, n: Node) -> Any:
     """ Timing wrapper around executing an FX Node """
     start = time.perf_counter()
     result = super().run_node(n)
     synchronize()
     sec = time.perf_counter() - start
     for prof in self.profile_stats:
         prof.record(n, sec)
     return result
    def test_warp_bicubic(self) -> torch.Tensor:
        with utils.Timer('Bicubic warping: {}'):
            for _ in range(self.n):
                _ = core_warp.warp(
                    self.butterfly,
                    self.m,
                    sizes='auto',
                    kernel='bicubic',
                    fill_value=0,
                )

            cuda.synchronize()
Beispiel #6
0
 def _train(data, opt=True):
     total = 0
     for y, x in data:
         y, x = y.to(device), x.to(device)
         pred_y = model(x)
         l = loss(pred_y, y)
         total += l.item()
         if opt:
             optimizer.zero_grad()
             l.backward()
             optimizer.step()
     cuda.synchronize()
     return total
Beispiel #7
0
 def gradient_default(self, X, Y):
     N = X.size()[1]
     W_ext = unsqueeze(self.forward_model.W, 0).expand(N, -1, -1)
     w0_ext = unsqueeze(self.forward_model.w0, 0).expand(N, -1, -1)
     X_ext = transpose(unsqueeze(X, 0), 0, 2)
     Y_ext = transpose(unsqueeze(Y, 0), 0, 2)
     cuda.synchronize()
     return (
         torch_sum(bmm(
             bmm(W_ext, X_ext) + w0_ext - Y_ext, transpose(X_ext, 1, 2)),
                   dim=0) * 2 / N,  # W gradient
         unsqueeze(torch_sum(self.forward_model(X) - Y, dim=1) * 2 / N,
                   1)  # w0 gradient
     )
Beispiel #8
0
def profile(device, name, model, example_inputs, args):
    model = torch.fx.symbolic_trace(model)
    prof = FXProfiler(model)

    for _ in range(args.warmup):
        model(*example_inputs)

    for _ in range(args.repeat):
        synchronize()
        prof.run(*example_inputs)

    for aggregate, stats in zip(PROFILES, prof.profile_stats):
        print(f"{device:4} {name:20} {aggregate.name:13} {stats.summary()}")
        aggregate.update(stats, name=name)
    return model
def cuda_time(fname, f, *args, **kwargs):
    start = cuda.Event(enable_timing=True)
    end = cuda.Event(enable_timing=True)

    cuda.synchronize()

    start.record()
    ret = f(*args, **kwargs)
    end.record()

    cuda.synchronize()

    t = start.elapsed_time(end) / 1000

    fmt = "'{}' ran in {:.2e} seconds"
    print(fmt.format(fname, t, flush=True))

    return ret
Beispiel #10
0
    def _store_side_effects(self):
        """
        Sub routine for proc_side_effects
        """
        # Code ran so we need to store the side-effects
        forced = NamespaceStack.get_forced()
        forced_objects = [each[2] for each in forced]
        # Looks like everything in forced will be forced to disk
        # There may be some redundancies between forced and self.args, that's what REDUNDANT is for
        # On redundancies, we skip from self.args, not from namespace_stack

        materialize_additionals = False

        # First, write everything new in self.args to disk
        for arg in self.args:
            if NamespaceStack.is_comparable(arg) and arg in forced_objects:
                # arg will be written from forced
                Writer.store(REDUNDANT, self.static_key, self.global_key)
                # If optimizer was modified, you'll also want to materialize the network
                materialize_additionals = True
            else:
                # write this arg to disk, it's not in forced
                if hasattr(arg, 'state_dict'):
                    Writer.store(deepcopy_cpu(arg.state_dict()),
                                 self.static_key, self.global_key)
                else:
                    # Not state_dict()
                    if hasattr(arg, 'cpu'):
                        Writer.store(arg.cpu(), self.static_key,
                                     self.global_key)
                    else:
                        Writer.store(copy.deepcopy(arg), self.static_key,
                                     self.global_key)
        # Enter a separator
        Writer.store(SEPARATOR, self.static_key, self.global_key)
        # If I should materialize a node in a group, materialize the entire group (forced)
        if materialize_additionals:
            for l, k, v in forced:
                Writer.store(str(l), self.static_key, self.global_key)
                Writer.store(k, self.static_key, self.global_key)
                Writer.store(deepcopy_cpu(v.state_dict()), self.static_key,
                             self.global_key)
        cuda.synchronize()
Beispiel #11
0
 def forked_write():
     cuda.synchronize()
     pid = os.fork()
     if not pid:
         path = flags.LOG_PATH.absolute.split('.')
         path.insert(-1, str(Writer.lsn))
         path = '.'.join(path)
         fd = open(path, 'w')
         os.nice(1)  # child process gets lower priority and starts flushing
         for each in Writer.write_buffer:
             if 'value' in each and not isinstance(
                     each['value'],
                     str):  # the dict can have 'value' or 'state'
                 each['value'] = Writer.serialize(each['value'])
             fd.write(json.dumps(each) + '\n')
         fd.close()
         os._exit(0)
     else:
         Writer.write_buffer = []  # parent process resets buffer
def test_net(net, writer, te, out_maps, noise, den_var, epoch, conv_field, GPU,
             cuda):
    if GPU == 1:
        cuda.synchronize()

    time_te = time.time()
    err_te = []
    net.eval()  #train(False)
    with torch.no_grad(
    ):  # we're just computing the test set error so we won't be updating the gradients or weights
        for i, input in enumerate(te):
            if GPU == 1:
                input = input.cuda()

            target = input.data * (out_maps - 1
                                   )  # switch from training to output space
            channels = target.shape[-3]

            if noise != 0:
                input = scramble_images(input, noise, den_var,
                                        GPU)  #NOT SETUP FOR MULTI-CHANNEL

            output = net(
                input.float()
            )  # reshape output from flat filters to channels * filters per channel
            output = torch.reshape(output,
                                   (output.shape[0], out_maps, channels,
                                    output.shape[-2], output.shape[-1]))

            loss = F.cross_entropy(output, target.long())  # c
            err_te.append(loss.data)

            if i % 10 == 0:  # log loss to tensorboard
                writer.add_scalar(
                    'test_loss', loss.data, epoch * len(te)
                )  # writer.add_histogram('conv1_weight', net[0].weight[0], epoch)  # if you want to watch the evolution of the filters  # writer.add_histogram('conv1_grad', net[0].weight.grad[0], epoch)

    if GPU == 1:
        cuda.synchronize()
    time_te = time.time() - time_te

    return err_te, time_te
def train_net(net, optimizer, writer, tr, epoch, out_maps, noise, den_var,
              conv_field, GPU, cuda):
    if GPU == 1:
        cuda.synchronize()  # synchronize for timing purposes
    time_tr = time.time()

    err_tr = []
    net.train(True)
    for i, input in enumerate(tr):
        if GPU == 1:
            input = input.cuda(non_blocking=True)

        target = input.data * (out_maps - 1
                               )  # switch from training to output space
        channels = target.shape[-3]

        if noise != 0:
            input = scramble_images(
                input, noise, den_var, GPU
            )  # introduce uniform noise to training samples (second term controls magnitude), not setup for multi-channel

        output = net(
            input.float()
        )  # reshape output from flat filters to channels * filters per channel
        output = torch.reshape(output, (output.shape[0], out_maps, channels,
                                        output.shape[-2], output.shape[-1]))

        loss = F.cross_entropy(output, target.long(
        ))  # compute the loss between the network output, and our target
        err_tr.append(loss.data)  # record loss
        optimizer.zero_grad()  # reset gradients from previous passes
        loss.backward()  # back-propagation
        optimizer.step()  # update parameters

        if i % 10 == 0:  # log loss to tensorboard
            writer.add_scalar('training_loss', loss.data, epoch * len(tr) + i)

    if GPU == 1:
        cuda.synchronize()
    time_tr = time.time() - time_tr

    return err_tr, time_tr
Beispiel #14
0
def _measure_performance(g, mem):
    tm = TicToc()
    tt = 0
    f = 1
    if g == -1:
        dev = torch.device('cpu')
    else:
        dev = torch.device('cuda:%s' % g)
    dtt = torch.double

    a = torch.eye(1024, 1024, dtype=dtt, device=dev)
    a.addmm_(a, a)
    if g >= 0:
        tcd.synchronize(device=dev)

    while tt < 1.0 and mem > 8.0 * (f * 2048.0) ** 2:
        tm.tic()
        a = torch.eye(f * 2048, f * 2048, dtype=dtt, device=dev)
        a.addmm_(a, a)
        if g >= 0:
            tcd.synchronize(device=dev)
        tt = tm.toc_val()
        f *= 2

    print('%s:%s - speed: %s' % (dev.type, dev.index, (float(f) ** 3) / tt))

    del a
    if g >= 0:
        tcd.synchronize(device=dev)
    tcd.empty_cache()

    return (float(f) ** 3) / tt
Beispiel #15
0
def run_inf(model, size, model_name, start_bs=64, logging=True):
    bs = start_bs
    finish = False

    if logging:
        print('Dataset Size:', size)

    while not finish:
        try:
            start = time.perf_counter()
            total_lat = 0
            num_iter = 0

            data_loader = DataLoader(dataset=RandomDataset(size),
                                     batch_size=bs)
            with torch.no_grad():
                for _, (img, lb) in enumerate(data_loader):
                    iter_start = time.perf_counter()

                    img = img.cuda()
                    out = model(img)
                    cuda.synchronize()

                    total_lat += time.perf_counter() - iter_start
                    num_iter += 1

            finish = True

            if logging:
                print('Batch Size:', bs)
                print('Latency(s): {:.2f}'.format(total_lat / num_iter))
                print('FPS: {:.2f}'.format(LEN /
                                           (time.perf_counter() - start)))

        except Exception as e:
            bs -= 2
def _network_execution_time(network, batch):
    # forward pass
    start = time()

    cuda.synchronize()
    out, _ = network(batch)
    cuda.synchronize()

    t_forward = time() - start

    # backward pass
    start = time()

    cuda.synchronize()
    out.backward(out)
    cuda.synchronize()

    t_backward = time() - start

    return t_forward, t_backward
Beispiel #17
0
def _sync_cuda():
    from torch import cuda

    cuda.synchronize()
def train(
    model_id,
    sequences_per_img=5,
    batch_size=10,
    resnet_conv_feature_size=2048,
    start_from=None,
    input_json_file_name=None,
    input_label_h5_file_name=None,
    label_smoothing=0,
    structure_loss_weight=1,
    train_sample_method="sample",
    train_beam_size=1,
    struc_use_logsoftmax=True,
    train_sample_n=5,
    structure_loss_type="seqnll",
    optimizer_type=NOAM,
    noamopt_factor=1,
    noamopt_warmup=20000,
    core_optimizer="sgd",
    learning_rate=0.0005,
    optimizer_alpha=0.9,
    optimizer_beta=0.999,
    optimizer_epsilon=1e-8,
    weight_decay=0,
    load_best_score=True,
    max_epochs=50,
    scheduled_sampling_start=-1,
    scheduled_sampling_increase_every=5,
    scheduled_sampling_increase_prob=0.05,
    scheduled_sampling_max_prob=0.25,
    self_critical_after=-1,
    structure_after=-1,
    cached_tokens="coco-train-idxs",
    grad_clip_value=0.1,
    grad_clip_mode=CLIP_VALUE,
    log_loss_iterations=25,
    save_every_epoch=True,
    save_checkpoint_iterations=3000,
    save_history_ckpt=True,
    eval_language_model=True,
):

    #
    # File names
    info_file_name = (
        join(start_from, "infos_" + model_id + ".pkl") if start_from is not None else ""
    )
    history_file_name = (
        join(start_from, "histories_" + model_id + ".pkl")
        if start_from is not None
        else ""
    )
    model_file_name = join(start_from, "model.pth") if start_from is not None else ""
    optimizer_file_name = (
        join(start_from, "optimizer.pth") if start_from is not None else ""
    )

    #
    # Load data
    loader = DataLoader(
        sequences_per_img,
        batch_size=batch_size,
        use_fc=True,
        use_att=True,
        use_box=0,
        norm_att_feat=0,
        norm_box_feat=0,
        input_json_file_name=input_json_file_name,
        input_label_h5_file_name=input_label_h5_file_name,
    )
    vocab_size = loader.vocab_size
    seq_length = loader.seq_length

    #
    # Initialize training info
    infos = {
        "iter": 0,
        "epoch": 0,
        "loader_state_dict": None,
        "vocab": loader.get_vocab(),
    }

    #
    # Load existing state training information, if there is any
    if start_from is not None and isfile(info_file_name):
        #
        with open(info_file_name, "rb") as f:
            assert True

    #
    # Create data logger
    histories = defaultdict(dict)
    if start_from is not None and isfile(history_file_name):
        with open(history_file_name, "rb") as f:
            histories.update(pickle_load(f))

    # tensorboard logger
    tb_summary_writer = SummaryWriter(checkpoint_path)

    #
    # Create our model
    vocab = loader.get_vocab()
    model = Transformer(
        vocab_size, resnet_conv_feature_size=resnet_conv_feature_size
    ).cuda()

    #
    # Load pretrained weights:
    if start_from is not None and isfile(model_file_name):
        model.load_state_dict(torch_load(model_file_name))

    #
    # Wrap generation model with loss function(used for training)
    # This allows loss function computed separately on each machine
    lw_model = LossWrapper(
        model,
        label_smoothing=label_smoothing,
        structure_loss_weight=structure_loss_weight,
        train_sample_method=train_sample_method,
        train_beam_size=train_beam_size,
        struc_use_logsoftmax=struc_use_logsoftmax,
        train_sample_n=train_sample_n,
        structure_loss_type=structure_loss_type,
    )

    #
    # Wrap with dataparallel
    dp_model = DataParallel(model)
    dp_lw_model = DataParallel(lw_model)

    #
    #  Build optimizer
    if optimizer_type == NOAM:
        optimizer = get_std_opt(model, factor=noamopt_factor, warmup=noamopt_warmup)
    elif optimizer_type == REDUCE_LR:
        optimizer = build_optimizer(
            model.parameters(),
            core_optimizer=core_optimizer,
            learning_rate=learning_rate,
            optimizer_alpha=optimizer_alpha,
            optimizer_beta=optimizer_beta,
            optimizer_epsilon=optimizer_epsilon,
            weight_decay=weight_decay,
        )
        optimizer = ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        raise (
            Exception("Only supports NoamOpt and ReduceLROnPlateau optimization types")
        )

    #
    # # Load the optimizer
    if start_from is not None and isfile(optimizer_file_name):
        optimizer.load_state_dict(torch_load(optimizer_file_name))

    #
    # Prepare for training
    iteration = infos["iter"]
    epoch = infos["epoch"]
    #
    # For back compatibility
    if "iterators" in infos:
        infos["loader_state_dict"] = {
            split: {
                "index_list": infos["split_ix"][split],
                "iter_counter": infos["iterators"][split],
            }
            for split in ["train", "val", "test"]
        }
    loader.load_state_dict(infos["loader_state_dict"])
    if load_best_score == 1:
        best_val_score = infos.get("best_val_score", None)
    if optimizer_type == NOAM:
        optimizer._step = iteration
    #
    # Assure in training mode
    dp_lw_model.train()
    epoch_done = True

    #
    # Start training
    try:
        while True:
            #
            # Check max epochs
            if epoch >= max_epochs and max_epochs != -1:
                break

            #
            # Update end of epoch data
            if epoch_done:
                #
                # Assign the scheduled sampling prob
                if epoch > scheduled_sampling_start and scheduled_sampling_start >= 0:
                    frac = (
                        epoch - scheduled_sampling_start
                    ) // scheduled_sampling_increase_every
                    ss_prob = min(
                        scheduled_sampling_increase_prob * frac,
                        scheduled_sampling_max_prob,
                    )
                    model.ss_prob = ss_prob

                #
                # If start self critical training
                if self_critical_after != -1 and epoch >= self_critical_after:
                    sc_flag = True
                    init_scorer(cached_tokens)
                else:
                    sc_flag = False

                #
                # If start structure loss training
                if structure_after != -1 and epoch >= structure_after:
                    struc_flag = True
                    init_scorer(cached_tokens)
                else:
                    struc_flag = False

                #
                # End epoch update
                epoch_done = False
            #
            # Compute time to load data
            start = time.time()
            data = loader.get_batch("train")
            load_data_time = time.time() - start
            print(f"Time to load data: {load_data_time} seconds")

            ########################
            # SYNC
            ########################
            synchronize()

            #
            # Compute time to complete epoch
            start = time.time()

            #
            # Make sure data is in GPU memory
            tmp = [
                data["fc_feats"],
                data["att_feats"],
                data["labels"],
                data["masks"],
                data["att_masks"],
            ]
            tmp = [_ if _ is None else _.cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks, att_masks = tmp

            #
            # Reset gradient
            optimizer.zero_grad()

            #
            print("MADE IT TO THE MODEL EVALUATION")
            #
            # Evaluate model
            model_out = dp_lw_model(
                fc_feats,
                att_feats,
                labels,
                masks,
                att_masks,
                data["gts"],
                torch_arange(0, len(data["gts"])),
                sc_flag,
                struc_flag,
            )

            #
            # Average loss over training batch
            loss = model_out["loss"].mean()

            #
            # Compute gradient
            loss.backward()

            #
            # Clip gradient
            if grad_clip_value != 0:
                gradient_clipping_functions[grad_clip_mode](
                    model.parameters(), grad_clip_value
                )
            #
            # Update
            optimizer.step()
            train_loss = loss.item()
            end = time.time()

            ########################
            # SYNC
            ########################
            synchronize()

            #
            # Output status
            if struc_flag:
                print(
                    "iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}".format(
                        iteration,
                        epoch,
                        train_loss,
                        model_out["lm_loss"].mean().item(),
                        model_out["struc_loss"].mean().item(),
                        end - start,
                    )
                )
            elif not sc_flag:
                print(
                    "iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}".format(
                        iteration, epoch, train_loss, end - start
                    )
                )
            else:
                print(
                    "iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}".format(
                        iteration, epoch, model_out["reward"].mean(), end - start
                    )
                )

            #
            # Update the iteration and epoch
            iteration += 1
            if data["bounds"]["wrapped"]:
                epoch += 1
                epoch_done = True

            #
            # Write the training loss summary
            if iteration % log_loss_iterations == 0:

                tb_summary_writer.add_scalar("train_loss", train_loss, iteration)

                if optimizer_type == NOAM:
                    current_lr = optimizer.rate()
                elif optimizer_type == REDUCE_LR:
                    current_lr = optimizer.current_lr

                tb_summary_writer.add_scalar("learning_rate", current_lr, iteration)
                tb_summary_writer.add_scalar(
                    "scheduled_sampling_prob", model.ss_prob, iteration
                )

                if sc_flag:
                    tb_summary_writer.add_scalar(
                        "avg_reward", model_out["reward"].mean(), iteration
                    )
                elif struc_flag:
                    tb_summary_writer.add_scalar(
                        "lm_loss", model_out["lm_loss"].mean().item(), iteration
                    )
                    tb_summary_writer.add_scalar(
                        "struc_loss", model_out["struc_loss"].mean().item(), iteration
                    )
                    tb_summary_writer.add_scalar(
                        "reward", model_out["reward"].mean().item(), iteration
                    )
                    tb_summary_writer.add_scalar(
                        "reward_var", model_out["reward"].var(1).mean(), iteration
                    )

                histories["loss_history"][iteration] = (
                    train_loss if not sc_flag else model_out["reward"].mean()
                )
                histories["lr_history"][iteration] = current_lr
                histories["ss_prob_history"][iteration] = model.ss_prob

            #
            # Update infos
            infos["iter"] = iteration
            infos["epoch"] = epoch
            infos["loader_state_dict"] = loader.state_dict()

            #
            # Make evaluation on validation set, and save model
            if (
                iteration % save_checkpoint_iterations == 0 and not save_every_epoch
            ) or (epoch_done and save_every_epoch):
                #
                # Evaluate model on Validation set of COCO
                eval_kwargs = {"split": "val", "dataset": input_json_file_name}
                val_loss, predictions, lang_stats = eval_split(
                    dp_model,
                    lw_model.crit,
                    loader,
                    verbose=True,
                    verbose_beam=False,
                    verbose_loss=True,
                    num_images=-1,
                    split="val",
                    lang_eval=False,
                    dataset="coco",
                    beam_size=1,
                    sample_n=1,
                    remove_bad_endings=False,
                    dump_path=False,
                    dump_images=False,
                    job_id="FUN_TIME",
                )

                #
                # Reduces learning rate if no improvement in objective
                if optimizer_type == REDUCE_LR:
                    if "CIDEr" in lang_stats:
                        optimizer.scheduler_step(-lang_stats["CIDEr"])
                    else:
                        optimizer.scheduler_step(val_loss)

                #
                # Write validation result into summary
                tb_summary_writer.add_scalar("validation loss", val_loss, iteration)
                if lang_stats is not None:
                    for k, v in lang_stats.items():
                        tb_summary_writer.add_scalar(k, v, iteration)

                histories["val_result_history"][iteration] = {
                    "loss": val_loss,
                    "lang_stats": lang_stats,
                    "predictions": predictions,
                }

                #
                # Save model if is improving on validation result
                if eval_language_model:
                    current_score = lang_stats["CIDEr"]
                else:
                    current_score = -val_loss

                best_flag = False

                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True

                #
                # Dump miscalleous informations
                infos["best_val_score"] = best_val_score

                #
                # Save checkpoints...seems only most recent one keep histories,
                # and it's overwritten each time
                save_checkpoint(
                    model,
                    infos,
                    optimizer,
                    checkpoint_dir=checkpoint_path,
                    histories=histories,
                    append="RECENT",
                )
                if save_history_ckpt:
                    save_checkpoint(
                        model,
                        infos,
                        optimizer,
                        checkpoint_dir=checkpoint_path,
                        append=str(epoch) if save_every_epoch else str(iteration),
                    )
                if best_flag:
                    save_checkpoint(
                        model,
                        infos,
                        optimizer,
                        checkpoint_dir=checkpoint_path,
                        append="BEST",
                    )

    except (RuntimeError, KeyboardInterrupt):
        print(f'{BAR("=", 20)}Save checkpoint on exception...')
        save_checkpoint(
            model, infos, optimizer, checkpoint_dir=checkpoint_path, append="EXCEPTION"
        )
        print(f'...checkpoint saved.{BAR("=", 20)}')
        stack_trace = format_exc()
        print(stack_trace)
Beispiel #19
0
 def update(self, W_grad, w0_grad, step_size):
     self.W -= W_grad * step_size
     self.w0 -= w0_grad * step_size
     cuda.synchronize()
     return
Beispiel #20
0
 def weight_norm(self):
     self.norm = sqrt(norm(self.W)**2 + norm(self.w0)**2)
     cuda.synchronize()
     return self.norm
Beispiel #21
0
    def train(self, train_loader, loss_fn, optimizer,train_metrics,test_loader=None,test_metrics=None, num_epochs=10, lr_schedule=None,
              save_models="all", model_dir=os.getcwd(),notebook_mode=False,batch_log=True,save_logs=None,display_metrics=True,save_metrics=True):


        if save_models not in ["all", "best"]:
            raise ValueError("save models must be 'all' or 'best' , {} is invalid".format(save_models))
        if save_models == "best" and test_loader is None:
            raise ValueError("save models can only be best when testloader is provided")

        if test_loader is not None:
            if test_metrics is None:
                raise ValueError("You must provide a metric for your test data")
            elif len(test_loader) == 0:
                raise ValueError("test metrics cannot be an empty list")

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)


        models_all = os.path.join(model_dir, "all_models")
        models_best = os.path.join(model_dir, "best_models")


        if not os.path.exists(models_all):
            os.mkdir(models_all)

        if not os.path.exists(models_best) and test_loader is not None:
            os.mkdir(models_best)


        from tqdm import tqdm_notebook
        from tqdm import tqdm

        best_metric = 0.0
        train_start_time = time()
        for e in tqdm(range(num_epochs)):
            print("Epoch {} of {}".format(e,num_epochs))

            for metric in train_metrics:
                metric.reset()

            self.model.train()
            self.on_epoch_start(e)

            running_loss = torch.Tensor([0.0])
            train_loss = 0.0
            data_len = 0


            if notebook_mode and batch_log:
                progress_ = tqdm_notebook(enumerate(train_loader))
            elif batch_log:
                progress_ = tqdm(enumerate(train_loader))
            else:
                progress_ = enumerate(train_loader)

            main_batch_size = 0

            init_time = time()

            for i, data in progress_:
                self.on_batch_start(e, i)

                if isinstance(data, list) or isinstance(data, tuple):
                    inputs = data[0]
                else:
                    inputs = data
                batch_size = inputs.size(0)

                if main_batch_size < batch_size:
                    main_batch_size = batch_size
                if len(self.__input_hooks) > 0:

                    for hook in self.__input_hooks:
                        inputs = hook(inputs)

                if isinstance(data, list):
                    data[0] = inputs
                elif isinstance(data, tuple):
                    data = (inputs,data[1])
                else:
                    data = inputs

                self.__train_func__(data,optimizer,loss_fn,train_metrics,running_loss,e,i)

                data_len += batch_size
                train_loss = running_loss.item()/data_len

                if batch_log:
                    progress_message = ""
                    for metric in train_metrics:
                        progress_message += "Train {} : {}".format(metric.name, metric.getValue())
                    progress_.set_description("{}/{} batches ".format(int(ceil(data_len / main_batch_size)),
                                                              int(ceil(len(train_loader.dataset) / main_batch_size))))
                    progress_dict = {"Train Loss": train_loss}
                    for metric in train_metrics:
                        progress_dict["Train " + metric.name] = metric.getValue()

                    progress_.set_postfix(progress_dict)

                self.on_batch_end(e, i, train_metrics, train_loss)
            if self.cuda:
                cuda.synchronize()

            self.loss_history.append(train_loss)
            duration = time() - init_time

            if lr_schedule is not None:
                lr = lr_schedule(e)
                adjust_learning_rate(lr,optimizer)

            model_file = os.path.join(models_all, "model_{}.pth".format(e))
            self.save_model(model_file)

            logfile = None
            if save_logs is not None:
                logfile = open(save_logs,"a")


            print(os.linesep+"Epoch: {}, Duration: {} , Train Loss: {}".format(e, duration, train_loss))
            if logfile is not None:
                logfile.write(os.linesep+"Epoch: {}, Duration: {} , Train Loss: {}".format(e, duration, train_loss))

            if test_loader is not None:
                message = "Accuracy did not improve"
                current_best = best_metric
                self.evaluate(test_loader,test_metrics)
                result = test_metrics[0].getValue()
                if result > current_best:
                    best_metric = result
                    message = "{} improved from {} to {}".format(test_metrics[0].name,current_best, result)
                    model_file = os.path.join(models_best,"model_{}.pth".format(e))
                    self.save_model(model_file)

                    print(os.linesep+"{} New Best Model saved in {}".format(message,model_file))
                    if logfile is not None:
                        logfile.write(os.linesep+"{} New Best Model saved in {}".format(message,model_file))

                else:
                    print(os.linesep+message)
                    if logfile is not None:
                        logfile.write(os.linesep+message)

                for metric in test_metrics:
                    print("Test {} : {}".format(metric.name,metric.getValue()))
                    if logfile is not None:
                        logfile.write(os.linesep+"Test {} : {}".format(metric.name,metric.getValue()))


            for metric in train_metrics:
                print("Train {} : {}".format(metric.name, metric.getValue()))
                if logfile is not None:
                    logfile.write(os.linesep + "Train {} : {}".format(metric.name, metric.getValue()))

            if logfile is not None:
                logfile.close()

            for metric in train_metrics:
                metric.add_history()


            epoch_arr = [x for x in range(e+1)]

            if display_metrics or save_metrics:

                save_path = None

                if save_metrics:
                    save_path = os.path.join(model_dir, "epoch_{}_loss.png".format(e))
                visualize(epoch_arr, [PlotInput(value=self.loss_history, name="Train Loss", color="red")],display=display_metrics,
                          save_path=save_path)

            if test_loader is not None and (display_metrics or save_metrics):
                    for metric in test_metrics:

                        save_path = None

                        if save_metrics:
                            save_path = os.path.join(model_dir, "test_{}_epoch_{}.png".format(metric.name, e))
                        visualize(epoch_arr, [PlotInput(value=metric.history, name="Test "+metric.name, color="blue")],display=display_metrics,
                                      save_path=save_path)
            for metric in train_metrics:
                if save_metrics:
                    save_path = os.path.join(model_dir, "test_{}_epoch_{}.png".format(metric.name, e))
                visualize(epoch_arr, [PlotInput(value=metric.history, name="Test " + metric.name, color="blue")],display=display_metrics,
                          save_path=save_path)

            self.on_epoch_end(e, train_metrics, test_metrics, train_loss, duration)
        train_end_time = time() - train_start_time

        self.on_training_completed(train_metrics,test_metrics,train_end_time)
Beispiel #22
0
    def train(self,
              target,
              source,
              gen_optimizer,
              disc_optimizer,
              num_epochs=10,
              disc_steps=1,
              gen_lr_schedule=None,
              disc_lr_schedule=None,
              model_dir=os.getcwd(),
              save_interval=100,
              notebook_mode=False,
              batch_log=True,
              save_logs=None,
              display_metrics=True,
              save_metrics=True):
        assert (len(target.dataset) == len(source.dataset))
        assert (disc_steps < len(target.dataset))

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)

        self.model_dir = model_dir
        models_gen = os.path.join(model_dir, "gen_models")
        models_disc = os.path.join(model_dir, "disc_models")

        if not os.path.exists(models_gen):
            os.mkdir(models_gen)

        if not os.path.exists(models_disc):
            os.mkdir(models_disc)

        iterations = 0

        from tqdm import tqdm_notebook
        from tqdm import tqdm

        train_start_time = time()

        for e in tqdm(range(num_epochs)):

            self.gen_model.train()
            self.disc_model.train()
            self.on_epoch_start(e)

            running_gen_loss = torch.Tensor([0.0])
            running_disc_loss = torch.Tensor([0.0])
            gen_loss = 0.0
            disc_loss = 0.0
            gen_data_len = 0
            disc_data_len = 0

            if notebook_mode and batch_log:
                progress_ = tqdm_notebook(enumerate(zip(target, source)))
            elif batch_log:
                progress_ = tqdm(enumerate(zip(target, source)))
            else:
                progress_ = enumerate(zip(target, source))

            init_time = time()

            for i, (t, s) in progress_:

                if isinstance(t, list) or isinstance(t, tuple):
                    inputs = t[0]
                else:
                    inputs = t
                batch_size = inputs.size(0)
                disc_data_len += batch_size

                if len(self.__input_hooks) > 0:

                    for hook in self.__input_hooks:
                        inputs = hook(inputs)

                if isinstance(t, list):
                    t[0] = inputs
                elif isinstance(t, tuple):
                    t = (inputs, t[1])
                else:
                    t = inputs

                self.__disc_train_func__(t, s, disc_optimizer,
                                         running_disc_loss, e, i)

                disc_loss = (running_disc_loss.data[0] / disc_data_len).item()

                if (i + 1) % disc_steps == 0:
                    self.__gen_train_func__(t, s, gen_optimizer,
                                            running_gen_loss, e, i)
                    gen_data_len += batch_size

                    gen_loss = (running_gen_loss.data[0] / gen_data_len).item()

                if batch_log:
                    progress_dict = {
                        "Gen Loss": gen_loss,
                        "Disc Loss": disc_loss
                    }
                    progress_.set_postfix(progress_dict)

                iterations += 1

                if iterations % save_interval == 0:
                    self.save(s, iterations)
                    self.show(s, iterations)

                self.on_batch_end(e, i, gen_loss, disc_loss)
            if self.cuda:
                cuda.synchronize()
            duration = time() - init_time

            self.disc_loss_history.append(disc_loss)
            self.gen_loss_history.append(gen_loss)

            if gen_lr_schedule is not None:
                lr = gen_lr_schedule(e)
                adjust_learning_rate(lr, gen_optimizer)

            if disc_lr_schedule is not None:
                lr = disc_lr_schedule(e)
                adjust_learning_rate(lr, disc_optimizer)

            model_file = os.path.join(models_gen, "gen_model_{}.pth".format(e))
            self.save_generator(model_file)

            model_file = os.path.join(models_disc,
                                      "disc_model_{}.pth".format(e))
            self.save_discriminator(model_file)

            print(
                "Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".format(
                    e, duration, gen_loss, disc_loss))

            if save_logs is not None:
                logfile = open(save_logs, "a")
                logfile.write(
                    "Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".
                    format(e, duration, gen_loss, disc_loss))
                logfile.close()

            epoch_arr = [x for x in range(e + 1)]

            if display_metrics or save_metrics:

                save_path = None

                if save_metrics:
                    save_path = os.path.join(model_dir,
                                             "epoch_{}_loss.png".format(e))

                visualize(epoch_arr, [
                    PlotInput(value=self.gen_loss_history,
                              name="Generator Loss",
                              color="red"),
                    PlotInput(value=self.disc_loss_history,
                              name="Discriminator Loss",
                              color="red")
                ],
                          display=display_metrics,
                          save_path=save_path)

            self.on_epoch_end(e, gen_loss, disc_loss, duration)
        train_end_time = time() - train_start_time
        self.on_training_completed(train_end_time)
Beispiel #23
0
def main():
    # For reproducibility
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    np.random.seed(args.seed)

    train_loader, val_loader = getDataLoader(args, logger)

    # Network
    aanet = nets.AANet(
        args.max_disp,
        num_downsample=args.num_downsample,
        feature_type=args.feature_type,
        no_feature_mdconv=args.no_feature_mdconv,
        feature_pyramid=args.feature_pyramid,
        feature_pyramid_network=args.feature_pyramid_network,
        feature_similarity=args.feature_similarity,
        aggregation_type=args.aggregation_type,
        useFeatureAtt=args.useFeatureAtt,
        num_scales=args.num_scales,
        num_fusions=args.num_fusions,
        num_stage_blocks=args.num_stage_blocks,
        num_deform_blocks=args.num_deform_blocks,
        no_intermediate_supervision=args.no_intermediate_supervision,
        refinement_type=args.refinement_type,
        mdconv_dilation=args.mdconv_dilation,
        deformable_groups=args.deformable_groups).to(device)

    # logger.info('%s' % aanet) if local_master else None
    if local_master:
        structure_of_net = os.path.join(args.checkpoint_dir,
                                        'structure_of_net.txt')
        with open(structure_of_net, 'w') as f:
            f.write('%s' % aanet)

    if args.pretrained_aanet is not None:
        logger.info('=> Loading pretrained AANet: %s' % args.pretrained_aanet)
        # Enable training from a partially pretrained model
        utils.load_pretrained_net(aanet,
                                  args.pretrained_aanet,
                                  no_strict=(not args.strict))

    aanet.to(device)
    logger.info('=> Use %d GPUs' %
                torch.cuda.device_count()) if local_master else None
    # if torch.cuda.device_count() > 1:
    if args.distributed:
        # aanet = torch.nn.DataParallel(aanet)
        #  尝试分布式训练
        aanet = torch.nn.SyncBatchNorm.convert_sync_batchnorm(aanet)
        aanet = torch.nn.parallel.DistributedDataParallel(
            aanet, device_ids=[local_rank], output_device=local_rank)
        synchronize()

    # Save parameters
    num_params = utils.count_parameters(aanet)
    logger.info('=> Number of trainable parameters: %d' % num_params)
    save_name = '%d_parameters' % num_params
    open(os.path.join(args.checkpoint_dir, save_name), 'a').close(
    ) if local_master else None  # 这是个空文件,只是通过其文件名称指示模型有多少个需要训练的参数

    # Optimizer
    # Learning rate for offset learning is set 0.1 times those of existing layers
    specific_params = list(
        filter(utils.filter_specific_params, aanet.named_parameters()))
    base_params = list(
        filter(utils.filter_base_params, aanet.named_parameters()))

    specific_params = [kv[1]
                       for kv in specific_params]  # kv is a tuple (key, value)
    base_params = [kv[1] for kv in base_params]

    specific_lr = args.learning_rate * 0.1
    params_group = [
        {
            'params': base_params,
            'lr': args.learning_rate
        },
        {
            'params': specific_params,
            'lr': specific_lr
        },
    ]

    optimizer = torch.optim.Adam(params_group, weight_decay=args.weight_decay)

    # Resume training
    if args.resume:
        # 1. resume AANet
        start_epoch, start_iter, best_epe, best_epoch = utils.resume_latest_ckpt(
            args.checkpoint_dir, aanet, 'aanet')
        # 2. resume Optimizer
        utils.resume_latest_ckpt(args.checkpoint_dir, optimizer, 'optimizer')
    else:
        start_epoch = 0
        start_iter = 0
        best_epe = None
        best_epoch = None

    # LR scheduler
    if args.lr_scheduler_type is not None:
        last_epoch = start_epoch if args.resume else start_epoch - 1
        if args.lr_scheduler_type == 'MultiStepLR':
            milestones = [int(step) for step in args.milestones.split(',')]
            lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
                optimizer,
                milestones=milestones,
                gamma=args.lr_decay_gamma,
                last_epoch=last_epoch
            )  # 最后这个last_epoch参数很重要:如果是resume的话,则会自动调整学习率适去应last_epoch。
        else:
            raise NotImplementedError
    # model.Model(object)对AANet做了进一步封装。
    train_model = model.Model(args,
                              logger,
                              optimizer,
                              aanet,
                              device,
                              start_iter,
                              start_epoch,
                              best_epe=best_epe,
                              best_epoch=best_epoch)

    logger.info('=> Start training...')

    trainLoss_dict, trainLossKey, valLoss_dict, valLossKey = getLossRecord(
        netName="AANet")

    if args.evaluate_only:
        assert args.val_batch_size == 1
        train_model.validate(
            val_loader, local_master, valLoss_dict,
            valLossKey)  # test模式。应该设置--evaluate_only,且--mode为“test”。
        # 保存Loss用于分析
        save_loss_for_matlab(trainLoss_dict, valLoss_dict)
    else:
        for epoch in range(start_epoch, args.max_epoch):  # 训练主循环(Epochs)!!!
            if not args.evaluate_only:
                # ensure distribute worker sample different data,
                # set different random seed by passing epoch to sampler
                if args.distributed:
                    train_loader.sampler.set_epoch(epoch)
                    logger.info(
                        'train_loader.sampler.set_epoch({})'.format(epoch))
                train_model.train(train_loader, local_master, trainLoss_dict,
                                  trainLossKey)
            if not args.no_validate:
                train_model.validate(val_loader, local_master, valLoss_dict,
                                     valLossKey)  # 训练模式下:边训练边验证。
            if args.lr_scheduler_type is not None:
                lr_scheduler.step()  # 调整Learning Rate

            # 保存Loss用于分析。每个epoch结束后,都保存一次,覆盖之前的保存。避免必须训练完成才保存的弊端。
            save_loss_for_matlab(trainLoss_dict, valLoss_dict)

        logger.info('=> End training\n\n')
Beispiel #24
0
def main():
    # For reproducibility
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    np.random.seed(args.seed)

    train_loader, val_loader = getDataLoader(args, logger)

    net = selectModel(args.model)

    # logger.info('%s' % net) if local_master else None

    # if args.pretrained_net is not None:
    #     logger.info('=> Loading pretrained Net: %s' % args.pretrained_net)
    #     # Enable training from a partially pretrained model
    #     utils.load_pretrained_net(net, args.pretrained_net, strict=args.strict, logger=logger)

    net.to(device)
    # if torch.cuda.device_count() > 1:
    if args.distributed:
        # aanet = torch.nn.DataParallel(aanet)
        #  尝试分布式训练
        net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
        net = torch.nn.parallel.DistributedDataParallel(
            net, device_ids=[local_rank], output_device=local_rank)
        synchronize()

    # Save parameters
    num_params = utils.count_parameters(net)
    logger.info('=> Number of trainable parameters: %d' % num_params)

    # 网络的特殊部分,设置特殊的学习率:specific_lr = args.learning_rate * 0.1
    params_group = setInitLR(net, args)

    # Optimizer
    optimizer = torch.optim.Adam(params_group, weight_decay=args.weight_decay)

    # Resume training
    if args.resume:
        # 1. resume Net
        start_epoch, start_iter, best_epe, best_epoch = utils.resume_latest_ckpt(
            args.checkpoint_dir, net, 'net_latest', False, logger)
        # 2. resume Optimizer
        utils.resume_latest_ckpt(args.checkpoint_dir, optimizer,
                                 'optimizer_latest', True, logger)
    else:
        start_epoch = 0
        start_iter = 0
        best_epe = None
        best_epoch = None

    # LR scheduler
    if args.lr_scheduler_type is not None:
        last_epoch = start_epoch if args.resume else start_epoch - 1
        if args.lr_scheduler_type == 'MultiStepLR':
            milestones = [int(step) for step in args.milestones.split(',')]
            lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
                optimizer,
                milestones=milestones,
                gamma=args.lr_decay_gamma,
                last_epoch=last_epoch
            )  # 最后这个last_epoch参数很重要:如果是resume的话,则会自动调整学习率适去应last_epoch。
        else:
            raise NotImplementedError
    # model.Model(net)对net做了进一步封装。
    train_model = model.Model(args,
                              logger,
                              optimizer,
                              net,
                              device,
                              start_iter,
                              start_epoch,
                              best_epe=best_epe,
                              best_epoch=best_epoch)
    logger.info('=> Start training...')

    for epoch in range(start_epoch, args.max_epoch):  # 训练主循环(Epochs)!!!
        # ensure distribute worker sample different data,
        # set different random seed by passing epoch to sampler
        if args.distributed:
            train_loader.sampler.set_epoch(epoch)
            logger.info('train_loader.sampler.set_epoch({})'.format(epoch))

        train_model.train(train_loader, local_master)

        if args.do_validate:
            train_model.validate(val_loader, local_master)  # 训练模式下:边训练边验证。

        if args.lr_scheduler_type is not None:
            lr_scheduler.step()  # 调整Learning Rate

    logger.info('=> End training\n\n')
def generate_samples(n_samples, sample_batch_size, sample_x_dim, sample_y_dim,
                     conv_field, generator, bound_type, GPU, cuda,
                     training_data, out_maps, boundary_layers, noise, den_var,
                     channels, temperature, dataset_size):
    if GPU == 1:
        cuda.synchronize()
    time_ge = time.time()

    sample_x_padded = sample_x_dim + 2 * conv_field * boundary_layers
    sample_y_padded = sample_y_dim + conv_field * boundary_layers  # don't need to pad the bottom

    sample_batch_size, changed = get_sample_batch_size(
        sample_batch_size, generator, sample_x_padded, sample_y_padded,
        conv_field, channels, GPU
    )  # add extra padding by conv_field in both x-directions, and in the + y direction, which we will remove later
    if changed:
        print('Sample batch size changed to {}'.format(sample_batch_size))

    if n_samples < sample_batch_size:
        n_samples = sample_batch_size

    batches = int(np.ceil(n_samples / sample_batch_size))
    n_samples = sample_batch_size * batches
    sample = torch.ByteTensor(n_samples, channels, sample_y_dim,
                              sample_x_dim)  # sample placeholder
    print('Generating {} Samples'.format(n_samples))

    for batch in range(
            batches):  # can't do these all at once so we do it in batches
        print('Batch {} of {} batches'.format(batch + 1, batches))
        sample_batch = torch.FloatTensor(
            sample_batch_size, channels, sample_y_padded + 2 * conv_field,
            sample_x_padded + 2 * conv_field
        )  # needs to be explicitly padded by the convolutional fieldn
        sample_batch.fill_(0)  # initialize with minimum value

        if bound_type > 0:
            sample_batch = build_boundary(sample_batch, sample_batch_size,
                                          training_data, conv_field, generator,
                                          bound_type, out_maps, noise, den_var,
                                          dataset_size, GPU)

        if GPU == 1:
            sample_batch = sample_batch.cuda()

        #generator.train(False)
        generator.eval()
        with torch.no_grad():  # we will not be updating weights
            for i in tqdm.tqdm(range(conv_field, sample_y_padded +
                                     conv_field)):  # for each pixel
                for j in range(conv_field, sample_x_padded + conv_field):
                    for k in range(channels):
                        out = generator(
                            sample_batch[:, :,
                                         i - conv_field:i + conv_field + 1,
                                         j - conv_field:j + conv_field +
                                         1].float()
                        )  # query the network about only area within the receptive field
                        out = torch.reshape(
                            out,
                            (out.shape[0], out_maps, channels, out.shape[-2],
                             out.shape[-1]))  # reshape to select channels
                        normed_temp = torch.mean(
                            torch.abs(out[:, 1:, k, 0, 0])
                        ) * (
                            temperature
                        )  # + np.exp(- i/conv_field/2)) # normalize temperature, graded against the boundary
                        probs = F.softmax(
                            out[:, 1:, k, 0, 0] / normed_temp, dim=1
                        ).data  # the remove the lowest element (boundary)
                        sample_batch[:, k, i, j] = (torch.multinomial(
                            probs, 1).float() + 1).squeeze(1) / (
                                out_maps - 1
                            )  # convert output back to training space
                        del out, probs

        for k in range(channels):
            sample[batch * sample_batch_size:(batch + 1) * sample_batch_size,
                   k, :, :] = sample_batch[:, k, (
                       boundary_layers + 1) * conv_field:-conv_field, (
                           boundary_layers + 1) * conv_field:-(
                               (boundary_layers + 1) * conv_field)] * (
                                   out_maps -
                                   1) - 1  # convert back to input space

    if GPU == 1:
        cuda.synchronize()
    time_ge = time.time() - time_ge

    return sample, time_ge, sample_batch_size, n_samples
Beispiel #26
0
                                     batch_size=batch_size,
                                     shuffle=True,
                                     num_workers=4)

model = draw(seq_len)
model.cuda()
# setup optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0002, betas=(0.5, 0.999))

# train
for epoch in range(25):
    for i, (data, _) in enumerate(loader, 0):
        input = Variable(data).cuda()
        recon_batch, mu_t, logvar_t = model(input, seq_len)
        # loss = Variable(torch.FloatTensor(1).fill_(0).cuda())
        cuda.synchronize()  # elsewise synchronize error
        loss = loss_function(recon_batch, input, mu_t, logvar_t, seq_len,
                             input.size(0), img_size)
        model.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 10 == 0:
            ##########################
            # Visualization
            ##########################
            images = make_grid(recon_batch.data[:8])
            writer.add_image('output', images, i)
            images = make_grid(data[:8])
            writer.add_image('images', images, i)
        writer.add_scalar('error', loss.data[0], i)
Beispiel #27
0
with open(fname, "w", newline="") as f:
    csv_file = csv.writer(f, delimiter=',')
    #
    # Iterate to convergence
    eval_counter = 0
    train_loss = L(MNIST_Data.X, MNIST_Data.Y)
    train_loss_delta = train_loss
    while eval_counter <= max_iter:
        print(eval_counter)
        #
        # Iterate over batches
        for (x, y) in MNIST_Data:
            #
            # Compute gradient and gradient norm
            grad_W, grad_w0 = L.gradient(x, y)
            cuda.synchronize()
            #
            # Update weights
            cuda.synchronize()
            F.update(grad_W, grad_w0, step_size)
        #
        # Update time step
        # step_size = dynamic_stepper.next()
        #
        # Update convergence criterion check
        train_loss = L(MNIST_Data.X, MNIST_Data.Y)
        test_loss = L(MNIST_Data_test.X, MNIST_Data_test.Y)
        eval_counter += 1
        #
        # Write progress
        csv_file.writerow(
Beispiel #28
0
    def train(self, target,source,gen_optimizer,disc_optimizer,num_epochs=10, disc_steps=1, gen_lr_schedule=None,disc_lr_schedule=None, model_dir=os.getcwd(), save_interval=100,notebook_mode=False,batch_log=True,save_logs=None,display_metrics=True,save_metrics=True):
        assert(len(target.dataset) == len(source.dataset))
        assert(disc_steps < len(target.dataset))

        if not os.path.exists(model_dir):
            os.mkdir(model_dir)

        self.model_dir = model_dir
        models_gen = os.path.join(model_dir, "gen_models")
        models_disc = os.path.join(model_dir, "disc_models")

        if not os.path.exists(models_gen):
            os.mkdir(models_gen)

        if not os.path.exists(models_disc):
            os.mkdir(models_disc)

        iterations = 0

        from tqdm import tqdm_notebook
        from tqdm import tqdm

        train_start_time = time()

        for e in tqdm(range(num_epochs)):

            self.gen_model.train()
            self.disc_model.train()
            self.on_epoch_start(e)

            running_gen_loss = torch.Tensor([0.0])
            running_disc_loss = torch.Tensor([0.0])
            gen_loss = 0.0
            disc_loss = 0.0
            gen_data_len = 0
            disc_data_len = 0

            if notebook_mode and batch_log:
                progress_ = tqdm_notebook(enumerate(zip(target,source)))
            elif batch_log:
                progress_ = tqdm(enumerate(zip(target,source)))
            else:
                progress_ = enumerate(zip(target,source))

            init_time = time()

            for i,(t,s) in progress_:

                if isinstance(t, list) or isinstance(t, tuple):
                    inputs = t[0]
                else:
                    inputs = t
                batch_size = inputs.size(0)
                disc_data_len += batch_size

                if len(self.__input_hooks) > 0:

                    for hook in self.__input_hooks:
                        inputs = hook(inputs)

                if isinstance(t, list):
                    t[0] = inputs
                elif isinstance(t, tuple):
                    t = (inputs,t[1])
                else:
                    t = inputs

                self.__disc_train_func__(t, s, disc_optimizer, running_disc_loss, e, i)

                disc_loss = running_disc_loss.data[0] / disc_data_len

                if (i+1) % disc_steps == 0:
                    self.__gen_train_func__(t, s, gen_optimizer, running_gen_loss, e, i)
                    gen_data_len += batch_size

                    gen_loss = running_gen_loss.data[0] / gen_data_len

                if batch_log:
                     progress_dict = {"Gen Loss": gen_loss,"Disc Loss":disc_loss}
                     progress_.set_postfix(progress_dict)

                iterations += 1

                if iterations % save_interval == 0:
                    self.save(s,iterations)
                    self.show(s,iterations)

                self.on_batch_end(e, i, gen_loss, disc_loss)
            if self.cuda:
                cuda.synchronize()
            duration = time() - init_time

            self.disc_loss_history.append(disc_loss)
            self.gen_loss_history.append(gen_loss)

            if gen_lr_schedule is not None:
                lr = gen_lr_schedule(e)
                adjust_learning_rate(lr,gen_optimizer)

            if disc_lr_schedule is not None:
                lr = disc_lr_schedule(e)
                adjust_learning_rate(lr, disc_optimizer)

            model_file = os.path.join(models_gen, "gen_model_{}.pth".format(e))
            self.save_generator(model_file)

            model_file = os.path.join(models_disc, "disc_model_{}.pth".format(e))
            self.save_discriminator(model_file)

            print("Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".format(e, duration, gen_loss,disc_loss))

            if save_logs is not None:
                logfile = open(save_logs, "a")
                logfile.write("Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".format(e, duration, gen_loss,disc_loss))
                logfile.close()

            epoch_arr = [x for x in range(e + 1)]

            if display_metrics or save_metrics:

                save_path = None

                if save_metrics:
                    save_path = os.path.join(model_dir, "epoch_{}_loss.png".format(e))

                visualize(epoch_arr, [PlotInput(value=self.gen_loss_history, name="Generator Loss", color="red"),
                                      PlotInput(value=self.disc_loss_history, name="Discriminator Loss", color="red")],display=display_metrics,
                          save_path=save_path)

            self.on_epoch_end(e,gen_loss, disc_loss, duration)
        train_end_time = time() - train_start_time
        self.on_training_completed(train_end_time)