Beispiel #1
0
    def __init__(self, config, logger=None, verbose=False):
        super(AbstractNetwork, self).__init__()  # Must call super __init__()

        # update configuration
        config = self.model_specific_config_update(config)

        self.optimizer, self.sample_data, self.models_to_update = None, None, None
        self.training_mode = True
        self.evaluate_after = config["evaluation"].get("evaluate_after", 1)

        self.it = 0  # it: iteration
        self.tm = timer.Timer()  # tm: timer
        self.grad_clip = config["optimize"].get("gradient_clip", 10)
        self.update_every = config["optimize"].get("update_every", 1)
        self.use_gpu = config["model"].get(
            "use_gpu", True if torch.cuda.is_available else False)
        self.device = torch.device("cuda" if self.use_gpu else "cpu")
        if len(config["misc"]["tensorboard_dir"]) > 0:
            self.create_tensorboard_summary(config["misc"]["tensorboard_dir"])

        # save configuration for later network reproduction
        resume = config["model"].get("resume", False)
        if not resume:
            save_config_path = os.path.join(config["misc"]["result_dir"],
                                            "config.yml")
            io_utils.write_yaml(save_config_path, config)
        self.config = config
        # prepare logging
        if logger is not None:
            self.log = logger.info
        else:
            self.log = print
        self.log(json.dumps(config, indent=2))
Beispiel #2
0
def evaluate(config,
             loader,
             net,
             epoch,
             logger_name="epoch",
             mode="Train",
             verbose_every=None):

    if verbose_every == None:
        verbose_every = config["evaluation"]["print_every"]
    # load logger
    if logger_name == "epoch":
        logger = io_utils.get_logger("Train")
    elif logger_name == "eval":
        logger = io_utils.get_logger("Evaluate")
    else:
        raise NotImplementedError()

    net.eval_mode()  # set network as evalmode
    net.reset_status()  # reset status
    """ Run validating network """
    ii = 0
    tm = timer.Timer()
    for batch in loader:
        data_load_duration = tm.get_duration()
        # forward the network
        tm.reset()
        outputs = net.evaluate(batch)
        run_duration = tm.get_duration()

        # accumulate the number of correct answers
        net.compute_status(outputs[1], batch[0][-1])

        # print learning information
        if ((verbose_every > 0) and ((ii+1) % verbose_every == 0)) \
                or config["misc"]["debug"]:
            net.print_status(epoch + 1, ii + 1, mode="eval")
            txt = "[TEST] fetching for {:.3f}s, inference for {:.3f}s\n"
            logger.debug(txt.format(data_load_duration, run_duration))

        ii += 1
        tm.reset()

        if (config["misc"]["debug"]) and (ii > 2):
            break
        # end for batch in loader

    net.metric = net.counters["top1-avg"].get_average(
    )  # would be used for tuning parameter
    net.print_counters_info(epoch + 1, logger_name=logger_name, mode=mode)
    net.save_results(None, "epoch_{:03d}".format(epoch + 1), mode="eval")
Beispiel #3
0
    def __init__(self):
        super(VirtualNetwork, self).__init__()  # Must call super __init__()

        self.models_to_update = None
        self.sample_data = None
        self.optimizer = None
        self.training_mode = True
        self.is_main_net = True

        self.counters = None
        self.status = None
        self.use_tf_summary = False
        self.it = 0  # it: iteration
        self.update_every = 1
        self.debug_mode = False
        self.qsts = None

        self._create_counters()
        self._get_loggers()
        self.reset_status(init_reset=True)

        self.tm = timer.Timer()  # tm: timer
Beispiel #4
0
def train(config):
    """ Build data loader """
    dsets = {}
    dsets["train"] = dataset.DataSet(config["train_loader"])
    dsets["test"] = dataset.DataSet(config["test_loader"])
    L = {}
    L["train"] = data.DataLoader( \
            dsets["train"], batch_size=config["train_loader"]["batch_size"], \
            num_workers=config["misc"]["num_workers"], \
            shuffle=True, collate_fn=dataset.collate_fn)
    L["test"] = data.DataLoader( \
            dsets["test"], batch_size=config["test_loader"]["batch_size"], \
            num_workers=config["misc"]["num_workers"], \
            shuffle=True, collate_fn=dataset.collate_fn)
    config = M.override_config_from_loader(config, dsets["train"])
    """ Build network """
    net = M(config)
    net.bring_loader_info(dsets)
    logger["train"].info(str(net))
    apply_cc_after = utils.get_value_from_dict(
        config["model"], "apply_curriculum_learning_after", -1)
    # load checkpoint if exists
    if len(config["model"]["checkpoint_path"]) > 0:
        net.load_checkpoint(config["model"]["checkpoint_path"])
        start_epoch = int(
            utils.get_filename_from_path(
                config["model"]["checkpoint_path"]).split("_")[-1])
        # If checkpoint use curriculum learning
        if (apply_cc_after > 0) and (start_epoch >= apply_cc_after):
            net.apply_curriculum_learning()
    else:
        start_epoch = 0

    # ship network to use gpu
    if config["model"]["use_gpu"]:
        net.gpu_mode()

    # Prepare tensorboard
    net.create_tensorboard_summary(config["misc"]["tensorboard_dir"])
    """ Run training network """
    ii = 0
    tm = timer.Timer()  # tm: timer
    iter_per_epoch = dsets["train"].get_iter_per_epoch()
    min_lr = config["optimize"].get("min_lr", 0.0002)
    for epoch in range(start_epoch, config["optimize"]["num_epoch"]):
        net.train_mode()  # set network as train mode
        net.reset_status()  # initialize status
        for batch in L["train"]:
            data_load_duration = tm.get_duration()

            # maintain sample data to observe learning status
            if ii == 0:
                sample_data = dsets["train"].get_samples(5)
                """ TODO: get samples from both training/test set
                test_sample_data = dsets["test"].get_samples(5))
                """

            # Forward and update the network
            # Note that the 1st and 2nd item of outputs from forward() should be
            # loss and logits. The others would change depending on the network
            tm.reset()
            lr = utils.adjust_lr(ii + 1, iter_per_epoch, config["optimize"],
                                 min_lr)
            outputs = net.forward_update(batch, lr)
            run_duration = tm.get_duration()

            # Compute status for current batch: loss, evaluation scores, etc
            net.compute_status(outputs[1], batch[0][-1])

            # print learning status
            if (ii + 1) % config["misc"]["print_every"] == 0:
                net.print_status(epoch + 1, ii + 1)
                txt = "fetching for {:.3f}s, optimizing for {:.3f}s, lr = {:.5f}"
                logger["train"].debug(
                    txt.format(data_load_duration, run_duration, lr))
                logger["train"].info("\n")

            # visualize results
            if (config["misc"]["vis_every"] > 0) \
                    and ((ii+1) % config["misc"]["vis_every"] == 0):
                if config["misc"]["model_type"] == "ensemble":
                    net.save_results(sample_data,
                                     "iteration_{}".format(ii + 1),
                                     mode="train")

            ii += 1
            tm.reset()

            if config["misc"]["debug"]:
                if ii % 100 == 0:
                    break
            # epoch done

        # save network every epoch
        net.save_checkpoint(epoch + 1)

        # visualize results
        net.save_results(sample_data,
                         "epoch_{:03d}".format(epoch + 1),
                         mode="train")

        # print status (metric) accumulated over each epoch
        net.print_counters_info(epoch + 1, logger_name="epoch", mode="Train")

        # validate network
        if (epoch + 1) % config["evaluation"]["every_eval"] == 0:
            cmf.evaluate(config,
                         L["test"],
                         net,
                         epoch,
                         logger_name="epoch",
                         mode="Valid")

        # curriculum learning
        if (apply_cc_after >= 0) and ((epoch + 1) == apply_cc_after):
            net.apply_curriculum_learning()

        # reset reference time to compute duration of loading data
        tm.reset()
def train(config):
    # create loggers
    it_logger = cmf.create_logger(config, "ITER", "train.log")
    eval_logger = cmf.create_logger(config, "EPOCH", "scores.log")

    """ Prepare data loader and model"""
    dsets, L = cmf.get_loader(dataset, split=["train", "test"],
                              loader_configs=[config["train_loader"], config["test_loader"]],
                              num_workers=config["misc"]["num_workers"])
    net, init_step = cmf.factory_model(config, M, dsets["train"], it_logger)

    # Prepare tensorboard
    net.create_tensorboard_summary(config["misc"]["tensorboard_dir"])

    """ Run training network """
    # load config values
    eval_every = config["evaluation"].get("every_eval", 1) # epoch
    eval_after= config["evaluation"].get("after_eval", 0) # epoch
    print_every = config["misc"].get("print_every", 1) # iteration
    num_step = config["optimize"].get("num_step", 30) # epoch
    apply_cl_after = config["model"].get("curriculum_learning_at", -1)

    vis_every = config["misc"].get("vis_every", -1) # epoch
    """
    if vis_every > 0:
        nsamples = config["misc"].get("vis_nsamples", 12)
        vis_data = dsets["train"].get_samples(int(nsamples/2))
        vis_data.extend(dsets["test"].get_samples(int(nsamples/2)))
        vis_data = dsets["train"].collate_fn(vis_data)
        vis_inp, vis_gt = net.prepare_batch(vis_data)
        net.visualize(vis_inp, vis_gt, "epoch{:03d}".format(0))
    """
    # We evaluate initialized model
    #cmf.test(config, L["test"], net, 0, eval_logger, mode="Valid")
    ii = 1
    net.train_mode() # set network as train mode
    net.reset_status() # initialize status
    tm = timer.Timer() # tm: timer
    print("=====> # of iteration per one epoch: {}".format(len(L["train"])))
    for epoch in range(init_step, init_step+num_step):
        # curriculum learning
        if (apply_cl_after > 0) and (epoch == apply_cl_after):
            net.apply_curriculum_learning()
        # training loop
        for batch in L["train"]:
            # Forward and update the network
            data_load_duration = tm.get_duration()
            tm.reset()
            net_inps, gts = net.prepare_batch(batch)
            outputs = net.forward_update(net_inps, gts)
            run_duration = tm.get_duration()

            # Compute status for current batch: loss, evaluation scores, etc
            net.compute_status(outputs["net_output"], gts)

            # print learning status
            if (print_every > 0) and (ii % print_every == 0):
                net.print_status()
                lr = net.get_lr()
                txt = "fetching for {:.3f}s, optimizing for {:.3f}s, lr = {:.5f}"
                it_logger.info(txt.format(data_load_duration, run_duration, lr))

            # for debugging
            if config["misc"]["debug"] and (ii > 2):
                cmf.test(config, L["test"], net, 0, eval_logger, mode="Valid")
                break

            tm.reset(); ii = ii + 1
            # iteration done

        # visualize network learning status
        #if (vis_every > 0) and (epoch % vis_every == 0):
        #    net.visualize(vis_inp, vis_gt, "epoch{:03d}".format(epoch))

        # validate current model
        if (epoch > eval_after) and (epoch % eval_every == 0):
            # print training losses
            net.save_results("epoch{:03d}".format(epoch), mode="Train")
            net.print_counters_info(eval_logger, epoch, mode="Train")

            cmf.test(config, L["test"], net, epoch, eval_logger, mode="Valid")

            net.train_mode() # set network as train mode
            net.reset_status() # initialize status
Beispiel #6
0
def train(config):

    # create loggers
    it_logger = cmf.create_logger(config, "ITER", "train.log")
    epoch_logger = cmf.create_logger(config, "EPOCH", "scores.log")
    """ Prepare data loader and model"""
    dsets, L = cmf.get_loader(
        dataset,
        split=["train", "test"],
        loader_configs=[config["train_loader"], config["test_loader"]],
        num_workers=config["misc"]["num_workers"])
    sample_data = dsets["train"].get_samples(1)
    net, start_epoch = cmf.factory_model(config, M, dsets["train"], it_logger)

    # Prepare tensorboard
    net.create_tensorboard_summary("./tensorboard")
    """ Run training network """
    ii = 1
    tm, epoch_tm = timer.Timer(), timer.Timer()  # tm: timer
    eval_after = config["evaluation"].get("evaluate_after", 1)  # epoch
    eval_every = config["evaluation"].get("every_eval", 1)  # epoch
    vis_every = config["misc"].get("vis_every", 1000)  # iteration
    print_every = config["misc"].get("print_every", 1)  # iteration

    # We evaluate initialized model
    #cmf.test(config, L["test"], net, 0, epoch_logger, mode="Valid")
    for epoch in range(start_epoch, config["optimize"]["num_epoch"] + 1):
        net.train_mode()  # set network as train mode
        net.reset_status()  # initialize status

        for batch in L["train"]:
            # Forward and update the network
            data_load_duration = tm.get_duration()
            tm.reset()
            net_inps, gts = net.prepare_batch(batch)
            outputs = net.forward_update(net_inps, gts)
            run_duration = tm.get_duration()

            # Compute status for current batch: loss, evaluation scores, etc
            net.compute_status(outputs["net_output"][0], gts)

            # print learning status
            if ii % print_every == 0:
                net.print_status(epoch)
                lr = net_utils.adjust_lr(net.it, net.it_per_epoch,
                                         net.config["optimize"])
                txt = "fetching for {:.3f}s, optimizing for {:.3f}s, lr = {:.5f}"
                it_logger.info(txt.format(data_load_duration, run_duration,
                                          lr))

            # check results for pre-selected samples over training
            if vis_every > 0 and (ii % vis_every == 0):
                net.save_results(sample_data,
                                 "iteration_{}".format(ii),
                                 mode="Train")

            ii += 1
            tm.reset()

            if config["misc"]["debug"] and (ii > 2):
                break
            # iteration done

        # print training time for 1 epoch
        txt = "[Epoch {}] total time of training 1 epoch: {:.3f}s"
        it_logger.info(txt.format(epoch, epoch_tm.get_duration()))

        # save network every epoch
        ckpt_path = os.path.join(config["misc"]["result_dir"], "checkpoints",
                                 "epoch_{:03d}.pkl".format(epoch))
        net.save_checkpoint(ckpt_path)

        # save results (predictions, visualizations)
        # Note: save_results() should be called before print_counters_info()
        net.save_results(sample_data,
                         "epoch_{:03d}".format(epoch),
                         mode="Train")

        # print status (metric) accumulated over each epoch
        net.print_counters_info(epoch, epoch_logger, mode="Train")

        # validate network
        if (epoch >= eval_after) and (epoch % eval_every == 0):
            cmf.test(config, L["test"], net, epoch, epoch_logger, mode="Valid")

        # check curriculum learning
        net.check_apply_curriculum(epoch)

        # reset reference time to compute duration of loading data
        tm.reset()
        epoch_tm.reset()