def __init__(self, config, logger=None, verbose=False): super(AbstractNetwork, self).__init__() # Must call super __init__() # update configuration config = self.model_specific_config_update(config) self.optimizer, self.sample_data, self.models_to_update = None, None, None self.training_mode = True self.evaluate_after = config["evaluation"].get("evaluate_after", 1) self.it = 0 # it: iteration self.tm = timer.Timer() # tm: timer self.grad_clip = config["optimize"].get("gradient_clip", 10) self.update_every = config["optimize"].get("update_every", 1) self.use_gpu = config["model"].get( "use_gpu", True if torch.cuda.is_available else False) self.device = torch.device("cuda" if self.use_gpu else "cpu") if len(config["misc"]["tensorboard_dir"]) > 0: self.create_tensorboard_summary(config["misc"]["tensorboard_dir"]) # save configuration for later network reproduction resume = config["model"].get("resume", False) if not resume: save_config_path = os.path.join(config["misc"]["result_dir"], "config.yml") io_utils.write_yaml(save_config_path, config) self.config = config # prepare logging if logger is not None: self.log = logger.info else: self.log = print self.log(json.dumps(config, indent=2))
def evaluate(config, loader, net, epoch, logger_name="epoch", mode="Train", verbose_every=None): if verbose_every == None: verbose_every = config["evaluation"]["print_every"] # load logger if logger_name == "epoch": logger = io_utils.get_logger("Train") elif logger_name == "eval": logger = io_utils.get_logger("Evaluate") else: raise NotImplementedError() net.eval_mode() # set network as evalmode net.reset_status() # reset status """ Run validating network """ ii = 0 tm = timer.Timer() for batch in loader: data_load_duration = tm.get_duration() # forward the network tm.reset() outputs = net.evaluate(batch) run_duration = tm.get_duration() # accumulate the number of correct answers net.compute_status(outputs[1], batch[0][-1]) # print learning information if ((verbose_every > 0) and ((ii+1) % verbose_every == 0)) \ or config["misc"]["debug"]: net.print_status(epoch + 1, ii + 1, mode="eval") txt = "[TEST] fetching for {:.3f}s, inference for {:.3f}s\n" logger.debug(txt.format(data_load_duration, run_duration)) ii += 1 tm.reset() if (config["misc"]["debug"]) and (ii > 2): break # end for batch in loader net.metric = net.counters["top1-avg"].get_average( ) # would be used for tuning parameter net.print_counters_info(epoch + 1, logger_name=logger_name, mode=mode) net.save_results(None, "epoch_{:03d}".format(epoch + 1), mode="eval")
def __init__(self): super(VirtualNetwork, self).__init__() # Must call super __init__() self.models_to_update = None self.sample_data = None self.optimizer = None self.training_mode = True self.is_main_net = True self.counters = None self.status = None self.use_tf_summary = False self.it = 0 # it: iteration self.update_every = 1 self.debug_mode = False self.qsts = None self._create_counters() self._get_loggers() self.reset_status(init_reset=True) self.tm = timer.Timer() # tm: timer
def train(config): """ Build data loader """ dsets = {} dsets["train"] = dataset.DataSet(config["train_loader"]) dsets["test"] = dataset.DataSet(config["test_loader"]) L = {} L["train"] = data.DataLoader( \ dsets["train"], batch_size=config["train_loader"]["batch_size"], \ num_workers=config["misc"]["num_workers"], \ shuffle=True, collate_fn=dataset.collate_fn) L["test"] = data.DataLoader( \ dsets["test"], batch_size=config["test_loader"]["batch_size"], \ num_workers=config["misc"]["num_workers"], \ shuffle=True, collate_fn=dataset.collate_fn) config = M.override_config_from_loader(config, dsets["train"]) """ Build network """ net = M(config) net.bring_loader_info(dsets) logger["train"].info(str(net)) apply_cc_after = utils.get_value_from_dict( config["model"], "apply_curriculum_learning_after", -1) # load checkpoint if exists if len(config["model"]["checkpoint_path"]) > 0: net.load_checkpoint(config["model"]["checkpoint_path"]) start_epoch = int( utils.get_filename_from_path( config["model"]["checkpoint_path"]).split("_")[-1]) # If checkpoint use curriculum learning if (apply_cc_after > 0) and (start_epoch >= apply_cc_after): net.apply_curriculum_learning() else: start_epoch = 0 # ship network to use gpu if config["model"]["use_gpu"]: net.gpu_mode() # Prepare tensorboard net.create_tensorboard_summary(config["misc"]["tensorboard_dir"]) """ Run training network """ ii = 0 tm = timer.Timer() # tm: timer iter_per_epoch = dsets["train"].get_iter_per_epoch() min_lr = config["optimize"].get("min_lr", 0.0002) for epoch in range(start_epoch, config["optimize"]["num_epoch"]): net.train_mode() # set network as train mode net.reset_status() # initialize status for batch in L["train"]: data_load_duration = tm.get_duration() # maintain sample data to observe learning status if ii == 0: sample_data = dsets["train"].get_samples(5) """ TODO: get samples from both training/test set test_sample_data = dsets["test"].get_samples(5)) """ # Forward and update the network # Note that the 1st and 2nd item of outputs from forward() should be # loss and logits. The others would change depending on the network tm.reset() lr = utils.adjust_lr(ii + 1, iter_per_epoch, config["optimize"], min_lr) outputs = net.forward_update(batch, lr) run_duration = tm.get_duration() # Compute status for current batch: loss, evaluation scores, etc net.compute_status(outputs[1], batch[0][-1]) # print learning status if (ii + 1) % config["misc"]["print_every"] == 0: net.print_status(epoch + 1, ii + 1) txt = "fetching for {:.3f}s, optimizing for {:.3f}s, lr = {:.5f}" logger["train"].debug( txt.format(data_load_duration, run_duration, lr)) logger["train"].info("\n") # visualize results if (config["misc"]["vis_every"] > 0) \ and ((ii+1) % config["misc"]["vis_every"] == 0): if config["misc"]["model_type"] == "ensemble": net.save_results(sample_data, "iteration_{}".format(ii + 1), mode="train") ii += 1 tm.reset() if config["misc"]["debug"]: if ii % 100 == 0: break # epoch done # save network every epoch net.save_checkpoint(epoch + 1) # visualize results net.save_results(sample_data, "epoch_{:03d}".format(epoch + 1), mode="train") # print status (metric) accumulated over each epoch net.print_counters_info(epoch + 1, logger_name="epoch", mode="Train") # validate network if (epoch + 1) % config["evaluation"]["every_eval"] == 0: cmf.evaluate(config, L["test"], net, epoch, logger_name="epoch", mode="Valid") # curriculum learning if (apply_cc_after >= 0) and ((epoch + 1) == apply_cc_after): net.apply_curriculum_learning() # reset reference time to compute duration of loading data tm.reset()
def train(config): # create loggers it_logger = cmf.create_logger(config, "ITER", "train.log") eval_logger = cmf.create_logger(config, "EPOCH", "scores.log") """ Prepare data loader and model""" dsets, L = cmf.get_loader(dataset, split=["train", "test"], loader_configs=[config["train_loader"], config["test_loader"]], num_workers=config["misc"]["num_workers"]) net, init_step = cmf.factory_model(config, M, dsets["train"], it_logger) # Prepare tensorboard net.create_tensorboard_summary(config["misc"]["tensorboard_dir"]) """ Run training network """ # load config values eval_every = config["evaluation"].get("every_eval", 1) # epoch eval_after= config["evaluation"].get("after_eval", 0) # epoch print_every = config["misc"].get("print_every", 1) # iteration num_step = config["optimize"].get("num_step", 30) # epoch apply_cl_after = config["model"].get("curriculum_learning_at", -1) vis_every = config["misc"].get("vis_every", -1) # epoch """ if vis_every > 0: nsamples = config["misc"].get("vis_nsamples", 12) vis_data = dsets["train"].get_samples(int(nsamples/2)) vis_data.extend(dsets["test"].get_samples(int(nsamples/2))) vis_data = dsets["train"].collate_fn(vis_data) vis_inp, vis_gt = net.prepare_batch(vis_data) net.visualize(vis_inp, vis_gt, "epoch{:03d}".format(0)) """ # We evaluate initialized model #cmf.test(config, L["test"], net, 0, eval_logger, mode="Valid") ii = 1 net.train_mode() # set network as train mode net.reset_status() # initialize status tm = timer.Timer() # tm: timer print("=====> # of iteration per one epoch: {}".format(len(L["train"]))) for epoch in range(init_step, init_step+num_step): # curriculum learning if (apply_cl_after > 0) and (epoch == apply_cl_after): net.apply_curriculum_learning() # training loop for batch in L["train"]: # Forward and update the network data_load_duration = tm.get_duration() tm.reset() net_inps, gts = net.prepare_batch(batch) outputs = net.forward_update(net_inps, gts) run_duration = tm.get_duration() # Compute status for current batch: loss, evaluation scores, etc net.compute_status(outputs["net_output"], gts) # print learning status if (print_every > 0) and (ii % print_every == 0): net.print_status() lr = net.get_lr() txt = "fetching for {:.3f}s, optimizing for {:.3f}s, lr = {:.5f}" it_logger.info(txt.format(data_load_duration, run_duration, lr)) # for debugging if config["misc"]["debug"] and (ii > 2): cmf.test(config, L["test"], net, 0, eval_logger, mode="Valid") break tm.reset(); ii = ii + 1 # iteration done # visualize network learning status #if (vis_every > 0) and (epoch % vis_every == 0): # net.visualize(vis_inp, vis_gt, "epoch{:03d}".format(epoch)) # validate current model if (epoch > eval_after) and (epoch % eval_every == 0): # print training losses net.save_results("epoch{:03d}".format(epoch), mode="Train") net.print_counters_info(eval_logger, epoch, mode="Train") cmf.test(config, L["test"], net, epoch, eval_logger, mode="Valid") net.train_mode() # set network as train mode net.reset_status() # initialize status
def train(config): # create loggers it_logger = cmf.create_logger(config, "ITER", "train.log") epoch_logger = cmf.create_logger(config, "EPOCH", "scores.log") """ Prepare data loader and model""" dsets, L = cmf.get_loader( dataset, split=["train", "test"], loader_configs=[config["train_loader"], config["test_loader"]], num_workers=config["misc"]["num_workers"]) sample_data = dsets["train"].get_samples(1) net, start_epoch = cmf.factory_model(config, M, dsets["train"], it_logger) # Prepare tensorboard net.create_tensorboard_summary("./tensorboard") """ Run training network """ ii = 1 tm, epoch_tm = timer.Timer(), timer.Timer() # tm: timer eval_after = config["evaluation"].get("evaluate_after", 1) # epoch eval_every = config["evaluation"].get("every_eval", 1) # epoch vis_every = config["misc"].get("vis_every", 1000) # iteration print_every = config["misc"].get("print_every", 1) # iteration # We evaluate initialized model #cmf.test(config, L["test"], net, 0, epoch_logger, mode="Valid") for epoch in range(start_epoch, config["optimize"]["num_epoch"] + 1): net.train_mode() # set network as train mode net.reset_status() # initialize status for batch in L["train"]: # Forward and update the network data_load_duration = tm.get_duration() tm.reset() net_inps, gts = net.prepare_batch(batch) outputs = net.forward_update(net_inps, gts) run_duration = tm.get_duration() # Compute status for current batch: loss, evaluation scores, etc net.compute_status(outputs["net_output"][0], gts) # print learning status if ii % print_every == 0: net.print_status(epoch) lr = net_utils.adjust_lr(net.it, net.it_per_epoch, net.config["optimize"]) txt = "fetching for {:.3f}s, optimizing for {:.3f}s, lr = {:.5f}" it_logger.info(txt.format(data_load_duration, run_duration, lr)) # check results for pre-selected samples over training if vis_every > 0 and (ii % vis_every == 0): net.save_results(sample_data, "iteration_{}".format(ii), mode="Train") ii += 1 tm.reset() if config["misc"]["debug"] and (ii > 2): break # iteration done # print training time for 1 epoch txt = "[Epoch {}] total time of training 1 epoch: {:.3f}s" it_logger.info(txt.format(epoch, epoch_tm.get_duration())) # save network every epoch ckpt_path = os.path.join(config["misc"]["result_dir"], "checkpoints", "epoch_{:03d}.pkl".format(epoch)) net.save_checkpoint(ckpt_path) # save results (predictions, visualizations) # Note: save_results() should be called before print_counters_info() net.save_results(sample_data, "epoch_{:03d}".format(epoch), mode="Train") # print status (metric) accumulated over each epoch net.print_counters_info(epoch, epoch_logger, mode="Train") # validate network if (epoch >= eval_after) and (epoch % eval_every == 0): cmf.test(config, L["test"], net, epoch, epoch_logger, mode="Valid") # check curriculum learning net.check_apply_curriculum(epoch) # reset reference time to compute duration of loading data tm.reset() epoch_tm.reset()