class CometExperimentLogger(ExperimentLogger): def __init__(self, exp_name, online=True, **kwargs): super(CometExperimentLogger, self).__init__(exp_name, **kwargs) if online: self.comet = Experiment(project_name=exp_name, **kwargs) else: self.comet = OfflineExperiment(project_name=exp_name, **kwargs) def log_metric(self, tag, value, step, **kwargs): self.comet.log_metric(tag, value, step=step, **kwargs) def log_image(self, tag, img, step, **kwargs): self.comet.log_image(img, name=tag, step=step, **kwargs) def log_plt(self, tag, plt, step, **kwargs): self.comet.log_figure(figure=plt, figure_name=tag, step=step, **kwargs) def log_text(self, tag, text, **kwargs): self.comet.log_text(text, **kwargs) def log_parameters(self, params, **kwargs): self.comet.log_parameters(params, **kwargs) def start_epoch(self, **kwargs): super(CometExperimentLogger, self).start_epoch() def end_epoch(self, **kwargs): super(CometExperimentLogger, self).end_epoch() self.comet.log_epoch_end(self.epoch, **kwargs) def end_experiment(self): self.comet.end()
verbose = 10, n_jobs = 2, n_points = 2, scoring = 'accuracy', ) checkpoint_callback = skopt.callbacks.CheckpointSaver(f'D:\\FINKI\\8_dps\\Project\\MODELS\\skopt_checkpoints\\{EXPERIMENT_ID}.pkl') hyperparameters_optimizer.fit(X_train, y_train, callback = [checkpoint_callback]) skopt.dump(hyperparameters_optimizer, f'saved_models\\{EXPERIMENT_ID}.pkl') y_pred = hyperparameters_optimizer.best_estimator_.predict(X_test) for i in range(len(hyperparameters_optimizer.cv_results_['params'])): exp = OfflineExperiment( api_key = 'A8Lg71j9LtIrsv0deBA0DVGcR', project_name = ALGORITHM, workspace = "8_dps", auto_output_logging = 'native', offline_directory = f'D:\\FINKI\\8_dps\\Project\\MODELS\\comet_ml_offline_experiments\\{EXPERIMENT_ID}' ) exp.set_name(f'{EXPERIMENT_ID}_{i + 1}') exp.add_tags([DS, SEGMENTS_LENGTH, ]) for k, v in hyperparameters_optimizer.cv_results_.items(): if k == "params": exp.log_parameters(dict(v[i])) else: exp.log_metric(k, v[i]) exp.end()
class Logger: def __init__(self, send_logs, tags, parameters, experiment=None): self.stations = 5 self.send_logs = send_logs if self.send_logs: if experiment is None: json_loc = glob.glob("./**/comet_token.json")[0] with open(json_loc, "r") as f: kwargs = json.load(f) self.experiment = OfflineExperiment(**kwargs) else: self.experiment = experiment self.sent_mb = 0 self.speed_window = deque(maxlen=100) self.step_time = None self.current_speed = 0 if self.send_logs: if tags is not None: self.experiment.add_tags(tags) if parameters is not None: self.experiment.log_parameters(parameters) def begin_logging(self, episode_count, steps_per_ep, sigma, theta, step_time): self.step_time = step_time if self.send_logs: self.experiment.log_parameter("Episode count", episode_count) self.experiment.log_parameter("Steps per episode", steps_per_ep) self.experiment.log_parameter("theta", theta) self.experiment.log_parameter("sigma", sigma) def log_round(self, states, reward, cumulative_reward, info, loss, observations, step): self.experiment.log_histogram_3d(states, name="Observations", step=step) info = [[j for j in i.split("|")] for i in info] info = np.mean(np.array(info, dtype=np.float32), axis=0) try: round_mb = info[0] except Exception as e: print(info) print(reward) raise e self.speed_window.append(round_mb) self.current_speed = np.mean(np.asarray(self.speed_window)/self.step_time) self.sent_mb += round_mb CW = info[1] CW_ax = info[2] self.stations = info[3] fairness = info[4] if self.send_logs: self.experiment.log_metric("Round reward", np.mean(reward), step=step) self.experiment.log_metric("Per-ep reward", np.mean(cumulative_reward), step=step) self.experiment.log_metric("Megabytes sent", self.sent_mb, step=step) self.experiment.log_metric("Round megabytes sent", round_mb, step=step) self.experiment.log_metric("Chosen CW for legacy devices", CW, step=step) self.experiment.log_metric("Chosen CW for 802.11ax devices", CW_ax, step=step) self.experiment.log_metric("Station count", self.stations, step=step) self.experiment.log_metric("Current throughput", self.current_speed, step=step) self.experiment.log_metric("Fairness index", fairness, step=step) for i, obs in enumerate(observations): self.experiment.log_metric(f"Observation {i}", obs, step=step) self.experiment.log_metrics(loss, step=step) def log_episode(self, cumulative_reward, speed, step): if self.send_logs: self.experiment.log_metric("Cumulative reward", cumulative_reward, step=step) self.experiment.log_metric("Speed", speed, step=step) self.sent_mb = 0 self.last_speed = speed self.speed_window = deque(maxlen=100) self.current_speed = 0 def end(self): if self.send_logs: self.experiment.end()
) # HTML write_html( output_directory + "/index.html", iterations + 1, config["image_save_iter"], "images", comet_exp=comet_exp, ) if (iterations + 1) % config["image_display_iter"] == 0: with torch.no_grad(): image_outputs = trainer.sample(train_display_images_a, train_display_images_b) write_2images(image_outputs, display_size, image_directory, "train_current") # Save network weights if (iterations + 1) % config["snapshot_save_iter"] == 0: trainer.save(checkpoint_directory, iterations) iterations += 1 if iterations >= max_iter: sys.exit("Finish training") comet_exp.end() subprocess.check_output( "python -m comet_ml.scripts.upload {}".format( str(Path(opts.output_path) / (comet_exp.id + ".zip"))), shell=True, )
class CometWriter: def __init__(self, logger, project_name: Optional[str] = None, experiment_name: Optional[str] = None, api_key: Optional[str] = None, log_dir: Optional[str] = None, offline: bool = False, **kwargs): if not _COMET_AVAILABLE: raise ImportError( "You want to use `comet_ml` logger which is not installed yet," " install it with `pip install comet-ml`.") self.project_name = project_name self.experiment_name = experiment_name self.kwargs = kwargs self.timer = Timer() if (api_key is not None) and (log_dir is not None): self.mode = "offline" if offline else "online" self.api_key = api_key self.log_dir = log_dir elif api_key is not None: self.mode = "online" self.api_key = api_key self.log_dir = None elif log_dir is not None: self.mode = "offline" self.log_dir = log_dir else: logger.warning( "CometLogger requires either api_key or save_dir during initialization." ) if self.mode == "online": self.experiment = CometExperiment( api_key=self.api_key, project_name=self.project_name, **self.kwargs, ) else: self.experiment = CometOfflineExperiment( offline_directory=self.log_dir, project_name=self.project_name, **self.kwargs, ) if self.experiment_name: self.experiment.set_name(self.experiment_name) def set_step(self, step, epoch=None, mode='train') -> None: self.mode = mode self.step = step self.epoch = epoch if step == 0: self.timer.reset() else: duration = self.timer.check() self.add_scalar({'steps_per_sec': 1 / duration}) def log_hyperparams(self, params: Dict[str, Any]) -> None: self.experiment.log_parameters(params) def log_code(self, file_name=None, folder='models/') -> None: self.experiment.log_code(file_name=file_name, folder=folder) def add_scalar(self, metrics: Dict[str, Union[torch.Tensor, float]], step: Optional[int] = None, epoch: Optional[int] = None) -> None: metrics_renamed = {} for key, val in metrics.items(): tag = '{}/{}'.format(key, self.mode) if is_tensor(val): metrics_renamed[tag] = val.cpu().detach() else: metrics_renamed[tag] = val if epoch is None: self.experiment.log_metrics(metrics_renamed, step=self.step, epoch=self.epoch) else: self.experiment.log_metrics(metrics_renamed, epoch=epoch) def add_plot(self, figure_name, figure): """ Primarily for log gate plots """ self.experiment.log_figure(figure_name=figure_name, figure=figure) def add_hist3d(self, hist, name): """ Primarily for log gate plots """ self.experiment.log_histogram_3d(hist, name=name) def reset_experiment(self): self.experiment = None def finalize(self) -> None: self.experiment.end() self.reset_experiment()
class CometLogger(Logger): def __init__( self, batch_size: int, snapshot_dir: Optional[str] = None, snapshot_mode: str = "last", snapshot_gap: int = 1, exp_set: Optional[str] = None, use_print_exp: bool = False, saved_exp: Optional[str] = None, **kwargs, ): """ :param kwargs: passed to comet's Experiment at init. """ if use_print_exp: self.experiment = PrintExperiment() else: from comet_ml import Experiment, ExistingExperiment, OfflineExperiment if saved_exp: self.experiment = ExistingExperiment( previous_experiment=saved_exp, **kwargs ) else: try: self.experiment = Experiment(**kwargs) except ValueError: # no API key log_dir = Path.home() / "logs" log_dir.mkdir(exist_ok=True) self.experiment = OfflineExperiment(offline_directory=str(log_dir)) self.experiment.log_parameter("complete", False) if exp_set: self.experiment.log_parameter("exp_set", exp_set) if snapshot_dir: snapshot_dir = Path(snapshot_dir) / self.experiment.get_key() # log_traj_window (int): How many trajectories to hold in deque for computing performance statistics. self.log_traj_window = 100 self._cum_metrics = { "n_unsafe_actions": 0, "constraint_used": 0, "cum_completed_trajs": 0, "logging_time": 0, } self._new_completed_trajs = 0 self._last_step = 0 self._start_time = self._last_time = time() self._last_snapshot_upload = 0 self._snaphot_upload_time = 30 * 60 super().__init__(batch_size, snapshot_dir, snapshot_mode, snapshot_gap) def log_fast( self, step: int, traj_infos: Sequence[Dict[str, float]], opt_info: Optional[Tuple[Sequence[float], ...]] = None, test: bool = False, ) -> None: if not traj_infos: return start = time() self._new_completed_trajs += len(traj_infos) self._cum_metrics["cum_completed_trajs"] += len(traj_infos) # TODO: do we need to support sum(t[k]) if key in k? # without that, this doesn't include anything from extra eval samplers for key in self._cum_metrics: if key == "cum_completed_trajs": continue self._cum_metrics[key] += sum(t.get(key, 0) for t in traj_infos) self._cum_metrics["logging_time"] += time() - start def log( self, step: int, traj_infos: Sequence[Dict[str, float]], opt_info: Optional[Tuple[Sequence[float], ...]] = None, test: bool = False, ): self.log_fast(step, traj_infos, opt_info, test) start = time() with (self.experiment.test() if test else nullcontext()): step *= self.batch_size if opt_info is not None: # grad norm is left on the GPU for some reason # https://github.com/astooke/rlpyt/issues/163 self.experiment.log_metrics( { k: np.mean(v) for k, v in opt_info._asdict().items() if k != "gradNorm" }, step=step, ) if traj_infos: agg_vals = {} for key in traj_infos[0].keys(): if key in self._cum_metrics: continue agg_vals[key] = sum(t[key] for t in traj_infos) / len(traj_infos) self.experiment.log_metrics(agg_vals, step=step) if not test: now = time() self.experiment.log_metrics( { "new_completed_trajs": self._new_completed_trajs, "steps_per_second": (step - self._last_step) / (now - self._last_time), }, step=step, ) self._last_time = now self._last_step = step self._new_completed_trajs = 0 self.experiment.log_metrics(self._cum_metrics, step=step) self._cum_metrics["logging_time"] += time() - start def log_metric(self, name, val): self.experiment.log_metric(name, val) def log_parameters(self, parameters): self.experiment.log_parameters(parameters) def log_config(self, config): self.experiment.log_parameter("config", json.dumps(convert_dict(config))) def upload_snapshot(self): if self.snapshot_dir: self.experiment.log_asset(self._previous_snapshot_fname) def save_itr_params( self, step: int, params: Dict[str, Any], metric: Optional[float] = None ) -> None: super().save_itr_params(step, params, metric) now = time() if now - self._last_snapshot_upload > self._snaphot_upload_time: self._last_snapshot_upload = now self.upload_snapshot() def shutdown(self, error: bool = False) -> None: if not error: self.upload_snapshot() self.experiment.log_parameter("complete", True) self.experiment.end()
def run_experiment_iter(i, experiment, train_iter, nExp, agent_list, env, video, user_seed, experiment_name, log_params, debug, project_name, sps, sps_es, **kwargs): """ Function used to paralelize the run_experiment calculations. Parameters ---------- i : int Index of the agent being trained. Raises ------ NotImplementedError In case Comet is used, raises this error to signal where user intervention is required (namely to set the api_key and the workspace). Returns ------- rewards : array An array with the cumulative rewards, where each column corresponds to an agent (random seed), and each row to a training iteration. arms : array An array with the number of agent arms, where each column corresponds to an agent (random seed), and each row to a training iteration. agent : Agent The trained agent. """ if debug: start = time.time() print("Experiment {0} out of {1}...".format(i + 1, nExp)) if not user_seed: seed = int.from_bytes(os.urandom(4), 'big') else: seed = user_seed if experiment_name: raise NotImplementedError( "Before using Comet, you need to come here and set your API key") experiment = Experiment(api_key=None, project_name=project_name, workspace=None, display_summary=False, offline_directory="offline") experiment.add_tag(experiment_name) experiment.set_name("{0}_{1}".format(experiment_name, i)) # Sometimes adding the tag fails log_params["experiment_tag"] = experiment_name experiment.log_parameters(log_params) agent = agent_list[i] if sps_es: # This one overrides sps rewards, arms, agent = run_sps_es_experiment(agent, env, train_iter, seed=seed, video=video, experiment=experiment, **kwargs) elif sps: rewards, arms, agent = run_sps_experiment(agent, env, train_iter, seed=seed, video=video, experiment=experiment, **kwargs) else: rewards, arms, agent = run_aql_experiment(agent, env, train_iter, seed=seed, video=video, experiment=experiment, **kwargs) agent_list[i] = agent if experiment: experiment.end() if debug: end = time.time() elapsed = end - start units = "secs" if elapsed > 3600: elapsed /= 3600 units = "hours" elif elapsed > 60: elapsed /= 60 units = "mins" print("Time elapsed: {0:.02f} {1}".format(elapsed, units)) return rewards, arms, agent
def main(args): print('Pretrain? ', not args.not_pretrain) print(args.model) start_time = time.time() if opt['local_comet_dir']: comet_exp = OfflineExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo", project_name="selfcifar", workspace="cinjon", auto_metric_logging=True, auto_output_logging=None, auto_param_logging=False, offline_directory=opt['local_comet_dir']) else: comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo", project_name="selfcifar", workspace="cinjon", auto_metric_logging=True, auto_output_logging=None, auto_param_logging=False) comet_exp.log_parameters(vars(args)) comet_exp.set_name(args.name) # Build model # path = "/misc/kcgscratch1/ChoGroup/resnick/spaceofmotion/zeping/bsn" linear_cls = NonLinearModel if args.do_nonlinear else LinearModel if args.model == "amdim": hparams = load_hparams_from_tags_csv( '/checkpoint/cinjon/amdim/meta_tags.csv') # hparams = load_hparams_from_tags_csv(os.path.join(path, "meta_tags.csv")) model = AMDIMModel(hparams) if not args.not_pretrain: # _path = os.path.join(path, "_ckpt_epoch_434.ckpt") _path = '/checkpoint/cinjon/amdim/_ckpt_epoch_434.ckpt' model.load_state_dict(torch.load(_path)["state_dict"]) else: print("AMDIM not loading checkpoint") # Debug linear_model = linear_cls(AMDIM_OUTPUT_DIM, args.num_classes) elif args.model == "ccc": model = CCCModel(None) if not args.not_pretrain: # _path = os.path.join(path, "TimeCycleCkpt14.pth") _path = '/checkpoint/cinjon/spaceofmotion/bsn/TimeCycleCkpt14.pth' checkpoint = torch.load(_path) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } model.load_state_dict(base_dict) else: print("CCC not loading checkpoint") # Debug linear_model = linaer_cls(CCC_OUTPUT_DIM, args.num_classes) #.to(device) elif args.model == "corrflow": model = CORRFLOWModel(None) if not args.not_pretrain: _path = '/checkpoint/cinjon/spaceofmotion/supercons/corrflow.kineticsmodel.pth' # _path = os.path.join(path, "corrflow.kineticsmodel.pth") checkpoint = torch.load(_path) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } model.load_state_dict(base_dict) else: print("CorrFlow not loading checkpoing") # Debug linear_model = linear_cls(CORRFLOW_OUTPUT_DIM, args.num_classes) elif args.model == "resnet": if not args.not_pretrain: resnet = torchvision.models.resnet50(pretrained=True) else: resnet = torchvision.models.resnet50(pretrained=False) print("ResNet not loading checkpoint") # Debug modules = list(resnet.children())[:-1] model = nn.Sequential(*modules) linear_model = linear_cls(RESNET_OUTPUT_DIM, args.num_classes) else: raise Exception("model type has to be amdim, ccc, corrflow or resnet") if torch.cuda.device_count() > 1: model = nn.DataParallel(model).to(device) linear_model = nn.DataParallel(linear_model).to(device) else: model = model.to(device) linear_model = linear_model.to(device) # model = model.to(device) # linear_model = linear_model.to(device) # Freeze model for p in model.parameters(): p.requires_grad = False model.eval() if args.optimizer == "Adam": optimizer = optim.Adam(linear_model.parameters(), lr=args.lr, weight_decay=args.weight_decay) print("Optimizer: Adam with weight decay: {}".format( args.weight_decay)) elif args.optimizer == "SGD": optimizer = optim.SGD(linear_model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) print("Optimizer: SGD with weight decay: {} momentum: {}".format( args.weight_decay, args.momentum)) else: raise Exception("optimizer should be Adam or SGD") optimizer.zero_grad() # Set up log dir now = datetime.datetime.now() log_dir = '/checkpoint/cinjon/spaceofmotion/bsn/cifar-%d-weights/%s/%s' % ( args.num_classes, args.model, args.name) # log_dir = "{}{:%Y%m%dT%H%M}".format(args.model, now) # log_dir = os.path.join("weights", log_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) print("Saving to {}".format(log_dir)) batch_size = args.batch_size * torch.cuda.device_count() # CIFAR-10 if args.num_classes == 10: data_path = ("/private/home/cinjon/cifar-data/cifar-10-batches-py") _train_dataset = CIFAR_dataset(glob(os.path.join(data_path, "data*")), args.num_classes, args.model, True) # _train_acc_dataset = CIFAR_dataset( # glob(os.path.join(data_path, "data*")), # args.num_classes, # args.model, # False) train_dataloader = data.DataLoader(_train_dataset, shuffle=True, batch_size=batch_size, num_workers=args.num_workers) # train_split = int(len(_train_dataset) * 0.8) # train_dev_split = int(len(_train_dataset) - train_split) # train_dataset, train_dev_dataset = data.random_split( # _train_dataset, [train_split, train_dev_split]) # train_acc_dataloader = data.DataLoader( # train_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # train_dev_acc_dataloader = data.DataLoader( # train_dev_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # train_dataset = data.Subset(_train_dataset, list(range(train_split))) # train_dataloader = data.DataLoader( # train_dataset, shuffle=True, batch_size=batch_size, num_workers=args.num_workers) # train_acc_dataset = data.Subset( # _train_acc_dataset, list(range(train_split))) # train_acc_dataloader = data.DataLoader( # train_acc_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # train_dev_acc_dataset = data.Subset( # _train_acc_dataset, list(range(train_split, len(_train_acc_dataset)))) # train_dev_acc_dataloader = data.DataLoader( # train_dev_acc_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) _val_dataset = CIFAR_dataset([os.path.join(data_path, "test_batch")], args.num_classes, args.model, False) val_dataloader = data.DataLoader(_val_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # val_split = int(len(_val_dataset) * 0.8) # val_dev_split = int(len(_val_dataset) - val_split) # val_dataset, val_dev_dataset = data.random_split( # _val_dataset, [val_split, val_dev_split]) # val_dataloader = data.DataLoader( # val_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # val_dev_dataloader = data.DataLoader( # val_dev_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # CIFAR-100 elif args.num_classes == 100: data_path = ("/private/home/cinjon/cifar-data/cifar-100-python") _train_dataset = CIFAR_dataset([os.path.join(data_path, "train")], args.num_classes, args.model, True) train_dataloader = data.DataLoader(_train_dataset, shuffle=True, batch_size=batch_size) _val_dataset = CIFAR_dataset([os.path.join(data_path, "test")], args.num_classes, args.model, False) val_dataloader = data.DataLoader(_val_dataset, shuffle=False, batch_size=batch_size) else: raise Exception("num_classes should be 10 or 100") best_acc = 0.0 best_epoch = 0 # Training for epoch in range(1, args.epochs + 1): current_lr = max(3e-4, args.lr *\ math.pow(0.5, math.floor(epoch / args.lr_interval))) linear_model.train() if args.optimizer == "Adam": optimizer = optim.Adam(linear_model.parameters(), lr=current_lr, weight_decay=args.weight_decay) elif args.optimizer == "SGD": optimizer = optim.SGD( linear_model.parameters(), lr=current_lr, momentum=args.momentum, weight_decay=args.weight_decay, ) #################################################### # Train t = time.time() train_acc = 0 train_loss_sum = 0.0 for iter, input in enumerate(train_dataloader): if time.time( ) - start_time > args.time * 3600 - 300 and comet_exp is not None: comet_exp.end() sys.exit(-1) imgs = input[0].to(device) if args.model != "resnet": imgs = imgs.unsqueeze(1) lbls = input[1].flatten().to(device) # output = model(imgs) # output = linear_model(output) output = linear_model(model(imgs)) loss = F.cross_entropy(output, lbls) train_loss_sum += float(loss.data) train_acc += int(sum(torch.argmax(output, dim=1) == lbls)) optimizer.zero_grad() loss.backward() optimizer.step() # log_text = "train epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter" if iter % 1500 == 0: log_text = "train epoch {}/{}\titer {}/{} loss:{}" print(log_text.format(epoch, args.epochs, iter + 1, len(train_dataloader), loss.data, time.time() - t), flush=False) t = time.time() train_acc /= len(_train_dataset) train_loss_sum /= len(train_dataloader) with comet_exp.train(): comet_exp.log_metrics({ 'acc': train_acc, 'loss': train_loss_sum }, step=(epoch + 1) * len(train_dataloader), epoch=epoch + 1) print("train acc epoch {}/{} loss:{} train_acc:{}".format( epoch, args.epochs, train_loss_sum, train_acc), flush=True) ####################################################################### # Train acc # linear_model.eval() # train_acc = 0 # train_loss_sum = 0.0 # for iter, input in enumerate(train_acc_dataloader): # imgs = input[0].to(device) # if args.model != "resnet": # imgs = imgs.unsqueeze(1) # lbls = input[1].flatten().to(device) # # # output = model(imgs) # # output = linear_model(output) # output = linear_model(model(imgs)) # loss = F.cross_entropy(output, lbls) # train_loss_sum += float(loss.data) # train_acc += int(sum(torch.argmax(output, dim=1) == lbls)) # # print("train acc epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter".format( # epoch, # args.epochs, # iter+1, # len(train_acc_dataloader), # loss.data, # time.time() - t), # flush=True) # t = time.time() # # # train_acc /= len(train_acc_dataset) # train_loss_sum /= len(train_acc_dataloader) # print("train acc epoch {}/{} loss:{} train_acc:{}".format( # epoch, args.epochs, train_loss_sum, train_acc), flush=True) ####################################################################### # Train dev acc # # linear_model.eval() # train_dev_acc = 0 # train_dev_loss_sum = 0.0 # for iter, input in enumerate(train_dev_acc_dataloader): # imgs = input[0].to(device) # if args.model != "resnet": # imgs = imgs.unsqueeze(1) # lbls = input[1].flatten().to(device) # # output = model(imgs) # output = linear_model(output) # # output = linear_model(model(imgs)) # loss = F.cross_entropy(output, lbls) # train_dev_loss_sum += float(loss.data) # train_dev_acc += int(sum(torch.argmax(output, dim=1) == lbls)) # # print("train dev acc epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter".format( # epoch, # args.epochs, # iter+1, # len(train_dev_acc_dataloader), # loss.data, # time.time() - t), # flush=True) # t = time.time() # # train_dev_acc /= len(train_dev_acc_dataset) # train_dev_loss_sum /= len(train_dev_acc_dataloader) # print("train dev epoch {}/{} loss:{} train_dev_acc:{}".format( # epoch, args.epochs, train_dev_loss_sum, train_dev_acc), flush=True) ####################################################################### # Val dev # # linear_model.eval() # val_dev_acc = 0 # val_dev_loss_sum = 0.0 # for iter, input in enumerate(val_dev_dataloader): # imgs = input[0].to(device) # if args.model != "resnet": # imgs = imgs.unsqueeze(1) # lbls = input[1].flatten().to(device) # # output = model(imgs) # output = linear_model(output) # loss = F.cross_entropy(output, lbls) # val_dev_loss_sum += float(loss.data) # val_dev_acc += int(sum(torch.argmax(output, dim=1) == lbls)) # # print("val dev epoch {}/{} iter {}/{} loss:{} {:.3f}s/iter".format( # epoch, # args.epochs, # iter+1, # len(val_dev_dataloader), # loss.data, # time.time() - t), # flush=True) # t = time.time() # # val_dev_acc /= len(val_dev_dataset) # val_dev_loss_sum /= len(val_dev_dataloader) # print("val dev epoch {}/{} loss:{} val_dev_acc:{}".format( # epoch, args.epochs, val_dev_loss_sum, val_dev_acc), flush=True) ####################################################################### # Val linear_model.eval() val_acc = 0 val_loss_sum = 0.0 for iter, input in enumerate(val_dataloader): if time.time( ) - start_time > args.time * 3600 - 300 and comet_exp is not None: comet_exp.end() sys.exit(-1) imgs = input[0].to(device) if args.model != "resnet": imgs = imgs.unsqueeze(1) lbls = input[1].flatten().to(device) output = model(imgs) output = linear_model(output) loss = F.cross_entropy(output, lbls) val_loss_sum += float(loss.data) val_acc += int(sum(torch.argmax(output, dim=1) == lbls)) # log_text = "val epoch {}/{} iter {}/{} loss:{} {:.3f}s/iter" if iter % 1500 == 0: log_text = "val epoch {}/{} iter {}/{} loss:{}" print(log_text.format(epoch, args.epochs, iter + 1, len(val_dataloader), loss.data, time.time() - t), flush=False) t = time.time() val_acc /= len(_val_dataset) val_loss_sum /= len(val_dataloader) print("val epoch {}/{} loss:{} val_acc:{}".format( epoch, args.epochs, val_loss_sum, val_acc)) with comet_exp.test(): comet_exp.log_metrics({ 'acc': val_acc, 'loss': val_loss_sum }, step=(epoch + 1) * len(train_dataloader), epoch=epoch + 1) if val_acc > best_acc: best_acc = val_acc best_epoch = epoch linear_save_path = os.path.join(log_dir, "{}.linear.pth".format(epoch)) model_save_path = os.path.join(log_dir, "{}.model.pth".format(epoch)) torch.save(linear_model.state_dict(), linear_save_path) torch.save(model.state_dict(), model_save_path) # Check bias and variance print( "Epoch {} lr {} total: train_loss:{} train_acc:{} val_loss:{} val_acc:{}" .format(epoch, current_lr, train_loss_sum, train_acc, val_loss_sum, val_acc), flush=True) # print("Epoch {} lr {} total: train_acc:{} train_dev_acc:{} val_dev_acc:{} val_acc:{}".format( # epoch, current_lr, train_acc, train_dev_acc, val_dev_acc, val_acc), flush=True) print("The best epoch: {} acc: {}".format(best_epoch, best_acc))