def train(model, optimizer, loss_fun, train_loader, dev_loader, patience, output, max_epoch, use_progress_bar=True, start_from_scratch=False): try: best_dev_metric = pytorch_train_impl(dev_loader, loss_fun, max_epoch, model, optimizer, output, patience, train_loader, use_progress_bar, start_from_scratch) except RuntimeError as err: if orion.client.IS_ORION_ON and 'CUDA out of memory' in str(err): logger.error(err) logger.error( 'model was out of memory - assigning a bad score to tell Orion to avoid' 'too big model') best_dev_metric = -999 else: raise err report_results([ dict( name='dev_metric', type='objective', # note the minus - cause orion is always trying to minimize (cit. from the guide) value=-best_dev_metric) ])
def main(_): """ This function aims at calling the main training process from which it retrieves the metrics and computes the objective value to be optimized by ORION. In this case, we optimize for the F1-score on the validation dataset. """ run_metrics = main_process() default_value = 1000000.0 orion_objective = default_value if not (run_metrics is None): # val_acc = run_metrics.get('val_binary_accuracy', 0.0) val_precision = run_metrics.get('val_precision', 0.0) val_recall = run_metrics.get('val_recall', 0.0) denom = val_precision + val_recall if denom <= 0: denom = 1e-5 val_f1 = 2 * (val_precision * val_recall) / denom orion_objective = -val_f1 # -val_acc tf.logging.info("FOUND OBJECTIVE: {}".format(orion_objective)) report_results([ dict(name='orion_objective', type='objective', value=orion_objective) ])
def train(model, optimizer, loss_fun, train_loader, dev_loader, patience, output, max_epoch, use_progress_bar=True, start_from_scratch=False): # pragma: no cover """Training loop wrapper. Used to catch exception (and to handle them) if Orion is being used. Args: model (obj): The neural network model object. optimizer (obj): Optimizer used during training. loss_fun (obj): Loss function that will be optimized. train_loader (obj): Dataloader for the training set. dev_loader (obj): Dataloader for the validation set. patience (int): max number of epochs without improving on `best_eval_score`. After this point, the train ends. output (str): Output directory. max_epoch (int): Max number of epochs to train for. use_progress_bar (bool): Use tqdm progress bar (can be disabled when logging). start_from_scratch (bool): Start training from scratch (ignore checkpoints) """ try: best_dev_metric = train_impl( model, optimizer, loss_fun, train_loader, dev_loader, patience, output, max_epoch, use_progress_bar, start_from_scratch) except RuntimeError as err: if orion.client.IS_ORION_ON and 'CUDA out of memory' in str(err): logger.error(err) logger.error('model was out of memory - assigning a bad score to tell Orion to avoid' 'too big model') best_dev_metric = -999 else: raise err report_results([dict( name='dev_metric', type='objective', # note the minus - cause orion is always trying to minimize (cit. from the guide) value=-float(best_dev_metric))])
def execute(): """Execute a simple pipeline as an example.""" # 1. Receive inputs as you want parser = argparse.ArgumentParser() parser.add_argument('--configuration', required=True) inputs = parser.parse_args() with open(inputs.configuration, 'r') as f: config = yaml.safe_load(f) # 2. Perform computations y, dy = function(config['x']) # 3. Gather and report results results = list() results.append(dict( name='example_objective', type='objective', value=y)) results.append(dict( name='example_gradient', type='gradient', value=[dy])) report_results(results)
def execute(): """Execute a simple pipeline as an example.""" # 1. Receive inputs as you want parser = argparse.ArgumentParser() parser.add_argument("-x", type=float, required=True) parser.add_argument("--test-env", action="store_true") parser.add_argument("--experiment-id", type=str) parser.add_argument("--experiment-name", type=str) parser.add_argument("--experiment-version", type=str) parser.add_argument("--trial-id", type=str) parser.add_argument("--working-dir", type=str) inputs = parser.parse_args() if inputs.test_env: assert inputs.experiment_id == os.environ["ORION_EXPERIMENT_ID"] assert inputs.experiment_name == os.environ["ORION_EXPERIMENT_NAME"] assert inputs.experiment_version == os.environ["ORION_EXPERIMENT_VERSION"] assert inputs.trial_id == os.environ["ORION_TRIAL_ID"] assert inputs.working_dir == os.environ["ORION_WORKING_DIR"] # 2. Perform computations y, dy = function(inputs.x) # 3. Gather and report results results = list() results.append(dict(name="example_objective", type="objective", value=y)) results.append(dict(name="example_gradient", type="gradient", value=[dy])) report_results(results)
def execute(): """Execute a simple pipeline as an example.""" # 1. Receive inputs as you want parser = argparse.ArgumentParser() parser.add_argument( "-x", type=str, required=True, help="Representation of a list of floating numbers of " "length at least 2.", ) parser.add_argument( "-y", type=float, default=0, help="An optional float to check multi-dimensional inputs.", ) inputs = parser.parse_args() # 2. Perform computations x = numpy.fromstring(inputs.x[1:-1], sep=", ") f = rosenbrock_function(x, inputs.y) # 3. Gather and report results results = list() results.append(dict(name="rosenbrock", type="objective", value=f)) report_results(results)
def main(argv=None): opt = parse_args(argv) tasks = TCGAMeta(download=True, preload=True) task = tasks[113] # Setup the results dictionary filename = "experiments/results/clinical-tasks.pkl" try: results = pickle.load(open(filename, "rb"), encoding='latin1') print("Loaded Checkpointed Results") except Exception as e: print(e) results = pd.DataFrame(columns=[ 'task', 'acc_metric', 'model', 'graph', 'trial', 'train_size', 'time_elapsed' ]) print("Created a New Results Dictionary") train_size = 50 trials = 3 cuda = True exp = [] for trial in range(trials): model = GCN(cuda=cuda, dropout=opt.dropout, num_layer=opt.num_layer, channels=opt.channels, embedding=opt.embedding, aggregation=opt.aggregation, lr=opt.lr, agg_reduce=opt.agg_reduce, seed=trial) task._samples = task._samples - task._samples.mean(axis=0) task._samples = task._samples / task._samples.var() X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( task._samples, task._labels, stratify=task._labels, train_size=train_size, test_size=len(task._labels) - train_size) adj = sparse.csr_matrix(nx.to_numpy_matrix(GeneManiaGraph().nx_graph)) model.fit(X_train, y_train, adj=adj) y_hat = [] for chunk in get_every_n(X_test, 10): y_hat.extend(np.argmax(model.predict(chunk), axis=1).numpy()) exp.append(model.metric(y_test, y_hat)) print(exp) report_results([{ "name": "acc_metric", "type": "objective", "value": np.array(exp).mean() }])
def main(argv=None): opt = parse_args(argv) dataset = datasets.TCGADataset() dataset.df = dataset.df - dataset.df.mean(axis=0) gene_graph = GeneManiaGraph() search_num_genes = [50, 100, 200, 300, 500, 1000, 2000, 4000, 8000, 16300] test_size = 300 cuda = torch.cuda.is_available() exp = [] for num_genes in search_num_genes: start_time = time.time() gene = "RPL4" model = GCN(cuda=cuda, dropout=opt.dropout, num_layer=opt.num_layer, channels=opt.channels, embedding=opt.embedding, aggregation=opt.aggregation, lr=opt.lr, agg_reduce=opt.agg_reduce) dataset.labels = dataset.df[gene].where( dataset.df[gene] > 0).notnull().astype("int") dataset.labels = dataset.labels.values if type( dataset.labels) == pd.Series else dataset.labels X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( dataset.df, dataset.labels, stratify=dataset.labels, train_size=opt.train_size, test_size=opt.test_size, random_state=opt.seed) if num_genes == 16300: neighbors = gene_graph.nx_graph else: neighbors = gene_graph.bfs_sample_neighbors(gene, num_genes) X_train = X_train[list(neighbors.nodes)].copy() X_test = X_test[list(neighbors.nodes)].copy() X_train[gene] = 1 X_test[gene] = 1 adj = sparse.csr_matrix(nx.to_numpy_matrix(neighbors)) model.fit(X_train, y_train, adj=adj) y_hat = model.predict(X_test) y_hat = np.argmax(y_hat, axis=1) auc = sklearn.metrics.roc_auc_score(y_test, np.asarray(y_hat).flatten()) del model exp.append(auc) report_results([{ "name": "auc", "type": "objective", "value": np.array(exp).mean() }])
def execute(): """Execute a simple pipeline as an example.""" # 1. Receive inputs as you want parser = argparse.ArgumentParser() parser.add_argument("-x", type=float, required=True) inputs = parser.parse_args() # 2. Perform computations y, dy = function(inputs.x) # 3. Gather and report results results = list() results.append(dict(name="example_objective", type="objective", value=y)) results.append(dict(name="example_gradient", type="gradient", value=[dy])) report_results(results)
def run(args): with open(args.config, 'r') as stream: hyper_params = load(stream) for k, v in hyper_params: log_param(k, v) logger.info('hp "{}" => "{}"'.format(k, v)) patience = 10 not_improving_since = 0 best_dev_metric = None for e in range(args.max_epoch): # change this part to do the real training / evaluation on dev. loss = do_training() dev_metric = eval_on_dev() log_metric("loss", loss, step=e) log_metric("dev_metric", dev_metric, step=e) if best_dev_metric is None or dev_metric > best_dev_metric: best_dev_metric = dev_metric not_improving_since = 0 save_model() else: not_improving_since += 1 logger.info('\ndone epoch {} => loss {} - dev metric {} (not improving' ' since {} epoch)'.format(e, loss, dev_metric, not_improving_since)) if not_improving_since >= patience: logger.info('done! best dev metric is {}'.format(best_dev_metric)) break # useful so was can easily sort models w.r.t. the best dev evaluation log_metric("best_dev_metric", best_dev_metric) report_results([ dict( name='dev_metric', type='objective', # note the minus - cause orion is always trying to minimize (cit. from the guide) value=-best_dev_metric) ])
def on_train_end(self, logs=None): """operations to perform after training is complete Parameters ---------- logs : dict, optional dictionary containing the model evaluation metrics, by default None """ self.end_time = datetime.datetime.now() logging.info(f"Ending training at {self.end_time}") logging.info(f"Training duration: {self.end_time - self.start_time}") if self.stopped_epoch > 0: logging.info(f"Early stopping at epoch {self.stopped_epoch}") log_metric("best_valid_acc", self.best_valid_acc) report_results([ dict(name="valid_acc", type="objective", value=-self.best_valid_acc) ])
def execute(): """Execute a simple pipeline as an example.""" # 1. Receive inputs as you want parser = argparse.ArgumentParser() parser.add_argument('-x', type=float, required=True) parser.add_argument('--fidelity', type=int, default=10) inputs = parser.parse_args() assert 0 <= inputs.fidelity <= 10 noise = (1 - inputs.fidelity / 10) + 0.0001 # 2. Perform computations y, dy = function(inputs.x, noise) # 3. Gather and report results results = list() results.append(dict(name='example_objective', type='objective', value=y)) results.append(dict(name='example_gradient', type='gradient', value=[dy])) report_results(results)
def execute(): """Execute a simple pipeline as an example.""" parser = argparse.ArgumentParser() parser.add_argument('-x', type=float, required=True) parser.add_argument('--dir', type=str, required=True) parser.add_argument('--name', type=str, required=True) parser.add_argument('--other-name', type=str, required=True) inputs = parser.parse_args() # That's what is expected to happen os.makedirs(os.path.join(inputs.dir, inputs.other_name, "my-exp-{}".format(inputs.name)), exist_ok=False) # Raise OSError if it exists y, dy = function(inputs.x) results = list() results.append(dict(name='example_objective', type='objective', value=y)) results.append(dict(name='example_gradient', type='gradient', value=[dy])) report_results(results)
def main(hparams: HyperParameters, train_config: TrainConfig): print("Experiment name:", train_config.experiment_name) print("Hyperparameters:", hparams) print("Train_config:", train_config) # create the results path so its directly possible to call the "tail" progam to stream the results as they come in. experiment_results_file = os.path.join( "logs", train_config.experiment_name + "-results.txt") with open(experiment_results_file, "a") as runs_results_file: pass train_data_dir = os.path.join(os.path.curdir, "debug_data") if DEBUG else "~/Train" # Create the required directories if not present. os.makedirs(train_config.log_dir, exist_ok=True) print("Training directory:", train_config.log_dir) with utils.log_to_file(os.path.join(train_config.log_dir, "train_log.txt")): results = train(train_data_dir, hparams, train_config) print(f"Saved model weights are located at '{train_config.log_dir}'") log_results(results) using_validation_set = train_config.validation_data_fraction != 0.0 if using_validation_set: from orion.client import report_results report_results([ dict( name='validation_loss', type='objective', value=results.metrics_dict.get("loss", np.Inf), ) ]) print("TRAINING COMPLETE")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--config", help="config file with generic hyper-parameters, such as optimizer, " "batch_size, ... - in yaml format", required=True, ) parser.add_argument( "--gpu", help="list of gpu ids to use. default is cpu. example: --gpu 0 1", type=int, nargs="+" ) parser.add_argument( "--validation-interval", help="how often to run validation in one epoch - " "e.g., 0.5 means halfway - default 0.5", type=float, default=0.5, ) parser.add_argument("--output", help="where to store models", required=True) parser.add_argument( "--no-model-restoring", help="will not restore any previous model weights (" "even if present)", action="store_true", ) parser.add_argument( "--train", help="will not train - will just evaluate on dev", action="store_true", ) parser.add_argument( "--validate", help="will not train - will just evaluate on dev", action="store_true", ) parser.add_argument( "--predict", help="will predict on the json file you provide as an arg" ) parser.add_argument( "--predict-outliers", help="will use the sklearn model to predict outliers", action="store_true" ) parser.add_argument( "--file-to-emb", help="will use this file as input to generate embeddings" ) parser.add_argument( "--write-emb-to", help="will write question embeddings to this file" ) parser.add_argument( "--save-weights-to", help="will save ONLY the model weights (not the pytorch lightning object)" " to this file", ) parser.add_argument("--predict-to", help="(optional) write predictions here)") parser.add_argument( "--redirect-log", help="will intercept any stdout/err and log it", action="store_true", ) parser.add_argument( "--num-workers", help="number of workers - default 2", type=int, default=0 ) parser.add_argument( "--print-sentence-stats", help="will print stats on the data", action="store_true", ) parser.add_argument( "--multiple-thresholds", help="will print results for various thresholds", action="store_true", ) parser.add_argument('--log', help='log to this file (in addition to stdout/err)') parser.add_argument("--debug", help="will log more info", action="store_true") args = parser.parse_args() logging.basicConfig(level=logging.INFO) # will log to a file if provided (useful for orion on cluster) if args.log is not None: handler = WatchedFileHandler(args.log) formatter = logging.Formatter(logging.BASIC_FORMAT) handler.setFormatter(formatter) root = logging.getLogger() root.setLevel(logging.INFO) root.addHandler(handler) if args.redirect_log or args.log: sys.stdout = LoggerWriter(logger.info) sys.stderr = LoggerWriter(logger.warning) with open(args.config, "r") as stream: hyper_params = load(stream, Loader=yaml.FullLoader) if args.gpu is None and 'GPU' in os.environ: gpu_string = os.environ['GPU'] gpu = [int(x) for x in gpu_string.strip().split()] else: gpu = args.gpu ckpt_to_resume, ret_trainee, trainer = init_model( hyper_params, args.num_workers, args.output, args.validation_interval, gpu, args.no_model_restoring, args.debug, args.print_sentence_stats ) if args.train: trainer.fit(ret_trainee) best_dev_result = float(trainer.early_stop_callback.best.cpu().numpy()) report_results([dict( name='dev_metric', type='objective', # note the minus - cause orion is always trying to minimize (cit. from the guide) value=-float(best_dev_result))]) elif args.validate: trainer.test(ret_trainee) elif args.predict: if not args.predict_to: raise ValueError('--predict also requires --predict-to') model_ckpt = torch.load(ckpt_to_resume, map_location=torch.device("cpu")) ret_trainee.load_state_dict(model_ckpt["state_dict"]) if args.predict_outliers: with open(os.path.join(args.output, SKLEARN_MODEL_FILE_NAME), 'rb') as file: sklearn_model = pickle.load(file) predictor = PredictorWithOutlierDetector(ret_trainee, sklearn_model) else: predictor = Predictor(ret_trainee) predictor.generate_predictions( json_file=args.predict, predict_to=args.predict_to, multiple_thresholds=args.multiple_thresholds ) elif args.file_to_emb: if args.write_emb_to is None: raise ValueError('please specify also --write-emb-to') model_ckpt = torch.load(ckpt_to_resume, map_location=torch.device("cpu")) ret_trainee.load_state_dict(model_ckpt["state_dict"]) generate_embeddings( ret_trainee, input_file=args.file_to_emb, out_file=args.write_emb_to ) elif args.save_weights_to is not None: torch.save(ret_trainee.retriever.state_dict(), args.save_weights_to) else: logger.warning("please select one between --train / --validate / --test")
embed_set10 = "final_test10" test_embed_dir10 = os.path.join(embed_dir, embed_set10) df_test_embed10, _ = calculate_embedding(test_dl10, model_triplet, savedir=test_embed_dir10, concatenate="append") test_embed10 = DataLoadDf(df_test_embed10, encode_function_label, transform=Compose(trans_embedding)) test_embed_loader10 = DataLoader(test_embed10, batch_size=batch_size_classif, shuffle=False, num_workers=num_workers, drop_last=False) model_triplet = to_cpu(model_triplet) classif_model = to_cuda_if_available(classif_model) classif_model.eval() mean_test_results1 = measure_classif(classif_model, test_embed_loader1, classes=classes, suffix_print="test1") mean_test_results10 = measure_classif(classif_model, test_embed_loader10, classes=classes, suffix_print="test10") print(f"Time of the program: {time.time() - t}") from orion.client import report_results report_results( [dict( name="mean_test_results", type="objective", value=float(100 - classif_state["macro_measure_valid"] * 100) ) ] )
def main(): args = parser.parse_args() device = 'cuda' if torch.cuda.is_available() else 'cpu' hyperparams = load_parameters(args.hyperparameter_path) orion_hp_string = '' if args.lr or args.log10_lr: if args.log10_lr: lr = 10**args.log10_lr else: lr = args.lr hyperparams['optimizer']['lr_init'] = lr hyperparams['scheduler']['lr_min'] = lr * 1e-3 orion_hp_string += 'lr= %.4f\n' % lr if args.kl_obs_dur: hyperparams['objective']['kl_obs'][ 'schedule_dur'] = args.kl_obs_dur * args.kl_obs_dur_scale orion_hp_string += 'kl_obs_dur= %i\n' % (args.kl_obs_dur * args.kl_obs_dur_scale) if args.kl_obs_max: hyperparams['objective']['kl_obs']['max'] = args.kl_obs_max orion_hp_string += 'kl_obs_max= %.3f\n' % (args.kl_obs_max) if args.kl_deep_max: hyperparams['objective']['kl_deep']['max'] = args.kl_deep_max orion_hp_string += 'kl_deep_max= %.3f\n' % (args.kl_deep_max) if args.deep_start_p: deep_start = int(args.deep_start_p * args.deep_start_p_scale * hyperparams['objective']['kl_obs']['schedule_dur']) hyperparams['objective']['kl_deep']['schedule_start'] = deep_start hyperparams['objective']['l2']['schedule_start'] = deep_start hyperparams['model']['deep_unfreeze_step'] = deep_start orion_hp_string += 'deep_start= %i\n' % deep_start if args.l2_gen_scale or args.log10_l2_gen_scale: if args.log10_l2_gen_scale: l2_gen_scale = 10**args.log10_l2_gen_scale else: l2_gen_scale = args.l2_gen_scale hyperparams['objective']['l2_gen_scale'] = l2_gen_scale orion_hp_string += 'l2_gen_scale= %.3f\n' % l2_gen_scale if args.l2_con_scale or args.log10_l2_con_scale: if args.log10_l2_con_scale: l2_con_scale = 10**args.log10_l2_con_scale else: l2_con_scale = args.l2_con_scale hyperparams['objective']['l2_con_scale'] = l2_con_scale orion_hp_string += 'l2_con_scale= %.3f\n' % l2_con_scale data_name = args.data_path.split('/')[-1] model_name = hyperparams['model_name'] mhp_list = [ key.replace('size', '').replace('deep', 'd').replace( 'obs', 'o').replace('_', '')[:4] + str(val) for key, val in hyperparams['model'].items() if 'size' in key ] mhp_list.sort() hyperparams['run_name'] = '_'.join(mhp_list) orion_hp_string = orion_hp_string.replace('\n', '-').replace(' ', '').replace( '=', '') orion_hp_string = '_orion-' + orion_hp_string hyperparams['run_name'] += orion_hp_string save_loc = '%s/%s/%s/%s/' % (args.output_dir, data_name, model_name, hyperparams['run_name']) if not os.path.exists(save_loc): os.makedirs(save_loc) data_dict = read_data(args.data_path) train_data = torch.Tensor(data_dict['train_fluor']).to(device) valid_data = torch.Tensor(data_dict['valid_fluor']).to(device) num_trials, num_steps, input_size = train_data.shape train_ds = torch.utils.data.TensorDataset(train_data) valid_ds = torch.utils.data.TensorDataset(valid_data) train_dl = torch.utils.data.DataLoader(train_ds, batch_size=args.batch_size, shuffle=True) valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=valid_data.shape[0]) # transforms = trf.Compose([trf.Normalize(mean=(train_data.mean(),), std=(train_data.std(),))]) transforms = trf.Compose([]) loglikelihood_obs = LogLikelihoodGaussian() loglikelihood_deep = LogLikelihoodPoissonSimplePlusL1( dt=float(data_dict['dt'])) objective = SVLAE_Loss( loglikelihood_obs=loglikelihood_obs, loglikelihood_deep=loglikelihood_deep, loss_weight_dict={ 'kl_deep': hyperparams['objective']['kl_deep'], 'kl_obs': hyperparams['objective']['kl_obs'], 'l2': hyperparams['objective']['l2'], 'recon_deep': hyperparams['objective']['recon_deep'] }, l2_con_scale=hyperparams['objective']['l2_con_scale'], l2_gen_scale=hyperparams['objective']['l2_gen_scale']).to(device) hyperparams['model']['obs']['tau']['value'] /= data_dict['dt'] model = SVLAE_Net( input_size=input_size, factor_size=hyperparams['model']['factor_size'], obs_encoder_size=hyperparams['model']['obs_encoder_size'], obs_latent_size=hyperparams['model']['obs_latent_size'], obs_controller_size=hyperparams['model']['obs_controller_size'], deep_g_encoder_size=hyperparams['model']['deep_g_encoder_size'], deep_c_encoder_size=hyperparams['model']['deep_c_encoder_size'], deep_g_latent_size=hyperparams['model']['deep_g_latent_size'], deep_u_latent_size=hyperparams['model']['deep_u_latent_size'], deep_controller_size=hyperparams['model']['deep_controller_size'], generator_size=hyperparams['model']['generator_size'], prior=hyperparams['model']['prior'], clip_val=hyperparams['model']['clip_val'], generator_burn=hyperparams['model']['generator_burn'], dropout=hyperparams['model']['dropout'], do_normalize_factors=hyperparams['model']['normalize_factors'], factor_bias=hyperparams['model']['factor_bias'], max_norm=hyperparams['model']['max_norm'], deep_unfreeze_step=hyperparams['model']['deep_unfreeze_step'], obs_early_stop_step=hyperparams['model']['obs_early_stop_step'], obs_continue_step=hyperparams['model']['obs_continue_step'], ar1_start_step=hyperparams['model']['ar1_start_step'], obs_params=hyperparams['model']['obs'], device=device).to(device) total_params = 0 for ix, (name, param) in enumerate(model.named_parameters()): print(ix, name, list(param.shape), param.numel(), param.requires_grad) total_params += param.numel() print('Total parameters: %i' % total_params) optimizer = opt.Adam([p for p in model.parameters() if p.requires_grad], lr=hyperparams['optimizer']['lr_init'], betas=hyperparams['optimizer']['betas'], eps=hyperparams['optimizer']['eps']) scheduler = LFADS_Scheduler( optimizer=optimizer, mode='min', factor=hyperparams['scheduler']['scheduler_factor'], patience=hyperparams['scheduler']['scheduler_patience'], verbose=True, threshold=1e-4, threshold_mode='abs', cooldown=hyperparams['scheduler']['scheduler_cooldown'], min_lr=hyperparams['scheduler']['lr_min']) TIME = torch._np.arange(0, num_steps * data_dict['dt'], data_dict['dt']) plotter = { 'train': Plotter(time=TIME, truth={ 'rates': data_dict['train_rates'], 'spikes': data_dict['train_spikes'], 'latent': data_dict['train_latent'] }), 'valid': Plotter(time=TIME, truth={ 'rates': data_dict['valid_rates'], 'spikes': data_dict['valid_spikes'], 'latent': data_dict['valid_latent'] }) } if args.use_tensorboard: import importlib if importlib.util.find_spec('torch.utils.tensorboard'): tb_folder = save_loc + 'tensorboard/' if not os.path.exists(tb_folder): os.mkdir(tb_folder) elif os.path.exists(tb_folder) and args.restart: os.system('rm -rf %s' % tb_folder) os.mkdir(tb_folder) from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter(tb_folder) rm_plotter = plotter else: writer = None rm_plotter = None else: writer = None rm_plotter = None run_manager = RunManager(model=model, objective=objective, optimizer=optimizer, scheduler=scheduler, train_dl=train_dl, valid_dl=valid_dl, transforms=transforms, writer=writer, plotter=rm_plotter, max_epochs=args.max_epochs, save_loc=save_loc, do_health_check=args.do_health_check) run_manager.run() if not torch._np.isfinite(run_manager.best): run_manager.best = 1e8 report_results( [dict(name='valid_loss', type='objective', value=run_manager.best)]) fig_folder = save_loc + 'figs/' if os.path.exists(fig_folder): os.system('rm -rf %s' % fig_folder) os.mkdir(fig_folder) from matplotlib.figure import Figure import matplotlib matplotlib.use('Agg') fig_dict = plotter['valid'].plot_summary(model=run_manager.model, dl=run_manager.valid_dl) for k, v in fig_dict.items(): if type(k) == Figure: v.savefig(fig_folder + k + '.svg')
def learner(model, rollout_storage, train_params, ppo_params, ready_to_works, queue, sync_flag, rank=0, distributed=False, b=None): ''' learner use ppo algorithm to train model with experience from storage :param model: :param storage: :param params: :param ready_to_works: :param queue: :param sync_flag: :param rank: :return: ''' print(f"learner with pid ({os.getpid()}) starts job") logger = TB_logger("ppo_ai2thor", rank) agent = PPO(actor_critic=model, **ppo_params) device = rollout_storage.device if distributed: world_size = dist.get_world_size() else: world_size = 1 epochs = train_params["epochs"] min_clip_param = 0.001 min_kl = 0.001 # start workers for next epoch _ = [e.set() for e in ready_to_works] # Training policy start_time = time.time() for epoch in range(epochs): agent.clip_param = (ppo_params['clip_param'] - min_clip_param) * ( epochs - epoch) / epochs + min_clip_param agent.max_kl = (ppo_params['max_kl'] - min_kl) * (epochs - epoch) / epochs + min_kl rollout_ret = [] rollout_steps = [] # wait until all workers finish a epoch for i in range(train_params["num_workers"]): rewards, steps, id = queue.get() print( f'Leaner rank:{rank} recieve worker:{id} done signal and reaches {i}th wokers' ) rollout_ret.extend(rewards) rollout_steps.extend(steps) if b: print(f'Learner rank:{rank} wait') b.wait() print("Start training") # normalize advantage # if distributed: # mean = rollout_storage.adv_buf.mean() # var = rollout_storage.adv_buf.var() # mean = dist_mean(mean) # var = dist_mean(var) # rollout_storage.normalize_adv(mean_std=(mean, torch.sqrt(var))) # else: # rollout_storage.normalize_adv() # train with batch model.train() print('updating...') pi_loss, v_loss, kl, entropy = agent.update(rollout_storage, distributed) v_mean = rollout_storage.val_buf.mean() model.eval() print("Finishes training") # start workers for next epoch if epoch == train_params["epochs"] - 1: # set exit flag to 1, and notify workers to exit sync_flag.value = 1 _ = [e.set() for e in ready_to_works] # log statistics with TensorBoard ret_sum = np.sum(rollout_ret) steps_sum = np.sum(rollout_steps) episode_count = len(rollout_ret) #visdom # vis.line(X=[episode_count], Y=[ret_sum], win='training_Rewards'+str(rank), update='append') if distributed: pi_loss = dist_mean(pi_loss) v_loss = dist_mean(v_loss) kl = dist_mean(kl) entropy = dist_mean(entropy) v_mean = dist_mean(v_mean) ret_sum = dist_sum(torch.tensor(ret_sum).to(device)) steps_sum = dist_sum(torch.tensor(steps_sum).to(device)) episode_count = dist_sum(torch.tensor(episode_count).to(device)) # Log info about epoch global_steps = (epoch + 1) * train_params["steps"] * train_params["world_size"] fps = global_steps / (time.time() - start_time) logger.log_info(f"Epoch [{epoch}] avg. FPS:[{fps:.2f}]") logger.add_scalar("KL", kl, global_steps) logger.add_scalar("Entropy", entropy, global_steps) logger.add_scalar("p_loss", pi_loss, global_steps) logger.add_scalar("v_loss", v_loss, global_steps) logger.add_scalar("v_mean", v_mean, global_steps) # print(agent.clip_param,agent.max_kl) logger.add_scalar("clip_ration", agent.clip_param, global_steps) logger.add_scalar("max_kl", agent.max_kl, global_steps) if episode_count > 0: ret_per_1000 = (ret_sum / steps_sum) * 1000 logger.add_scalar("Return1000", ret_per_1000, global_steps) logger.log_info( f"Epoch [{epoch}] Steps {global_steps}: " f"return:({ret_per_1000:.1f}), sum:{ret_sum}, step_sum:{steps_sum}" ) else: logger.log_info(f"Epoch [{epoch}] Steps {global_steps}: " f"Goal is not reached in this epoch") if (epoch + 1) % 20 == 0 and rank == 0: if distributed: torch.save(model.module.state_dict(), f'model/ppo/model{epoch+1}.pt') else: torch.save(model.state_dict(), f'model/ppo/model{epoch+1}.pt') print("finish statistics") spl = evaluate_with_spl(model, rollout_storage) print('>>>>>>>>>>>>>>>>>>>>> Reporting...') report_results( [dict(name='validation_return', type='objective', value=-spl)]) print(f"learner with pid ({os.getpid()}) finished job")
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) with open(args.eval_env_seeds_file, 'r') as f: eval_env_seeds = json.load(f) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] eval_dir = os.path.join(args.log_dir, "eval/") if not os.path.exists(eval_dir): os.makedirs(eval_dir) eval_env = [ make_env(args.env_name, args.seed, 0, eval_dir, args.add_timestep, early_resets=True) ] eval_env = DummyVecEnv(eval_env) if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=args.gamma) if len(envs.observation_space.shape) == 1: # Don't touch rewards for evaluation eval_env = VecNormalize(eval_env, ret=False) # set running filter to be the same eval_env.ob_rms = envs.ob_rms obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass validation_returns = evaluate_with_seeds(eval_env, actor_critic, args.cuda, eval_env_seeds) report_results([ dict(name='validation_return', type='objective', value=np.mean(validation_returns)) ])
def train(self): if is_distributed(): warn_once( "Distributed training outputs average-per-worker metrics during " "training, and may be slightly distorted. Validation/test are " "unadulterated." ) opt = self.opt world = self.world with world: while True: # do one example / batch of examples world.parley() self.parleys += 1 # get the total training examples done, compute epochs self._total_epochs = ( self._preempted_epochs + num_workers() * self.world.get_total_epochs() ) exs_per_epoch = self.world.num_examples() self._total_exs = int(np.round(self._total_epochs * exs_per_epoch)) # and use the primary worker's timings for everything train_time, log_time, validate_time = sync_object(( self.train_time.time(), self.log_time.time(), self.validate_time.time() )) # check counters and timers if self._total_epochs >= self.max_num_epochs: self.log() print('[ num_epochs completed:{} time elapsed:{}s ]'.format( self.max_num_epochs, train_time)) break if train_time > self.max_train_time: print('[ max_train_time elapsed:{}s ]'.format(train_time)) break if log_time > self.log_every_n_secs: self.log() if ( validate_time > self.val_every_n_secs or self._total_epochs - self.last_valid_epoch >= self.val_every_n_epochs ): stop_training = self.validate() self.last_valid_epoch = self._total_epochs if stop_training: break if ( self.save_time.time() > self.save_every_n_secs and opt.get('model_file') and is_primary_worker() ): print("[ saving model checkpoint: {}.checkpoint".format( opt['model_file'] )) self.save_model('.checkpoint') self.save_time.reset() if not self.saved and is_primary_worker(): # save agent self.save_model() elif opt.get('model_file'): # reload best validation model self.agent = create_agent(opt) valid_world = _maybe_load_eval_world(self.agent, opt, 'valid') max_exs = opt['validation_max_exs'] if opt.get('short_final_eval') else -1 v_report = run_eval(valid_world, opt, 'valid', max_exs, write_log=True) test_world = _maybe_load_eval_world(self.agent, opt, 'test') t_report = run_eval(test_world, opt, 'test', max_exs, write_log=True) from orion.client import report_results report_results([dict( name='-recall@50', type='objective', value=-t_report['recall@50'] )]) if valid_world: valid_world.shutdown() if test_world: test_world.shutdown() return v_report, t_report
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test_error_rate = test(args, model, device, test_loader) report_results([ dict(name='test_error_rate', type='objective', value=test_error_rate) ]) if (args.save_model): torch.save(model.state_dict(), "mnist_cnn.pt")
#!/usr/bin/env python import argparse from orion.client import report_results def sphere_func_2d(x, y): return x * x + y * y if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-x', help='the x coordinate', type=float) parser.add_argument('-y', help='the y coordinate', type=float) args = parser.parse_args() loss = sphere_func_2d(args.x, args.y) report_results( [dict(name='test_error_rate', type='objective', value=loss)])
def main(): args = parser.parse_args() device = 'cuda' if torch.cuda.is_available() else 'cpu' hyperparams = load_parameters(args.hyperparameter_path) if args.lr: hyperparams['optimizer']['lr_init'] = args.lr hyperparams['scheduler']['lr_min'] = args.lr * 1e-3 if args.patience: hyperparams['scheduler']['scheduler_patience'] = args.patience if args.weight_schedule_dur: hyperparams['objective']['kl'][ 'weight_schedule_dur'] = args.weight_schedule_dur hyperparams['objective']['l2'][ 'weight_schedule_dur'] = args.weight_schedule_dur if args.kl_max: hyperparams['objective']['kl']['max'] = args.kl_max data_name = args.data_path.split('/')[-1] model_name = hyperparams['model_name'] mhp_list = [ key.replace('size', '').replace('_', '')[:4] + str(val) for key, val in hyperparams['model'].items() if 'size' in key ] mhp_list.sort() hyperparams['run_name'] = '_'.join(mhp_list) + '_retest' save_loc = '%s/%s/%s/%s/' % (args.output_dir, data_name, model_name, hyperparams['run_name']) if not os.path.exists(save_loc): os.makedirs(save_loc) data_dict = read_data(args.data_path) train_data = torch.Tensor(data_dict['train_%s' % args.data_suffix]).to(device) valid_data = torch.Tensor(data_dict['valid_%s' % args.data_suffix]).to(device) num_trials, num_steps, input_size = train_data.shape train_ds = torch.utils.data.TensorDataset(train_data) valid_ds = torch.utils.data.TensorDataset(valid_data) train_dl = torch.utils.data.DataLoader(train_ds, batch_size=args.batch_size, shuffle=True) valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=valid_data.shape[0]) transforms = trf.Compose([]) loglikelihood = LogLikelihoodPoisson(dt=float(data_dict['dt'])) objective = LFADS_Loss( loglikelihood=loglikelihood, loss_weight_dict={ 'kl': hyperparams['objective']['kl'], 'l2': hyperparams['objective']['l2'] }, l2_con_scale=hyperparams['objective']['l2_con_scale'], l2_gen_scale=hyperparams['objective']['l2_gen_scale']).to(device) model = LFADS_SingleSession_Net( input_size=input_size, factor_size=hyperparams['model']['factor_size'], g_encoder_size=hyperparams['model']['g_encoder_size'], c_encoder_size=hyperparams['model']['c_encoder_size'], g_latent_size=hyperparams['model']['g_latent_size'], u_latent_size=hyperparams['model']['u_latent_size'], controller_size=hyperparams['model']['controller_size'], generator_size=hyperparams['model']['generator_size'], prior=hyperparams['model']['prior'], clip_val=hyperparams['model']['clip_val'], dropout=hyperparams['model']['dropout'], do_normalize_factors=hyperparams['model']['normalize_factors'], max_norm=hyperparams['model']['max_norm'], device=device).to(device) total_params = 0 for ix, (name, param) in enumerate(model.named_parameters()): print(ix, name, list(param.shape), param.numel(), param.requires_grad) total_params += param.numel() print('Total parameters: %i' % total_params) optimizer = opt.Adam(model.parameters(), lr=hyperparams['optimizer']['lr_init'], betas=hyperparams['optimizer']['betas'], eps=hyperparams['optimizer']['eps']) scheduler = LFADS_Scheduler( optimizer=optimizer, mode='min', factor=hyperparams['scheduler']['scheduler_factor'], patience=hyperparams['scheduler']['scheduler_patience'], verbose=True, threshold=1e-4, threshold_mode='abs', cooldown=hyperparams['scheduler']['scheduler_cooldown'], min_lr=hyperparams['scheduler']['lr_min']) TIME = torch._np.arange(0, num_steps * data_dict['dt'], data_dict['dt']) train_truth = {} if 'train_rates' in data_dict.keys(): train_truth['rates'] = data_dict['train_rates'] if 'train_latent' in data_dict.keys(): train_truth['latent'] = data_dict['train_latent'] valid_truth = {} if 'valid_rates' in data_dict.keys(): valid_truth['rates'] = data_dict['valid_rates'] if 'valid_latent' in data_dict.keys(): valid_truth['latent'] = data_dict['valid_latent'] plotter = { 'train': Plotter(time=TIME, truth=train_truth), 'valid': Plotter(time=TIME, truth=valid_truth) } if args.use_tensorboard: import importlib if importlib.util.find_spec('torch.utils.tensorboard'): tb_folder = save_loc + 'tensorboard/' if not os.path.exists(tb_folder): os.mkdir(tb_folder) elif os.path.exists(tb_folder) and args.restart: os.system('rm -rf %s' % tb_folder) os.mkdir(tb_folder) from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter(tb_folder) rm_plotter = plotter else: writer = None rm_plotter = None else: writer = None rm_plotter = None run_manager = RunManager(model=model, objective=objective, optimizer=optimizer, scheduler=scheduler, train_dl=train_dl, valid_dl=valid_dl, transforms=transforms, writer=writer, plotter=rm_plotter, max_epochs=args.max_epochs, save_loc=save_loc, do_health_check=args.do_health_check) run_manager.run() report_results( [dict(name='valid_loss', type='objective', value=run_manager.best)]) fig_folder = save_loc + 'figs/' if os.path.exists(fig_folder): os.system('rm -rf %s' % fig_folder) os.mkdir(fig_folder) from matplotlib.figure import Figure import matplotlib matplotlib.use('Agg') fig_dict = plotter['valid'].plot_summary(model=run_manager.model, dl=run_manager.valid_dl) for k, v in fig_dict.items(): if type(v) == Figure: v.savefig(fig_folder + k + '.svg')
# gin.config.register_finalize_hook( # lambda config: config[('', 'src.train.train')].update({'device': torch.device(config[('', 'src.train.train')].get('device','cpu'))})) gin.parse_config_files_and_bindings(args.config, args.gin_param) print(gin.operative_config_str()) errors = [] for random_seed in range(5): if args.savedir: os.makedirs(args.savedir, exist_ok=True) seed_savedir = f'{args.savedir}/{random_seed}' else: seed_savedir = None best_error = train(savedir=seed_savedir, random_seed=random_seed) errors.append(best_error) if args.aggregate_seeds == 'mean': objective = sum(errors) / len(errors) elif args.aggregate_seeds == 'min': objective = min(errors) print(f'{args.aggregate_seeds} error over seeds: {objective:2.2f}') report_results([ dict(name=f'{args.aggregate_seeds}_error_over_seeds', type='objective', value=objective) ])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--batchSz', type=int, default=256) parser.add_argument('--nEpochs', type=int, default=100) parser.add_argument('--card', type=int, default=2) parser.add_argument('--no-cuda', action='store_true') parser.add_argument('--ica', type=float, default=1e-1) parser.add_argument('--ica-fc', type=float, default=0) parser.add_argument('--wd', type=float, default=1e-4) parser.add_argument('--save') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--opt', type=str, default='adam', choices=('sgd', 'adam', 'rmsprop', 'sgdw')) args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"]=str(args.card) args.cuda = not args.no_cuda and torch.cuda.is_available() args.save = args.save or 'error.csv' torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) if os.path.exists(args.save): shutil.rmtree(args.save) os.makedirs(args.save, exist_ok=True) normMean = [0.49139968, 0.48215827, 0.44653124] normStd = [0.24703233, 0.24348505, 0.26158768] normTransform = transforms.Normalize(normMean, normStd) trainTransform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normTransform ]) testTransform = transforms.Compose([ transforms.ToTensor(), normTransform ]) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} trainLoader = DataLoader( dset.CIFAR10(root='./data', train=True, download=True, transform=trainTransform), batch_size=args.batchSz, shuffle=True, **kwargs) testLoader = DataLoader( dset.CIFAR10(root='./data', train=False, download=True, transform=testTransform), batch_size=args.batchSz, shuffle=False, **kwargs) net = Net([args.ica]*4 + [args.ica_fc]*2) print(' + Number of params: {}'.format( sum([p.data.nelement() for p in net.parameters()]))) if args.cuda: net = net.cuda() #net = nn.DataParallel(net, device_ids=[0,1]) if args.opt == 'sgd': optimizer = optim.SGD(net.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-4) elif args.opt == 'adam': optimizer = optim.Adam(net.parameters(), weight_decay=args.wd) elif args.opt == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), weight_decay=1e-4) trainF = open(os.path.join(args.save, 'train.csv'), 'w') testF = open(os.path.join(args.save, 'test.csv'), 'w') for epoch in range(1, args.nEpochs + 1): adjust_opt(args.opt, optimizer, epoch) train(args, epoch, net, trainLoader, optimizer, trainF) test_error_rate = test(args, epoch, net, testLoader, optimizer, testF) torch.save(net, os.path.join(args.save, 'latest.pth')) trainF.close() testF.close() report_results([dict( name='test_error_rate', type='objective', value=test_error_rate)])
assert NDCGs_1model != -1, "Orion's objective not evaluated" ######## ORION ######## # For Orion, print results (MongoDB,...) report_results([dict( name='NDCG with genres', type='objective', value=-NDCGs_1model), # dict( # name='valid_pred_error', # type='constraint', # value=pred_err), # dict( # name='valid_reconst_error', # type='constraint', # value=valid_err), # dict( # name='g', # type='constraint', # value=model.g.data.item()) ])
# will log to a file if provided if args.log is not None: handler = logging.handlers.WatchedFileHandler(args.log) formatter = logging.Formatter(logging.BASIC_FORMAT) handler.setFormatter(formatter) root = logging.getLogger() root.setLevel(logging.INFO) root.addHandler(handler) args.learning_rate = [float(lr) for lr in eval(args.learning_rate)] logger.info(args) val_loss = train(args.data_dir, args.csv_path, args.splits_path, args.output_dir, target=args.target, nb_epoch=args.epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, dropout=args.dropout, optim=args.optim, min_patients_per_label=args.min_patients, seed=args.seed, model_type=args.model_type, architecture=args.arch, data_augmentation=args.data_augmentation, misc=args) report_results([dict(name='val_auc', type='objective', value=val_loss)])