def main(): STARTTIME0 = time.strftime('run_%Y_%m_%d_%H_%M_%s') METRICS = [] for ts_size in [3000, 5000, 5600]: for iteration in range(10): _, _, X_train, X_test, y_train, y_test, _ = process_data( size=ts_size) experiment = Experiment(api_key=os.environ['COMET_API_KEY'], project_name='color-ml') experiment.log_parameters(PARAMETERS_MEDIAN) with experiment.train(): regressor_median = fit(X_train, y_train) metrics_dict = get_metrics_dict(regressor_median, X_test, y_test, experiment) metrics_dict['iteration'] = iteration metrics_dict['ts_size'] = ts_size METRICS.append(metrics_dict) df = pd.DataFrame(METRICS) df.to_csv('learningurve_' + STARTTIME0 + '.csv') experiment.log_asset('learningurve_' + STARTTIME0 + '.csv')
def main(modelpath, xpath, ypath, outname): experiment = Experiment(api_key=os.environ['COMET_API_KEY'], project_name='color-ml') model = joblib.load(modelpath) X = np.load(xpath) y = np.load(ypath) metrics = get_metrics(model, X, y, experiment) df = pd.DataFrame(metrics) df.to_csv(outname, index=False) experiment.log_asset(outname)
def setup_comet_ml(args, rank): # dummy init of experiment so it can be used without error # even if comet is disabled experiment = Experiment(api_key='dummy_key', disabled=True) if args.comet_api_key: # initiating comet if args.existing_exp_key: if rank == 0: print("STARTING FROM AND EXISTING EXPERIMENT") experiment = ExistingExperiment( api_key=args.comet_api_key, workspace=args.comet_workspace, project_name=args.project_name, previous_experiment=args.existing_exp_key, auto_output_logging="simple", auto_metric_logging=False, parse_args=False, disabled=args.disable_comet or rank != 0) else: if rank == 0: print("STARTING A NEW EXPERIMENT") experiment = Experiment( api_key=args.comet_api_key, workspace=args.comet_workspace, project_name=args.project_name, auto_output_logging="simple", auto_metric_logging=False, parse_args=False, disabled=args.disable_comet or rank != 0) experiment.log_asset('config.yaml') experiment.log_asset('config_prod.yaml') experiment.log_asset('config_prod_prime.yaml') return experiment
def log(self, experiment=None): ''' Export all logs in the Comet.ml environment. See https://www.comet.ml/ for more details ''' # Initialize Comet.ml experience (naming, tags) for automatic logging project_name = 'Optimization' if self.comet_optimize else 'Summary' experiment_name = '{} - {} '.format(self.model_name, str(self.batch_size)) + ('ES+' if self.train_after_es else '') experiment_tags = [ self.model_name, self.monitor_val ] + (['ES+'] if self.train_after_es else []) + (['Pre-train'] if self.pretraining else []) if experiment == None: experiment = Experiment(api_key='cSZq9kuH2I87ezvm2dEWTx6op', project_name=project_name, log_code=False, auto_param_logging=False, auto_metric_logging=False) experiment.set_name(experiment_name) experiment.add_tags(experiment_tags) # Export hyperparameters experiment.log_parameters(self.dataloader_params) experiment.log_parameters(self.training_params) # Export metrics values experiment.log_metrics({'Average accuracy' : np.mean(self.test_score['accuracy']), 'Std accuracy' : np.std(self.test_score['accuracy'])}) # Export metrics graphs for each pilot (accuracy, loss, confusion matrix) [ experiment.log_figure(figure_name='Confusion matrix {}'.format(pilot_idx), figure=plot_cm(self.conf_matrices, pilot_idx)) for pilot_idx in range(1,self.n_pilots+1)] [ experiment.log_figure(figure_name='Loss pilot {}'.format(pilot_idx), figure=plot_loss(self.histories[pilot_idx-1], pilot_idx)) for pilot_idx in range(1,self.n_pilots+1)] fig, ax = plt.subplots(figsize=(10,6)) plot_full_barchart(self.test_score, n_pilots=self.n_pilots, title=' {} ConvNet model'.format(self.model_name), fig=fig) experiment.log_figure(figure_name='Accuracy barchart', figure=fig) if self.train_after_es: [ experiment.log_figure(figure_name='Loss pilot {} (ES+)'.format(pilot_idx), figure=plot_loss(self.histories_es[pilot_idx-1], pilot_idx)) for pilot_idx in range(1,self.n_pilots+1)] # Export model weights for each pilot [ experiment.log_asset('{}{}.h5'.format(self.weights_savename_prefix, pilot_idx)) for pilot_idx in range(1,self.n_pilots+1)] experiment.end()
print("Train Ensemble") model.ensemble(experiment=experiment) #Final score, be absolutely sure you get all the data, feed slowly in batches of 1 final_score = model.ensemble_model.evaluate(model.val_split.unbatch().batch(1)) experiment.log_metric("Ensemble Accuracy", final_score[1]) #Save model and figure #tf.keras.utils.plot_model(model.ensemble_model, to_file="{}/Ensemble.png".format(save_dir)) #experiment.log_figure("{}/Ensemble.png".format(save_dir)) model.ensemble_model.save("{}/Ensemble.h5".format(save_dir)) #save predictions predicted_shp = model.predict(model = model.ensemble_model) predicted_shp.to_file("{}/prediction.shp".format(save_dir)) experiment.log_asset("{}/prediction.shp".format(save_dir)) experiment.log_asset("{}/prediction.dbf".format(save_dir)) experiment.log_asset("{}/prediction.shx".format(save_dir)) experiment.log_asset("{}/prediction.cpg".format(save_dir)) #per species accurracy predicted_shp["match"] = predicted_shp.apply(lambda x: x.true_taxonID == x.predicted_taxonID, 1) per_species = predicted_shp.groupby("true_taxonID").apply(lambda x: x["match"].sum()/len(x)) per_species.to_csv("{}/perspecies.csv".format(save_dir)) experiment.log_asset("{}/perspecies.csv".format(save_dir)) per_site = predicted_shp.groupby("siteID").apply(lambda x: x["match"].sum()/len(x)) per_site.to_csv("{}/persite.csv".format(save_dir)) experiment.log_asset("{}/persite.csv".format(save_dir)) #Plots - this function needs to be rewritten because the dataset is now nested: ids, (data, label). probably predict on batch.
# experiment.log_metric("train_loss", np.mean(train_loss), step=epoch) experiment.log_metric("train_perplexity", train_ppl, step=epoch) # RUN MODEL ON VALIDATION DATA val_ppl, val_loss = run_epoch(model, valid_data) # experiment.log_metric("val_loss", np.mean(val_loss), step=epoch) experiment.log_metric("val_perplexity", val_ppl, step=epoch) # SAVE MODEL IF IT'S THE BEST SO FAR if val_ppl < best_val_so_far: best_val_so_far = val_ppl if args.save_best: print("Saving model parameters to best_params.pt") best_model_path = os.path.join(args.save_dir, 'best_params.pt') torch.save(model.state_dict(), best_model_path) experiment.log_asset(best_model_path, overwrite=True) # NOTE ============================================== # You will need to load these parameters into the same model # for a couple Problems: so that you can compute the gradient # of the loss w.r.t. hidden state as required in Problem 5.2 # and to sample from the the model as required in Problem 5.3 # We are not asking you to run on the test data, but if you # want to look at test performance you would load the saved # model and run on the test data with batch_size=1 # LOC RESULTS train_ppls.append(train_ppl) val_ppls.append(val_ppl) train_losses.extend(train_loss) val_losses.extend(val_loss) times.append(time.time() - t0)
def train(opt): # Set random seed if torch.cuda.is_available(): torch.cuda.manual_seed(opt.random_seed) else: torch.manual_seed(opt.random_seed) # Instantiate the model if opt.conv_dim is not None and \ opt.conv_kernel_sizes is not None and \ opt.conv_strides is not None and \ opt.fc_dim is not None: model = DeepQNetwork(opt.image_size, opt.image_size, conv_dim=opt.conv_dim, conv_kernel_sizes=opt.conv_kernel_sizes, conv_strides=opt.conv_strides, fc_dim=opt.fc_dim) else: model = DeepQNetwork(opt.image_size, opt.image_size) if opt.log_comet_ml: # Create a Comet.ml experiment experiment = Experiment(api_key=opt.comet_ml_api_key, project_name=opt.comet_ml_project_name, workspace=opt.comet_ml_workspace) experiment.log_other("iters_to_save", opt.iters_to_save) experiment.log_other("completed", False) experiment.log_other("random_seed", opt.random_seed) # Report hyperparameters to Comet.ml hyper_params = { "image_size": opt.image_size, "batch_size": opt.batch_size, "optimizer": opt.optimizer, "learning_rate": opt.lr, "gamma": opt.gamma, "initial_epsilon": opt.initial_epsilon, "final_epsilon": opt.final_epsilon, "num_iters": opt.num_iters, "replay_memory_size": opt.replay_memory_size, "random_seed": opt.random_seed, "conv_dim": opt.conv_dim, "conv_kernel_sizes": opt.conv_kernel_sizes, "conv_strides": opt.conv_strides, "fc_dim": opt.fc_dim } experiment.log_parameters(hyper_params) optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) # Optimization algorithm criterion = nn.MSELoss() # Loss function game_state = FlappyBird() # Instantiate the Flappy Compass game image, reward, terminal = game_state.next_frame( 0 ) # Get the next image, along with its reward and an indication if it's a terminal state # Image preprocessing step (scaling, color removal and convertion to a PyTorch tensor) image = pre_processing( image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) image = torch.from_numpy(image) # Move the model and the current image data to the GPU, if available if torch.cuda.is_available(): model.cuda() image = image.cuda() # Prepare the state variable, which will host the last 4 frames state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :] # Initialize the replay memory, which saves sets of consecutive game states, the reward and terminal state indicator # so that the model can learn from them (essentially constitutes the training data, which grows with every new iteration) replay_memory = [] iter = 0 # Iteration counter # Main training loop performing the number of iterations specified by num_iters while iter < opt.num_iters: prediction = model(state)[0] # Get a prediction from the current state epsilon = opt.final_epsilon + ( (opt.num_iters - iter) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters ) # Set the decay of the probability of random actions u = random() random_action = u <= epsilon if random_action: print("Perform a random action") action = randint(0, 1) else: # Use the model's prediction to decide the next action action = torch.argmax(prediction).item() # Get a new frame and process it next_image, reward, terminal = game_state.next_frame(action) next_image = pre_processing( next_image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) next_image = torch.from_numpy(next_image) # Move the next image data to the GPU, if available if torch.cuda.is_available(): next_image = next_image.cuda() next_state = torch.cat( (state[0, 1:, :, :], next_image) )[None, :, :, :] # Prepare the next state variable, which will host the last 4 frames replay_memory.append( [state, action, reward, next_state, terminal] ) # Save the current state, action, next state and terminal state indicator in the replay memory if len(replay_memory) > opt.replay_memory_size: del replay_memory[ 0] # Delete the oldest reolay from memory if full capacity has been reached batch = sample(replay_memory, min(len(replay_memory), opt.batch_size) ) # Retrieve past play sequences from the replay memory state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip( *batch) state_batch = torch.cat(tuple( state for state in state_batch)) # States of the current batch action_batch = torch.from_numpy( np.array([[1, 0] if action == 0 else [0, 1] for action in action_batch], dtype=np.float32)) # Actions taken in the current batch reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) # Rewards in the current batch next_state_batch = torch.cat(tuple( state for state in next_state_batch)) # Next states of the current batch # Move batch data to the GPU, if available if torch.cuda.is_available(): state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() current_prediction_batch = model( state_batch ) # Predictions of the model for the replays of the current batch next_prediction_batch = model( next_state_batch ) # Next predictions of the model for the replays of the current batch # Set ground truth for the rewards for the current batch, considering whether the state is terminal or not y_batch = torch.cat( tuple(reward if terminal else reward + opt.gamma * torch.max(prediction) for reward, terminal, prediction in zip( reward_batch, terminal_batch, next_prediction_batch))) q_value = torch.sum( current_prediction_batch * action_batch, dim=1 ) # Predicted Q values (i.e. estimated return for each action) optimizer.zero_grad( ) # Reset the gradients to zero before a new optimization step loss = criterion(q_value, y_batch) # Calculate the loss loss.backward() # Backpropagation optimizer.step() # Weights optimization step state = next_state # Move to the next frame iter += 1 print( "Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}" .format(iter + 1, opt.num_iters, action, loss, epsilon, reward, torch.max(prediction))) if opt.log_comet_ml: # Log metrics to Comet.ml experiment.log_metric("train_loss", loss, step=iter) experiment.log_metric("train_epsilon", epsilon, step=iter) experiment.log_metric("train_reward", reward, step=iter) experiment.log_metric("train_Q_value", torch.max(prediction), step=iter) if (iter + 1) % opt.iters_to_save == 0: # Get the current day and time to attach to the saved model's name current_datetime = datetime.now().strftime('%d_%m_%Y_%H_%M') # Set saved model name model_filename = f'{opt.saved_path}/flappy_compass_{current_datetime}_{iter+1}.pth' # Save model every iters_to_save iterations torch.save(model, model_filename) if opt.log_comet_ml and opt.comet_ml_save_model: # Upload model to Comet.ml experiment.log_asset(file_path=model_filename, overwrite=True) # Get the current day and time to attach to the saved model's name current_datetime = datetime.now().strftime('%d_%m_%Y_%H_%M') # Set saved model name model_filename = f'{opt.saved_path}/flappy_compass_{current_datetime}_{iter+1}.pth' # Save the model after reaching the final iteration torch.save(model, model_filename) if opt.log_comet_ml: # Only report that the experiment completed successfully if it finished the training without errors experiment.log_other("completed", True) if opt.comet_ml_save_model: # Upload model to Comet.ml experiment.log_asset(file_path=model_filename, overwrite=True)
"=================================================================" ) if current_epoch % MODEL_SAVE_INTERVAL == 0: current_save_model_name = save_checkpoint( { 'model': net.state_dict(), 'optimizer': optimizer.state_dict(), 'e': current_epoch, 'PACNN_PERSPECTIVE_AWARE_MODEL': PACNN_PERSPECTIVE_AWARE_MODEL # 'amp': amp.state_dict() }, False, MODEL_SAVE_NAME + "_" + str(current_epoch) + "_") experiment.log_asset(current_save_model_name) print("saved ", current_save_model_name) # end 1 epoch # after epoch evaluate mae_calculator_d1 = MAECalculator() mae_calculator_d2 = MAECalculator() mae_calculator_d3 = MAECalculator() mae_calculator_final = MAECalculator() with torch.no_grad(): for val_img, label in val_loader_pacnn: net.eval() # load data d1_label, d2_label, d3_label = label
help="Path to the config file.", ) parser.add_argument("--output_path", type=str, default=".", help="outputs path") parser.add_argument("--resume", action="store_true") parser.add_argument("--trainer", type=str, default="MUNIT", help="MUNIT|UNIT") parser.add_argument("--git_hash", type=str, default="no-git-hash", help="output of git log --pretty=format:'%h' -n 1") opts = parser.parse_args() if comet_exp is not None: comet_exp.log_asset(file_data=opts.config, file_name="config.yaml") comet_exp.log_parameter("git_hash", opts.git_hash) cudnn.benchmark = True # Load experiment setting config = get_config(opts.config) max_iter = config["max_iter"] display_size = config["display_size"] config["vgg_model_path"] = opts.output_path # Setup model and data loader if opts.trainer == "MUNIT": trainer = MUNIT_Trainer(config) elif opts.trainer == "UNIT": trainer = UNIT_Trainer(config) else:
def main(cfg: DictConfig): print('Nishika Second-hand Apartment Price Training') cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) data_dir = './input' seed_everything(cfg.data.seed) experiment = Experiment(api_key=cfg.exp.api_key, project_name=cfg.exp.project_name, auto_output_logging='simple', auto_metric_logging=False) experiment.log_parameters(dict(cfg.data)) # Config #################################################################################### del_tar_col = ['取引時点'] id_col = 'ID' tar_col = '取引価格(総額)_log' g_col = 'year' criterion = MAE cv = KFold(n_splits=cfg.data.n_splits, shuffle=True, random_state=cfg.data.seed) # cv = GroupKFold(n_splits=5) # Load Data #################################################################################### if cfg.exp.use_pickle: # pickleから読み込み df = unpickle('./input/data.pkl') else: df = load_data(data_dir, sampling=cfg.data.sampling, seed=cfg.data.seed, id_col=id_col, target_col=tar_col) # Preprocessing print('Preprocessing') df = preprocessing(df, cfg) # pickle形式で保存 to_pickle('./input/data.pkl', df) try: experiment.log_asset(file_data='./input/data.pkl', file_name='data.pkl') except: pass features = [c for c in df.columns if c not in del_tar_col] # Model #################################################################################### model = None if cfg.exp.model == 'lgb': model = LGBMModel(dict(cfg.lgb)) elif cfg.exp.model == 'cat': model = CatBoostModel(dict(cfg.cat)) # Train & Predict ############################################################################## trainer = Trainer(model, id_col, tar_col, g_col, features, cv, criterion, experiment) trainer.fit(df) trainer.predict(df) trainer.get_feature_importance()
def main(cfg: DictConfig): cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) seed_everything(cfg.train.seed) # Comet.ml experiment = Experiment(api_key=API_KEY, project_name=PROJECT_NAME) # Load Data ################################################################ # Chris Dataset chris_image_size = cfg.data.load_size data_dir = f'./input/_Chris_Dataset_{chris_image_size}' train = pd.read_csv(os.path.join(data_dir, 'train.csv')) test = pd.read_csv(os.path.join(data_dir, 'test.csv')) img_paths = { 'train': glob.glob(os.path.join(data_dir, 'train', '*.jpg')), 'test': glob.glob(os.path.join(data_dir, 'test', '*.jpg')) } # Cross Validation ######################################################### # GroupKFold cv = GroupKFold(n_splits=5) train['fold'] = -1 for i, (trn_idx, val_idx) in enumerate( cv.split(train, train['target'], groups=train['patient_id'].tolist())): train.loc[val_idx, 'fold'] = i # Preprocessing ############################################################ # Drop Image drop_image_name = [ 'ISIC_4579531', 'ISIC_7918608', 'ISIC_0948240', 'ISIC_4904364', 'ISIC_8780369', 'ISIC_8770180', 'ISIC_7148656', 'ISIC_7408392', 'ISIC_9959813', 'ISIC_1894141', 'ISIC_6633174', 'ISIC_3001941', 'ISIC_4259290', 'ISIC_6833905', 'ISIC_7452152', 'ISIC_2744859', 'ISIC_5464206', 'ISIC_6596403', 'ISIC_0711790', 'ISIC_5644568', 'ISIC_5843094', 'ISIC_8904326', 'ISIC_4963405', 'ISIC_9839042', 'ISIC_1355907', 'ISIC_0694037', 'ISIC_9513918', 'ISIC_0787851', 'ISIC_2932886', 'ISIC_2336763', 'ISIC_4064330', 'ISIC_7358293', 'ISIC_5789052', 'ISIC_7828320', 'ISIC_8277969', 'ISIC_1080647', 'ISIC_3238159', 'ISIC_8480913', 'ISIC_3790692', 'ISIC_0612624', 'ISIC_1242543', 'ISIC_4036915', 'ISIC_8174647', 'ISIC_2956783', 'ISIC_3302289', 'ISIC_6761105', 'ISIC_2152755', 'ISIC_9169000', 'ISIC_6852275', 'ISIC_4432898', 'ISIC_5459207', 'ISIC_7418664', 'ISIC_5136612', 'ISIC_9174738', 'ISIC_3160301', 'ISIC_7140636', 'ISIC_7718384', 'ISIC_9336675', 'ISIC_4282719', 'ISIC_4330005', 'ISIC_9828463', 'ISIC_6511141', 'ISIC_5335139', 'ISIC_5104921', 'ISIC_0695575', 'ISIC_0610141', 'ISIC_5946998', 'ISIC_0464315', 'ISIC_6556513', 'ISIC_3688407', 'ISIC_7730443', 'ISIC_4358550', 'ISIC_6461484', 'ISIC_9690422', 'ISIC_5374076', 'ISIC_1793200', 'ISIC_1389620', 'ISIC_8098274', 'ISIC_6425888', 'ISIC_6321076', 'ISIC_4298309', 'ISIC_2981912', 'ISIC_3650938', 'ISIC_4288522', 'ISIC_9459785', 'ISIC_1938535', 'ISIC_5576241', 'ISIC_6567889', 'ISIC_2768800', 'ISIC_6023795', 'ISIC_9281339', 'ISIC_6712494', 'ISIC_1811256', 'ISIC_5157055', 'ISIC_3943097', 'ISIC_7194471', 'ISIC_0361529', 'ISIC_9797578', 'ISIC_3575926', 'ISIC_6166824', 'ISIC_8828670', 'ISIC_6953126', 'ISIC_4430815', 'ISIC_8146054', 'ISIC_9305209', 'ISIC_4263017', 'ISIC_9314144', 'ISIC_1330763', 'ISIC_4792936', 'ISIC_1823608', 'ISIC_4910683', 'ISIC_9360142', 'ISIC_2863809', 'ISIC_4748668', 'ISIC_5681315', 'ISIC_3202829', 'ISIC_3450978', 'ISIC_9704624', 'ISIC_4350914', 'ISIC_3587744', 'ISIC_8190321', 'ISIC_1766413', 'ISIC_2872769', 'ISIC_3186625', 'ISIC_0170059', 'ISIC_4858099', 'ISIC_0314462', 'ISIC_2811886', 'ISIC_2140099', 'ISIC_9514450', 'ISIC_1195354', 'ISIC_8325872', 'ISIC_0227038', 'ISIC_6342641', 'ISIC_4162828', 'ISIC_7597293', 'ISIC_5278307', 'ISIC_3774190', 'ISIC_2957196', 'ISIC_4443545', 'ISIC_3455136', 'ISIC_0610499', 'ISIC_8483008', 'ISIC_0243683', 'ISIC_9028131', 'ISIC_8507102', 'ISIC_7128535', 'ISIC_4085552', 'ISIC_2940763', 'ISIC_1219894', 'ISIC_1043313', 'ISIC_6587979', 'ISIC_7050773', 'ISIC_3230164', 'ISIC_5159557', 'ISIC_7854457', 'ISIC_2582493', 'ISIC_5161114', 'ISIC_5238910', 'ISIC_6515221', 'ISIC_7771339', 'ISIC_9274260', 'ISIC_8054626', 'ISIC_1178847', 'ISIC_0236778', 'ISIC_6704518', 'ISIC_4214813', 'ISIC_0322818', 'ISIC_0230209', 'ISIC_7682938', 'ISIC_1852500', 'ISIC_3699454', 'ISIC_4693693', 'ISIC_9574591', 'ISIC_3465766', 'ISIC_1826803', 'ISIC_6234881', 'ISIC_2417958', 'ISIC_8142203', 'ISIC_5019268', 'ISIC_3251719', 'ISIC_4654808', 'ISIC_1027856', 'ISIC_3262153', 'ISIC_4681838', 'ISIC_6594555', 'ISIC_8623291', 'ISIC_3167092', 'ISIC_8791163', 'ISIC_1538510', 'ISIC_3962218', 'ISIC_2160145', 'ISIC_7690654', 'ISIC_9464203', 'ISIC_4673844', 'ISIC_9481260', 'ISIC_5407240', 'ISIC_5179742', 'ISIC_8851901', 'ISIC_7433711', 'ISIC_5777548', 'ISIC_2164933', 'ISIC_7194695', 'ISIC_7115605', 'ISIC_7560157', 'ISIC_1323909', 'ISIC_0307958', 'ISIC_8015259', 'ISIC_3089729', 'ISIC_3048886', 'ISIC_0861066', 'ISIC_6110309', 'ISIC_9103289', 'ISIC_2853454', 'ISIC_1436572', 'ISIC_9650546', 'ISIC_8208962', 'ISIC_5218561', 'ISIC_3285862', 'ISIC_5361506', 'ISIC_8196660', 'ISIC_0356238', 'ISIC_1156392', 'ISIC_2761440', 'ISIC_0645462', 'ISIC_4908514', 'ISIC_1374795', 'ISIC_3481768', 'ISIC_2102371', 'ISIC_4548990', 'ISIC_7200676', 'ISIC_8827725', 'ISIC_0667149', 'ISIC_7028320', 'ISIC_5485142', 'ISIC_9698871', 'ISIC_7764481', 'ISIC_8831706', 'ISIC_4478276', 'ISIC_0401250', 'ISIC_6987824', 'ISIC_7789537', 'ISIC_1114860', 'ISIC_7586566', 'ISIC_0343061', 'ISIC_1442157', 'ISIC_9161937', 'ISIC_5904214', 'ISIC_8335489', 'ISIC_9994768', 'ISIC_4384331', 'ISIC_0639415', 'ISIC_0982984', 'ISIC_2195070', 'ISIC_9022865', 'ISIC_0159060', 'ISIC_4933735', 'ISIC_3571989', 'ISIC_8593130', 'ISIC_1585919', 'ISIC_3907656', 'ISIC_9728805', 'ISIC_6029052', 'ISIC_3582787', 'ISIC_2205007', 'ISIC_1447559' ] train = train[~train['image_name'].isin(drop_image_name)].reset_index( drop=True) # Preprocessing metadata # OneHotEncoder train, test = preprocessing_meta(train, test) features_num = len([ f for f in train.columns if f not in ['image_name', 'patient_id', 'target', 'fold'] ]) # Model #################################################################### net = ENet(model_name=cfg.train.model_name, meta_features_num=features_num) transform = ImageTransform(img_size=cfg.data.img_size, input_res=chris_image_size) # Lightning Module ######################################################### model = MelanomaSystem(net, cfg, img_paths, train, test, transform, experiment) checkpoint_callback = ModelCheckpoint(filepath='./checkpoint', save_top_k=1, verbose=True, monitor='avg_val_loss', mode='min', prefix=cfg.exp.exp_name + '_') trainer = Trainer(max_epochs=cfg.train.epoch, checkpoint_callback=checkpoint_callback, gpus=[0]) # Train & Test ############################################################ # Train trainer.fit(model) experiment.log_metric('best_auc', model.best_auc) checkpoint_path = glob.glob(f'./checkpoint/{cfg.exp.exp_name}_*.ckpt')[0] experiment.log_asset(file_data=checkpoint_path) # Test for i in range(test_num): trainer.test(model) # Submit sub_list = glob.glob(f'submission_{cfg.exp.exp_name}*.csv') _ = summarize_submit(sub_list, experiment, filename=f'submission_all_{cfg.exp.exp_name}.csv') # oof valid_dataset = MelanomaDataset(train, img_paths['train'], transform, phase='test') valid_dataloader = DataLoader(valid_dataset, batch_size=cfg.train.batch_size, pin_memory=False, shuffle=False, drop_last=False) for i in range(10): trainer.test(model, test_dataloaders=valid_dataloader) # Submit sub_list = glob.glob('submission*.csv') _ = summarize_submit(sub_list, experiment, filename=f'submission_oof_{cfg.exp.exp_name}.csv') # Reset del model, trainer, net, experiment
# Train the model if hyper_params["LOAD_MODEL"]: model = torch.load(f'hyper_params["LOAD_MODEL_NAME"].pt') else: train_mnist_vae(train_loader, model, criterion=optimizer, n_epoch=hyper_params["EPOCH"], experiment=experiment, beta=hyper_params["BETA"], loss_type="mse", flatten=False) torch.save(model, f'hyper_params["MODEL_NAME"].pt') model.save_weights(f'./{hyper_params["MODEL_NAME"]}.h5') experiment.log_asset(file_data=f'./{hyper_params["MODEL_NAME"]}.h5', file_name='model.h5') # Compute p-values pval, _ = compute_empirical_pval(train_data.data, model, test_data.data) pval_order = numpy.argsort(pval) # Plot p-values x_line = numpy.arange(0, test_data.data.shape[0], step=1) y_line = numpy.linspace(0, 1, test_data.data.shape[0]) y_adj = numpy.arange(0, test_data.data.shape[0], step=1) / test_data.data.shape[0] * hyper_params["ALPHA"] zoom = int(0.2 * test_data.data.shape[0]) # nb of points to zoom index = numpy.concatenate([ numpy.repeat(True, len(id_min_test)), numpy.repeat(False, len(id_maj_test)) ])
def train_cifar10(batch_size: int, learning_rate: float, epochs: int, experiment: Experiment, model: Sequential = get_model(), initial_epoch: int = 0, training_datagen: ImageDataGenerator = ImageDataGenerator(), scheduler: Callable[[int], float] = None, early_stopping_th: Optional[int] = 250, data_portion: float = 1.0, find_lr: bool = False) -> None: preprocessing_fnc = training_datagen.preprocessing_function name = experiment.get_key() log_path, model_path = get_output_paths(name) data = get_cifar10_data(data_portion=data_portion) training_datagen.fit(data.x_train) log_images(data.x_train, training_datagen, experiment) log_input_images(data.x_train, data.y_train, training_datagen, experiment) opt = Adam(lr=learning_rate) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) log_model_plot(experiment, model) csv_cb = CSVLogger(log_path) keep_best_cb = KeepBest('val_acc') callbacks = [csv_cb, keep_best_cb] # [csv_cb, early_stopping_cb, keep_best_cb] if early_stopping_th is not None: early_stopping_cb = EarlyStopping('val_acc', patience=early_stopping_th, restore_best_weights=True, verbose=2) callbacks.append(early_stopping_cb) if scheduler is not None: scheduler.experiment_log(experiment=experiment, epochs=list(range(epochs))) callbacks.append(LearningRateScheduler(scheduler)) if find_lr: lrf = LearningRateFinder(model=model) lrf.lrMult = (10e-1 / learning_rate)**( 1.0 / (epochs * len(data.x_train) / batch_size)) callbacks = [ LambdaCallback( on_batch_end=lambda batch, logs: lrf.on_batch_end(batch, logs)) ] model.fit_generator(training_datagen.flow(data.x_train, data.y_train, batch_size=batch_size), steps_per_epoch=len(data.x_train) / batch_size, epochs=epochs, validation_data=(preprocessing_fnc(data.x_dev), data.y_dev), shuffle=True, callbacks=callbacks, verbose=2, initial_epoch=initial_epoch) model.save(model_path) experiment.log_asset(model_path) experiment.log_asset(log_path) if find_lr: experiment.log_figure('lr vs acc', lrf.plot_loss()) log_final_metrics(experiment, model, data, preprocessing_fnc)
class Trainer(): def __init__(self, log_dir, cfg): self.path = log_dir self.cfg = cfg if cfg.TRAIN.FLAG: self.model_dir = os.path.join(self.path, 'Model') self.log_dir = os.path.join(self.path, 'Log') mkdir_p(self.model_dir) mkdir_p(self.log_dir) self.writer = SummaryWriter(log_dir=self.log_dir) self.logfile = os.path.join(self.path, "logfile.log") sys.stdout = Logger(logfile=self.logfile) self.data_dir = cfg.DATASET.DATA_DIR self.max_epochs = cfg.TRAIN.MAX_EPOCHS self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL s_gpus = cfg.GPU_ID.split(',') self.gpus = [int(ix) for ix in s_gpus] self.num_gpus = len(self.gpus) self.batch_size = cfg.TRAIN.BATCH_SIZE self.lr = cfg.TRAIN.LEARNING_RATE torch.cuda.set_device(self.gpus[0]) cudnn.benchmark = True sample = cfg.SAMPLE self.dataset = [] self.dataloader = [] self.use_feats = cfg.model.use_feats eval_split = cfg.EVAL if cfg.EVAL else 'val' train_split = cfg.DATASET.train_split if cfg.DATASET.DATASET == 'clevr': clevr_collate_fn = collate_fn cogent = cfg.DATASET.COGENT if cogent: print(f'Using CoGenT {cogent.upper()}') if cfg.TRAIN.FLAG: self.dataset = ClevrDataset(data_dir=self.data_dir, split=train_split + cogent, sample=sample, **cfg.DATASET.params) self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.WORKERS, drop_last=True, collate_fn=clevr_collate_fn) self.dataset_val = ClevrDataset(data_dir=self.data_dir, split=eval_split + cogent, sample=sample, **cfg.DATASET.params) self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=cfg.TEST_BATCH_SIZE, drop_last=False, shuffle=False, num_workers=cfg.WORKERS, collate_fn=clevr_collate_fn) elif cfg.DATASET.DATASET == 'gqa': if self.use_feats == 'spatial': gqa_collate_fn = collate_fn_gqa elif self.use_feats == 'objects': gqa_collate_fn = collate_fn_gqa_objs if cfg.TRAIN.FLAG: self.dataset = GQADataset(data_dir=self.data_dir, split=train_split, sample=sample, use_feats=self.use_feats, **cfg.DATASET.params) self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.WORKERS, drop_last=True, collate_fn=gqa_collate_fn) self.dataset_val = GQADataset(data_dir=self.data_dir, split=eval_split, sample=sample, use_feats=self.use_feats, **cfg.DATASET.params) self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=cfg.TEST_BATCH_SIZE, shuffle=False, num_workers=cfg.WORKERS, drop_last=False, collate_fn=gqa_collate_fn) # load model self.vocab = load_vocab(cfg) self.model, self.model_ema = mac.load_MAC(cfg, self.vocab) self.weight_moving_average(alpha=0) if cfg.TRAIN.RADAM: self.optimizer = RAdam(self.model.parameters(), lr=self.lr) else: self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.start_epoch = 0 if cfg.resume_model: location = 'cuda' if cfg.CUDA else 'cpu' state = torch.load(cfg.resume_model, map_location=location) self.model.load_state_dict(state['model']) self.optimizer.load_state_dict(state['optim']) self.start_epoch = state['iter'] + 1 state = torch.load(cfg.resume_model_ema, map_location=location) self.model_ema.load_state_dict(state['model']) if cfg.start_epoch is not None: self.start_epoch = cfg.start_epoch self.previous_best_acc = 0.0 self.previous_best_epoch = 0 self.previous_best_loss = 100 self.previous_best_loss_epoch = 0 self.total_epoch_loss = 0 self.prior_epoch_loss = 10 self.print_info() self.loss_fn = torch.nn.CrossEntropyLoss().cuda() self.comet_exp = Experiment( project_name=cfg.COMET_PROJECT_NAME, api_key=os.getenv('COMET_API_KEY'), workspace=os.getenv('COMET_WORKSPACE'), disabled=cfg.logcomet is False, ) if cfg.logcomet: exp_name = cfg_to_exp_name(cfg) print(exp_name) self.comet_exp.set_name(exp_name) self.comet_exp.log_parameters(flatten_json_iterative_solution(cfg)) self.comet_exp.log_asset(self.logfile) self.comet_exp.log_asset_data(json.dumps(cfg, indent=4), file_name='cfg.json') self.comet_exp.set_model_graph(str(self.model)) if cfg.cfg_file: self.comet_exp.log_asset(cfg.cfg_file) with open(os.path.join(self.path, 'cfg.json'), 'w') as f: json.dump(cfg, f, indent=4) def print_info(self): print('Using config:') pprint.pprint(self.cfg) print("\n") pprint.pprint("Size of train dataset: {}".format(len(self.dataset))) # print("\n") pprint.pprint("Size of val dataset: {}".format(len(self.dataset_val))) print("\n") print("Using MAC-Model:") pprint.pprint(self.model) print("\n") def weight_moving_average(self, alpha=0.999): for param1, param2 in zip(self.model_ema.parameters(), self.model.parameters()): param1.data *= alpha param1.data += (1.0 - alpha) * param2.data def set_mode(self, mode="train"): if mode == "train": self.model.train() self.model_ema.train() else: self.model.eval() self.model_ema.eval() def reduce_lr(self): epoch_loss = self.total_epoch_loss # / float(len(self.dataset) // self.batch_size) lossDiff = self.prior_epoch_loss - epoch_loss if ((lossDiff < 0.015 and self.prior_epoch_loss < 0.5 and self.lr > 0.00002) or \ (lossDiff < 0.008 and self.prior_epoch_loss < 0.15 and self.lr > 0.00001) or \ (lossDiff < 0.003 and self.prior_epoch_loss < 0.10 and self.lr > 0.000005)): self.lr *= 0.5 print("Reduced learning rate to {}".format(self.lr)) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr self.prior_epoch_loss = epoch_loss self.total_epoch_loss = 0 def save_models(self, iteration): save_model(self.model, self.optimizer, iteration, self.model_dir, model_name="model") save_model(self.model_ema, None, iteration, self.model_dir, model_name="model_ema") def train_epoch(self, epoch): cfg = self.cfg total_loss = 0. total_correct = 0 total_samples = 0 self.labeled_data = iter(self.dataloader) self.set_mode("train") dataset = tqdm(self.labeled_data, total=len(self.dataloader), ncols=20) for data in dataset: ###################################################### # (1) Prepare training data ###################################################### image, question, question_len, answer = data['image'], data[ 'question'], data['question_length'], data['answer'] answer = answer.long() question = Variable(question) answer = Variable(answer) if cfg.CUDA: if self.use_feats == 'spatial': image = image.cuda() elif self.use_feats == 'objects': image = [e.cuda() for e in image] question = question.cuda() answer = answer.cuda().squeeze() else: question = question image = image answer = answer.squeeze() ############################ # (2) Train Model ############################ self.optimizer.zero_grad() scores = self.model(image, question, question_len) loss = self.loss_fn(scores, answer) loss.backward() if self.cfg.TRAIN.CLIP_GRADS: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg.TRAIN.CLIP) self.optimizer.step() self.weight_moving_average() ############################ # (3) Log Progress ############################ correct = scores.detach().argmax(1) == answer total_correct += correct.sum().cpu().item() total_loss += loss.item() * answer.size(0) total_samples += answer.size(0) avg_loss = total_loss / total_samples train_accuracy = total_correct / total_samples # accuracy = correct.sum().cpu().numpy() / answer.shape[0] # if avg_loss == 0: # avg_loss = loss.item() # train_accuracy = accuracy # else: # avg_loss = 0.99 * avg_loss + 0.01 * loss.item() # train_accuracy = 0.99 * train_accuracy + 0.01 * accuracy # self.total_epoch_loss += loss.item() * answer.size(0) dataset.set_description( 'Epoch: {}; Avg Loss: {:.5f}; Avg Train Acc: {:.5f}'.format( epoch + 1, avg_loss, train_accuracy)) self.total_epoch_loss = avg_loss dict = { "loss": avg_loss, "accuracy": train_accuracy, "avg_loss": avg_loss, # For commet "avg_accuracy": train_accuracy, # For commet } return dict def train(self): cfg = self.cfg print("Start Training") for epoch in range(self.start_epoch, self.max_epochs): with self.comet_exp.train(): dict = self.train_epoch(epoch) self.reduce_lr() dict['epoch'] = epoch + 1 dict['lr'] = self.lr self.comet_exp.log_metrics( dict, epoch=epoch + 1, ) with self.comet_exp.validate(): dict = self.log_results(epoch, dict) dict['epoch'] = epoch + 1 dict['lr'] = self.lr self.comet_exp.log_metrics( dict, epoch=epoch + 1, ) if cfg.TRAIN.EALRY_STOPPING: if epoch - cfg.TRAIN.PATIENCE == self.previous_best_epoch: # if epoch - cfg.TRAIN.PATIENCE == self.previous_best_loss_epoch: print('Early stop') break self.comet_exp.log_asset(self.logfile) self.save_models(self.max_epochs) self.writer.close() print("Finished Training") print( f"Highest validation accuracy: {self.previous_best_acc} at epoch {self.previous_best_epoch}" ) def log_results(self, epoch, dict, max_eval_samples=None): epoch += 1 self.writer.add_scalar("avg_loss", dict["loss"], epoch) self.writer.add_scalar("train_accuracy", dict["accuracy"], epoch) metrics = self.calc_accuracy("validation", max_samples=max_eval_samples) self.writer.add_scalar("val_accuracy_ema", metrics['acc_ema'], epoch) self.writer.add_scalar("val_accuracy", metrics['acc'], epoch) self.writer.add_scalar("val_loss_ema", metrics['loss_ema'], epoch) self.writer.add_scalar("val_loss", metrics['loss'], epoch) print( "Epoch: {epoch}\tVal Acc: {acc},\tVal Acc EMA: {acc_ema},\tAvg Loss: {loss},\tAvg Loss EMA: {loss_ema},\tLR: {lr}" .format(epoch=epoch, lr=self.lr, **metrics)) if metrics['acc'] > self.previous_best_acc: self.previous_best_acc = metrics['acc'] self.previous_best_epoch = epoch if metrics['loss'] < self.previous_best_loss: self.previous_best_loss = metrics['loss'] self.previous_best_loss_epoch = epoch if epoch % self.snapshot_interval == 0: self.save_models(epoch) return metrics def calc_accuracy(self, mode="train", max_samples=None): self.set_mode("validation") if mode == "train": loader = self.dataloader # elif (mode == "validation") or (mode == 'test'): # loader = self.dataloader_val else: loader = self.dataloader_val total_correct = 0 total_correct_ema = 0 total_samples = 0 total_loss = 0. total_loss_ema = 0. pbar = tqdm(loader, total=len(loader), desc=mode.upper(), ncols=20) for data in pbar: image, question, question_len, answer = data['image'], data[ 'question'], data['question_length'], data['answer'] answer = answer.long() question = Variable(question) answer = Variable(answer) if self.cfg.CUDA: if self.use_feats == 'spatial': image = image.cuda() elif self.use_feats == 'objects': image = [e.cuda() for e in image] question = question.cuda() answer = answer.cuda().squeeze() with torch.no_grad(): scores = self.model(image, question, question_len) scores_ema = self.model_ema(image, question, question_len) loss = self.loss_fn(scores, answer) loss_ema = self.loss_fn(scores_ema, answer) correct = scores.detach().argmax(1) == answer correct_ema = scores_ema.detach().argmax(1) == answer total_correct += correct.sum().cpu().item() total_correct_ema += correct_ema.sum().cpu().item() total_loss += loss.item() * answer.size(0) total_loss_ema += loss_ema.item() * answer.size(0) total_samples += answer.size(0) avg_acc = total_correct / total_samples avg_acc_ema = total_correct_ema / total_samples avg_loss = total_loss / total_samples avg_loss_ema = total_loss_ema / total_samples pbar.set_postfix({ 'Acc': f'{avg_acc:.5f}', 'Acc Ema': f'{avg_acc_ema:.5f}', 'Loss': f'{avg_loss:.5f}', 'Loss Ema': f'{avg_loss_ema:.5f}', }) return dict(acc=avg_acc, acc_ema=avg_acc_ema, loss=avg_loss, loss_ema=avg_loss_ema)
allexperiments = api.get('wronnyhuang/landscape2d') for expt in allexperiments: if exptname != api.get_experiment_other(expt, 'Name')[0]: continue raw = api.get_experiment_metrics_raw(expt) for r in raw: if r['metricName'] == 'xent': xent[r['step']] = r['metricValue'] elif r['metricName'] == 'acc': acc[r['step']] = r['metricValue'] for idx, (c1, c2) in enumerate(cfeed): if np.mod(idx, args.npart) != args.part: continue if idx in xent and idx in acc: print('skipping idx ' + str(idx)) continue perturbedWeights = [ w + c1 * d1 + c2 * d2 for w, d1, d2 in zip(weights, dw1, dw2) ] evaluator.assign_weights(perturbedWeights) xent[idx], acc[idx], _ = evaluator.eval() experiment.log_metric('xent', xent[idx], step=idx) experiment.log_metric('acc', acc[idx], step=idx) print('point ', idx + 1, 'of', len(cfeed), '| time:', time()) # save plot data and log the figure with open(exptname + '.pkl', 'wb') as f: pickle.dump((xent, acc), f) experiment.log_asset(exptname + '.pkl') os.remove(exptname + '.pkl')
results_dir + 'predictions/predictions_result.txt', 'evaluation loss:' + str(evaluation_loss) + ' evaluation accuracy:' + str(evaluation_accuracy) + ' evaluation dice coef:' + str(evaluation_dice_coef)) make_file_and_write(results_dir + 'description.txt', description) predicted_masks = model.predict(test_images, 1, verbose=1) converted_test_images = convert_one_class_images_to_pixel_images_and_save( results_dir + 'predictions/images/', test_images, shape=input_shape) converted_test_masks = convert_multiclass_matirx_masks_to_pixel_masks_and_save( results_dir + 'predictions/masks/', test_masks, mask_pixel_values_aka_classes) converted_predicted_masks = convert_multiclass_matirx_masks_to_pixel_masks_and_save( results_dir + 'predictions/results/', predicted_masks, mask_pixel_values_aka_classes) plot_model(model, to_file=results_dir + 'model_architecture.png', show_shapes=True, show_layer_names=True, rankdir='TB') experiment.log_image(results_dir + 'model_architecture.png', name='model_architecture.png') experiment.log_asset(results_dir + 'unet.hdf5', file_name='unet.hdf5') for index in range(len(test_images)): experiment.log_image(converted_test_images[index], name=str(index) + '_test_image') experiment.log_image(converted_test_masks[index], name=str(index) + '_test_mask') experiment.log_image(converted_predicted_masks[index], name=str(index) + '_predicted_mask')
label_names = list(labeldf.taxonID.values) callback_list = callbacks.create(experiment=experiment, train_data=model.train_split, validation_data=model.val_split, train_shp=model.train_shp, log_dir=save_dir, label_names=label_names, submodel=False) neighbor.fit(model.train_split, epochs=model.config["train"]["ensemble"]["epochs"], validation_data=model.val_split, callbacks=callback_list) #save neighbor.save("{}/neighbors.h5".format(save_dir)) predicted_shp = model.predict(model=neighbor) predicted_shp.to_file("{}/prediction.shp".format(save_dir)) experiment.log_asset("{}/prediction.shp".format(save_dir)) experiment.log_asset("{}/prediction.dbf".format(save_dir)) experiment.log_asset("{}/prediction.shx".format(save_dir)) experiment.log_asset("{}/prediction.cpg".format(save_dir)) estimate_a = neighbor.get_layer("ensemble_add_bias").get_weights() experiment.log_metric(name="target_versus_context_weight", value=estimate_a[0][0]) #estimate_lambda = neighbor.get_layer("distance_decay").get_weights() #experiment.log_metric(name="distance_decay_rate", value=estimate_lambda[0])
opt_file = "prev_experiments/11k_wgan_feature_pixelDA.yml" opts = load_opts(path=root / opt_file, default=root / "shared/defaults.yml") opts = set_mode("test", opts) opts.data.loaders.batch_size = 1 val_loader = get_loader(opts) dataset_size = len(val_loader) print("#testing images = %d" % dataset_size) comet_exp = Experiment(workspace=opts.comet.workspace, project_name=opts.comet.project_name) if comet_exp is not None: comet_exp.log_asset(file_data=str(root / opt_file), file_name=root / opt_file) comet_exp.log_parameters(opts) checkpoint_directory, image_directory = prepare_sub_folder( opts.train.output_dir) opts.comet.exp = comet_exp model = create_model(opts) model.setup() total_steps = 0 for i, data in enumerate(val_loader): # with Timer("Elapsed time in update " + str(i) + ": %f"):
if not args.only_training: print('Incremental processing evaluation started.') if not args.comet_track: experiment = None # outputs using partial, incremental inputs partial_outputs = Results(test_loader, model, my_device, label_pad_id, corpus, seq2seq, prophecies=None) partial_outputs.print_metrics(model_name, experiment) pickle.dump(partial_outputs, open('outputs/results_'+model_name, 'wb')) if args.comet_track: experiment.log_asset('outputs/results_'+model_name, 'results_partialInputs_'+model_name) # outputs using GPT2 prophecies prophecies = pickle.load(open('prophecies/gpt2Prophecies_'+args.task+'_testset-withOutliers', 'rb')) prophecies_outputs = Results(test_loader, model, my_device, label_pad_id, corpus, seq2seq, prophecies) prophecies_outputs.print_metrics(model_name+'_gpt', experiment) pickle.dump(prophecies_outputs, open('outputs/resultsGPT_'+model_name, 'wb')) if args.comet_track: experiment.log_asset('outputs/resultsGPT_'+model_name, "results_prophecies_"+model_name) print('Finished!')
os.makedirs('pickle', exist_ok=True); pickle.dump(dw1, open(join('pickle', args.ckpt), 'wb')) along = 'along_eigvec' else: dw1 = evaluator.get_random_dir() along = 'along_random_'+str(args.seed) # span cfeed = args.span/2 * np.linspace(-1, 1, 30) cfeed_enum = list(enumerate(cfeed)); random.shuffle(cfeed_enum) # shuffle order so we see plot shape sooner on comet # loop over all points along surface direction name = 'span_' + str(args.span) + '/' + basename(args.ckpt) + '/' + along # name of experiment xent = np.zeros(len(cfeed)) weights = evaluator.get_weights() for i, (idx, c) in enumerate(cfeed_enum): perturbedWeights = [w + c * d1 for w, d1 in zip(weights, dw1)] evaluator.assign_weights(perturbedWeights) xent[idx], acc, _ = evaluator.eval() experiment.log_metric(name, xent[idx], idx) print('progress:', i + 1, 'of', len(cfeed_enum), '| time:', time()) # save plot data and log the figure xent = np.reshape(np.array(xent), cfeed.shape) plt.plot(cfeed, xent) experiment.log_figure(name) unique = utils.timenow() pickle.dump((cfeed, xent), open(unique, 'wb')) experiment.log_asset(file_path=unique, file_name=name+'.pkl')
def upload_experiment(): experiment = Experiment(**COMET_ML_KEY) experiment.log_asset_folder('./datasets') experiment.log_asset_folder('./models') experiment.log_asset_folder('./knapsack') experiment.log_asset(RESULT_FILE)
verbose=1) y_pred = model.predict( oDataSet.attributes[oData.Testing_indexes]).argmax(axis=1) y_true = oDataSet.labels[oData.Testing_indexes] experiment.log_metric("test_accuracy", accuracy_score(y_true, y_pred)) experiment.log_metric("beta", best_b) experiment.log_metric("neurons", best_p) experiment.log_confusion_matrix(matrix=confusion_matrix(y_true, y_pred).tolist(), labels=oDataSet.labelsNames) # model.save('model.h5') # experiment.log_asset("model.h5") model.save_weights('model.weights') experiment.log_asset("model.weights") print(accuracy_score(y_true, y_pred)) print(confusion_matrix(y_true, y_pred)) oData.confusion_matrix = confusion_matrix(y_true, y_pred) oData.model = model oData.params = { "k_fold": K_FOLD, "GRID_RESULT": grid_result, "GRID_VALUES_NEURON": GRID_NEURON, "GRID_VALUES_BETA": GRID_B, "LEARNING RATE": LEARNING_RATE, "EPOCHS": epochs } experiment.log_other("params", oData.params) y_pred = model.predict(
def run(args, train, sparse_evidences, claims_dict): BATCH_SIZE = args.batch_size LEARNING_RATE = args.learning_rate DATA_SAMPLING = args.data_sampling NUM_EPOCHS = args.epochs MODEL = args.model RANDOMIZE = args.no_randomize PRINT = args.print use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") logger = Logger('./logs/{}'.format(time.localtime())) if MODEL: print("Loading pretrained model...") model = torch.load(MODEL) model.load_state_dict(torch.load(MODEL).state_dict()) else: model = cdssm.CDSSM() model = model.cuda() model = model.to(device) # model = cdssm.CDSSM() # model = model.cuda() # model = model.to(device) if torch.cuda.device_count() > 0: print("Let's use", torch.cuda.device_count(), "GPU(s)!") model = nn.DataParallel(model) print("Created model with {:,} parameters.".format( putils.count_parameters(model))) # if MODEL: # print("TEMPORARY change to loading!") # model.load_state_dict(torch.load(MODEL).state_dict()) print("Created dataset...") # use an 80/20 train/validate split! train_size = int(len(train) * 0.80) #test = int(len(train) * 0.5) train_dataset = pytorch_data_loader.WikiDataset( train[:train_size], claims_dict, data_sampling=DATA_SAMPLING, sparse_evidences=sparse_evidences, randomize=RANDOMIZE) val_dataset = pytorch_data_loader.WikiDataset( train[train_size:], claims_dict, data_sampling=DATA_SAMPLING, sparse_evidences=sparse_evidences, randomize=RANDOMIZE) train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=0, shuffle=True, collate_fn=pytorch_data_loader.PadCollate()) val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=0, shuffle=True, collate_fn=pytorch_data_loader.PadCollate()) # Loss and optimizer criterion = torch.nn.NLLLoss() # criterion = torch.nn.SoftMarginLoss() # if torch.cuda.device_count() > 0: # print("Let's parallelize the backward pass...") # criterion = DataParallelCriterion(criterion) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3) OUTPUT_FREQ = max(int((len(train_dataset) / BATCH_SIZE) * 0.02), 20) parameters = { "batch size": BATCH_SIZE, "epochs": NUM_EPOCHS, "learning rate": LEARNING_RATE, "optimizer": optimizer.__class__.__name__, "loss": criterion.__class__.__name__, "training size": train_size, "data sampling rate": DATA_SAMPLING, "data": args.data, "sparse_evidences": args.sparse_evidences, "randomize": RANDOMIZE, "model": MODEL } experiment = Experiment(api_key="YLsW4AvRTYGxzdDqlWRGCOhee", project_name="clsm", workspace="moinnadeem") experiment.add_tag("train") experiment.log_asset("cdssm.py") experiment.log_dataset_info(name=args.data) experiment.log_parameters(parameters) model_checkpoint_dir = "models/saved_model" for key, value in parameters.items(): if type(value) == str: value = value.replace("/", "-") if key != "model": model_checkpoint_dir += "_{}-{}".format(key.replace(" ", "_"), value) print("Training...") beginning_time = time.time() best_loss = torch.tensor(float("inf"), dtype=torch.float) # begin loss at infinity for epoch in range(NUM_EPOCHS): beginning_time = time.time() mean_train_acc = 0.0 train_running_loss = 0.0 train_running_accuracy = 0.0 model.train() experiment.log_current_epoch(epoch) with experiment.train(): for train_batch_num, inputs in enumerate(train_dataloader): claims_tensors, claims_text, evidences_tensors, evidences_text, labels = inputs claims_tensors = claims_tensors.cuda() evidences_tensors = evidences_tensors.cuda() labels = labels.cuda() #claims = claims.to(device).float() #evidences = evidences.to(device).float() #labels = labels.to(device) y_pred = model(claims_tensors, evidences_tensors) y = (labels) # y = y.unsqueeze(0) # y = y.unsqueeze(0) # y_pred = parallel.gather(y_pred, 0) y_pred = y_pred.squeeze() # y = y.squeeze() loss = criterion(y_pred, torch.max(y, 1)[1]) # loss = criterion(y_pred, y) y = y.float() binary_y = torch.max(y, 1)[1] binary_pred = torch.max(y_pred, 1)[1] accuracy = (binary_y == binary_pred).to("cuda") accuracy = accuracy.float() accuracy = accuracy.mean() train_running_accuracy += accuracy.item() mean_train_acc += accuracy.item() train_running_loss += loss.item() if PRINT: for idx in range(len(y)): print( "Claim: {}, Evidence: {}, Prediction: {}, Label: {}" .format(claims_text[0], evidences_text[idx], torch.exp(y_pred[idx]), y[idx])) if (train_batch_num % OUTPUT_FREQ) == 0 and train_batch_num > 0: elapsed_time = time.time() - beginning_time binary_y = torch.max(y, 1)[1] binary_pred = torch.max(y_pred, 1)[1] print( "[{}:{}:{:3f}s] training loss: {}, training accuracy: {}, training recall: {}" .format( epoch, train_batch_num / (len(train_dataset) / BATCH_SIZE), elapsed_time, train_running_loss / OUTPUT_FREQ, train_running_accuracy / OUTPUT_FREQ, recall_score(binary_y.cpu().detach().numpy(), binary_pred.cpu().detach().numpy()))) # 1. Log scalar values (scalar summary) info = { 'train_loss': train_running_loss / OUTPUT_FREQ, 'train_accuracy': train_running_accuracy / OUTPUT_FREQ } for tag, value in info.items(): experiment.log_metric(tag, value, step=train_batch_num * (epoch + 1)) logger.scalar_summary(tag, value, train_batch_num + 1) ## 2. Log values and gradients of the parameters (histogram summary) for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.detach().cpu().numpy(), train_batch_num + 1) logger.histo_summary(tag + '/grad', value.grad.detach().cpu().numpy(), train_batch_num + 1) train_running_loss = 0.0 beginning_time = time.time() train_running_accuracy = 0.0 optimizer.zero_grad() loss.backward() optimizer.step() # del loss # del accuracy # del claims_tensors # del claims_text # del evidences_tensors # del evidences_text # del labels # del y # del y_pred # torch.cuda.empty_cache() print("Running validation...") model.eval() pred = [] true = [] avg_loss = 0.0 val_running_accuracy = 0.0 val_running_loss = 0.0 beginning_time = time.time() with experiment.validate(): for val_batch_num, val_inputs in enumerate(val_dataloader): claims_tensors, claims_text, evidences_tensors, evidences_text, labels = val_inputs claims_tensors = claims_tensors.cuda() evidences_tensors = evidences_tensors.cuda() labels = labels.cuda() y_pred = model(claims_tensors, evidences_tensors) y = (labels) # y_pred = parallel.gather(y_pred, 0) y_pred = y_pred.squeeze() loss = criterion(y_pred, torch.max(y, 1)[1]) y = y.float() binary_y = torch.max(y, 1)[1] binary_pred = torch.max(y_pred, 1)[1] true.extend(binary_y.tolist()) pred.extend(binary_pred.tolist()) accuracy = (binary_y == binary_pred).to("cuda") accuracy = accuracy.float().mean() val_running_accuracy += accuracy.item() val_running_loss += loss.item() avg_loss += loss.item() if (val_batch_num % OUTPUT_FREQ) == 0 and val_batch_num > 0: elapsed_time = time.time() - beginning_time print( "[{}:{}:{:3f}s] validation loss: {}, accuracy: {}, recall: {}" .format( epoch, val_batch_num / (len(val_dataset) / BATCH_SIZE), elapsed_time, val_running_loss / OUTPUT_FREQ, val_running_accuracy / OUTPUT_FREQ, recall_score(binary_y.cpu().detach().numpy(), binary_pred.cpu().detach().numpy()))) # 1. Log scalar values (scalar summary) info = {'val_accuracy': val_running_accuracy / OUTPUT_FREQ} for tag, value in info.items(): experiment.log_metric(tag, value, step=val_batch_num * (epoch + 1)) logger.scalar_summary(tag, value, val_batch_num + 1) ## 2. Log values and gradients of the parameters (histogram summary) for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.detach().cpu().numpy(), val_batch_num + 1) logger.histo_summary(tag + '/grad', value.grad.detach().cpu().numpy(), val_batch_num + 1) val_running_accuracy = 0.0 val_running_loss = 0.0 beginning_time = time.time() # del loss # del accuracy # del claims_tensors # del claims_text # del evidences_tensors # del evidences_text # del labels # del y # del y_pred # torch.cuda.empty_cache() accuracy = accuracy_score(true, pred) print("[{}] mean accuracy: {}, mean loss: {}".format( epoch, accuracy, avg_loss / len(val_dataloader))) true = np.array(true).astype("int") pred = np.array(pred).astype("int") print(classification_report(true, pred)) best_loss = torch.tensor( min(avg_loss / len(val_dataloader), best_loss.cpu().numpy())) is_best = bool((avg_loss / len(val_dataloader)) <= best_loss) putils.save_checkpoint( { "epoch": epoch, "model": model, "best_loss": best_loss }, is_best, filename="{}_loss_{}".format(model_checkpoint_dir, best_loss.cpu().numpy()))
class Logger: """ Logs/plots results to comet. Args: exp_config (dict): experiment configuration hyperparameters model_config (dict): model configuration hyperparameters data_config (dict): data configuration hyperparameters """ def __init__(self, exp_config, model_config, data_config): self.exp_config = exp_config self.experiment = Experiment(**exp_config['comet_config']) self.experiment.disable_mp() self._log_hyper_params(exp_config, model_config, data_config) self._epoch = 0 def _log_hyper_params(self, exp_config, model_config, data_config): """ Log the hyper-parameters for the experiment. Args: exp_config (dict): experiment configuration hyperparameters model_config (dict): model configuration hyperparameters data_config (dict): data configuration hyperparameters """ def flatten_arg_dict(arg_dict): flat_dict = {} for k, v in arg_dict.items(): if type(v) == dict: flat_v = flatten_arg_dict(v) for kk, vv in flat_v.items(): flat_dict[k + '_' + kk] = vv else: flat_dict[k] = v return flat_dict self.experiment.log_parameters(flatten_arg_dict(exp_config)) self.experiment.log_parameters(flatten_arg_dict(model_config)) self.experiment.log_parameters(flatten_arg_dict(data_config)) def log(self, results, train_val): """ Plot the results in comet. Args: results (dict): dictionary of metrics to plot train_val (str): either 'train' or 'val' """ objectives, grads, params, images, metrics = results for metric_name, metric in objectives.items(): self.experiment.log_metric(metric_name + '_' + train_val, metric, self._epoch) print(metric_name, ':', metric.item()) if train_val == 'train': for grad_metric_name, grad_metric in grads.items(): self.experiment.log_metric('grads_' + grad_metric_name, grad_metric, self._epoch) for param_name, param in params.items(): self.experiment.log_metric(param_name + '_' + train_val, param, self._epoch) for image_name, imgs in images.items(): self.plot_images(imgs, image_name, train_val) for metric_name, metric in metrics.items(): self.experiment.log_metric(metric_name + '_' + train_val, metric, self._epoch) if train_val == 'val': self._epoch += 1 def plot_images(self, images, title, train_val): """ Plot a tensor of images. Args: images (torch.Tensor): a tensor of shape [steps, b, c, h, w] title (str): title for the images, e.g. reconstructions train_val (str): either 'train' or 'val' """ # add a channel dimension if necessary if len(images.shape) == 4: s, b, h, w = images.shape images = images.view(s, b, 1, h, w) s, b, c, h, w = images.shape if b > 10: images = images[:, :10] # swap the steps and batch dimensions images = images.transpose(0, 1).contiguous() images = images.view(-1, c, h, w) # grid = make_grid(images.clamp(0, 1), nrow=s).numpy() grid = make_grid(images, nrow=s).numpy() if c == 1: grid = grid[0] cmap = 'gray' else: grid = np.transpose(grid, (1, 2, 0)) cmap = None plt.imshow(grid, cmap=cmap) plt.axis('off') self.experiment.log_figure(figure=plt, figure_name=title + '_' + train_val) plt.close() def save(self, model): """ Save the model weights in comet. Args: model (nn.Module): the model to be saved """ if self._epoch % self.exp_config['checkpoint_interval'] == 0: print('Checkpointing the model...') state_dict = model.state_dict() cpu_state_dict = {k: v.cpu() for k, v in state_dict.items()} # save the state dictionary ckpt_path = os.path.join('./ckpt_epoch_' + str(self._epoch) + '.ckpt') torch.save(cpu_state_dict, ckpt_path) self.experiment.log_asset(ckpt_path) os.remove(ckpt_path) print('Done.') def load(self, model): """ Load the model weights. """ assert self.exp_config[ 'checkpoint_exp_key'] is not None, 'Checkpoint experiment key must be set.' print('Loading checkpoint from ' + self.exp_config['checkpoint_exp_key'] + '...') comet_api = comet_ml.papi.API( rest_api_key=self.exp_config['rest_api_key']) exp = comet_api.get_experiment( workspace=self.exp_config['comet_config']['workspace'], project_name=self.exp_config['comet_config']['project_name'], experiment=self.exp_config['checkpoint_exp_key']) # asset_list = comet_api.get_experiment_asset_list(self.exp_config['checkpoint_exp_key']) asset_list = exp.get_asset_list() # get most recent checkpoint ckpt_assets = [ asset for asset in asset_list if 'ckpt' in asset['fileName'] ] asset_times = [asset['createdAt'] for asset in ckpt_assets] asset = asset_list[asset_times.index(max(asset_times))] print('Checkpoint Name:', asset['fileName']) ckpt = exp.get_asset(asset['assetId']) state_dict = torch.load(io.BytesIO(ckpt)) model.load(state_dict) print('Done.')
history['val_accuracy'].append(np.mean(val_accuracy)) # Save checkpoint if checkpointer metric improves checkpointer.save_best(float(np.mean(val_loss)), global_step) # Check to stop training early if early_stopping and earlystopper.check_early_stop( float(np.mean(val_loss))): break # Save training history history_file = os.path.join(output_dir, experiment_name + "_history.npz") save_history(history_file, history) experiment.log_asset(history_file) end_time = time.time() print("Training took " + str(('%.3f' % (end_time - start_time))) + " seconds for " + str(num_epochs) + " epochs") print("------------------------------------") print("Saving model...") checkpointer.save(global_step) experiment.log_asset_folder(checkpoint_dir) if testing: # Test the model print("------------------------------------") print("Testing model...")
float(acc_10) / float(total_batches)) experiment.log_metric( 'Acc@20', float(acc_20) / float(total_batches)) experiment.log_metric( 'Acc@50', float(acc_50) / float(total_batches)) running_loss = 0.0 acc_1 = 0.0 acc_5 = 0.0 acc_10 = 0.0 acc_20 = 0.0 acc_50 = 0.0 total_batches = 0.0 print("Saving Epoch") torch.save({'model_state_dict': model.state_dict()}, "./models/" + arch_name.upper() + ".pt") experiment.log_asset("./models/" + arch_name.upper() + ".pt") if epoch % 5 == 0: val_loss_old = val_loss val_loss = val_model(model, val_loader) if val_loss - val_loss_old < 1e-3: scheduler_step.step() step_count += 1 print("End here")
"--git_hash", type=str, default="no-git-hash", help="output of git log --pretty=format:'%h' -n 1", ) opts = parser.parse_args() cudnn.benchmark = True # Load experiment setting config = get_config(opts.config) max_iter = config["max_iter"] display_size = config["display_size"] config["vgg_model_path"] = opts.output_path if comet_exp is not None: comet_exp.log_asset(file_data=opts.config, file_name=Path(opts.config)) comet_exp.log_parameter("git_hash", opts.git_hash) comet_exp.log_parameters(flatten_opts(config)) # Setup model and data loader if opts.trainer == "MUNIT": trainer = MUNIT_Trainer(config) elif opts.trainer == "UNIT": trainer = UNIT_Trainer(config) else: sys.exit("Only support MUNIT|UNIT") trainer.cuda() train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders( config) test_loader_a_w_mask = get_data_loader_mask_and_im(
def train_model( xpath, ypath, xvalidpath, yvalidpath, xtestpath, ytestpath, modelpath, models, scaler, ): if not os.path.exists(os.path.abspath(modelpath)): os.mkdir(os.path.abspath(modelpath)) experiment = Experiment(project_name="mof-oxidation-states") experiment.log_asset(xpath) experiment.log_asset(ypath) experiment.log_asset(xvalidpath) experiment.log_asset(yvalidpath) experiment.log_asset(xtestpath) experiment.log_asset(ytestpath) trainlogger.info("Train X: {}".format(xpath)) trainlogger.info("Train y: {}".format(ypath)) trainlogger.info("Validation X: {}".format(xvalidpath)) trainlogger.info("Validation y: {}".format(yvalidpath)) trainlogger.info("Test X: {}".format(xtestpath)) trainlogger.info("Test y: {}".format(ytestpath)) train_stem = Path(xpath).stem ml_object = MLOxidationStates.from_x_y_paths( xpath=os.path.abspath(xpath), ypath=os.path.abspath(ypath), xvalidpath=os.path.abspath(xvalidpath), yvalidpath=os.path.abspath(yvalidpath), modelpath=os.path.abspath(modelpath), scaler=scaler, n=int(10), voting="soft", calibrate="istonic", experiment=experiment, ) X_test = np.load(xtestpath) y_test = np.load(ytestpath) X_test = ml_object.scaler.transform(X_test) models_loaded = [] for model in models: name = Path(model).stem model = joblib.load(model) models_loaded.append((name, model)) votingclassifier, _ = ml_object.calibrate_ensemble( models_loaded, ml_object.x_valid, ml_object.y_valid, ml_object.experiment, ml_object.voting, ml_object.calibrate, ) votingclassifier_tuple = [("votingclassifier_" + train_stem, votingclassifier)] cores_test = ml_object.model_eval( votingclassifier_tuple, X_test, y_test, experiment, "test", modelpath ) scores_train = ml_object.model_eval( votingclassifier_tuple, ml_object.x, ml_object.y, experiment, "train", modelpath ) scores_valid = ml_object.model_eval( votingclassifier_tuple, ml_object.x_valid, ml_object.y_valid, experiment, "valid", modelpath, )
} def getBestModelfromTrials(trials): valid_trial_list = [ trial for trial in trials if STATUS_OK == trial['result']['status'] ] losses = [float(trial['result']['loss']) for trial in valid_trial_list] index_having_minumum_loss = np.argmin(losses) best_trial_obj = valid_trial_list[index_having_minumum_loss] return best_trial_obj['result']['mlp'] if __name__ == '__main__': experiment = Experiment(project_name='color-ml') with experiment.train(): trials = Trials() best = fmin(keras_fmin_fnct, get_space(), algo=tpe.suggest, max_evals=150, trials=trials) X_train, Y_train, X_test, Y_test = data() print('Evalutation of best performing model:') joblib.dump(best, 'best.joblib') model = getBestModelfromTrials(trials) joblib.dump(model, 'best_model.joblib') experiment.log_asset('best.joblib') experiment.log_asset('best_model.joblib')
options['clip_gradient_norm']) optimizer.step() N, C = output_fw.shape running_loss += total_loss.item() #torch.cuda.empty_cache() div = len(train_dl) experiment.log_metric('Epoch', epoch) experiment.log_metric('Running_loss', running_loss / float(div)) experiment.log_metric('Temporal_loss', temporal_loss / float(div)) experiment.log_metric('Caption_loss', caption_loss / float(div)) torch.save({ 'model_state_dict': model.state_dict(), }, "full_caption_C3D_16_" + str(epoch) + ".pt") experiment.log_asset("full_caption_C3D_16_" + str(epoch) + ".pt") if epoch % 5 == 0: val_loss_old = val_loss val_loss = val_model(model, val_loader) if val_loss - val_loss_old < 1e-4: step_count += 1