def main(): # capture the config path from the run arguments # then process the json configuration file try: args = get_args() config = process_config(args.config) except: print("missing or invalid arguments") exit(0) print('Create the data generator.') data_loader = RobertaDataLoader(config) print('Create the model.') model = RobertaModel(config) print('Creating the Experiment') experiment = Experiment(api_key=config.exp.comet_api_key, project_name=config.exp.name, auto_output_logging="simple") print('Create the trainer') trainer = RobertaTrainer(model.model, experiment, config, data_loader.get_train_data()) with experiment.train(): print('Start training the model.') trainer.train() model.save() with experiment.test(): print('Predicting the testing data') trainer.predict(data_loader.get_test_data(), data_loader.get_tokenizer())
def train(X, y, outdir, max_feat=30): experiment = Experiment(project_name='color-ml') with experiment.train(): gp_kernel = RationalQuadratic( length_scale=0.1, length_scale_bounds=(1e-4, 0.5)) + WhiteKernel( 0.01, (1e-3, 0.5e-1)) gp = GaussianProcessRegressor(kernel=gp_kernel, n_restarts_optimizer=15, normalize_y=True) sfs = SFS( gp, k_features=max_feat, forward=True, floating=False, scoring='neg_mean_squared_error', cv=5, verbose=2, n_jobs=-1, ) sfs = sfs.fit(X, y) joblib.dump(sfs, os.path.join(outdir, 'sfs.joblib')) return sfs
def fit_validate(exp_params, k, data_path, write_path, others=None, custom_tag=''): """Fit model and compute metrics on train and validation set. Intended for hyperparameter search. Only logs final metrics and scatter plot of final embedding. Args: exp_params(dict): Parameter dict. Should at least have keys model_name, dataset_name & random_state. Other keys are assumed to be model parameters. k(int): Fold identifier. data_path(str): Data directory. write_path(str): Where to write temp files. others(dict): Other things to log to Comet experiment. custom_tag(str): Custom tag for comet experiment. """ # Comet experiment exp = Experiment(parse_args=False) exp.disable_mp() custom_tag += '_validate' exp.add_tag(custom_tag) exp.log_parameters(exp_params) if others is not None: exp.log_others(others) # Parse experiment parameters model_name, dataset_name, random_state, model_params = parse_params(exp_params) # Fetch and split dataset. data_train = getattr(grae.data, dataset_name)(split='train', random_state=random_state, data_path=data_path) data_train, data_val = data_train.validation_split(random_state=FOLD_SEEDS[k]) # Model m = getattr(grae.models, model_name)(random_state=FOLD_SEEDS[k], **model_params) m.write_path = write_path m.data_val = data_val with exp.train(): m.fit(data_train) # Log plot m.comet_exp = exp m.plot(data_train, data_val, title=f'{model_name} : {dataset_name}') # Probe embedding prober = EmbeddingProber() prober.fit(model=m, dataset=data_train, mse_only=True) train_z, train_metrics = prober.score(data_train, is_train=True) # Log train metrics exp.log_metrics(train_metrics) with exp.validate(): val_z, val_metrics = prober.score(data_val) # Log train metrics exp.log_metrics(val_metrics) # Log marker to mark successful experiment exp.log_other('success', 1)
def train(self): os.mkdir(self.paths['path']) if self.use_comet and self.api_key and self.project_name and self.workspace: experiment = Experiment(api_key=self.api_key, project_name=self.project_name, workspace=self.workspace) experiment.log_dataset_hash(self.train_dataset) experiment.add_tags([ str(self.architecture), "text_generation", f"nb_labels_{self.number_labels}" ]) with experiment.train(): hist = self.fit_dataset(self.train_dataset, self.val_dataset, self.epochs) experiment.end() elif self.use_comet: raise Exception( "Please provide an api_key, project_name and workspace for comet_ml" ) else: callbacks = self.callback_func( tensorboard_dir=self.paths['tensorboard_path'], checkpoint_path=self.paths['checkpoint_path']) hist = self.model.fit_dataset(self.train_dataset, self.val_dataset, self.epochs, callbacks) self.metrics = get_metrics(hist, "sparse_categorical_accuracy") self.export_weights(self.model) self.export_info(self.model_info) self.export_metrics(self.metrics) self.export_tokenizer(self.tokenizer) if self.do_zip_model: self.zip_model()
def train(train_data, model, optimizer, experiment: Experiment): global TRAIN_MINI_BATCH global EPOCH model.train() model.cuda() batches = len(train_data) total_loss = 0 loss_func = nn.L1Loss() with experiment.train(): for x, in tqdm(train_data): x = x.cuda() optimizer.zero_grad() prediction, _ = model(x) loss = loss_func(prediction, x) # loss = MyLoss(x, prediction) experiment.log_metric( "mini-batch loss", loss.item(), step=TRAIN_MINI_BATCH) TRAIN_MINI_BATCH += 1 loss.backward() optimizer.step() total_loss += loss.item() average_loss = total_loss / batches experiment.log_metric("batch loss", average_loss, step=EPOCH) return average_loss
def main(): STARTTIME0 = time.strftime('run_%Y_%m_%d_%H_%M_%s') METRICS = [] for ts_size in [3000, 5000, 5600]: for iteration in range(10): _, _, X_train, X_test, y_train, y_test, _ = process_data( size=ts_size) experiment = Experiment(api_key=os.environ['COMET_API_KEY'], project_name='color-ml') experiment.log_parameters(PARAMETERS_MEDIAN) with experiment.train(): regressor_median = fit(X_train, y_train) metrics_dict = get_metrics_dict(regressor_median, X_test, y_test, experiment) metrics_dict['iteration'] = iteration metrics_dict['ts_size'] = ts_size METRICS.append(metrics_dict) df = pd.DataFrame(METRICS) df.to_csv('learningurve_' + STARTTIME0 + '.csv') experiment.log_asset('learningurve_' + STARTTIME0 + '.csv')
def log_metrics(metrics: dict, comet_logger: Experiment, epoch: int, context_val: bool): if context_val: with comet_logger.validate(): comet_logger.log_metrics(metrics, epoch=epoch) else: with comet_logger.train(): comet_logger.log_metrics(metrics, epoch=epoch)
def log_simclr_images(img1: Tensor, img2: Tensor, context_val: bool, comet_logger: Experiment): if context_val: with comet_logger.validate(): plot_simclr_images(img1.data[0].cpu(), img2.data[0].cpu(), comet_logger) else: with comet_logger.train(): plot_simclr_images(img1.data[0].cpu(), img2.data[0].cpu(), comet_logger)
def _train_with_comet(self, train_dataset, val_dataset): experiment = Experiment(api_key=self.api_key, project_name=self.project_name, workspace=self.workspace) experiment.log_dataset_hash(train_dataset) experiment.add_tags([ str(self.architecture), self.name, f"nb_labels_{self.label_encoder_classes_number}" ]) with experiment.train(): hist = self.fit_dataset(train_dataset, val_dataset) experiment.end() return hist
def train(hyper_params): mnist = get_data() # Get graph definition, tensors and ops train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph( hyper_params) experiment = Experiment(project_name="tf") experiment.log_parameters(hyper_params) experiment.log_dataset_hash(mnist) with tf.Session() as sess: with experiment.train(): sess.run(tf.global_variables_initializer()) experiment.set_model_graph(sess.graph) for i in range(hyper_params["steps"]): batch = mnist.train.next_batch(hyper_params["batch_size"]) experiment.set_step(i) # Compute train accuracy every 10 steps if i % 10 == 0: train_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: batch[1] }) print('step %d, training accuracy %g' % (i, train_accuracy)) experiment.log_metric("accuracy", train_accuracy, step=i) # Update weights (back propagation) _, loss_val = sess.run([train_step, cross_entropy], feed_dict={ x: batch[0], y_: batch[1] }) experiment.log_metric("loss", loss_val, step=i) ### Finished Training ### with experiment.test(): # Compute test accuracy acc = accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels }) experiment.log_metric("accuracy", acc) print('test accuracy %g' % acc)
def log_hybrid2_images( img1: Tensor, img2: Tensor, params: Dict[str, Tensor], context_val: bool, comet_logger: Experiment, ): params = {k: v.data[0].cpu() for k, v in params.items()} if context_val: with comet_logger.validate(): plot_hybrid2_images(img1.data[0].cpu(), img2.data[0].cpu(), params, comet_logger) else: with comet_logger.train(): plot_hybrid2_images(img1.data[0].cpu(), img2.data[0].cpu(), params, comet_logger)
def fit(self, experiment: Experiment, cuda: bool = True) -> dict: # Prepare comet.ml logger experiment.log_parameters(self.hparams) # log hyper parameters # noinspection PyUnresolvedReferences torch.backends.cudnn.benchmark = True # enable cudnn benchmark mode for better performance self.is_cuda = cuda if self.is_cuda: self.cuda() # move model to cuda train_set = self.prepare_dataset() self.criterion = self.prepare_criterion() optimizer = self.prepare_optimizers() best_model = {"loss": 999, "epoch": -1, "model": None} self.train() # set the model to train mode with experiment.train(): for e in range(self.epoch): self.before_train_epoch() batch_loss = 0 for batch_idx, batch in enumerate(train_set): optimizer.zero_grad() loss = self.train_step(batch) batch_loss += loss.item() loss.backward() _ = torch.nn.utils.clip_grad_norm_(self.parameters(), self.gradients_norm) optimizer.step() avg_loss = batch_loss / len(train_set) experiment.log_metric("epoch_loss", avg_loss, step=e) # noinspection PyUnboundLocalVariable if avg_loss < best_model["loss"]: best_model["loss"] = avg_loss best_model["epoch"] = e best_model["state_dict"] = deepcopy(self.state_dict()) # detach all weights with deepcopy return best_model
class Logger(object): def __init__(self, dataset_name, model_name): self.model_name = model_name self.project_name = "%s-%s" % (dataset_name, self.model_name) self.logdir = os.path.join(hp.logdir, self.project_name) self.writer = SummaryWriter(log_dir=self.logdir) self.experiment = None # Experiment(api_key="luY5eUQDsBynS168WxJiRPJmJ", project_name=self.project_name, log_code=False) if hp.comet_ml_api_key is not None: self.experiment = Experiment(api_key=hp.comet_ml_api_key, project_name=self.project_name, log_code=False) self.experiment.log_multiple_params( dict((name, getattr(hp, name)) for name in dir(hp) if not name.startswith('__'))) def log_step(self, phase, step, loss_dict, image_dict): if phase == 'train': if step % 50 == 0: if self.experiment is not None: with self.experiment.train(): self.experiment.log_multiple_metrics(loss_dict, step=step) # self.writer.add_scalar('lr', get_lr(), step) # self.writer.add_scalar('%s-step/loss' % phase, loss, step) for key in sorted(loss_dict): self.writer.add_scalar('%s-step/%s' % (phase, key), loss_dict[key], step) if step % 1000 == 0: for key in sorted(image_dict): self.writer.add_image('%s/%s' % (self.model_name, key), image_dict[key], step) def log_epoch(self, phase, step, loss_dict): for key in sorted(loss_dict): self.writer.add_scalar('%s/%s' % (phase, key), loss_dict[key], step) if phase == 'valid': if self.experiment is not None: with self.experiment.validate(): self.experiment.log_multiple_metrics(loss_dict, step=step)
def train(train_data, model, optimizer, experiment: Experiment): global TRAIN_MINI_BATCH model.train() model.cuda() batches = len(train_data) total_loss = 0 predictions = [] ground_truth = [] with experiment.train(): for x, y in tqdm(train_data): x = x.cuda() y = y.cuda().float().view optimizer.zero_grad() prediction = model(x) loss = F.binary_cross_entropy(prediction, y) loss.backward() optimizer.step() total_loss += loss.item() prediction = prediction >= 0.5 predictions.append(prediction.detach().cpu().numpy()) ground_truth.append(y.detach().cpu().numpy()) experiment.log_metric("Mini batch loss", loss.item(), step=TRAIN_MINI_BATCH) TRAIN_MINI_BATCH += 1 print(total_loss) average_loss = total_loss / batches predictions = np.concatenate(predictions) ground_truth = np.concatenate(ground_truth) accuracy = accuracy_score(ground_truth, predictions) f1score = f1_score(ground_truth, predictions) precision = precision_score(ground_truth, predictions) recall = recall_score(ground_truth, predictions) return average_loss, accuracy, f1score, precision, recall
def log_pairwise_images( img1: Tensor, img2: Tensor, gt_pred: Dict[str, Tensor], context_val: bool, comet_logger: Experiment, ): gt_pred = { k: [v[0].data[0].cpu().numpy(), v[1].data[0].cpu().numpy()] for k, v in gt_pred.items() } if context_val: with comet_logger.validate(): plot_pairwise_images(img1.data[0].cpu(), img2.data[0].cpu(), gt_pred, comet_logger) else: with comet_logger.train(): plot_pairwise_images(img1.data[0].cpu(), img2.data[0].cpu(), gt_pred, comet_logger)
def run(experiment: Experiment, params: argparse.Namespace): sb3_utils.set_random_seed(params.seed, using_cuda=use_cuda) env = helper.make_env(params, 'env') # Logs will be saved in log_dir/monitor.csv env = Monitor(env) with experiment.train(): callback = SaveOnBestTrainingRewardCallback(experiment, check_freq=1000) # Deactivate all the DQN extensions to have the original version # In practice, it is recommend to have them activated model = DQN(CnnPolicy, env, learning_rate=params.learning_rate, gamma=params.gamma, seed=params.seed, max_grad_norm=params.max_grad_norm, verbose=1, device=device, policy_kwargs={'features_extractor_class': ColoringCNN}) model.learn(total_timesteps=params.max_ts, callback=callback)
def log_image( prediction: Tensor, y: Tensor, x: Tensor, gpu: bool, context_val: bool, comet_logger: Experiment, ): if gpu: pred_label = prediction.data[0].cpu().numpy() true_label = y.data[0].cpu().detach().numpy() else: pred_label = prediction[0].detach().numpy() true_label = y[0].detach().numpy() if context_val: with comet_logger.validate(): plot_truth_vs_prediction(pred_label, true_label, x.data[0].cpu(), comet_logger) else: with comet_logger.train(): plot_truth_vs_prediction(pred_label, true_label, x.data[0].cpu(), comet_logger)
def calibrate_ensemble( models: list, X_valid: np.array, y_valid: np.array, experiment: Experiment, voting: str = "soft", calibrate: str = "isotonic", ) -> Tuple[VotingClassifier, float]: """Collects base models into a voting classifier, trains it and then performs probability calibration Arguments: models {list} -- list of optimized base models X_valid {np.array} -- feature matrix y_valid {np.array} -- label vector Keyword Arguments: voting {str} -- voting mechanism (hard or soft) (default: {"soft"}) n {int} -- number of CV folds for isotonic regression (default: {10}) calibrate {str} -- probability calibration method (none, isotonic, sigmoid) (default: {isotonic}) Returns: [CalibratedClassifierCV, float] -- [calibrated classifier and elapsed time] """ trainlogger.debug("calibrating and building ensemble model") startime = time.process_time() assert len(X_valid) == len(y_valid) # calibrate the base esimators with experiment.train(): vc = VotingClassifier(models, voting=voting) trainlogger.debug("now, calibrating the base base estimators") vc._calibrate_base_estimators(calibrate, X_valid, y_valid) # pylint:disable=protected-access endtime = time.process_time() elapsed_time = endtime - startime return vc, elapsed_time
def main(cmd=None, stdout=True): args = get_args(cmd, stdout) model_id = "seed_{}_strat_{}_noise_fn_{}_noise_fp_{}_num_passes_{}_seed_size_{}_model_{}_batch_size_{}_gamma_{}_label_budget_{}_epochs_{}".format( args.seed, args.strategy, args.noise_fn, args.noise_fp, args.num_passes, args.seed_size, args.model, args.batch_size, args.gamma, args.label_budget, args.epochs) logging.basicConfig( filename="{}/{}.txt".format(args.dout, model_id), format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logger = Experiment(comet_ml_key, project_name="ActiveDialogue") logger.log_parameters(vars(args)) if args.model == "glad": model_arch = GLAD elif args.model == "gce": model_arch = GCE env = PartialEnv(load_dataset, model_arch, args) if args.seed_size: with logger.train(): if not env.load('seed'): logging.info("No loaded seed. Training now.") env.seed_fit(args.seed_epochs, prefix="seed") logging.info("Seed completed.") else: logging.info("Loaded seed.") if args.force_seed: logging.info("Training seed regardless.") env.seed_fit(args.seed_epochs, prefix="seed") env.load('seed') use_strategy = False if args.strategy == "entropy": use_strategy = True strategy = partial_entropy elif args.strategy == "bald": use_strategy = True strategy = partial_bald if use_strategy: if args.threshold_strategy == "fixed": strategy = FixedThresholdStrategy(strategy, args, True) elif args.threshold_strategy == "variable": strategy = VariableThresholdStrategy(strategy, args, True) elif args.threshold_strategy == "randomvariable": strategy = StochasticVariableThresholdStrategy( strategy, args, True) ended = False i = 0 initial_metrics = env.metrics(True) logger.log_current_epoch(i) logging.info("Initial metrics: {}".format(initial_metrics)) for k, v in initial_metrics.items(): logger.log_metric(k, v) with logger.train(): while not ended: i += 1 # Observe environment state logger.log_current_epoch(i) if env.can_label: # Obtain label request from strategy obs, preds = env.observe(20 if args.strategy == "bald" else 1) if args.strategy != "bald": preds = preds[0] if args.strategy == "aggressive": label_request = aggressive(preds) elif args.strategy == "random": label_request = random(preds) elif args.strategy == "passive": label_request = passive(preds) elif use_strategy: label_request = strategy.observe(preds) else: raise ValueError() # Label solicitation labeled = env.label(label_request) if use_strategy: strategy.update( sum([ np.sum(s.flatten()) for s in label_request.values() ]), sum([ np.sum(np.ones_like(s).flatten()) for s in label_request.values() ])) else: break # Environment stepping ended = env.step() # Fit every al_batch of items best = env.fit(prefix=model_id, reset_model=True) for k, v in best.items(): logger.log_metric(k, v) env.load(prefix=model_id) # Final fit final_metrics = env.fit(epochs=args.final_epochs, prefix="final_fit_" + model_id, reset_model=True) for k, v in final_metrics.items(): logger.log_metric("Final " + k, v) logging.info("Final " + k + ": " + str(v)) logging.info("Run finished.")
def main(): global args, best_acc1 args = parser.parse_args() ######################################################################################### # Create options ######################################################################################### if args.bert_model == "bert-base-uncased": question_features_path = BASE_EXTRACTED_QUES_FEATURES_PATH elif args.bert_model == "bert-base-multilingual-cased": question_features_path = CASED_EXTRACTED_QUES_FEATURES_PATH else: question_features_path = EXTRACTED_QUES_FEATURES_PATH options = { 'vqa': { 'trainsplit': args.vqa_trainsplit }, 'logs': { 'dir_logs': args.dir_logs }, 'model': { 'arch': args.arch, 'seq2vec': { 'type': args.st_type, 'dropout': args.st_dropout, 'fixed_emb': args.st_fixed_emb } }, 'optim': { 'lr': args.learning_rate, 'batch_size': args.batch_size, 'epochs': args.epochs } } if args.path_opt is not None: with open(args.path_opt, 'r') as handle: options_yaml = yaml.load(handle, Loader=yaml.FullLoader) options = utils.update_values(options, options_yaml) print('## args') pprint(vars(args)) print('## options') pprint(options) if args.help_opt: return # Set datasets options if 'vgenome' not in options: options['vgenome'] = None ######################################################################################### # Create needed datasets ######################################################################################### trainset = datasets.factory_VQA(options['vqa']['trainsplit'], options['vqa'], options['coco'], options['vgenome']) train_loader = trainset.data_loader( batch_size=options['optim']['batch_size'], num_workers=args.workers, shuffle=True) if options['vqa']['trainsplit'] == 'train': valset = datasets.factory_VQA('val', options['vqa'], options['coco']) val_loader = valset.data_loader( batch_size=options['optim']['batch_size'], num_workers=args.workers) if options['vqa']['trainsplit'] == 'trainval' or args.evaluate: testset = datasets.factory_VQA('test', options['vqa'], options['coco']) test_loader = testset.data_loader( batch_size=options['optim']['batch_size'], num_workers=args.workers) ######################################################################################### # Create model, criterion and optimizer ######################################################################################### model = models.factory(options['model'], trainset.vocab_words(), trainset.vocab_answers(), cuda=True, data_parallel=True) criterion = criterions.factory(options['vqa'], cuda=True) optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), options['optim']['lr']) ######################################################################################### # args.resume: resume from a checkpoint OR create logs directory ######################################################################################### exp_logger = None if args.resume: args.start_epoch, best_acc1, exp_logger = load_checkpoint( model.module, optimizer, os.path.join(options['logs']['dir_logs'], args.resume)) else: # Or create logs directory if os.path.isdir(options['logs']['dir_logs']): if click.confirm( 'Logs directory already exists in {}. Erase?'.format( options['logs']['dir_logs'], default=False)): os.system('rm -r ' + options['logs']['dir_logs']) else: return os.system('mkdir -p ' + options['logs']['dir_logs']) path_new_opt = os.path.join(options['logs']['dir_logs'], os.path.basename(args.path_opt)) path_args = os.path.join(options['logs']['dir_logs'], 'args.yaml') with open(path_new_opt, 'w') as f: yaml.dump(options, f, default_flow_style=False) with open(path_args, 'w') as f: yaml.dump(vars(args), f, default_flow_style=False) if exp_logger is None: # Set loggers exp_name = os.path.basename( options['logs']['dir_logs']) # add timestamp exp_logger = logger.Experiment(exp_name, options) exp_logger.add_meters('train', make_meters()) exp_logger.add_meters('test', make_meters()) if options['vqa']['trainsplit'] == 'train': exp_logger.add_meters('val', make_meters()) exp_logger.info['model_params'] = utils.params_count(model) print('Model has {} parameters'.format( exp_logger.info['model_params'])) ######################################################################################### # args.evaluate: on valset OR/AND on testset ######################################################################################### if args.evaluate: path_logger_json = os.path.join(options['logs']['dir_logs'], 'logger.json') if options['vqa']['trainsplit'] == 'train': acc1, val_results = engine.validate(val_loader, model, criterion, exp_logger, args.start_epoch, args.print_freq) # save results and compute OpenEnd accuracy exp_logger.to_json(path_logger_json) save_results(val_results, args.start_epoch, valset.split_name(), options['logs']['dir_logs'], options['vqa']['dir']) test_results, testdev_results = engine.test(test_loader, model, exp_logger, args.start_epoch, args.print_freq) # save results and DOES NOT compute OpenEnd accuracy exp_logger.to_json(path_logger_json) save_results(test_results, args.start_epoch, testset.split_name(), options['logs']['dir_logs'], options['vqa']['dir']) save_results(testdev_results, args.start_epoch, testset.split_name(testdev=True), options['logs']['dir_logs'], options['vqa']['dir']) return ######################################################################################### # Begin training on train/val or trainval/test ######################################################################################### experiment = Experiment(api_key="AgTGwIoRULRgnfVR5M8mZ5AfS", project_name="vqa", workspace="vuhoangminh") experiment.log_parameters(flatten(options)) with experiment.train(): for epoch in range(args.start_epoch + 1, options['optim']['epochs']): engine.train(train_loader, model, criterion, optimizer, exp_logger, epoch, experiment, args.print_freq) if options['vqa']['trainsplit'] == 'train': # evaluate on validation set with experiment.validate(): acc1, val_results = engine.validate( val_loader, model, criterion, exp_logger, epoch, args.print_freq) # this will be logged as validation accuracy based on the context. experiment.log_metric("acc1", acc1) # remember best prec@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint( { 'epoch': epoch, 'arch': options['model']['arch'], 'best_acc1': best_acc1, 'exp_logger': exp_logger }, model.module.state_dict(), optimizer.state_dict(), options['logs']['dir_logs'], args.save_model, args.save_all_from, is_best) # save results and compute OpenEnd accuracy save_results(val_results, epoch, valset.split_name(), options['logs']['dir_logs'], options['vqa']['dir']) else: test_results, testdev_results = engine.test( test_loader, model, exp_logger, epoch, args.print_freq, topk=3, dict=io_utils.read_pickle(question_features_path), bert_dim=options["model"]["dim_q"]) # save checkpoint at every timestep save_checkpoint( { 'epoch': epoch, 'arch': options['model']['arch'], 'best_acc1': best_acc1, 'exp_logger': exp_logger }, model.module.state_dict(), optimizer.state_dict(), options['logs']['dir_logs'], args.save_model, args.save_all_from) # save results and DOES NOT compute OpenEnd accuracy save_results(test_results, epoch, testset.split_name(), options['logs']['dir_logs'], options['vqa']['dir']) save_results(testdev_results, epoch, testset.split_name(testdev=True), options['logs']['dir_logs'], options['vqa']['dir'])
def train_and_evaluate(self, train_gen, val_gen, epochs): """ """ experiment = Experiment( api_key="VNQSdbR1pw33EkuHbUsGUSZWr", project_name="piratesofthecaribbean", workspace="florpi", ) model = self.build() with experiment.train(): model_path = os.path.join(self.directory, "cnn_{epoch:02d}-{val_loss:.2f}.hdf5") callbacks = [ ModelCheckpoint(model_path, monitor="val_loss", mode="min"), # EarlyStopping( # monitor="val_loss", # mode="min", # min_delta=0.1, # patience=1, # restore_best_weights=True, # ), ] model.fit( train_gen, epochs=epochs, validation_data=val_gen, callbacks=callbacks, class_weight=CLASS_WEIGHTS, ) model.save(os.path.join(self.directory, "cnn_final.h5")) # Run validation with experiment.test(): probabilities = [] y_val_all = [] # reset generator val_gen.reset() for idx, (X_val, y_val) in tqdm(enumerate(val_gen), desc="valset", total=val_gen._num_examples): y_val_all += y_val.tolist() probs = model.predict(X_val) probabilities += probs.tolist() if idx > val_gen._num_examples: break y_true = np.argmax(y_val_all, axis=-1) y_pred = np.argmax(probabilities, axis=-1) visualize.plot_confusion_matrix(y_true, y_pred, classes=LABELS, normalize=True, experiment=experiment) visualize.plot_confusion_matrix(y_true, y_pred, classes=LABELS, normalize=False, experiment=experiment) experiment.log_confusion_matrix(y_true=y_true, y_predicted=y_pred, labels=LABELS) return model
class Experiment: """ A helper class to facilitate the training and validation procedure of the GoTurnRemix model Parameters ---------- learning_rate: float Learning rate to train the model. The optimizer is SGD and the loss is L1 Loss image_size: int The size of the input image. This has to be fixed before the data is created data_path: Path Path to the data folder. If the folder name includes "pickle", then the data saved as pickles are loaded augment: bool Perform augmentation on the images before training logs_path: Path Path to save the validation predictions at the end of each epoch models_path: Path Path to save the model state at the end of each epoch save_name: str Name of the folder in which the logs and models are saved. If not provided, the current datetime is used """ def __init__(self, learning_rate: float, image_size: int, data_path: Path, augment: bool = True, logs_path: Path = None, models_path: Path = None, save_name: str = None, comet_api: str = None): self.image_size = image_size self.logs_path = logs_path self.models_path = models_path self.model = GoTurnRemix() self.model.cuda() self.criterion = torch.nn.L1Loss() self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, self.model.parameters()), lr=learning_rate) self.model_name = str(datetime.datetime.now()).split('.')[0].replace( ':', '-').replace(' ', '-') self.model_name = save_name if save_name else self.model_name self.augment = augment self.data = Data(data_path, target_size=self.image_size, transforms=augment) self.comet = None if comet_api: self.comet = Comet(api_key=comet_api) self.comet.log_parameter('learning_rate', learning_rate) self.comet.log_parameter('image_size', image_size) self.comet.log_parameter('augment', augment) def __train_step__(self, data): """ Performs one step of the training procedure Parameters ---------- data data obtained from @Data.__getitem__ Returns ------- Loss at the end of training step """ if self.comet: self.comet.train() previous_cropped, current_cropped, bbox, scale, crop = data previous_cropped = torch.div(previous_cropped, 255).float().cuda() current_cropped = torch.div(current_cropped, 255).float().cuda() previous_cropped = torch.autograd.Variable(previous_cropped, requires_grad=True) current_cropped = torch.autograd.Variable(current_cropped, requires_grad=True) bbox = bbox.requires_grad_(True).float().cuda() self.optimizer.zero_grad() preds = self.model(previous_cropped, current_cropped) del previous_cropped del current_cropped gc.collect() loss = self.criterion(preds, bbox) if self.comet: self.comet.log_metric('loss', loss) loss.backward() self.optimizer.step() return loss def __test__(self): """ Test tracking of the model Returns ------- Test loss and test predictions """ # Set model to evaluation mode if self.comet: self.comet.test() self.model.eval() test_preds = [] test_loss = [] video_frames = self.data.video_frames[-1] video_annotations = self.data.video_annotations[-1] p_a = video_annotations[0] p_f = video_frames[0] test_preds.append(p_a) for i in tqdm(range(1, len(video_annotations)), desc='Validating'): c_a = video_annotations[i] c_f = video_frames[i] p_c, c_c, bbox, scale, crop = self.data.make_crops( p_f, c_f, p_a, c_a) p_c = torch.div(torch.from_numpy(p_c), 255).unsqueeze(0).float().cuda() c_c = torch.div(torch.from_numpy(c_c), 255).unsqueeze(0).float().cuda() bbox = torch.tensor(bbox, requires_grad=False).float().cuda() preds = self.model(p_c, c_c) del p_c del c_c gc.collect() loss = torch.nn.functional.l1_loss(preds, bbox) if self.comet: self.comet.log_metric('val_loss', loss) test_loss.append(loss.item()) preds = self.data.get_bbox(preds.cpu().detach().numpy()[0], self.image_size, scale, crop) test_preds.append(preds) p_a = preds p_f = c_f return test_loss, test_preds def __validate__(self): """ Performs validation on the model Returns ------- Validation loss and validation predictions """ # Set model to evaluation mode if self.comet: self.comet.validate() self.model.eval() validation_preds = [] validation_loss = [] video_frames = self.data.video_frames[-1] video_annotations = self.data.video_annotations[-1] p_a = video_annotations[0] p_f = video_frames[0] validation_preds.append(p_a) for i in tqdm(range(1, len(video_annotations)), desc='Validating'): c_a = video_annotations[i] c_f = video_frames[i] p_c, c_c, bbox, scale, crop = self.data.make_crops( p_f, c_f, p_a, c_a) p_c = torch.div(torch.from_numpy(p_c), 255).unsqueeze(0).float().cuda() c_c = torch.div(torch.from_numpy(c_c), 255).unsqueeze(0).float().cuda() bbox = torch.tensor(bbox, requires_grad=False).float().cuda() preds = self.model(p_c, c_c) del p_c del c_c gc.collect() loss = torch.nn.functional.l1_loss(preds, bbox) if self.comet: self.comet.log_metric('val_loss', loss) validation_loss.append(loss.item()) preds = self.data.get_bbox(preds.cpu().detach().numpy()[0], self.image_size, scale, crop) validation_preds.append(preds) p_a = c_a p_f = c_f return validation_loss, validation_preds def train(self, epochs: int, batch_size: int, validate: bool = True, test: bool = True): """ Trains the model for @epochs number of epochs Parameters ---------- epochs: int Number of epochs to train the model batch_size: int The size of each batch when training the model validate: bool, default=True If True, validation occurs at the end of each epoch The results are saved in @logs_path and models are saved in @models_path test: bool, default=True If True, the model is tested for tracking at the end of the training procedure The results are saved in @logs_path Returns ------- list: List containing the training loss at the end of each epoch """ if self.comet: self.comet.log_parameter('epochs', epochs) self.comet.log_parameter('batch_size', batch_size) loss_per_epoch = [] preds_per_epoch = [] # Set the model to training mode self.model.train() # Create a DataLoader to feed data to the model dataloader = torch.utils.data.DataLoader(dataset=self.data, batch_size=batch_size, shuffle=True) # Run for @epochs number of epochs for epoch in range(epochs): if self.comet: self.comet.log_metric('epoch', epoch) running_loss = [] for step, data in enumerate( tqdm(dataloader, total=int(len(self.data) / batch_size), desc='Epoch {}'.format(epoch))): loss = self.__train_step__(data) running_loss.append(loss.item()) training_loss = sum(running_loss) / len(running_loss) if self.comet: self.comet.log_metric('mean_train_loss', training_loss) loss_per_epoch.append(sum(running_loss) / len(running_loss)) if validate: validation_loss, validation_preds = self.__validate__() if self.comet: self.comet.log_metric('mean_validation_loss', validation_loss) preds_per_epoch.append(validation_preds) print('Validation loss: {}'.format( sum(validation_loss) / len(validation_loss))) # Save the model at this stage if self.models_path: (self.models_path / self.model_name).mkdir(exist_ok=True) torch.save(self.model, (self.models_path / self.model_name / 'epoch_{}'.format(epoch)).resolve()) print('Training Loss: {}'.format(training_loss)) # Save the validation frames, ground truths and predictions at this stage if self.logs_path: (self.logs_path / self.model_name).mkdir(exist_ok=True) save = { 'frames': self.data.video_frames[-1], 'truth': self.data.video_annotations[-1], 'preds': preds_per_epoch } np.save( str((self.logs_path / self.model_name / 'preds_per_epoch.npy').resolve()), save) # Test the model and save the results if test: test_loss, test_preds = self.__test__() if self.logs_path: (self.logs_path / self.model_name).mkdir(exist_ok=True) save = { 'frames': self.data.video_frames[-1], 'truth': self.data.video_annotations[-1], 'preds': test_preds, 'loss': test_loss } np.save( str((self.logs_path / self.model_name / 'test_preds.npy').resolve()), save) return loss_per_epoch
def run(args, train, sparse_evidences, claims_dict): BATCH_SIZE = args.batch_size LEARNING_RATE = args.learning_rate DATA_SAMPLING = args.data_sampling NUM_EPOCHS = args.epochs MODEL = args.model RANDOMIZE = args.no_randomize PRINT = args.print use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") logger = Logger('./logs/{}'.format(time.localtime())) if MODEL: print("Loading pretrained model...") model = torch.load(MODEL) model.load_state_dict(torch.load(MODEL).state_dict()) else: model = cdssm.CDSSM() model = model.cuda() model = model.to(device) # model = cdssm.CDSSM() # model = model.cuda() # model = model.to(device) if torch.cuda.device_count() > 0: print("Let's use", torch.cuda.device_count(), "GPU(s)!") model = nn.DataParallel(model) print("Created model with {:,} parameters.".format( putils.count_parameters(model))) # if MODEL: # print("TEMPORARY change to loading!") # model.load_state_dict(torch.load(MODEL).state_dict()) print("Created dataset...") # use an 80/20 train/validate split! train_size = int(len(train) * 0.80) #test = int(len(train) * 0.5) train_dataset = pytorch_data_loader.WikiDataset( train[:train_size], claims_dict, data_sampling=DATA_SAMPLING, sparse_evidences=sparse_evidences, randomize=RANDOMIZE) val_dataset = pytorch_data_loader.WikiDataset( train[train_size:], claims_dict, data_sampling=DATA_SAMPLING, sparse_evidences=sparse_evidences, randomize=RANDOMIZE) train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=0, shuffle=True, collate_fn=pytorch_data_loader.PadCollate()) val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=0, shuffle=True, collate_fn=pytorch_data_loader.PadCollate()) # Loss and optimizer criterion = torch.nn.NLLLoss() # criterion = torch.nn.SoftMarginLoss() # if torch.cuda.device_count() > 0: # print("Let's parallelize the backward pass...") # criterion = DataParallelCriterion(criterion) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3) OUTPUT_FREQ = max(int((len(train_dataset) / BATCH_SIZE) * 0.02), 20) parameters = { "batch size": BATCH_SIZE, "epochs": NUM_EPOCHS, "learning rate": LEARNING_RATE, "optimizer": optimizer.__class__.__name__, "loss": criterion.__class__.__name__, "training size": train_size, "data sampling rate": DATA_SAMPLING, "data": args.data, "sparse_evidences": args.sparse_evidences, "randomize": RANDOMIZE, "model": MODEL } experiment = Experiment(api_key="YLsW4AvRTYGxzdDqlWRGCOhee", project_name="clsm", workspace="moinnadeem") experiment.add_tag("train") experiment.log_asset("cdssm.py") experiment.log_dataset_info(name=args.data) experiment.log_parameters(parameters) model_checkpoint_dir = "models/saved_model" for key, value in parameters.items(): if type(value) == str: value = value.replace("/", "-") if key != "model": model_checkpoint_dir += "_{}-{}".format(key.replace(" ", "_"), value) print("Training...") beginning_time = time.time() best_loss = torch.tensor(float("inf"), dtype=torch.float) # begin loss at infinity for epoch in range(NUM_EPOCHS): beginning_time = time.time() mean_train_acc = 0.0 train_running_loss = 0.0 train_running_accuracy = 0.0 model.train() experiment.log_current_epoch(epoch) with experiment.train(): for train_batch_num, inputs in enumerate(train_dataloader): claims_tensors, claims_text, evidences_tensors, evidences_text, labels = inputs claims_tensors = claims_tensors.cuda() evidences_tensors = evidences_tensors.cuda() labels = labels.cuda() #claims = claims.to(device).float() #evidences = evidences.to(device).float() #labels = labels.to(device) y_pred = model(claims_tensors, evidences_tensors) y = (labels) # y = y.unsqueeze(0) # y = y.unsqueeze(0) # y_pred = parallel.gather(y_pred, 0) y_pred = y_pred.squeeze() # y = y.squeeze() loss = criterion(y_pred, torch.max(y, 1)[1]) # loss = criterion(y_pred, y) y = y.float() binary_y = torch.max(y, 1)[1] binary_pred = torch.max(y_pred, 1)[1] accuracy = (binary_y == binary_pred).to("cuda") accuracy = accuracy.float() accuracy = accuracy.mean() train_running_accuracy += accuracy.item() mean_train_acc += accuracy.item() train_running_loss += loss.item() if PRINT: for idx in range(len(y)): print( "Claim: {}, Evidence: {}, Prediction: {}, Label: {}" .format(claims_text[0], evidences_text[idx], torch.exp(y_pred[idx]), y[idx])) if (train_batch_num % OUTPUT_FREQ) == 0 and train_batch_num > 0: elapsed_time = time.time() - beginning_time binary_y = torch.max(y, 1)[1] binary_pred = torch.max(y_pred, 1)[1] print( "[{}:{}:{:3f}s] training loss: {}, training accuracy: {}, training recall: {}" .format( epoch, train_batch_num / (len(train_dataset) / BATCH_SIZE), elapsed_time, train_running_loss / OUTPUT_FREQ, train_running_accuracy / OUTPUT_FREQ, recall_score(binary_y.cpu().detach().numpy(), binary_pred.cpu().detach().numpy()))) # 1. Log scalar values (scalar summary) info = { 'train_loss': train_running_loss / OUTPUT_FREQ, 'train_accuracy': train_running_accuracy / OUTPUT_FREQ } for tag, value in info.items(): experiment.log_metric(tag, value, step=train_batch_num * (epoch + 1)) logger.scalar_summary(tag, value, train_batch_num + 1) ## 2. Log values and gradients of the parameters (histogram summary) for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.detach().cpu().numpy(), train_batch_num + 1) logger.histo_summary(tag + '/grad', value.grad.detach().cpu().numpy(), train_batch_num + 1) train_running_loss = 0.0 beginning_time = time.time() train_running_accuracy = 0.0 optimizer.zero_grad() loss.backward() optimizer.step() # del loss # del accuracy # del claims_tensors # del claims_text # del evidences_tensors # del evidences_text # del labels # del y # del y_pred # torch.cuda.empty_cache() print("Running validation...") model.eval() pred = [] true = [] avg_loss = 0.0 val_running_accuracy = 0.0 val_running_loss = 0.0 beginning_time = time.time() with experiment.validate(): for val_batch_num, val_inputs in enumerate(val_dataloader): claims_tensors, claims_text, evidences_tensors, evidences_text, labels = val_inputs claims_tensors = claims_tensors.cuda() evidences_tensors = evidences_tensors.cuda() labels = labels.cuda() y_pred = model(claims_tensors, evidences_tensors) y = (labels) # y_pred = parallel.gather(y_pred, 0) y_pred = y_pred.squeeze() loss = criterion(y_pred, torch.max(y, 1)[1]) y = y.float() binary_y = torch.max(y, 1)[1] binary_pred = torch.max(y_pred, 1)[1] true.extend(binary_y.tolist()) pred.extend(binary_pred.tolist()) accuracy = (binary_y == binary_pred).to("cuda") accuracy = accuracy.float().mean() val_running_accuracy += accuracy.item() val_running_loss += loss.item() avg_loss += loss.item() if (val_batch_num % OUTPUT_FREQ) == 0 and val_batch_num > 0: elapsed_time = time.time() - beginning_time print( "[{}:{}:{:3f}s] validation loss: {}, accuracy: {}, recall: {}" .format( epoch, val_batch_num / (len(val_dataset) / BATCH_SIZE), elapsed_time, val_running_loss / OUTPUT_FREQ, val_running_accuracy / OUTPUT_FREQ, recall_score(binary_y.cpu().detach().numpy(), binary_pred.cpu().detach().numpy()))) # 1. Log scalar values (scalar summary) info = {'val_accuracy': val_running_accuracy / OUTPUT_FREQ} for tag, value in info.items(): experiment.log_metric(tag, value, step=val_batch_num * (epoch + 1)) logger.scalar_summary(tag, value, val_batch_num + 1) ## 2. Log values and gradients of the parameters (histogram summary) for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.detach().cpu().numpy(), val_batch_num + 1) logger.histo_summary(tag + '/grad', value.grad.detach().cpu().numpy(), val_batch_num + 1) val_running_accuracy = 0.0 val_running_loss = 0.0 beginning_time = time.time() # del loss # del accuracy # del claims_tensors # del claims_text # del evidences_tensors # del evidences_text # del labels # del y # del y_pred # torch.cuda.empty_cache() accuracy = accuracy_score(true, pred) print("[{}] mean accuracy: {}, mean loss: {}".format( epoch, accuracy, avg_loss / len(val_dataloader))) true = np.array(true).astype("int") pred = np.array(pred).astype("int") print(classification_report(true, pred)) best_loss = torch.tensor( min(avg_loss / len(val_dataloader), best_loss.cpu().numpy())) is_best = bool((avg_loss / len(val_dataloader)) <= best_loss) putils.save_checkpoint( { "epoch": epoch, "model": model, "best_loss": best_loss }, is_best, filename="{}_loss_{}".format(model_checkpoint_dir, best_loss.cpu().numpy()))
def main(args): if args.dataset in ('FB15k-237', 'kinship', 'nations', 'umls', 'WN18RR', 'YAGO3-10'): S = joblib.load(args.data_path) train_set = FBDataset(S['train_data'], args.prefetch_to_gpu) valid_set = FBDataset(S['val_data'], attr_data) test_set = FBDataset(S['test_data'], attr_data) else: train_set = FBDataset(args.data_path % 'train', args.prefetch_to_gpu) valid_set = FBDataset(args.data_path % 'valid') test_set = FBDataset(args.data_path % 'test') print('50 Most Commone Attributes') if args.prefetch_to_gpu: train_hash = set([r.tobytes() for r in train_set.dataset.cpu().numpy()]) else: train_hash = set([r.tobytes() for r in train_set.dataset]) all_hash = train_hash.copy() all_hash.update(set([r.tobytes() for r in valid_set.dataset])) all_hash.update(set([r.tobytes() for r in test_set.dataset])) logdir = args.outname_base + '_logs' + '/' if args.remove_old_run: shutil.rmtree(logdir) if not os.path.exists(logdir): os.makedirs(logdir) tflogger = tfLogger(logdir) ''' Comet Logging ''' experiment = Experiment(api_key="Ht9lkWvTm58fRo9ccgpabq5zV", disabled= not args.do_log ,project_name="graph-invariance-icml", workspace="joeybose") experiment.set_name(args.namestr) modelD = TransD(args.num_ent, args.num_rel, args.embed_dim, args.p) fairD_0, fairD_1, fairD_2 = None,None,None optimizer_fairD_0, optimizer_fairD_1, optimizer_fairD_2 = None,None,None filter_0, filter_1, filter_2 = None, None, None if args.debug: ipdb.set_trace() if args.load_transD: modelD.load(args.saved_path) if args.use_cuda: modelD.cuda() if args.use_attr: ''' Hard Coded to the most common attribute for now ''' attr_data = [args.attr_mat,args.ent_to_idx,args.attr_to_idx,\ args.reindex_attr_idx,args.attr_count] fairD_0 = FBDemParDisc(args.embed_dim,args.fair_att_0,'0',attr_data,args.use_cross_entropy) fairD_1 = FBDemParDisc(args.embed_dim,args.fair_att_1,'1',attr_data,args.use_cross_entropy) fairD_2 = FBDemParDisc(args.embed_dim,args.fair_att_2,'2',attr_data,args.use_cross_entropy) most_common_attr = [print(fairD_0.inv_attr_map[int(k)]) for k in \ fairD_0.reindex_to_idx.keys()] ''' Initialize Optimizers ''' if args.sample_mask: filter_0 = AttributeFilter(args.embed_dim,attribute='0') filter_1 = AttributeFilter(args.embed_dim,attribute='1') filter_2 = AttributeFilter(args.embed_dim,attribute='2') filter_0.cuda() filter_1.cuda() filter_2.cuda() optimizer_fairD_0 = optimizer(fairD_0.parameters(),'adam', args.lr) optimizer_fairD_1 = optimizer(fairD_1.parameters(),'adam',args.lr) optimizer_fairD_2 = optimizer(fairD_2.parameters(),'adam', args.lr) elif args.use_trained_filters and not args.sample_mask: filter_0 = AttributeFilter(args.embed_dim,attribute='0') filter_1 = AttributeFilter(args.embed_dim,attribute='1') filter_2 = AttributeFilter(args.embed_dim,attribute='2') filter_0.cuda() filter_1.cuda() filter_2.cuda() else: optimizer_fairD_0 = optimizer(fairD_0.parameters(),'adam', args.lr) optimizer_fairD_1 = optimizer(fairD_1.parameters(),'adam',args.lr) optimizer_fairD_2 = optimizer(fairD_2.parameters(),'adam', args.lr) filter_0, filter_1, filter_2 = None, None, None if args.use_cuda: fairD_0.cuda() fairD_1.cuda() fairD_2.cuda() elif args.use_1_attr: attr_data = [args.attr_mat,args.ent_to_idx,args.attr_to_idx,\ args.reindex_attr_idx,args.attr_count] fairD_1 = FBDemParDisc(args.embed_dim,args.fair_att_1,'1',attr_data,\ use_cross_entropy=args.use_cross_entropy) fairD_1.cuda() optimizer_fairD_1 = optimizer(fairD_1.parameters(),'adam',args.lr) elif args.use_0_attr: attr_data = [args.attr_mat,args.ent_to_idx,args.attr_to_idx,\ args.reindex_attr_idx,args.attr_count] fairD_0 = FBDemParDisc(args.embed_dim,args.fair_att_0,'0',attr_data,\ use_cross_entropy=args.use_cross_entropy) optimizer_fairD_0 = optimizer(fairD_0.parameters(),'adam', args.lr) fairD_0.cuda() elif args.use_2_attr: attr_data = [args.attr_mat,args.ent_to_idx,args.attr_to_idx,\ args.reindex_attr_idx,args.attr_count] fairD_2 = FBDemParDisc(args.embed_dim,args.fair_att_2,'2',attr_data,\ use_cross_entropy=args.use_cross_entropy) optimizer_fairD_2 = optimizer(fairD_2.parameters(),'adam', args.lr) fairD_2.cuda() if args.load_filters: filter_0.load(args.filter_0_saved_path) filter_1.load(args.filter_1_saved_path) filter_2.load(args.filter_2_saved_path) ''' Create Sets ''' fairD_set = [fairD_0,fairD_1,fairD_2] filter_set = [filter_0,filter_1,filter_2] optimizer_fairD_set = [optimizer_fairD_0, optimizer_fairD_1,\ optimizer_fairD_2] D_monitor = OrderedDict() test_val_monitor = OrderedDict() if args.sample_mask and not args.use_trained_filters: optimizerD = optimizer(list(modelD.parameters()) + \ list(filter_0.parameters()) + \ list(filter_1.parameters()) + \ list(filter_2.parameters()), 'adam', args.lr) else: optimizerD = optimizer(modelD.parameters(), 'adam_hyp3', args.lr) # optimizerD = optimizer(modelD.parameters(), 'adam', args.lr) schedulerD = lr_scheduler(optimizerD, args.decay_lr, args.num_epochs) loss_func = MarginRankingLoss(args.margin,1) _cst_inds = torch.LongTensor(np.arange(args.num_ent, \ dtype=np.int64)[:,None]).cuda().repeat(1, args.batch_size//2) _cst_s = torch.LongTensor(np.arange(args.batch_size//2)).cuda() _cst_s_nb = torch.LongTensor(np.arange(args.batch_size//2,args.batch_size)).cuda() _cst_nb = torch.LongTensor(np.arange(args.batch_size)).cuda() if args.prefetch_to_gpu: train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=0, collate_fn=collate_fn) else: train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=4, pin_memory=True, collate_fn=collate_fn) if args.freeze_transD: freeze_model(modelD) ''' Joint Training ''' if not args.dont_train: with experiment.train(): for epoch in tqdm(range(1, args.num_epochs + 1)): train(train_loader,epoch,args,train_hash,modelD,optimizerD,\ tflogger,fairD_set,optimizer_fairD_set,filter_set,experiment) gc.collect() if args.decay_lr: if args.decay_lr == 'ReduceLROnPlateau': schedulerD.step(monitor['D_loss_epoch_avg']) else: schedulerD.step() if epoch % args.valid_freq == 0: with torch.no_grad(): l_ranks, r_ranks = test(test_set,args,all_hash,\ modelD,tflogger,filter_set,experiment,subsample=20) l_mean = l_ranks.mean() r_mean = r_ranks.mean() l_mrr = (1. / l_ranks).mean() r_mrr = (1. / r_ranks).mean() l_h10 = (l_ranks <= 10).mean() r_h10 = (r_ranks <= 10).mean() l_h5 = (l_ranks <= 5).mean() r_h5 = (r_ranks <= 5).mean() avg_mr = (l_mean + r_mean)/2 avg_mrr = (l_mrr+r_mrr)/2 avg_h10 = (l_h10+r_h10)/2 avg_h5 = (l_h5+r_h5)/2 if args.use_attr: test_fairness(test_set,args, modelD,tflogger,\ fairD_0,attribute='0',\ epoch=epoch,experiment=experiment,filter_=filter_0) test_fairness(test_set,args,modelD,tflogger,\ fairD_1,attribute='1',epoch=epoch,\ experiment=experiment,filter_=filter_1) test_fairness(test_set,args, modelD,tflogger,\ fairD_2,attribute='2',epoch=epoch,\ experiment=experiment,filter_=filter_2) elif args.use_0_attr: test_fairness(test_set,args,modelD,tflogger,\ fairD_0,attribute='0',epoch=epoch,\ experiment=experiment,filter_=filter_0) elif args.use_1_attr: test_fairness(test_set,args,modelD,tflogger,\ fairD_1,attribute='1',epoch=epoch,\ experiment=experiment,filter_=filter_1) elif args.use_2_attr: test_fairness(test_set,args,modelD,tflogger,\ fairD_2,attribute='2',epoch=epoch,\ experiment=experiment,filter_=filter_2) joblib.dump({'l_ranks':l_ranks,'r_ranks':r_ranks},args.outname_base+\ 'epoch{}_validation_ranks.pkl'.format(epoch), compress=9) print("Mean Rank is %f" %(float(avg_mr))) if args.do_log: # Tensorboard logging tflogger.scalar_summary('Mean Rank',float(avg_mr),epoch) tflogger.scalar_summary('Mean Reciprocal Rank',float(avg_mrr),epoch) tflogger.scalar_summary('Hit @10',float(avg_h10),epoch) tflogger.scalar_summary('Hit @5',float(avg_h5),epoch) experiment.log_metric("Mean Rank",float(avg_mr),step=counter) modelD.save(args.outname_base+'D_epoch{}.pts'.format(epoch)) if epoch % (args.valid_freq * 5) == 0: l_ranks, r_ranks = test(test_set,args,all_hash,modelD,\ tflogger,filter_set,experiment,subsample=20) l_mean = l_ranks.mean() r_mean = r_ranks.mean() l_mrr = (1. / l_ranks).mean() r_mrr = (1. / r_ranks).mean() l_h10 = (l_ranks <= 10).mean() r_h10 = (r_ranks <= 10).mean() l_h5 = (l_ranks <= 5).mean() r_h5 = (r_ranks <= 5).mean() if args.sample_mask: filter_0.save(args.outname_base+'Filter_0.pts') filter_1.save(args.outname_base+'Filter_1.pts') filter_2.save(args.outname_base+'Filter_2.pts') if args.test_new_disc: ''' Testing with fresh discriminators ''' args.use_attr = True args.use_trained_filters = True with experiment.test(): args.force_ce = True if args.use_trained_filters: logdir_filter = args.outname_base + '_test_2_filter_logs' + '/' if args.remove_old_run: shutil.rmtree(logdir_filter) if not os.path.exists(logdir_filter): os.makedirs(logdir_filter) tflogger_filter = tfLogger(logdir_filter) args.use_trained_filters = True ''' Test With Filters ''' if args.use_attr: retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\ optimizerD,tflogger_filter,filter_2=filter_2,filter_0=None,\ filter_1=None,attribute='2') retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\ optimizerD,tflogger_filter,filter_0,filter_1=None,\ filter_2=None,attribute='0') retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\ optimizerD,tflogger_filter,filter_1=filter_1,\ filter_0=None,filter_2=None,attribute='1') elif args.use_0_attr: retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\ optimizerD,tflogger_filter,filter_0,filter_1=None,\ filter_2=None,attribute='0') elif args.use_1_attr: retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\ optimizerD,experiment,tflogger_filter,filter_1=filter_1,\ filter_0=None,filter_2=None,attribute='1') elif args.use_2_attr: retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\ optimizerD,tflogger_filter,filter_2=filter_2,filter_0=None,\ filter_1=None,attribute='2') args.freeze_transD = True args.use_trained_filters = False logdir_no_filter = args.outname_base + '_test_no_2_filter_logs' + '/' if args.remove_old_run: shutil.rmtree(logdir_no_filter) if not os.path.exists(logdir_no_filter): os.makedirs(logdir_no_filter) tflogger_no_filter = tfLogger(logdir_no_filter)
class Trainer2D: def __init__(self, config): self.experiment = Experiment(api_key="CQ4yEzhJorcxul2hHE5gxVNGu", project_name="HIP") self.experiment.log_parameters(vars(config)) self.config = config self.log_step = config.log_step self.model = conv2d.Conv2DPatches(image_size=config.image_size) print(self.model) self.d = get_dataloader2D(config) self.train_loader, self.test_loader = self.d self.train_loader_jig, self.test_loader_jig = get_dataloader2DJigSaw( config) self.net_optimizer = optim.Adam(self.model.parameters(), config.lr, [0.5, 0.9999]) if torch.cuda.is_available(): self.model = self.model.cuda() self.criterion_c = nn.CrossEntropyLoss() self.criterion_d = nn.MSELoss() self.epochs = config.epochs if torch.cuda.is_available(): print("Using CUDA") self.model = self.model.cuda() # self.model = self.model.cuda() self.pre_model_path = "./artifacts/pre_models/" + str( config.lr) + ".pth" self.model_path = "./artifacts/models/" + str(config.lr) + ".pth" self.image_size = config.image_size def pre_train(self): if os.path.isfile(self.pre_model_path): print("Using pre-trained model for solving the jigsaw puzzle") self.model = torch.load(self.pre_model_path) else: print("Starting pre-training and solving the jigsaw puzzle") for epoch in range(0): print("Starting epoch {}".format(epoch)) train_loader = iter(self.train_loader_jig) with self.experiment.train(): for i in range(len(train_loader)): self.net_optimizer.zero_grad() data, indexes, _ = train_loader.next() # print(landmarks) # print(landmarks.shape) data, indexes = self.to_var(data), self.to_var( indexes).float() B, L, H, W = data.size() B, L, S = indexes.size() print(data.size()) print(indexes.size()) jig_out, _ = self.model(data, True) loss = self.criterion_d(jig_out, indexes.view(-1, S)) loss.backward() self.net_optimizer.step() # self.plots(y_slices, landmarks[:, :, [0, 2]], detected_points) self.experiment.log_metric("pre-loss", loss.item()) print("loss: {}".format(loss.item())) torch.save(self.model, self.pre_model_path) def train(self): if os.path.isfile(self.model_path): print("Using pre-trained model") self.model = torch.load(self.model_path) if False: pass else: print("Starting training") if torch.cuda.is_available(): self.model = self.model.cuda() for epoch in range(self.epochs): print("Starting epoch {}".format(epoch)) train_loader = iter(self.train_loader) with self.experiment.train(): for i in range(len(train_loader)): self.net_optimizer.zero_grad() data, landmarks, _ = train_loader.next() # print(landmarks) data, landmarks = self.to_var(data), self.to_var( landmarks) B, L, H, W = data.size() B, L, S = landmarks.size() y = landmarks[:, :, 1].view(B, L) y_slices = torch.zeros([B, L, H, W], dtype=torch.float32) if torch.cuda.is_available(): y_slices = y_slices.cuda() for i in range(B): y_slices[i] = data[i, y[i]] jig_out, detected_points = self.model(y_slices) landmarks = landmarks.float() / self.image_size loss = self.criterion_d(detected_points, landmarks[:, :, [0, 2]]) loss.backward() self.net_optimizer.step() # self.plots(y_slices, landmarks[:, :, [0, 2]], detected_points) self.experiment.log_metric("loss", loss.item()) print("loss: {}".format(loss.item())) if epoch % self.log_step == 0: with self.experiment.test(): self.evaluate() evaluator = Evaluator(self, self.test_loader) evaluator.report() torch.save(self.model, self.model_path) evaluator = Evaluator(self, self.test_loader) evaluator.report() def evaluate(self): test_loader = iter(self.test_loader) with self.experiment.test(): loss = 0 for i in range(len(test_loader)): self.net_optimizer.zero_grad() data, landmarks, _ = test_loader.next() data, landmarks = self.to_var(data), self.to_var(landmarks) B, L, H, W = data.size() B, L, S = landmarks.size() y = landmarks[:, :, 1].view(B, L) y_slices = torch.zeros([B, L, H, W], dtype=torch.float32) if torch.cuda.is_available(): y_slices = y_slices.cuda() for i in range(B): y_slices[i] = data[i, y[i]] jig_out, detected_points = self.model(y_slices) landmarks = landmarks.float() / self.image_size loss += self.criterion_d(detected_points, landmarks[:, :, [0, 2]]).item() self.plots(y_slices.cpu(), landmarks[:, :, [0, 2]], detected_points) self.experiment.log_metric("loss", loss / len(test_loader)) def plots(self, slices, real, predicted): figure, axes = plt.subplots(nrows=4, ncols=4, figsize=(15, 15)) slices = slices[0].cpu().detach().numpy() real = real[0].cpu().detach().numpy() predicted = predicted[0].cpu().detach().numpy() real *= self.image_size predicted *= self.image_size s = 0 # print(real.size()) # print(predicted.size()) for i in range(4): for j in range(4): axes[i, j].imshow(slices[s]) x, z = real[s] axes[i, j].scatter(x, z, color="red") x, z = predicted[s] axes[i, j].scatter(x, z, color="blue") s += 1 self.experiment.log_figure(figure=plt) plt.savefig("artifacts/predictions/img.png") plt.show() def to_var(self, x): """Converts numpy to variable.""" if torch.cuda.is_available(): x = x.cuda() return Variable(x, requires_grad=False) def to_data(self, x): """Converts variable to numpy.""" if torch.cuda.is_available(): x = x.cpu() return x.data.numpy() def predict(self, x): if torch.cuda.is_available(): self.model = self.model.cuda() x = x.cuda() _, x = self.model(x) return x
class Trainer(): def __init__(self, log_dir, cfg): self.path = log_dir self.cfg = cfg if cfg.TRAIN.FLAG: self.model_dir = os.path.join(self.path, 'Model') self.log_dir = os.path.join(self.path, 'Log') mkdir_p(self.model_dir) mkdir_p(self.log_dir) self.writer = SummaryWriter(log_dir=self.log_dir) self.logfile = os.path.join(self.path, "logfile.log") sys.stdout = Logger(logfile=self.logfile) self.data_dir = cfg.DATASET.DATA_DIR self.max_epochs = cfg.TRAIN.MAX_EPOCHS self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL s_gpus = cfg.GPU_ID.split(',') self.gpus = [int(ix) for ix in s_gpus] self.num_gpus = len(self.gpus) self.batch_size = cfg.TRAIN.BATCH_SIZE self.lr = cfg.TRAIN.LEARNING_RATE torch.cuda.set_device(self.gpus[0]) cudnn.benchmark = True sample = cfg.SAMPLE self.dataset = [] self.dataloader = [] self.use_feats = cfg.model.use_feats eval_split = cfg.EVAL if cfg.EVAL else 'val' train_split = cfg.DATASET.train_split if cfg.DATASET.DATASET == 'clevr': clevr_collate_fn = collate_fn cogent = cfg.DATASET.COGENT if cogent: print(f'Using CoGenT {cogent.upper()}') if cfg.TRAIN.FLAG: self.dataset = ClevrDataset(data_dir=self.data_dir, split=train_split + cogent, sample=sample, **cfg.DATASET.params) self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.WORKERS, drop_last=True, collate_fn=clevr_collate_fn) self.dataset_val = ClevrDataset(data_dir=self.data_dir, split=eval_split + cogent, sample=sample, **cfg.DATASET.params) self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=cfg.TEST_BATCH_SIZE, drop_last=False, shuffle=False, num_workers=cfg.WORKERS, collate_fn=clevr_collate_fn) elif cfg.DATASET.DATASET == 'gqa': if self.use_feats == 'spatial': gqa_collate_fn = collate_fn_gqa elif self.use_feats == 'objects': gqa_collate_fn = collate_fn_gqa_objs if cfg.TRAIN.FLAG: self.dataset = GQADataset(data_dir=self.data_dir, split=train_split, sample=sample, use_feats=self.use_feats, **cfg.DATASET.params) self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.WORKERS, drop_last=True, collate_fn=gqa_collate_fn) self.dataset_val = GQADataset(data_dir=self.data_dir, split=eval_split, sample=sample, use_feats=self.use_feats, **cfg.DATASET.params) self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=cfg.TEST_BATCH_SIZE, shuffle=False, num_workers=cfg.WORKERS, drop_last=False, collate_fn=gqa_collate_fn) # load model self.vocab = load_vocab(cfg) self.model, self.model_ema = mac.load_MAC(cfg, self.vocab) self.weight_moving_average(alpha=0) if cfg.TRAIN.RADAM: self.optimizer = RAdam(self.model.parameters(), lr=self.lr) else: self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.start_epoch = 0 if cfg.resume_model: location = 'cuda' if cfg.CUDA else 'cpu' state = torch.load(cfg.resume_model, map_location=location) self.model.load_state_dict(state['model']) self.optimizer.load_state_dict(state['optim']) self.start_epoch = state['iter'] + 1 state = torch.load(cfg.resume_model_ema, map_location=location) self.model_ema.load_state_dict(state['model']) if cfg.start_epoch is not None: self.start_epoch = cfg.start_epoch self.previous_best_acc = 0.0 self.previous_best_epoch = 0 self.previous_best_loss = 100 self.previous_best_loss_epoch = 0 self.total_epoch_loss = 0 self.prior_epoch_loss = 10 self.print_info() self.loss_fn = torch.nn.CrossEntropyLoss().cuda() self.comet_exp = Experiment( project_name=cfg.COMET_PROJECT_NAME, api_key=os.getenv('COMET_API_KEY'), workspace=os.getenv('COMET_WORKSPACE'), disabled=cfg.logcomet is False, ) if cfg.logcomet: exp_name = cfg_to_exp_name(cfg) print(exp_name) self.comet_exp.set_name(exp_name) self.comet_exp.log_parameters(flatten_json_iterative_solution(cfg)) self.comet_exp.log_asset(self.logfile) self.comet_exp.log_asset_data(json.dumps(cfg, indent=4), file_name='cfg.json') self.comet_exp.set_model_graph(str(self.model)) if cfg.cfg_file: self.comet_exp.log_asset(cfg.cfg_file) with open(os.path.join(self.path, 'cfg.json'), 'w') as f: json.dump(cfg, f, indent=4) def print_info(self): print('Using config:') pprint.pprint(self.cfg) print("\n") pprint.pprint("Size of train dataset: {}".format(len(self.dataset))) # print("\n") pprint.pprint("Size of val dataset: {}".format(len(self.dataset_val))) print("\n") print("Using MAC-Model:") pprint.pprint(self.model) print("\n") def weight_moving_average(self, alpha=0.999): for param1, param2 in zip(self.model_ema.parameters(), self.model.parameters()): param1.data *= alpha param1.data += (1.0 - alpha) * param2.data def set_mode(self, mode="train"): if mode == "train": self.model.train() self.model_ema.train() else: self.model.eval() self.model_ema.eval() def reduce_lr(self): epoch_loss = self.total_epoch_loss # / float(len(self.dataset) // self.batch_size) lossDiff = self.prior_epoch_loss - epoch_loss if ((lossDiff < 0.015 and self.prior_epoch_loss < 0.5 and self.lr > 0.00002) or \ (lossDiff < 0.008 and self.prior_epoch_loss < 0.15 and self.lr > 0.00001) or \ (lossDiff < 0.003 and self.prior_epoch_loss < 0.10 and self.lr > 0.000005)): self.lr *= 0.5 print("Reduced learning rate to {}".format(self.lr)) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr self.prior_epoch_loss = epoch_loss self.total_epoch_loss = 0 def save_models(self, iteration): save_model(self.model, self.optimizer, iteration, self.model_dir, model_name="model") save_model(self.model_ema, None, iteration, self.model_dir, model_name="model_ema") def train_epoch(self, epoch): cfg = self.cfg total_loss = 0. total_correct = 0 total_samples = 0 self.labeled_data = iter(self.dataloader) self.set_mode("train") dataset = tqdm(self.labeled_data, total=len(self.dataloader), ncols=20) for data in dataset: ###################################################### # (1) Prepare training data ###################################################### image, question, question_len, answer = data['image'], data[ 'question'], data['question_length'], data['answer'] answer = answer.long() question = Variable(question) answer = Variable(answer) if cfg.CUDA: if self.use_feats == 'spatial': image = image.cuda() elif self.use_feats == 'objects': image = [e.cuda() for e in image] question = question.cuda() answer = answer.cuda().squeeze() else: question = question image = image answer = answer.squeeze() ############################ # (2) Train Model ############################ self.optimizer.zero_grad() scores = self.model(image, question, question_len) loss = self.loss_fn(scores, answer) loss.backward() if self.cfg.TRAIN.CLIP_GRADS: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg.TRAIN.CLIP) self.optimizer.step() self.weight_moving_average() ############################ # (3) Log Progress ############################ correct = scores.detach().argmax(1) == answer total_correct += correct.sum().cpu().item() total_loss += loss.item() * answer.size(0) total_samples += answer.size(0) avg_loss = total_loss / total_samples train_accuracy = total_correct / total_samples # accuracy = correct.sum().cpu().numpy() / answer.shape[0] # if avg_loss == 0: # avg_loss = loss.item() # train_accuracy = accuracy # else: # avg_loss = 0.99 * avg_loss + 0.01 * loss.item() # train_accuracy = 0.99 * train_accuracy + 0.01 * accuracy # self.total_epoch_loss += loss.item() * answer.size(0) dataset.set_description( 'Epoch: {}; Avg Loss: {:.5f}; Avg Train Acc: {:.5f}'.format( epoch + 1, avg_loss, train_accuracy)) self.total_epoch_loss = avg_loss dict = { "loss": avg_loss, "accuracy": train_accuracy, "avg_loss": avg_loss, # For commet "avg_accuracy": train_accuracy, # For commet } return dict def train(self): cfg = self.cfg print("Start Training") for epoch in range(self.start_epoch, self.max_epochs): with self.comet_exp.train(): dict = self.train_epoch(epoch) self.reduce_lr() dict['epoch'] = epoch + 1 dict['lr'] = self.lr self.comet_exp.log_metrics( dict, epoch=epoch + 1, ) with self.comet_exp.validate(): dict = self.log_results(epoch, dict) dict['epoch'] = epoch + 1 dict['lr'] = self.lr self.comet_exp.log_metrics( dict, epoch=epoch + 1, ) if cfg.TRAIN.EALRY_STOPPING: if epoch - cfg.TRAIN.PATIENCE == self.previous_best_epoch: # if epoch - cfg.TRAIN.PATIENCE == self.previous_best_loss_epoch: print('Early stop') break self.comet_exp.log_asset(self.logfile) self.save_models(self.max_epochs) self.writer.close() print("Finished Training") print( f"Highest validation accuracy: {self.previous_best_acc} at epoch {self.previous_best_epoch}" ) def log_results(self, epoch, dict, max_eval_samples=None): epoch += 1 self.writer.add_scalar("avg_loss", dict["loss"], epoch) self.writer.add_scalar("train_accuracy", dict["accuracy"], epoch) metrics = self.calc_accuracy("validation", max_samples=max_eval_samples) self.writer.add_scalar("val_accuracy_ema", metrics['acc_ema'], epoch) self.writer.add_scalar("val_accuracy", metrics['acc'], epoch) self.writer.add_scalar("val_loss_ema", metrics['loss_ema'], epoch) self.writer.add_scalar("val_loss", metrics['loss'], epoch) print( "Epoch: {epoch}\tVal Acc: {acc},\tVal Acc EMA: {acc_ema},\tAvg Loss: {loss},\tAvg Loss EMA: {loss_ema},\tLR: {lr}" .format(epoch=epoch, lr=self.lr, **metrics)) if metrics['acc'] > self.previous_best_acc: self.previous_best_acc = metrics['acc'] self.previous_best_epoch = epoch if metrics['loss'] < self.previous_best_loss: self.previous_best_loss = metrics['loss'] self.previous_best_loss_epoch = epoch if epoch % self.snapshot_interval == 0: self.save_models(epoch) return metrics def calc_accuracy(self, mode="train", max_samples=None): self.set_mode("validation") if mode == "train": loader = self.dataloader # elif (mode == "validation") or (mode == 'test'): # loader = self.dataloader_val else: loader = self.dataloader_val total_correct = 0 total_correct_ema = 0 total_samples = 0 total_loss = 0. total_loss_ema = 0. pbar = tqdm(loader, total=len(loader), desc=mode.upper(), ncols=20) for data in pbar: image, question, question_len, answer = data['image'], data[ 'question'], data['question_length'], data['answer'] answer = answer.long() question = Variable(question) answer = Variable(answer) if self.cfg.CUDA: if self.use_feats == 'spatial': image = image.cuda() elif self.use_feats == 'objects': image = [e.cuda() for e in image] question = question.cuda() answer = answer.cuda().squeeze() with torch.no_grad(): scores = self.model(image, question, question_len) scores_ema = self.model_ema(image, question, question_len) loss = self.loss_fn(scores, answer) loss_ema = self.loss_fn(scores_ema, answer) correct = scores.detach().argmax(1) == answer correct_ema = scores_ema.detach().argmax(1) == answer total_correct += correct.sum().cpu().item() total_correct_ema += correct_ema.sum().cpu().item() total_loss += loss.item() * answer.size(0) total_loss_ema += loss_ema.item() * answer.size(0) total_samples += answer.size(0) avg_acc = total_correct / total_samples avg_acc_ema = total_correct_ema / total_samples avg_loss = total_loss / total_samples avg_loss_ema = total_loss_ema / total_samples pbar.set_postfix({ 'Acc': f'{avg_acc:.5f}', 'Acc Ema': f'{avg_acc_ema:.5f}', 'Loss': f'{avg_loss:.5f}', 'Loss Ema': f'{avg_loss_ema:.5f}', }) return dict(acc=avg_acc, acc_ema=avg_acc_ema, loss=avg_loss, loss_ema=avg_loss_ema)
def main(_): experiment = Experiment(api_key="xXtJguCo8yFdU7dpjEpo6YbHw", project_name=args.experiment_name) hyper_params = { "learning_rate": args.lr, "num_epochs": args.max_epoch, "batch_size": args.single_batch_size, "alpha": args.alpha, "beta": args.beta, "gamma": args.gamma, "loss": args.loss } experiment.log_multiple_params(hyper_params) # TODO: split file support with tf.Graph().as_default(): global save_model_dir start_epoch = 0 global_counter = 0 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION, visible_device_list=cfg.GPU_AVAILABLE, allow_growth=True) config = tf.ConfigProto( gpu_options=gpu_options, device_count={ "GPU": cfg.GPU_USE_COUNT, }, allow_soft_placement=True, log_device_placement=False, ) with tf.Session(config=config) as sess: # sess=tf_debug.LocalCLIDebugWrapperSession(sess,ui_type='readline') model = RPN3D(cls=cfg.DETECT_OBJ, single_batch_size=args.single_batch_size, learning_rate=args.lr, max_gradient_norm=5.0, alpha=args.alpha, beta=args.beta, gamma=args.gamma, loss_type=args.loss, avail_gpus=cfg.GPU_AVAILABLE.split(',')) # param init/restore if tf.train.get_checkpoint_state(save_model_dir): print("Reading model parameters from %s" % save_model_dir) model.saver.restore(sess, tf.train.latest_checkpoint(save_model_dir)) start_epoch = model.epoch.eval() + 1 global_counter = model.global_step.eval() + 1 else: print("Created model with fresh parameters.") tf.global_variables_initializer().run() # train and validate is_summary, is_summary_image, is_validate = False, False, False summary_interval = 5 summary_val_interval = 10 summary_writer = tf.summary.FileWriter(log_dir, sess.graph) experiment.set_model_graph(sess.graph) # training with experiment.train(): for epoch in range(start_epoch, args.max_epoch): counter = 0 batch_time = time.time() experiment.log_current_epoch(epoch) for batch in iterate_data( train_dir, shuffle=True, aug=True, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): counter += 1 global_counter += 1 experiment.set_step(global_counter) if counter % summary_interval == 0: is_summary = True else: is_summary = False epochs = args.max_epoch start_time = time.time() ret = model.train_step(sess, batch, train=True, summary=is_summary) forward_time = time.time() - start_time batch_time = time.time() - batch_time param = ret params = { "loss": param[0], "cls_loss": param[1], "cls_pos_loss": param[2], "cls_neg_loss": param[3] } experiment.log_multiple_metrics(params) # print(ret) print( 'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}' .format(counter, epoch, epochs, ret[0], ret[1], ret[2], ret[3], forward_time, batch_time)) # with open('log/train.txt', 'a') as f: # f.write( 'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'.format(counter,epoch, epochs, ret[0], ret[1], ret[2], ret[3], forward_time, batch_time)) #print(counter, summary_interval, counter % summary_interval) if counter % summary_interval == 0: print("summary_interval now") summary_writer.add_summary(ret[-1], global_counter) #print(counter, summary_val_interval, counter % summary_val_interval) if counter % summary_val_interval == 0: print("summary_val_interval now") batch = sample_test_data( val_dir, args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT) ret = model.validate_step(sess, batch, summary=True) summary_writer.add_summary(ret[-1], global_counter) try: ret = model.predict_step(sess, batch, summary=True) summary_writer.add_summary( ret[-1], global_counter) except: print("prediction skipped due to error") if check_if_should_pause(args.tag): model.saver.save(sess, os.path.join( save_model_dir, 'checkpoint'), global_step=model.global_step) print('pause and save model @ {} steps:{}'.format( save_model_dir, model.global_step.eval())) sys.exit(0) batch_time = time.time() experiment.log_epoch_end(epoch) sess.run(model.epoch_add_op) model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) # dump test data every 10 epochs if (epoch + 1) % 10 == 0: # create output folder os.makedirs(os.path.join(args.output_path, str(epoch)), exist_ok=True) os.makedirs(os.path.join(args.output_path, str(epoch), 'data'), exist_ok=True) if args.vis: os.makedirs(os.path.join(args.output_path, str(epoch), 'vis'), exist_ok=True) for batch in iterate_data( val_dir, shuffle=False, aug=False, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): if args.vis: tags, results, front_images, bird_views, heatmaps = model.predict_step( sess, batch, summary=False, vis=True) else: tags, results = model.predict_step( sess, batch, summary=False, vis=False) for tag, result in zip(tags, results): of_path = os.path.join(args.output_path, str(epoch), 'data', tag + '.txt') with open(of_path, 'w+') as f: labels = box3d_to_label( [result[:, 1:8]], [result[:, 0]], [result[:, -1]], coordinate='lidar')[0] for line in labels: f.write(line) print('write out {} objects to {}'.format( len(labels), tag)) # dump visualizations if args.vis: for tag, front_image, bird_view, heatmap in zip( tags, front_images, bird_views, heatmaps): front_img_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_front.jpg') bird_view_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_bv.jpg') heatmap_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_heatmap.jpg') cv2.imwrite(front_img_path, front_image) cv2.imwrite(bird_view_path, bird_view) cv2.imwrite(heatmap_path, heatmap) # execute evaluation code cmd_1 = "./kitti_eval/launch_test.sh" cmd_2 = os.path.join(args.output_path, str(epoch)) cmd_3 = os.path.join(args.output_path, str(epoch), 'log') os.system(" ".join([cmd_1, cmd_2, cmd_3])) print('train done. total epoch:{} iter:{}'.format( epoch, model.global_step.eval())) # finallly save model model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step)
def train(args, use_comet : bool = True): data_cls = funcs[args['dataset']] model_cls = funcs[args['model']] network = funcs[args['network']] print ('[INFO] Getting dataset...') data = data_cls() data.load_data() (x_train, y_train), (x_test, y_test) = (data.x_train, data.y_train), (data.x_test, data.y_test) classes = data.mapping # #Used for testing only # x_train = x_train[:100, :, :] # y_train = y_train[:100, :] # x_test = x_test[:100, :, :] # y_test = y_test[:100, :] # print ('[INFO] Training shape: ', x_train.shape, y_train.shape) # print ('[INFO] Test shape: ', x_test.shape, y_test.shape) # #delete these lines # distribute 90% test 10% val dataset with equal class distribution (x_test, x_valid, y_test, y_valid) = train_test_split(x_test, y_test, test_size=0.2, random_state=42) print ('[INFO] Training shape: ', x_train.shape, y_train.shape) print ('[INFO] Validation shape: ', x_valid.shape, y_valid.shape) print ('[INFO] Test shape: ', x_test.shape, y_test.shape) print ('[INFO] Setting up the model..') if args['network'] == 'lstmctc': network_args = {'backbone' : args['backbone'], 'seq_model' : args['seq'], 'bi' : args['bi'] } model = model_cls(network, data_cls, network_args) else: model = model_cls(network, data_cls) print (model) dataset = dict({ 'x_train' : x_train, 'y_train' : y_train, 'x_valid' : x_valid, 'y_valid' : y_valid, 'x_test' : x_test, 'y_test' : y_test }) if use_comet and args['find_lr'] == False: #create an experiment with your api key experiment = Experiment(api_key='WVBNRAfMLCBWslJAAsffxM4Gz', project_name='iam_lines', auto_param_logging=False) print ('[INFO] Starting Training...') #will log metrics with the prefix 'train_' with experiment.train(): _ = train_model( model, dataset, batch_size=args['batch_size'], epochs=args['epochs'], name=args['network'] ) print ('[INFO] Starting Testing...') #will log metrics with the prefix 'test_' with experiment.test(): score = model.evaluate(dataset, int(args['batch_size'])) print(f'[INFO] Test evaluation: {score*100}...') metrics = { 'accuracy':score } experiment.log_metrics(metrics) experiment.log_parameters(args) experiment.log_dataset_hash(x_train) #creates and logs a hash of your data experiment.end() elif use_comet and args['find_lr'] == True: _ = train_model( model, dataset, batch_size=args['batch_size'], epochs=args['epochs'], FIND_LR=args['find_lr'], name=args['network'] ) else : print ('[INFO] Starting Training...') train_model( model, dataset, batch_size=args['batch_size'], epochs=args['epochs'], name=args['network'] ) print ('[INFO] Starting Testing...') score = model.evaluate(dataset, args['batch_size']) print(f'[INFO] Test evaluation: {score*100}...') if args['weights']: model.save_weights() if args['save_model']: model.save_model()
kernel_size=params['filter_size'], padding='same', activation=params['activation'])) model.add(Dropout(params['dropout'])) model.add(Flatten()) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer=params['optimizer'], metrics=['accuracy']) #print model.summary() to preserve automatically in `Output` tab print(model.summary()) params.update({'total_number_of_parameters': model.count_params()}) #will log metrics with the prefix 'train_' with experiment.train(): model.fit(X_train, y_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose=1, validation_data=(X_test, y_test)) #will log metrics with the prefix 'test_' with experiment.test(): loss, accuracy = model.evaluate(X_test, y_test) metrics = {'loss': loss, 'accuracy': accuracy} experiment.log_multiple_metrics(metrics) experiment.log_multiple_params(params) experiment.log_dataset_hash(X_train) #creates and logs a hash of your data
def train(args, use_comet: bool = True): data_cls = funcs[args['dataset']] model_cls = funcs[args['model']] network = funcs[args['network']] print('[INFO] Getting dataset...') data = data_cls() (x_train, y_train), (x_test, y_test) = data.load_data() classes = data.mapping # #Used for testing only # x_train = x_train[:100, :, :] # y_train = y_train[:100, :] # x_test = x_test[:100, :, :] # y_test = y_test[:100, :] # print ('[INFO] Training shape: ', x_train.shape, y_train.shape) # print ('[INFO] Test shape: ', x_test.shape, y_test.shape) # #delete these lines y_test_labels = [ np.where(y_test[idx] == 1)[0][0] for idx in range(len(y_test)) ] # distribute 90% test 10% val dataset with equal class distribution (x_test, x_valid, y_test, y_valid) = train_test_split(x_test, y_test, test_size=0.1, stratify=y_test_labels, random_state=42) print('[INFO] Training shape: ', x_train.shape, y_train.shape) print('[INFO] Validation shape: ', x_valid.shape, y_valid.shape) print('[INFO] Test shape: ', x_test.shape, y_test.shape) print('[INFO] Setting up the model..') model = model_cls(network, data_cls) print(model) dataset = dict({ 'x_train': x_train, 'y_train': y_train, 'x_valid': x_valid, 'y_valid': y_valid, 'x_test': x_test, 'y_test': y_test }) if use_comet and args['find_lr'] == False: #create an experiment with your api key experiment = Experiment(api_key='INSERT API KEY', project_name='emnist', auto_param_logging=False) print('[INFO] Starting Training...') #will log metrics with the prefix 'train_' with experiment.train(): _ = train_model(model, dataset, batch_size=args['batch_size'], epochs=args['epochs'], name=args['network']) print('[INFO] Starting Testing...') #will log metrics with the prefix 'test_' with experiment.test(): loss, score = model.evaluate(dataset, args['batch_size']) print(f'[INFO] Test evaluation: {score*100}') metrics = {'loss': loss, 'accuracy': score} experiment.log_metrics(metrics) experiment.log_parameters(args) experiment.log_dataset_hash( x_train) #creates and logs a hash of your data experiment.end() elif use_comet and args['find_lr'] == True: _ = train_model(model, dataset, batch_size=args['batch_size'], epochs=args['epochs'], FIND_LR=args['find_lr'], name=args['network']) else: print('[INFO] Starting Training...') train_model(model, dataset, batch_size=args['batch_size'], epochs=args['epochs'], name=args['network']) print('[INFO] Starting Testing...') loss, score = model.evaluate(dataset, args['batch_size']) print(f'[INFO] Test evaluation: {score*100}') if args['weights']: model.save_weights() if args['save_model']: model.save_model()