def main(argv=None): check_args(FLAGS) if FLAGS.multigpu == 'n': if FLAGS.mode == 'train': from train import train_model train_model(FLAGS) elif FLAGS.mode == 'save_pb': from save_model import load_weights_save_pb load_weights_save_pb(FLAGS) elif FLAGS.mode == 'eval': from eval import eval_model eval_model(FLAGS) elif FLAGS.multigpu == 'y': if FLAGS.mode == 'train': from multigpu_train import train_model train_model(FLAGS) elif FLAGS.mode == 'save_pb': from save_multigpu_model import load_weights_save_pb load_weights_save_pb(FLAGS) elif FLAGS.mode == 'eval': from eval import eval_model eval_model(FLAGS) else: raise Exception('Please use true option of multigpu')
def evaluate(ema, dl_eval): model = ema.ema_model acc_1_ema, acc_5_ema = eval_model(model, dl_eval) model = ema.model acc_1, acc_5 = eval_model(model, dl_eval) torch.cuda.empty_cache() return acc_1, acc_5, acc_1_ema, acc_5_ema
def train(meta_files): if len(meta_files) < 3: meta_files = idm.init_dataset_meta() if not os.path.exists(config.OUTPUT_DIR): os.mkdir(config.OUTPUT_DIR) recognizer = cv2.face.LBPHFaceRecognizer_create() #global_accuracy = 0 [list_img, list_label], num_sample = load_batch(meta_files[0]) if num_sample < 1: print('Err: 0 sample found') recognizer.train(list_img, np.array(list_label)) #recognizer.update(list_img, np.array(list_label)) train_accuracy = eval_model(recognizer, meta_files[0]) val_accuracy = eval_model(recognizer, meta_files[1]) test_accuracy = eval_model(recognizer, meta_files[2]) print('Train accuracy = %f' % (train_accuracy)) print('Test accuracy = %f' % (test_accuracy)) print('Validate accuracy = %f' % (val_accuracy)) recognizer.write(os.path.join(config.OUTPUT_DIR, config.OUTPUT_MODEL_FILE))
def train(meta_files): if not os.path.exists(config.OUTPUT_DIR): os.mkdir(config.OUTPUT_DIR) #recognizer=cv2.face.EigenFaceRecognizer_create() #recognizer=cv2.face.FisherFaceRecognizer_create() recognizer = cv2.face.LBPHFaceRecognizer_create() global_accuracy = 0 print('\nProcessing...') [list_img, list_label], num_sample = load_batch(meta_files[0]) if num_sample < 1: return recognizer.train(list_img, np.array(list_label)) train_accuracy = eval_model(recognizer, meta_files[0]) val_accuracy = eval_model(recognizer, meta_files[1]) test_accuracy = eval_model(recognizer, meta_files[2]) print( '\nRESULT: Number of images: %d, Train accuracy: %g, Test accuracy: %g' % (num_sample, train_accuracy, test_accuracy)) recognizer.save(os.path.join(config.OUTPUT_DIR, config.OUTPUT_MODEL_FILE))
def main(argv=None): check_args(FLAGS) # Create some local cache directories used for transfer data between local path and OBS path if not FLAGS.data_url.startswith('s3://'): FLAGS.data_local = FLAGS.data_url else: FLAGS.data_local = os.path.join(FLAGS.local_data_root, 'train_data/') if not os.path.exists(FLAGS.data_local): mox.file.copy_parallel(FLAGS.data_url, FLAGS.data_local) # 如果自己的模型需要加载预训练参数文件,可以先手动将参数文件从外网下载到自己的机器本地,再上传到OBS # 然后用下面的代码,将OBS上的预训练参数文件拷贝到 ModelArts 平台训练代码所在的目录 # 拷贝代码格式为 mox.file.copy(src_path, dst_path),其中dst_path不能是目录,必须是一个具体的文件名 # mox.file.copy('s3://your_obs_path/imagenet_class_index.json', # os.path.dirname(os.path.abspath(__file__)) + '/models/imagenet_class_index.json') # mox.file.copy('s3://your_obs_path/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5', # os.path.dirname(os.path.abspath(__file__)) + '/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5') else: print('FLAGS.data_local: %s is already exist, skip copy' % FLAGS.data_local) if not FLAGS.train_url.startswith('s3://'): FLAGS.train_local = FLAGS.train_url else: FLAGS.train_local = os.path.join(FLAGS.local_data_root, 'model_snapshots/') if not os.path.exists(FLAGS.train_local): os.mkdir(FLAGS.train_local) if not FLAGS.test_data_url.startswith('s3://'): FLAGS.test_data_local = FLAGS.test_data_url else: FLAGS.test_data_local = os.path.join(FLAGS.local_data_root, 'test_data/') if not os.path.exists(FLAGS.test_data_local): mox.file.copy_parallel(FLAGS.test_data_url, FLAGS.test_data_local) else: print('FLAGS.test_data_local: %s is already exist, skip copy' % FLAGS.test_data_local) FLAGS.tmp = os.path.join(FLAGS.local_data_root, 'tmp/') if not os.path.exists(FLAGS.tmp): os.mkdir(FLAGS.tmp) if FLAGS.mode == 'train': from train import train_model train_model(FLAGS) elif FLAGS.mode == 'save_pb': from save_model import load_weights_save_pb load_weights_save_pb(FLAGS) elif FLAGS.mode == 'eval': from eval import eval_model eval_model(FLAGS)
def evaluate(ema, dl_eval): model = ema.ema_model metric_dict = eval_model(model, dl_eval, cfg.metric) model = ema.model metric_dict_ema = eval_model(model, dl_eval, cfg.metric) metric_dict_ema = {f'{k}_ema': v for k, v in metric_dict_ema.items()} metric_dict.update(metric_dict_ema) torch.cuda.empty_cache() return metric_dict
def main(argv=None): check_args(FLAGS) # Create some local cache directories used for transfer data between local path and OBS path if not FLAGS.data_url.startswith('s3://'): FLAGS.data_local = FLAGS.data_url else: FLAGS.data_local = os.path.join(FLAGS.local_data_root, 'train_data/') if not os.path.exists(FLAGS.data_local): pass # file.copy_parallel(FLAGS.data_url, FLAGS.data_local) else: print('FLAGS.data_local: %s is already exist, skip copy' % FLAGS.data_local) if not FLAGS.train_url.startswith('s3://'): FLAGS.train_local = FLAGS.train_url else: FLAGS.train_local = os.path.join(FLAGS.local_data_root, 'model_snapshots/') if not os.path.exists(FLAGS.train_local): os.mkdir(FLAGS.train_local) if not FLAGS.test_data_url.startswith('s3://'): FLAGS.test_data_local = FLAGS.test_data_url else: FLAGS.test_data_local = os.path.join(FLAGS.local_data_root, 'test_data/') if not os.path.exists(FLAGS.test_data_local): pass #file.copy_parallel(FLAGS.test_data_url, FLAGS.test_data_local) else: print('FLAGS.test_data_local: %s is already exist, skip copy' % FLAGS.test_data_local) FLAGS.tmp = os.path.join(FLAGS.local_data_root, 'tmp/') if not os.path.exists(FLAGS.tmp): os.mkdir(FLAGS.tmp) if FLAGS.mode == 'train': from train_eval import train_model train_model(FLAGS) elif FLAGS.mode == 'save_pb': from save_model import load_weights_save_pb load_weights_save_pb(FLAGS) elif FLAGS.mode == 'eval': from eval import eval_model eval_model(FLAGS)
def classification_model(raw_data_file, metric_col, categorical_col, target_col, test_perc, hyperopt_iterations, const_params, use_predefined_params, k_fold, tuning_metric): # preprocess data print('preprocess data:') data_obj = Preproc(raw_data_file, metric_col, categorical_col, target_col, test_perc) # hyperparameter tuning train with best params print('hyperparams tuning and model fitting:') model, params = train_best_model(data_obj.X_train, data_obj.y_train, const_params, hyperopt_iterations, k_fold, tuning_metric, use_predefined_params) print('best params are {}'.format(params), file=sys.stdout) # evaluate model auc = eval_model(data_obj.X_test, data_obj.y_test, model) # save model model.save_model(save_model_dir, format="json", pool=cb.Pool(data_obj.X_train, data_obj.y_train, cat_features=np.where( data_obj.X_train.dtypes == object)[0]))
def train(): for epoch in range(args.epochs): model.train() print('\n\n-------------------------------------------') print('Epoch-{}'.format(epoch)) print('-------------------------------------------') train_iter = enumerate(data.get_batches('train')) if not args.no_tqdm: train_iter = tqdm(train_iter) train_iter.set_description_str('Training') train_iter.total = len(data.train) for it, mb in train_iter: c, c_u_m, c_m, r, r_u_m, r_m, y = mb # print (c, c_u_m, c_m, r, y) # getting predictions pred = model(c, c_u_m, c_m, r, r_u_m, r_m) #train_iter.set_description(model.print_loss()) #loss = F.nll_loss(pred, r) #loss = criteria(pred, y) #y = torch.argmax(y) #print (y.size()) loss = criteria(pred, y) loss.backward() #print (model.conv3.grad) #clip_gradient_threshold(model, -10, 10) solver.step() solver.zero_grad() val_mrr = eval_model(model, data, 'valid') print('Validation MRR for this epoch:' + str(val_mrr))
def train_model(model: nn.Module, optimizer, loss_func, data_loader: DataLoader, eval_data_loader: DataLoader, eval_tgt_id2word, device: str, train_params: AttributeDict, enc_params: AttributeDict, dec_params: AttributeDict, epoch: int): # Set train flag model.train() n_epochs = train_params.n_epochs losses = [] data_length = len(data_loader) with tqdm(data_loader, total=data_length, desc=f'Epoch {epoch:03d}') as tqdm_iterator: for i, batch in enumerate(tqdm_iterator): loss = train_step(model, device, batch, optimizer, loss_func) losses.append(loss) tqdm_iterator.set_postfix_str(f'loss: {loss:05.3f}') avg_loss = np.mean(losses) print(f'Epochs [{epoch}/{n_epochs}] avg losses: {avg_loss:05.3f}') val_loss = eval_model(model, loss_func, eval_data_loader, device, eval_tgt_id2word) return avg_loss, val_loss
def main(argv=None): check_args(FLAGS) # Create some local cache directories used for transfer data between local path and OBS path FLAGS.data_local = FLAGS.data_url FLAGS.train_local = FLAGS.train_url FLAGS.test_data_local = FLAGS.test_data_url # FLAGS.tmp = os.path.join(FLAGS.local_data_root, 'tmp/') # print(FLAGS.tmp) # if not os.path.exists(FLAGS.tmp): # os.mkdir(FLAGS.tmp) if FLAGS.mode == 'train': from train import train_model train_model(FLAGS) elif FLAGS.mode == 'save_pb': from save_model import load_weights_save_pb load_weights_save_pb(FLAGS) elif FLAGS.mode == 'eval': from eval import eval_model eval_model(FLAGS)
def main(_): config = expconf.ExperimentConfig(data_dir=FLAGS.data_dir, root_log_dir=FLAGS.log_dir, config_path=FLAGS.config_path) learning_rate = config.learning_rate_nms softmax_ini_scores = False class_of_interest = config.config['general']['class_of_interest'] if class_of_interest == 'all': is_one_class = False class_ix = 0 n_classes = TOTAL_NUMBER_OF_CLASSES - 1 softmax_ini_scores = True else: is_one_class = True class_ix = CLASSES.index(class_of_interest) n_classes = 1 config.save_results() logging.info("config : %s" % yaml.dump(config.config)) logging.info('loading data..') logging.info('train..') frames_data_train = load_data( config.train_data_dir, n_bboxes=config.n_bboxes, use_short_features=config.use_reduced_fc_features, one_class=is_one_class, class_id=class_ix) train_class_instances = 0 for fid in frames_data_train.keys(): train_class_instances += len(frames_data_train[fid]['gt_labels']) logging.info("number of gt objects of class %s in train : %d" % (class_of_interest, train_class_instances)) logging.info('test..') frames_data_test = load_data( config.test_data_dir, n_bboxes=config.n_bboxes, use_short_features=config.use_reduced_fc_features, one_class=is_one_class, class_id=class_ix) test_class_instances = 0 for fid in frames_data_test.keys(): test_class_instances += len(frames_data_test[fid]['gt_labels']) logging.info("number of gt objects of class %s in test : %d" % (class_of_interest, test_class_instances)) if config.shuffle_train_test: frames_data_train, frames_data_test = shuffle_train_test( frames_data_train, frames_data_test) n_frames_train = len(frames_data_train.keys()) n_frames_test = len(frames_data_test.keys()) logging.info("number of bboxes per image : %d" % config.n_bboxes) logging.info('building model graph..') n_dt_features = frames_data_train[0][nms_net.DT_FEATURES].shape[1] in_ops = input_ops(n_classes=n_classes, n_dt_features=n_dt_features) nnms_model = nms_net.NMSNetwork(n_classes=n_classes, input_ops=in_ops, gt_match_iou_thr=0.5, class_ix=class_ix, softmax_ini_scores=softmax_ini_scores, **config.nms_network_config) lr_decay_applied = False with tf.Session() as sess: step_id = 0 sess.run(nnms_model.init_op) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0) if not config.start_from_scratch: ckpt_path = tf.train.latest_checkpoint(config.log_dir) if ckpt_path is not None: logging.info('model exists, restoring..') ckpt_name = ntpath.basename(ckpt_path) step_id = int(ckpt_name.split('-')[1]) saver.restore(sess, ckpt_path) summary_writer = tf.summary.FileWriter(config.log_dir, sess.graph) # loss_mode = 'nms' # nnms_model.switch_loss('nms') # logging.info("current loss mode : %s" % loss_mode) # train_frame_probs = np.squeeze(generate_frame_probs(frames_data_train), axis=0) logging.info('training started..') for epoch_id in range(0, config.n_epochs): step_times = [] for fid in shuffle_samples(n_frames_train): # if step_id == config.loss_change_step: # learning_rate = config.learning_rate_det # loss_mode = 'detection' # nnms_model.switch_loss('detection') # logging.info('switching loss to actual detection loss..') frame_data = frames_data_train[fid] if n_classes == 1: dt_probs_ini = frame_data[nms_net.DT_SCORES] gt_labels = frame_data[nms_net.GT_LABELS] else: dt_probs_ini = softmax(frame_data[nms_net.DT_SCORES])[:, 1:] gt_labels = frame_data[nms_net.GT_LABELS] - 1 feed_dict = { nnms_model.dt_coords: frame_data[nms_net.DT_COORDS], nnms_model.dt_features: frame_data[nms_net.DT_FEATURES], nnms_model.dt_probs_ini: dt_probs_ini, nnms_model.gt_coords: frame_data[nms_net.GT_COORDS], nnms_model.gt_labels: gt_labels, nnms_model.keep_prob: config.keep_prob_train } start_step = timer() if nnms_model.loss_type == 'nms': summary, _ = sess.run([ nnms_model.merged_summaries, nnms_model.nms_train_step ], feed_dict=feed_dict) else: summary, _ = sess.run([ nnms_model.merged_summaries, nnms_model.det_train_step ], feed_dict=feed_dict) end_step = timer() step_times.append(end_step - start_step) summary_writer.add_summary(summary, global_step=step_id) summary_writer.flush() step_id += 1 if step_id % config.eval_step == 0: logging.info('step : %d, mean time for step : %s' % (step_id, str(np.mean(step_times)))) if step_id % config.full_eval_step == 0: full_eval = True else: full_eval = False # logging.info('evaluating on TRAIN..') # train_out_dir = os.path.join(config.log_dir, 'train') # logging.info('full evaluation : %d' % full_eval) # train_loss_opt, train_loss_final = eval.eval_model(sess, nnms_model, # frames_data_train, # global_step=step_id, # n_eval_frames=config.n_eval_frames, # out_dir=train_out_dir, # full_eval=full_eval, # nms_thres=config.nms_thres, # one_class=is_one_class, # class_ix=class_ix) write_scalar_summary(train_loss_opt, 'train_loss_opt', summary_writer, step_id) logging.info('evaluating on TEST..') test_out_dir = os.path.join(config.log_dir, 'test') logging.info('full evaluation : %d' % full_eval) test_loss_opt, test_loss_final = eval.eval_model( sess, nnms_model, frames_data_test, global_step=step_id, n_eval_frames=config.n_eval_frames, out_dir=test_out_dir, full_eval=full_eval, nms_thres=config.nms_thres, one_class=is_one_class, class_ix=class_ix) write_scalar_summary(test_loss_opt, 'test_loss_opt', summary_writer, step_id) config.update_results(step_id, train_loss_opt, train_loss_final, test_loss_opt, test_loss_final, np.mean(step_times)) config.save_results() saver.save(sess, config.model_file, global_step=step_id) logging.info('all done.') return
# grad_norm=0.5, # grad_clipping=1 ) metrics = trainer.train() # evaluate model after training print("loading best model for evaluation") model = init_model(args, args.model, num_authors, vocab, args.encoder, args.max_vocab_size, date_span, args.ignore_time, args.num_sk) with open(args.snapshot_path + "best.th", 'rb') as f: model.load_state_dict(torch.load(f)) if args.cuda: model.cuda(args.device) print("Evaluation in validation data.") eval_model(model, vocab, val_ds, args.batch_size, args.device if args.cuda else -1) print("Evaluation in testing data.") eval_model(model, vocab, test_ds, args.batch_size, args.device if args.cuda else -1) elif args.test: # test the single model # Evaluation print("Evaluation ...") if not args.snapshot: print("No snapshot is provided!") exit(0) # auto-fill snapshot path args.snapshot = complete_snapshot(args.snapshot_path, args.snapshot)
def run_training(data_type="screw", model_dir="models", epochs=256, pretrained=True, test_epochs=10, freeze_resnet=20, learninig_rate=0.03, optim_name="SGD", batch_size=64, head_layer=8): torch.multiprocessing.freeze_support() # TODO: use script params for hyperparameter # Temperature Hyperparameter currently not used temperature = 0.2 device = "cuda" weight_decay = 0.00003 momentum = 0.9 #TODO: use f strings also for the date LOL model_name = f"model-{data_type}" + '-{date:%Y-%m-%d_%H_%M_%S}'.format( date=datetime.datetime.now()) #augmentation: size = 256 min_scale = 0.5 # create Training Dataset and Dataloader after_cutpaste_transform = transforms.Compose([]) after_cutpaste_transform.transforms.append(transforms.ToTensor()) after_cutpaste_transform.transforms.append( transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])) train_transform = transforms.Compose([]) # train_transform.transforms.append(transforms.RandomResizedCrop(size, scale=(min_scale,1))) # train_transform.transforms.append(transforms.GaussianBlur(int(size/10), sigma=(0.1,2.0))) train_transform.transforms.append(transforms.Resize((256, 256))) train_transform.transforms.append( CutPaste(transform=after_cutpaste_transform)) # train_transform.transforms.append(transforms.ToTensor()) train_data = MVTecAT("Data", data_type, transform=train_transform, size=int(size * (1 / min_scale))) dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=8, collate_fn=cut_paste_collate_fn, persistent_workers=True, pin_memory=True, prefetch_factor=5) # Writer will output to ./runs/ directory by default writer = SummaryWriter(Path("logdirs") / model_name) # create Model: head_layers = [512] * head_layer + [128] print(head_layers) model = ProjectionNet(pretrained=pretrained, head_layers=head_layers) model.to(device) if freeze_resnet > 0: model.freeze_resnet() loss_fn = torch.nn.CrossEntropyLoss() if optim_name == "sgd": optimizer = optim.SGD(model.parameters(), lr=learninig_rate, momentum=momentum, weight_decay=weight_decay) scheduler = CosineAnnealingWarmRestarts(optimizer, epochs) #scheduler = None elif optim_name == "adam": optimizer = optim.Adam(model.parameters(), lr=learninig_rate, weight_decay=weight_decay) scheduler = None else: print(f"ERROR unkown optimizer: {optim_name}") step = 0 import torch.autograd.profiler as profiler num_batches = len(dataloader) def get_data_inf(): while True: for out in enumerate(dataloader): yield out dataloader_inf = get_data_inf() # From paper: "Note that, unlike conventional definition for an epoch, # we define 256 parameter update steps as one epoch. for step in tqdm(range(epochs * 256)): epoch = int(step / 256) if epoch == freeze_resnet: model.unfreeze() batch_embeds = [] batch_idx, data = next(dataloader_inf) x1, x2 = data x1 = x1.to(device) x2 = x2.to(device) # zero the parameter gradients optimizer.zero_grad() xc = torch.cat((x1, x2), axis=0) embeds, logits = model(xc) # embeds = F.normalize(embeds, p=2, dim=1) # embeds1, embeds2 = torch.split(embeds,x1.size(0),dim=0) # ip = torch.matmul(embeds1, embeds2.T) # ip = ip / temperature # y = torch.arange(0,x1.size(0), device=device) # loss = loss_fn(ip, torch.arange(0,x1.size(0), device=device)) y = torch.tensor([0, 1], device=device) y = y.repeat_interleave(x1.size(0)) loss = loss_fn(logits, y) # regulize weights: loss.backward() optimizer.step() if scheduler is not None: scheduler.step(epoch + batch_idx / num_batches) writer.add_scalar('loss', loss.item(), step) # predicted = torch.argmax(ip,axis=0) predicted = torch.argmax(logits, axis=1) # print(logits) # print(predicted) # print(y) accuracy = torch.true_divide(torch.sum(predicted == y), predicted.size(0)) writer.add_scalar('acc', accuracy, step) if scheduler is not None: writer.add_scalar('lr', scheduler.get_last_lr()[0], step) # save embed for validation: if test_epochs > 0 and epoch % test_epochs == 0: batch_embeds.append(embeds.cpu().detach()) writer.add_scalar('epoch', epoch, step) # run tests if test_epochs > 0 and epoch % test_epochs == 0: # run auc calculation #TODO: create dataset only once. #TODO: train predictor here or in the model class itself. Should not be in the eval part #TODO: we might not want to use the training datat because of droupout etc. but it should give a indecation of the model performance??? # batch_embeds = torch.cat(batch_embeds) # print(batch_embeds.shape) model.eval() roc_auc = eval_model(model_name, data_type, device=device, save_plots=False, size=size, show_training_data=False, model=model) #train_embed=batch_embeds) model.train() writer.add_scalar('eval_auc', roc_auc, step) torch.save(model.state_dict(), model_dir / f"{model_name}.tch")
def train_eval_model(model, criterion, optimizer, dataloader, num_epochs, resume=False, start_epoch=0): print("Start training...") since = time.time() dataloader["train"].dataset.set_num_graphs( cfg.TRAIN.num_graphs_in_matching_instance) dataset_size = len(dataloader["train"].dataset) device = next(model.parameters()).device print("model on device: {}".format(device)) checkpoint_path = Path(cfg.model_dir) / "params" if not checkpoint_path.exists(): checkpoint_path.mkdir(parents=True) if resume: params_path = os.path.join(cfg.warmstart_path, f"params.pt") print("Loading model parameters from {}".format(params_path)) model.load_state_dict(torch.load(params_path)) optim_path = os.path.join(cfg.warmstart_path, f"optim.pt") print("Loading optimizer state from {}".format(optim_path)) optimizer.load_state_dict(torch.load(optim_path)) # Evaluation only if cfg.evaluate_only: # assert resume print(f"Evaluating without training...") accs, f1_scores = eval_model(model, dataloader["test"]) acc_dict = { "acc_{}".format(cls): single_acc for cls, single_acc in zip(dataloader["train"].dataset.classes, accs) } f1_dict = { "f1_{}".format(cls): single_f1_score for cls, single_f1_score in zip( dataloader["train"].dataset.classes, f1_scores) } acc_dict.update(f1_dict) acc_dict["matching_accuracy"] = torch.mean(accs) acc_dict["f1_score"] = torch.mean(f1_scores) time_elapsed = time.time() - since print("Evaluation complete in {:.0f}h {:.0f}m {:.0f}s".format( time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60)) return model, acc_dict _, lr_milestones, lr_decay = lr_schedules[cfg.TRAIN.lr_schedule] scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=lr_milestones, gamma=lr_decay) if cfg.log_dir: os.makedirs(cfg.log_dir, exist_ok=True) writer = SummaryWriter(cfg.log_dir) for epoch in range(start_epoch, num_epochs): print("Epoch {}/{}".format(epoch, num_epochs - 1)) print("-" * 10) model.train() # Set model to training mode print("lr = " + ", ".join( ["{:.2e}".format(x["lr"]) for x in optimizer.param_groups])) epoch_loss = 0.0 running_loss = 0.0 running_acc = 0.0 epoch_acc = 0.0 running_f1 = 0.0 epoch_f1 = 0.0 running_since = time.time() iter_num = 0 # Iterate over data. for inputs in dataloader["train"]: data_list = [_.cuda() for _ in inputs["images"]] points_gt_list = [_.cuda() for _ in inputs["Ps"]] n_points_gt_list = [_.cuda() for _ in inputs["ns"]] edges_list = [_.to("cuda") for _ in inputs["edges"]] perm_mat_list = [ perm_mat.cuda() for perm_mat in inputs["gt_perm_mat"] ] iter_num = iter_num + 1 # zero the parameter gradients optimizer.zero_grad() with torch.set_grad_enabled(True): # forward s_pred_list = model(data_list, points_gt_list, edges_list, n_points_gt_list, perm_mat_list) loss = sum([ criterion(s_pred, perm_mat) for s_pred, perm_mat in zip(s_pred_list, perm_mat_list) ]) loss /= len(s_pred_list) # backward + optimize loss.backward() optimizer.step() tp, fp, fn = get_pos_neg_from_lists(s_pred_list, perm_mat_list) f1 = f1_score(tp, fp, fn) acc, _, __ = matching_accuracy_from_lists( s_pred_list, perm_mat_list) # statistics bs = perm_mat_list[0].size(0) running_loss += loss.item() * bs # multiply with batch size epoch_loss += loss.item() * bs running_acc += acc.item() * bs epoch_acc += acc.item() * bs running_f1 += f1.item() * bs epoch_f1 += f1.item() * bs if iter_num % cfg.STATISTIC_STEP == 0: running_speed = cfg.STATISTIC_STEP * bs / (time.time() - running_since) loss_avg = running_loss / cfg.STATISTIC_STEP / bs acc_avg = running_acc / cfg.STATISTIC_STEP / bs f1_avg = running_f1 / cfg.STATISTIC_STEP / bs print( "Epoch {:<4} Iter {:<4} {:>4.2f}sample/s Loss={:<8.4f} Accuracy={:<2.3} F1={:<2.3}" .format(epoch, iter_num, running_speed, loss_avg, acc_avg, f1_avg)) running_acc = 0.0 running_f1 = 0.0 running_loss = 0.0 running_since = time.time() epoch_loss = epoch_loss / dataset_size epoch_acc = epoch_acc / dataset_size epoch_f1 = epoch_f1 / dataset_size if cfg.save_checkpoint: base_path = Path(checkpoint_path / "{:04}".format(epoch + 1)) Path(base_path).mkdir(parents=True, exist_ok=True) path = str(base_path / "params.pt") torch.save(model.state_dict(), path) torch.save(optimizer.state_dict(), str(base_path / "optim.pt")) print( "Over whole epoch {:<4} -------- Loss: {:.4f} Accuracy: {:.3f} F1: {:.3f}" .format(epoch, epoch_loss, epoch_acc, epoch_f1)) print() # Eval in each epoch accs, f1_scores = eval_model(model, dataloader["test"]) acc_dict = { "acc_{}".format(cls): single_acc for cls, single_acc in zip(dataloader["train"].dataset.classes, accs) } f1_dict = { "f1_{}".format(cls): single_f1_score for cls, single_f1_score in zip( dataloader["train"].dataset.classes, f1_scores) } acc_dict.update(f1_dict) val_acc = torch.mean(accs) val_f1 = torch.mean(f1_scores) acc_dict["matching_accuracy"] = val_acc acc_dict["f1_score"] = val_f1 # Tensorboard if cfg.log_dir: writer.add_scalar('Loss/train', epoch_loss, epoch) writer.add_scalar('Acc/train', epoch_acc, epoch) writer.add_scalar('F1/train', epoch_f1, epoch) writer.add_scalar('Acc/val', val_acc, epoch) writer.add_scalar('F1/val', val_f1, epoch) lr = optimizer.param_groups[0]["lr"] writer.add_scalar('lr', lr, epoch) scheduler.step() # Close TensorBoard writer if cfg.log_dir: writer.close() time_elapsed = time.time() - since print("Training complete in {:.0f}h {:.0f}m {:.0f}s".format( time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60)) return model, acc_dict
def train_model(log_dict,data,model,loss_fn,optimizer,lr_scheduler,writer,save_home): best_acc1 = 0 patience_flag = 0 train_iter,valid_iter,test_iter = data[0],data[1],data[2] # data is a tuple of three iterators # print("Start Training") for epoch in range(0,log_dict.param.nepoch): ## train and validation train_loss, train_acc = train_epoch(model, train_iter, epoch,loss_fn,optimizer,log_dict) val_loss, val_acc ,val_f1_score,val_w_f1_score,val_top3_acc= eval_model(model, valid_iter,loss_fn,log_dict) print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%') ## testing test_loss, test_acc,test_f1_score,test_w_f1_score,test_top3_acc = eval_model(model, test_iter,loss_fn,log_dict) print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}% Test F1 score: {test_f1_score:.4f}') ## save best model is_best = val_acc > best_acc1 os.makedirs(save_home,exist_ok=True) save_checkpoint({'epoch': epoch + 1,'arch': log_dict.param.arch_name,'state_dict': model.state_dict(),'train_acc':train_acc,"val_acc":val_acc,'param':dict(log_dict.param),'optimizer' : optimizer.state_dict()},is_best,save_home+"/model_best.pth.tar") best_acc1 = max(val_acc, best_acc1) if log_dict.param.step_size != None: lr_scheduler.step() ## tensorboard runs writer.add_scalar('Loss/train',train_loss,epoch) writer.add_scalar('Accuracy/train',train_acc,epoch) writer.add_scalar('Loss/val',val_loss,epoch) writer.add_scalar('Accuracy/val',val_acc,epoch) ## save logs if is_best: patience_flag = 0 log_dict.train_acc = train_acc log_dict.test_acc = test_acc log_dict.valid_acc = val_acc log_dict.test_f1_score = test_f1_score log_dict.valid_f1_score = val_f1_score log_dict.valid_top3_acc = val_top3_acc log_dict.test_top3_acc = test_top3_acc log_dict.train_loss = train_loss log_dict.test_loss = test_loss log_dict.valid_loss = val_loss log_dict.epoch = epoch+1 log_dict.weighted_test_f1_score = test_w_f1_score log_dict.weighted_valid_f1_score = val_w_f1_score with open(save_home+"/log.json", 'w') as fp: json.dump(dict(log_dict), fp,indent=4) fp.close() else: patience_flag += 1 ## early stopping if patience_flag == log_dict.param.patience or epoch == log_dict.param.nepoch-1: print(log_dict) break
from train import train_gan from eval import eval_model from models import * from config import * if __name__ == '__main__': model = train_gan('/content/Break-dataset/QDMR/train.csv') model.save_internal(seq2seq_path='model.dat') eval_model([('model.dat', RobertaDecomposer, SEQ_LENGTH)], '/content/Break-dataset/QDMR/dev.csv', orig_filenames=["orig.csv"], pred_filenames=["pred.csv"])
def main(_): config = expconf.ExperimentConfig(data_dir=FLAGS.data_dir, root_log_dir=FLAGS.root_log_dir, config_path=FLAGS.config_path) logging.info("config info : %s" % config.config) labels_dir = os.path.join(FLAGS.data_dir, 'label_2') detections_dir = os.path.join(FLAGS.data_dir, 'detection_2') frames_ids = np.asarray([ int(ntpath.basename(path).split('.')[0]) for path in os.listdir(labels_dir) ]) n_frames = len(frames_ids) n_bboxes_test = config.n_bboxes n_classes = 1 class_name = config.general_params.get('class_of_interest', 'Car') half = n_frames / 2 learning_rate = config.learning_rate_det # shuffled_samples = shuffle_samples(n_frames) # train_frames = frames_ids[shuffled_samples[0:half]] # test_frames = frames_ids[shuffled_samples[half:]] train_frames_path = os.path.join(FLAGS.data_dir, 'train.txt') train_frames = np.loadtxt(train_frames_path, dtype=int) test_frames_path = os.path.join(FLAGS.data_dir, 'val.txt') test_frames = np.loadtxt(test_frames_path, dtype=int) train_out_dir = os.path.join(config.log_dir, 'train') test_out_dir = os.path.join(config.log_dir, 'test') n_train_samples = len(train_frames) n_test_samples = len(test_frames) logging.info('building model graph..') in_ops = input_ops(config.n_dt_features, n_classes) nnms_model = nms_net.NMSNetwork(n_classes=1, input_ops=in_ops, class_ix=0, **config.nms_network_config) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0) config.save_results() logging.info('training started..') with tf.Session() as sess: sess.run(nnms_model.init_op) step_id = 0 step_times = [] data_times = [] # loss_mode = 'nms' # nnms_model.switch_loss('nms') # logging.info("current loss mode : %s" % loss_mode) summary_writer = tf.summary.FileWriter(config.log_dir, sess.graph) for epoch_id in range(0, config.n_epochs): epoch_frames = train_frames[shuffle_samples(n_train_samples)] for fid in epoch_frames: # if step_id == config.loss_change_step: # learning_rate = config.learning_rate_det # loss_mode = 'detection' # nnms_model.switch_loss('detection') # logging.info('switching loss to actual detection loss..') # logging.info('learning rate to %f' % learning_rate) start_step = timer() frame_data = get_frame_data_fixed( frame_id=fid, labels_dir=labels_dir, detections_dir=detections_dir, n_detections=config.n_bboxes, class_name=class_name, n_features=config.n_dt_features) data_step = timer() feed_dict = { nnms_model.dt_coords: frame_data['dt_coords'], nnms_model.dt_features: frame_data['dt_features'], nnms_model.dt_probs_ini: frame_data['dt_probs'], nnms_model.gt_coords: frame_data['gt_coords'], nnms_model.gt_labels: frame_data['gt_labels'], nnms_model.keep_prob: config.keep_prob_train } if nnms_model.loss_type == 'nms': summary, _ = sess.run([ nnms_model.merged_summaries, nnms_model.nms_train_step ], feed_dict=feed_dict) else: summary, _ = sess.run([ nnms_model.merged_summaries, nnms_model.det_train_step ], feed_dict=feed_dict) step_id += 1 summary_writer.add_summary(summary, global_step=step_id) summary_writer.flush() end_step = timer() step_times.append(end_step - start_step) data_times.append(data_step - start_step) if step_id % config.eval_step == 0: logging.info("learning rate %s" % str(nnms_model.learning_rate_det.eval())) logging.info( 'curr step : %d, mean time for step : %s, for getting data : %s' % (step_id, str( np.mean(step_times)), str(np.mean(data_times)))) logging.info("eval on TRAIN..") train_loss_opt, train_loss_fin = eval.eval_model( sess, nnms_model, detections_dir=detections_dir, labels_dir=labels_dir, eval_frames=train_frames, n_bboxes=config.n_bboxes, n_features=config.n_dt_features, global_step=step_id, out_dir=train_out_dir, nms_thres=config.nms_thres, class_name=class_name) logging.info("eval on TEST..") test_loss_opt, test_loss_fin = eval.eval_model( sess, nnms_model, detections_dir=detections_dir, labels_dir=labels_dir, eval_frames=test_frames, n_bboxes=config.n_bboxes, n_features=config.n_dt_features, global_step=step_id, out_dir=test_out_dir, nms_thres=config.nms_thres, class_name=class_name) config.update_results(step_id, train_loss_opt, train_loss_fin, test_loss_opt, test_loss_fin, np.mean(step_times)) config.save_results() saver.save(sess, config.model_file, global_step=step_id) train_loss_opt, train_loss_fin = eval.eval_model( sess, nnms_model, detections_dir=detections_dir, labels_dir=labels_dir, eval_frames=train_frames, n_bboxes=config.n_bboxes, n_features=config.n_dt_features, global_step=step_id, out_dir=train_out_dir, nms_thres=config.nms_thres, class_name=class_name) test_loss_opt, test_loss_fin = eval.eval_model( sess, nnms_model, detections_dir=detections_dir, labels_dir=labels_dir, eval_frames=test_frames, n_bboxes=config.n_bboxes, n_features=config.n_dt_features, global_step=step_id, out_dir=test_out_dir, nms_thres=config.nms_thres, class_name=class_name) config.update_results(step_id, train_loss_opt, train_loss_fin, test_loss_opt, test_loss_fin, np.mean(step_times)) config.save_results() saver.save(sess, config.model_file, global_step=step_id) return
def run_model(config, device): model_config1 = model_config(config.modelconfig) # Load data and create features. If calculated features available in the cache, it will use it dataprocessor = MultiClassificationProcessor() train_dataloader, data_len, num_labels, num_train_optimization_steps, all_label_ids = dataprocessor.get_data_loader( config) # set to right Model Name if config.programsettings.MODEL_NAME == "BioBERT_fc": model = Biobert_fc(device, model_config1) if config.programsettings.MODEL_NAME == "BioBERT_CNN_fc": model = Biobert_cnn_fc(device, model_config1) elif config.programsettings.MODEL_NAME == "BERT_Sequence": model = BertForSequenceClassification.from_pretrained( config.programsettings.BERT_MODEL, cache_dir=config.programsettings.CACHE_DIR, num_labels=num_labels) # Freeze BERT layers if we don't want to tune based on configuraiton if config.hyperparams.NUM_BERT_LAYERS_FREEZE >= 0: count = 0 for child in model.children(): if count > 0 and count < config.hyperparams.NUM_BERT_LAYERS_FREEZE: for param in child.parameters(): param.requires_grad = False count += 1 # set the optimizer optimizer = optim.AdamW(model.parameters(), lr=config.hyperparams.LEARNING_RATE) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.hyperparams.NUM_WARMP_STEPS, num_training_steps=num_train_optimization_steps) # PyTorch scheduler # Run the model # from train import train_model model = train_model(config, model, optimizer, scheduler, train_dataloader, num_labels, data_len, device=device, model_save_path=config.programsettings.OUTPUT_DIR, model_name=config.programsettings.MODEL_NAME, num_epochs=config.hyperparams.NUM_TRAIN_EPOCHS) # Evaluate training data train_inputs, train_preds, train_labels, train_loss = eval_model( config, model, train_dataloader, device, num_labels) # Prepare dev dataset dev_dataloader, dev_data_len, dev_num_labels, dev_num_train_optimization_steps, all_dev_label_ids = dataprocessor.get_data_loader( config, source='dev') # Run the trained model on dev data dev_inputs, dev_preds, dev_labels, dev_loss = eval_model( config, model, dev_dataloader, device, num_labels) # Evaluate Test data dataprocessor = MultiClassificationProcessor() test_dataloader, dev_data_len, dev_num_labels, dev_num_train_optimization_steps, all_dev_label_ids = dataprocessor.get_data_loader( config, source='test') test_inputs, test_preds, test_labels, test_loss = eval_model( config, model, test_dataloader, device, num_labels) return train_inputs, train_labels, train_preds, train_loss, dev_inputs, dev_labels, dev_loss, dev_preds, test_inputs, test_preds, test_labels, test_loss
def train_eval_model(model, permLoss, optimizer, dataloader, num_epochs=25, resume=False, start_epoch=0, viz=None, savefiletime='time'): print('**************************************') print('Start training...') dataset_size = len(dataloader['train'].dataset) print('train datasize: {}'.format(dataset_size)) since = time.time() lap_solver = hungarian optimal_acc = 0.0 optimal_rot = np.inf device = next(model.parameters()).device print('model on device: {}'.format(device)) checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params' if not checkpoint_path.exists(): checkpoint_path.mkdir(parents=True) if resume: assert start_epoch != 0 model_path = str(checkpoint_path / 'params_{:04}.pt'.format(start_epoch)) print('Loading model parameters from {}'.format(model_path)) load_model(model, model_path) optim_path = str(checkpoint_path / 'optim_{:04}.pt'.format(start_epoch)) print('Loading optimizer state from {}'.format(optim_path)) optimizer.load_state_dict(torch.load(optim_path)) scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.TRAIN.LR_STEP, gamma=cfg.TRAIN.LR_DECAY, last_epoch=cfg.TRAIN.START_EPOCH - 1) for epoch in range(start_epoch, num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) model.train() # Set model to training mode print('lr = ' + ', '.join( ['{:.2e}'.format(x['lr']) for x in optimizer.param_groups])) iter_num = 0 running_since = time.time() all_train_metrics_np = defaultdict(list) # Iterate over data3d. for inputs in dataloader['train']: P1_gt, P2_gt = [_.cuda() for _ in inputs['Ps']] #keypoints coordinate n1_gt, n2_gt = [_.cuda() for _ in inputs['ns']] #keypoints number A1_gt, A2_gt = [_.cuda() for _ in inputs['As']] #edge connect matrix perm_mat = inputs['gt_perm_mat'].cuda() #permute matrix T1_gt, T2_gt = [_.cuda() for _ in inputs['Ts']] Inlier_src_gt, Inlier_ref_gt = [_.cuda() for _ in inputs['Ins']] batch_cur_size = perm_mat.size(0) iter_num = iter_num + 1 # zero the parameter gradients optimizer.zero_grad() with torch.set_grad_enabled(True): # forward s_pred, Inlier_src_pre, Inlier_ref_pre = model( P1_gt, P2_gt, A1_gt, A2_gt, n1_gt, n2_gt) # multi_loss = [] if cfg.DATASET.NOISE_TYPE == 'clean': permloss = permLoss(s_pred, perm_mat, n1_gt, n2_gt) loss = permloss else: if cfg.PGM.USEINLIERRATE: s_pred = Inlier_src_pre * s_pred * Inlier_ref_pre.transpose( 2, 1).contiguous() permloss = permLoss(s_pred, perm_mat, n1_gt, n2_gt) loss = permloss # backward + optimize loss.backward() optimizer.step() # training accuracy statistic s_perm_mat = lap_solver(s_pred, n1_gt, n2_gt, Inlier_src_pre, Inlier_ref_pre) match_metrics = matching_accuracy(s_perm_mat, perm_mat, n1_gt) perform_metrics = compute_metrics(s_perm_mat, P1_gt[:, :, :3], P2_gt[:, :, :3], T1_gt[:, :3, :3], T1_gt[:, :3, 3]) for k in match_metrics: all_train_metrics_np[k].append(match_metrics[k]) for k in perform_metrics: all_train_metrics_np[k].append(perform_metrics[k]) all_train_metrics_np['loss'].append(np.repeat(loss.item(), 4)) if iter_num % cfg.STATISTIC_STEP == 0: running_speed = cfg.STATISTIC_STEP * batch_cur_size / ( time.time() - running_since) # globalstep = epoch * dataset_size + iter_num * batch_cur_size print( 'Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f} GT-Acc:{:.4f} Pred-Acc:{:.4f}' .format( epoch, iter_num, running_speed, np.mean( np.concatenate(all_train_metrics_np['loss']) [-cfg.STATISTIC_STEP * batch_cur_size:]), np.mean( np.concatenate(all_train_metrics_np['acc_gt']) [-cfg.STATISTIC_STEP * batch_cur_size:]), np.mean( np.concatenate( all_train_metrics_np['acc_pred']) [-cfg.STATISTIC_STEP * batch_cur_size:]))) running_since = time.time() all_train_metrics_np = { k: np.concatenate(all_train_metrics_np[k]) for k in all_train_metrics_np } summary_metrics = summarize_metrics(all_train_metrics_np) print('Epoch {:<4} Mean-Loss: {:.4f} GT-Acc:{:.4f} Pred-Acc:{:.4f}'. format(epoch, summary_metrics['loss'], summary_metrics['acc_gt'], summary_metrics['acc_pred'])) print_metrics(summary_metrics) save_model(model, str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1))) torch.save(optimizer.state_dict(), str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1))) # to save values during training metric_is_save = False if metric_is_save: np.save( str( Path(cfg.OUTPUT_PATH) / ('train_log_' + savefiletime + '_metric')), all_train_metrics_np) if viz is not None: viz.update('train_loss', epoch, {'loss': summary_metrics['loss']}) viz.update('train_acc', epoch, {'acc': summary_metrics['acc_gt']}) viz.update( 'train_metric', epoch, { 'r_mae': summary_metrics['r_mae'], 't_mae': summary_metrics['t_mae'] }) # Eval in each epochgi val_metrics = eval_model(model, dataloader['val']) if viz is not None: viz.update('val_acc', epoch, {'acc': val_metrics['acc_gt']}) viz.update('val_metric', epoch, { 'r_mae': val_metrics['r_mae'], 't_mae': val_metrics['t_mae'] }) if optimal_acc < val_metrics['acc_gt']: optimal_acc = val_metrics['acc_gt'] print('Current best acc model is {}'.format(epoch + 1)) if optimal_rot > val_metrics['r_mae']: optimal_rot = val_metrics['r_mae'] print('Current best rotation model is {}'.format(epoch + 1)) # Test in each epochgi test_metrics = eval_model(model, dataloader['test']) if viz is not None: viz.update('test_acc', epoch, {'acc': test_metrics['acc_gt']}) viz.update('test_metric', epoch, { 'r_mae': test_metrics['r_mae'], 't_mae': test_metrics['t_mae'] }) scheduler.step() time_elapsed = time.time() - since print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format( time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60)) return model
def train(cmd_opt, word2vec_matrix, dataset_loader, num_classes, image_size, word2vec_embedding_size, image_branch, validation_examples): global_step_tensor = tf.Variable(0, trainable=False, name="global_step") train_image_tensor, train_vector_tensor, train_label_tensor = \ create_batches(dataset_loader, cmd_opt.batchSize // 2, image_size, word2vec_embedding_size,) valid_image_tensor, valid_vector_tensor, valid_label_tensor = \ create_batches(dataset_loader, cmd_opt.batchSize, image_size, word2vec_embedding_size, is_train=False, is_valid=True) image_placeholder, wordvec_placeholder, groundtruth_placeholder, \ matrix_placeholder = create_placeholders(cmd_opt.batchSize, image_size, word2vec_embedding_size, num_classes) im_em, word_em, loss_tensor, train_op = train_model(image = image_placeholder, word_vec = wordvec_placeholder, groundtruth = groundtruth_placeholder, embedding_size = cmd_opt.embeddingSize, learning_rate = cmd_opt.learningRate, global_step = global_step_tensor, loss_margin=cmd_opt.margin, num_classes=num_classes, batch_size=cmd_opt.batchSize) label_tensor, _, embedding_inversion = eval_model(image_embedding = im_em, wordvec_embedding = word_em, matrix = matrix_placeholder, batch_size = cmd_opt.batchSize, num_classes = num_classes, word_validation = cmd_opt.validation==0, image_validation = cmd_opt.validation==1) supervisor_saver = tf.train.Saver() # supervisor = tf.train.Supervisor(logdir=None,#cmd_opt.expDir, # summary_op=None, # global_step=global_step_tensor, # saver=supervisor_saver) config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True # config_proto.gpu_options.per_process_gpu_memory_fraction = 0.6 session = tf.Session(config=config_proto) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=session) session = tf_debug.LocalCLIDebugWrapperSession(session) # with supervisor.managed_session(config=config_proto) as session: for i in range(1): session.run(tf.global_variables_initializer()) global_step = session.run(global_step_tensor) start_time = time.time() for iter in range(global_step, cmd_opt.numIters): loss, global_step = train_iter(session, train_op, loss_tensor, global_step_tensor, train_image_tensor, train_vector_tensor, train_label_tensor, dataset_loader.word2vec_matrix, \ dataset_loader.distance_matrix, image_placeholder, wordvec_placeholder, groundtruth_placeholder, permutation=True) if (iter+1) % cmd_opt.displayIters == 0: end_time = time.time() print ("Time per iteration: ", str((end_time-start_time)/cmd_opt.displayIters)) print ("Training Loss at " , iter+1, ": ", str(loss)) start_time = time.time() # if (iter+1) % cmd_opt.validIters == 0: # if cmd_opt.validation == 0: print("Validation accuracy: ", compute_word_accuracy(session, valid_image_tensor, valid_label_tensor, label_tensor, image_placeholder, cmd_opt.batchSize, word2vec_matrix, matrix_placeholder, validation_examples))
def train_model(model, optimizer, train_loader, epochs, scheduler, early_stopping=None, test_loader=None, eval_loader=None, device='cpu', t=1): scores = [] mean_losses = [] best_model = model.state_dict() best_model_i = 0 model.to(device) if early_stopping is not None: early_stopping.reset() model.train() bar = tqdm(range(epochs), leave=True) for epoch in bar: model.train() losses = [] for i, (x, y) in enumerate(train_loader): x, y = x.to(device), y.to(device) if t > 1: pred = torch.stack([model(x) for _ in range(t)], dim=0) pred = pred.mean(0) else: pred = model(x) loss = torch.nn.functional.cross_entropy(pred, y, reduction='none') losses.extend(loss.tolist()) loss = loss.mean() optimizer.zero_grad() loss.backward() optimizer.step() if scheduler is not None: if isinstance(scheduler, (StepLR, MultiStepLR)): scheduler.step() elif hasattr(scheduler, 'step'): scheduler.step() if eval_loader is not None: eval_scores, _ = eval_model(model, eval_loader, topk=[1, 5], device=device) else: eval_scores = 0 mean_loss = sum(losses) / len(losses) mean_losses.append(mean_loss) if early_stopping is not None: r = early_stopping.step(eval_scores[1]) if eval_loader is not None \ else early_stopping.step(mean_loss) if r < 0: break elif r > 0: best_model = model.state_dict() best_model_i = epoch else: best_model = model.state_dict() best_model_i = epoch train_scores, _ = eval_model(model, train_loader, device=device) test_scores, _ = eval_model(model, test_loader, device=device) bar.set_postfix({ 'Train score': train_scores[1], 'Test score': test_scores[1], 'Eval score': eval_scores[1] if eval_scores != 0 else 0, 'Mean loss': mean_loss }) scores.append((train_scores, eval_scores, test_scores)) return best_model, scores, scores[best_model_i], mean_losses
def train_eval_model(model, criterion, optimizer, image_dataset, dataloader, tfboard_writer, benchmark, num_epochs=25, start_epoch=0, xls_wb=None): print('Start training...') since = time.time() dataset_size = len(dataloader['train'].dataset) displacement = Displacement() device = next(model.parameters()).device print('model on device: {}'.format(device)) checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params' if not checkpoint_path.exists(): checkpoint_path.mkdir(parents=True) model_path, optim_path = '', '' if start_epoch != 0: model_path = str(checkpoint_path / 'params_{:04}.pt'.format(start_epoch)) optim_path = str(checkpoint_path / 'optim_{:04}.pt'.format(start_epoch)) if len(cfg.PRETRAINED_PATH) > 0: model_path = cfg.PRETRAINED_PATH if len(model_path) > 0: print('Loading model parameters from {}'.format(model_path)) load_model(model, model_path, strict=False) if len(optim_path) > 0: print('Loading optimizer state from {}'.format(optim_path)) optimizer.load_state_dict(torch.load(optim_path)) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.TRAIN.LR_STEP, gamma=cfg.TRAIN.LR_DECAY, last_epoch=cfg.TRAIN.START_EPOCH - 1) for epoch in range(start_epoch, num_epochs): # Reset seed after evaluation per epoch torch.manual_seed(cfg.RANDOM_SEED + epoch + 1) dataloader['train'] = get_dataloader(image_dataset['train'], shuffle=True, fix_seed=False) print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) model.train() # Set model to training mode print('lr = ' + ', '.join(['{:.2e}'.format(x['lr']) for x in optimizer.param_groups])) epoch_loss = 0.0 running_loss = 0.0 running_since = time.time() iter_num = 0 # Iterate over data. for inputs in dataloader['train']: if iter_num >= cfg.TRAIN.EPOCH_ITERS: break if model.module.device != torch.device('cpu'): inputs = data_to_cuda(inputs) iter_num = iter_num + 1 # zero the parameter gradients optimizer.zero_grad() with torch.set_grad_enabled(True): # forward outputs = model(inputs) if cfg.PROBLEM.TYPE == '2GM': assert 'ds_mat' in outputs assert 'perm_mat' in outputs assert 'gt_perm_mat' in outputs # compute loss if cfg.TRAIN.LOSS_FUNC == 'offset': d_gt, grad_mask = displacement(outputs['gt_perm_mat'], *outputs['Ps'], outputs['ns'][0]) d_pred, _ = displacement(outputs['ds_mat'], *outputs['Ps'], outputs['ns'][0]) loss = criterion(d_pred, d_gt, grad_mask) elif cfg.TRAIN.LOSS_FUNC in ['perm', 'ce', 'hung']: loss = criterion(outputs['ds_mat'], outputs['gt_perm_mat'], *outputs['ns']) elif cfg.TRAIN.LOSS_FUNC == 'hamming': loss = criterion(outputs['perm_mat'], outputs['gt_perm_mat']) elif cfg.TRAIN.LOSS_FUNC == 'custom': loss = torch.sum(outputs['loss']) else: raise ValueError( 'Unsupported loss function {} for problem type {}'.format(cfg.TRAIN.LOSS_FUNC, cfg.PROBLEM.TYPE)) # compute accuracy acc = matching_accuracy(outputs['perm_mat'], outputs['gt_perm_mat'], outputs['ns'][0]) elif cfg.PROBLEM.TYPE in ['MGM', 'MGM3']: assert 'ds_mat_list' in outputs assert 'graph_indices' in outputs assert 'perm_mat_list' in outputs if not 'gt_perm_mat_list' in outputs: assert 'gt_perm_mat' in outputs gt_perm_mat_list = [outputs['gt_perm_mat'][idx] for idx in outputs['graph_indices']] else: gt_perm_mat_list = outputs['gt_perm_mat_list'] # compute loss & accuracy if cfg.TRAIN.LOSS_FUNC in ['perm', 'ce' 'hung']: loss = torch.zeros(1, device=model.module.device) ns = outputs['ns'] for s_pred, x_gt, (idx_src, idx_tgt) in \ zip(outputs['ds_mat_list'], gt_perm_mat_list, outputs['graph_indices']): l = criterion(s_pred, x_gt, ns[idx_src], ns[idx_tgt]) loss += l loss /= len(outputs['ds_mat_list']) elif cfg.TRAIN.LOSS_FUNC == 'plain': loss = torch.sum(outputs['loss']) else: raise ValueError( 'Unsupported loss function {} for problem type {}'.format(cfg.TRAIN.LOSS_FUNC, cfg.PROBLEM.TYPE)) # compute accuracy acc = torch.zeros(1, device=model.module.device) for x_pred, x_gt, (idx_src, idx_tgt) in \ zip(outputs['perm_mat_list'], gt_perm_mat_list, outputs['graph_indices']): a = matching_accuracy(x_pred, x_gt, ns[idx_src]) acc += torch.sum(a) acc /= len(outputs['perm_mat_list']) else: raise ValueError('Unknown problem type {}'.format(cfg.PROBLEM.TYPE)) # backward + optimize if cfg.FP16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() batch_num = inputs['batch_size'] # tfboard writer loss_dict = dict() loss_dict['loss'] = loss.item() tfboard_writer.add_scalars('loss', loss_dict, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num) accdict = dict() accdict['matching accuracy'] = torch.mean(acc) tfboard_writer.add_scalars( 'training accuracy', accdict, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num ) # statistics running_loss += loss.item() * batch_num epoch_loss += loss.item() * batch_num if iter_num % cfg.STATISTIC_STEP == 0: running_speed = cfg.STATISTIC_STEP * batch_num / (time.time() - running_since) print('Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f}' .format(epoch, iter_num, running_speed, running_loss / cfg.STATISTIC_STEP / batch_num)) tfboard_writer.add_scalars( 'speed', {'speed': running_speed}, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num ) tfboard_writer.add_scalars( 'learning rate', {'lr_{}'.format(i): x['lr'] for i, x in enumerate(optimizer.param_groups)}, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num ) running_loss = 0.0 running_since = time.time() epoch_loss = epoch_loss / cfg.TRAIN.EPOCH_ITERS / batch_num save_model(model, str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1))) torch.save(optimizer.state_dict(), str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1))) print('Epoch {:<4} Loss: {:.4f}'.format(epoch, epoch_loss)) print() # Eval in each epoch if dataloader['test'].dataset.cls not in ['none', 'all', None]: clss = [dataloader['test'].dataset.cls] else: clss = dataloader['test'].dataset.bm.classes l_e = (epoch == (num_epochs - 1)) accs = eval_model(model, clss, benchmark['test'], l_e, xls_sheet=xls_wb.add_sheet('epoch{}'.format(epoch + 1))) acc_dict = {"{}".format(cls): single_acc for cls, single_acc in zip(dataloader['test'].dataset.classes, accs)} acc_dict['average'] = torch.mean(accs) tfboard_writer.add_scalars( 'Eval acc', acc_dict, (epoch + 1) * cfg.TRAIN.EPOCH_ITERS ) wb.save(wb.__save_path) scheduler.step() time_elapsed = time.time() - since print('Training complete in {:.0f}h {:.0f}m {:.0f}s' .format(time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60)) return model
def main(): parser = argparse.ArgumentParser(description='Train a compositional recognizer model.') parser.add_argument('-c', '--class-count', type=int, default=10) parser.add_argument('-l', '--seq-length', type=int, default=10) parser.add_argument('-o', '--overlap', type=int, default=2) parser.add_argument('--noise', type=float, default=None) parser.add_argument('-s', '--hidden-size', type=int, default=512) parser.add_argument('-b', '--batch-size', type=int, default=8) parser.add_argument('-n', '--epoch-size', type=int, default=5120) parser.add_argument('-v', '--validation-size', type=int, default=5120) parser.add_argument('-e', '--epoch-count', type=int, default=100) parser.add_argument('-g', '--gpu_id', type=int, default=0) parser.add_argument('-d', '--dropout', type=float, default=0) parser.add_argument('-r', '--regenerate', action='store_true') parser.add_argument('-w', '--write-to', default=None) args = parser.parse_args() dataset = CompositionalDataset(args.class_count, args.seq_length, args.overlap, args.noise) model = CompositionalRecognizer(args.class_count, args.hidden_size, args.dropout) optimizer = torch.optim.Adam(model.parameters()) device = torch.device('cuda:{id}'.format(id=args.gpu_id)) if not args.regenerate: print('Generating training dataset...') dataloader = DataLoader(dataset.generate_dataset(args.epoch_size), args.batch_size, drop_last=True) print('Generating validation dataset...') val_dataloader = DataLoader(dataset.generate_dataset(args.validation_size), 256, drop_last=True) losses = [] scores = [] model = model.to(device) for epoch in range(1, args.epoch_count+1): model.train() total_loss = 0 if args.regenerate: print('Generating training dataset for epoch {i}...'.format(i=epoch)) dataloader = DataLoader(dataset.generate_dataset(args.epoch_size), args.batch_size, drop_last=True) print('Starting epoch {i}...'.format(i=epoch)) pbar = tqdm(total=args.epoch_size, desc='Batch - (Loss = -)') batch = 1 for x, labels in dataloader: x = x.to(device) labels = labels.to(device) optimizer.zero_grad() loss = model.forward_loss(x, labels) loss_val = loss.item() total_loss += loss_val loss.backward() optimizer.step() pbar.update(args.batch_size) pbar.set_description('Batch {b} (Loss = {ls})'.format(b=batch, ls=round(loss_val, 3))) batch += 1 pbar.close() total_loss /= args.epoch_size // args.batch_size losses.append(total_loss) print('Average epoch loss:', total_loss) print('Evaluating MAP score...') model.eval() map_score = eval_model(model, val_dataloader, device) scores.append(map_score) print('Epoch MAP score:', map_score) if args.write_to is not None: with open(args.write_to, 'a') as f: f.write(json.dumps(dict(params=vars(args), losses=losses, scores=scores)) + '\n')
def train_eval_model(model, criterion, optimizer, dataloader, tfboard_writer, num_epochs=25, resume=False, start_epoch=0): print('Start training...') since = time.time() dataset_size = len(dataloader['train'].dataset) device = next(model.parameters()).device print('model on device: {}'.format(device)) checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params' if not checkpoint_path.exists(): checkpoint_path.mkdir(parents=True) #model_path = str(checkpoint_path / 'params_{:04}.pt'.format(2)) #print('Loading model parameters from {}'.format(model_path)) #load_model(model, model_path) if resume: assert start_epoch != 0 model_path = str(checkpoint_path / 'params_{:04}.pt'.format(start_epoch)) print('Loading model parameters from {}'.format(model_path)) load_model(model, model_path) optim_path = str(checkpoint_path / 'optim_{:04}.pt'.format(start_epoch)) print('Loading optimizer state from {}'.format(optim_path)) optimizer.load_state_dict(torch.load(optim_path)) margin_loss = MarginLoss(30) marginedge_loss = MarginLoss(1, 0.3) scheduler = optim.lr_scheduler.ExponentialLR( optimizer, gamma=cfg.TRAIN.LR_DECAY, last_epoch=cfg.TRAIN.START_EPOCH - 1) #scheduler.step() for epoch in range(start_epoch, num_epochs): score_thresh = min(epoch * 0.1, 0.5) print('Epoch {}/{},score_thresh {}'.format(epoch, num_epochs - 1, score_thresh)) print('-' * 10) model.train() # Set model to training mode print('lr = ' + ', '.join( ['{:.2e}'.format(x['lr']) for x in optimizer.param_groups])) epoch_loss = 0.0 running_loss = 0.0 running_since = time.time() iter_num = 0 # Iterate over data. for inputs in dataloader['train']: data1, data2 = [_.cuda() for _ in inputs['images']] P1_gt, P2_gt = [_.cuda() for _ in inputs['Ps']] n1_gt, n2_gt = [_.cuda() for _ in inputs['ns']] weights = inputs['ws'].cuda() perm_mat = inputs['gt_perm_mat'].cuda() iter_num = iter_num + 1 # zero the parameter gradients optimizer.zero_grad() with torch.set_grad_enabled(True): # forward s_pred, d_pred,match_emb1,match_emb2,match_edgeemb1,match_edgeemb2,perm_mat,n1_gt,n2_gt = \ model(data1, data2, P1_gt, P2_gt, n1_gt, n2_gt,perm_mat=perm_mat,score_thresh=score_thresh) multi_loss = [] loss_lsm = criterion(s_pred, perm_mat, n1_gt, n2_gt, weights) loss_marg = margin_loss(match_emb1, match_emb2, perm_mat, n1_gt, n2_gt) loss_edgemarg = marginedge_loss(match_edgeemb1, match_edgeemb2, perm_mat, n1_gt, n2_gt) loss = (loss_marg + loss_edgemarg ) * 0.25 + loss_lsm #(loss_marg)*0.5+loss_pca # backward + optimize loss.backward() optimizer.step() # tfboard writer loss_dict = { 'loss_{}'.format(i): l.item() for i, l in enumerate(multi_loss) } loss_dict['loss'] = loss.item() tfboard_writer.add_scalars( 'loss', loss_dict, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num) # statistics running_loss += loss.item() * perm_mat.size(0) epoch_loss += loss.item() * perm_mat.size(0) if iter_num % cfg.STATISTIC_STEP == 0: running_speed = cfg.STATISTIC_STEP * perm_mat.size(0) / ( time.time() - running_since) print( 'Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f}' .format( epoch, iter_num, running_speed, running_loss / cfg.STATISTIC_STEP / perm_mat.size(0))) tfboard_writer.add_scalars( 'speed', {'speed': running_speed}, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num) running_loss = 0.0 running_since = time.time() epoch_loss = epoch_loss / dataset_size save_model(model, str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1))) torch.save(optimizer.state_dict(), str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1))) print('Epoch {:<4} Loss: {:.4f}'.format(epoch, epoch_loss)) print() # Eval in each epoch accs = eval_model(model, dataloader['test'], train_epoch=epoch) scheduler.step() time_elapsed = time.time() - since print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format( time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60)) return model
learning_rate=args.learning_rate, cuda=cuda) start_epoch, best_accuracy = load_model(model, cuda) for epoch in range(start_epoch, args.epochs): train_model(model=model, optimizer=optimizer, train_loader=train_loader, train_dataset=train_dataset, loss_fn=loss_fn, num_epochs=args.epochs, epoch=epoch, batch_size=args.batch_size, notify=args.notify, cuda=cuda) accuracy = eval_model(model=model, test_loader=test_loader, cuda=cuda) accuracy = 100. * accuracy / len(test_loader.dataset) logging.info('Test Accuracy: {:.2f}%'.format(accuracy)) send_metrics(accuracy=accuracy) # Save checkpoint logic accuracy = torch.FloatTensor([accuracy]) best_accuracy = torch.FloatTensor(max(accuracy.numpy(), best_accuracy.numpy())) if bool(accuracy.numpy() > best_accuracy.numpy()): logging.info('Saving new state for epoch {}'.format(epoch)) state = { 'epoch': epoch + 1, 'state': model.state_dict(), 'accuracy': best_accuracy }
def train_n_epochs(model, hyper, data, data_valid, evals, n_epochs, feedbacks_per_epoch=10, alpha_decay=1.0): """ Train the model for a desired amount of epochs. Automatically takes snapshots of the parameters after each epoch, and monitors the progress. Args: model: data: (str) Training data hyper: (dict) hyperparameters dictionary data_valid: (str) Validation data evals: (dict of lists) The dict that stores the losses and times for each epoch n_epochs: (int) Number of epochs to train for feedbacks_per_epoch: (int) Max number of progress printouts per epoch alpha_decay: (float)(default=1.0) How much to decay the alpha by after each epoch. Returns: (dict) - evals - the dictionary that monitors the losses, and times """ timer = Timer() timer.start() # CALCULATE NUMBER OF STEPS NEEDED # Technically the following calculation for `samples_per_epoch` is incorrect, # since we are randomly sampling windows, and not dividing the data into an # even number of chunks. # But it is still a useful approximation, that allows us to have more variation # in the training data. samples_per_epoch = int(len(data_train) // hyper["SAMPLE_LENGTH"]) steps_per_epoch = int(samples_per_epoch / hyper["BATCH_SIZE"]) feedback_every = int(steps_per_epoch / feedbacks_per_epoch) try: for i in range(n_epochs): print() print("=" * 60) print("EPOCH {}/{} ({:0.2f}%) alpha={}".format( i + 1, n_epochs, 100 * (i / n_epochs), model.alpha)) print("=" * 60) # TRAIN OVER A SINGLE EPOCH train_loss, epoch_time = train_n_steps( model, hyper, data_train, n_steps=steps_per_epoch, batch_size=hyper["BATCH_SIZE"], feedback_every=feedback_every) evals["train_loss"].append(train_loss) evals["train_time"].append(epoch_time) evals["alpha"].append(model.alpha) # EVALUATE ON VALIDATION DATA eval_loss, eval_time = eval_model( model, data_valid, char2id, seq_length=hyper["SAMPLE_LENGTH"], batch_size=hyper["BATCH_SIZE"]) evals["valid_loss"].append(eval_loss) evals["valid_time"].append(eval_time) # PREPARE MODEL FOR NEXT EPOCH model.update_learning_rate(model.alpha * alpha_decay) hyper["LAST_ALPHA"] = model.alpha # TAKE SNAPSHOTS - of parameters and evaluation dictionary global_epoch = len(evals["train_loss"]) epoch_snapshot(model, epoch=global_epoch, loss=eval_loss, name=MODEL_NAME, dir=SNAPSHOTS_DIR) obj2pickle(evals, EVALS_FILE) save_hyper_params(hyper, HYPERPARAMS_FILE) # FEEDBACK PRINTOUTS # TODO: Save a sample numerous generated strings to files at each epoch # Print a sample of generated text print_sample_generation(model, char2id, exploration=0.85) epoch_template = "({}) TRAIN_LOSS={: 7.3f} VALID_LOSS={: 7.3f}" print( epoch_template.format(timer.elapsed_string(), train_loss, eval_loss)) # UPDATE LEARNING CURVE PLOT plot_learning_curves(evals, file=LEARNING_CURVES_FILE, model_name=MODEL_NAME) print("- DONE") return evals # HANDLE EARLY TERMINATION except KeyboardInterrupt: print("\n A keyboard interrupt was triggered at", timer.elapsed_string()) # Save parameters as a recovery file print("Storing Recovery parameters") file = os.path.join(SNAPSHOTS_DIR, MODEL_NAME + ".recovery_params") take_snapshot(model, file) # Save evals as a recovery file print("Storing Recovery evals") file = os.path.join(MODELS_DIR, MODEL_NAME + ".recovery_evals") obj2pickle(evals, file) # Save hyper parameters print("Saving Hyper Params") hyper["LAST_ALPHA"] = model.alpha save_hyper_params(hyper, HYPERPARAMS_FILE) print("OK DONE") return evals
def train_eval_model(model, criterion, optimizer, dataloader, tfboard_writer, num_epochs=25, resume=False, start_epoch=0): print('Start training...') since = time.time() dataset_size = len(dataloader['train'].dataset) displacement = Displacement() lap_solver = hungarian device = next(model.parameters()).device print('model on device: {}'.format(device)) checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params' if not checkpoint_path.exists(): checkpoint_path.mkdir(parents=True) if resume: assert start_epoch != 0 model_path = str(checkpoint_path / 'params_{:04}.pt'.format(start_epoch)) print('Loading model parameters from {}'.format(model_path)) load_model(model, model_path) optim_path = str(checkpoint_path / 'optim_{:04}.pt'.format(start_epoch)) print('Loading optimizer state from {}'.format(optim_path)) optimizer.load_state_dict(torch.load(optim_path)) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.TRAIN.LR_STEP, gamma=cfg.TRAIN.LR_DECAY, last_epoch=cfg.TRAIN.START_EPOCH - 1) for epoch in range(start_epoch, num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) model.train() # Set model to training mode print('lr = ' + ', '.join(['{:.2e}'.format(x['lr']) for x in optimizer.param_groups])) epoch_loss = 0.0 running_loss = 0.0 running_since = time.time() iter_num = 0 # Iterate over data. for inputs in dataloader['train']: if 'images' in inputs: data1, data2 = [_.cuda() for _ in inputs['images']] inp_type = 'img' elif 'features' in inputs: data1, data2 = [_.cuda() for _ in inputs['features']] inp_type = 'feat' else: raise ValueError('no valid data key (\'images\' or \'features\') found from dataloader!') P1_gt, P2_gt = [_.cuda() for _ in inputs['Ps']] n1_gt, n2_gt = [_.cuda() for _ in inputs['ns']] if 'es' in inputs: e1_gt, e2_gt = [_.cuda() for _ in inputs['es']] G1_gt, G2_gt = [_.cuda() for _ in inputs['Gs']] H1_gt, H2_gt = [_.cuda() for _ in inputs['Hs']] KG, KH = [_.cuda() for _ in inputs['Ks']] perm_mat = inputs['gt_perm_mat'].cuda() iter_num = iter_num + 1 # zero the parameter gradients optimizer.zero_grad() with torch.set_grad_enabled(True): # forward if 'es' in inputs: s_pred, d_pred = \ model(data1, data2, P1_gt, P2_gt, G1_gt, G2_gt, H1_gt, H2_gt, n1_gt, n2_gt, KG, KH, inp_type) else: s_pred, d_pred = \ model(data1, data2, P1_gt, P2_gt, n1_gt, n2_gt) multi_loss = [] if cfg.TRAIN.LOSS_FUNC == 'offset': d_gt, grad_mask = displacement(perm_mat, P1_gt, P2_gt, n1_gt) loss = criterion(d_pred, d_gt, grad_mask) elif cfg.TRAIN.LOSS_FUNC == 'perm': loss = criterion(s_pred, perm_mat, n1_gt, n2_gt) else: raise ValueError('Unknown loss function {}'.format(cfg.TRAIN.LOSS_FUNC)) # backward + optimize loss.backward() optimizer.step() if cfg.MODULE == 'NGM.hypermodel': tfboard_writer.add_scalars( 'weight', {'w2': model.module.weight2, 'w3': model.module.weight3}, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num ) # training accuracy statistic acc, _, __ = matching_accuracy(lap_solver(s_pred, n1_gt, n2_gt), perm_mat, n1_gt) # tfboard writer loss_dict = {'loss_{}'.format(i): l.item() for i, l in enumerate(multi_loss)} loss_dict['loss'] = loss.item() tfboard_writer.add_scalars('loss', loss_dict, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num) accdict = dict() accdict['matching accuracy'] = acc tfboard_writer.add_scalars( 'training accuracy', accdict, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num ) # statistics running_loss += loss.item() * perm_mat.size(0) epoch_loss += loss.item() * perm_mat.size(0) if iter_num % cfg.STATISTIC_STEP == 0: running_speed = cfg.STATISTIC_STEP * perm_mat.size(0) / (time.time() - running_since) print('Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f}' .format(epoch, iter_num, running_speed, running_loss / cfg.STATISTIC_STEP / perm_mat.size(0))) tfboard_writer.add_scalars( 'speed', {'speed': running_speed}, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num ) running_loss = 0.0 running_since = time.time() epoch_loss = epoch_loss / dataset_size save_model(model, str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1))) torch.save(optimizer.state_dict(), str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1))) print('Epoch {:<4} Loss: {:.4f}'.format(epoch, epoch_loss)) print() # Eval in each epoch accs = eval_model(model, dataloader['test']) acc_dict = {"{}".format(cls): single_acc for cls, single_acc in zip(dataloader['train'].dataset.classes, accs)} acc_dict['average'] = torch.mean(accs) tfboard_writer.add_scalars( 'Eval acc', acc_dict, (epoch + 1) * cfg.TRAIN.EPOCH_ITERS ) scheduler.step() time_elapsed = time.time() - since print('Training complete in {:.0f}h {:.0f}m {:.0f}s' .format(time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60)) return model
'/home/dcg-adlr-mranzinger-data.cosmos1100/scene-text/icdar/incidental_text/', help='Path to the images to test against') args = parser.parse_args() model = EAST(False) paths = [] for dirpath, dirnames, filenames in os.walk(args.root): for dirname in dirnames: if dirname == 'checkpoints': experiment = os.path.join(dirpath, dirname) try: chk = resolve_checkpoint_path(experiment, load_best=True) paths.append(chk) except: pass paths.sort() for dataset in ['val', 'relabeled_val']: dataset = os.path.join(args.dataset, dataset) print(f'\n\n----------------------\nDataset: {dataset}') for chk in paths: print(f'\nUsing checkpoint: {chk}') submit_path = './submit' eval_model(model, chk, dataset, submit_path)
def be_model_training(model, optimizer, train_loader, epochs, scheduler, early_stopping=None, test_loader=None, eval_loader=None, device='cpu', w=1): def divergence(): d = torch.tensor(0.0, device=device) for name, module in model.named_modules(): if isinstance(module, (EnsembleMaskedWrapper, BatchEnsembleMaskedWrapper)): distr = module.distributions for i, d1 in enumerate(distr): for j, d2 in enumerate(distr): if j <= i: continue d += MMD(d1, d2) return d model.to(device) distributions = dict() for name, module in model.named_modules(): if isinstance(module, EnsembleMaskedWrapper): distr = module.distributions distributions[name] = distr for name, module in model.named_modules(): if isinstance(module, EnsembleMaskedWrapper): module.set_distribution('all') scores = [] mean_losses = [] best_model = model.state_dict() best_model_i = 0 model.to(device) if early_stopping is not None: early_stopping.reset() model.train() bar = tqdm(range(epochs), leave=True, desc='Mask training') for epoch in bar: model.train() losses = [] kl_losses = [] for i, (x, y) in enumerate(train_loader): x, y = x.to(device), y.to(device) pred = model(x) loss = torch.nn.functional.cross_entropy(pred, y, reduction='none') losses.extend(loss.tolist()) loss = loss.mean() kl = divergence() kl = 1 / (kl + 1e-12) kl *= w kl_losses.append(kl.item()) loss += kl optimizer.zero_grad() loss.backward() optimizer.step() if scheduler is not None: if isinstance(scheduler, (StepLR, MultiStepLR)): scheduler.step() if eval_loader is not None: eval_scores, _ = eval_model(model, eval_loader, topk=[1, 5], device=device) else: eval_scores = 0 mean_loss = sum(losses) / len(losses) mean_losses.append(mean_loss) if early_stopping is not None: r = early_stopping.step( eval_scores[1] ) if eval_loader is not None else early_stopping.step(mean_loss) if r < 0: break elif r > 0: best_model = model.state_dict() best_model_i = epoch train_scores, _ = eval_model(model, train_loader, device=device) test_scores, _ = eval_model(model, test_loader, device=device) kl_losses = sum(kl_losses) / len(kl_losses) bar.set_postfix({ 'Train score': train_scores[1], 'Test score': test_scores[1], 'Eval score': eval_scores[1] if eval_scores != 0 else 0, 'Mean loss': mean_loss, 'Kl loss': kl_losses }) scores.append((train_scores, eval_scores, test_scores)) return best_model, scores, scores[best_model_i], mean_losses