def train(net, dl, start_epoch, end_epoch, save_frequency): optim = torch.optim.Adam(net.parameters()) criterion = utils.CubeLoss('none').to(device) for e in range(start_epoch + 1, end_epoch + 1): stats = utils.Stats() perClass = utils.PerClassStats(maxScrambles) for input, target, scrambles in dl: optim.zero_grad() input, target = input.to(device), target.to(device) output = net(input) loss, acc = criterion(output, target) torch.mean(loss).backward() optim.step() stats.accumulate(len(target), loss, acc) perClass.accumulate(scrambles, loss, acc) print(f'Epoch {e}/{end_epoch}:') print(f'acc={100*stats.getAcc():.2f}%, loss={stats.getLoss():.3f}') print(f'acc= {perClass.accStr()}') print() if e % save_frequency == 0: os.makedirs(modelDir, exist_ok=True) filePath = getModelPath(e) print(f'Saving to {filePath}') torch.save(net, filePath)
def train( model, train_loader, device, tile_size, epochs=10, batch_size=1, learning_rate=1e-4, momentum=0.9, weight_decay=5e-3, ): writer = SummaryWriter( comment=f'LR_{learning_rate}_BS_{batch_size}_Epochs_{epochs}') since = time.time() criterion = CrossEntropyLoss2d() # optimizer = torch.optim.SGD( optimizer = torch.optim.Adam( model.parameters(), lr=learning_rate, # momentum=momentum, weight_decay=weight_decay, ) model.train() model = model.to(device=device) summary(model, (3, tile_size[0], tile_size[1])) criterion = criterion.to(device=device) training_stats = utils.Stats() running_loss = 0.0 for n in range(epochs): epoch_stats = utils.Stats() loader_with_progress = utils.loader_with_progress(train_loader, epoch_n=n, epoch_total=epochs, stats=epoch_stats, leave=True) progress_bar_output = io.StringIO() with redirect_stderr(progress_bar_output): for i, (x, y) in enumerate(loader_with_progress): # for x, y in loader_with_progress: y = y.to(device=device) x = x.to(device=device) y_pred = model(x) loss = criterion(y_pred, y) epoch_stats.append_loss(loss.item()) training_stats.append_loss(loss.item()) loader_with_progress.set_postfix(epoch_stats.fmt_dict()) # print(flush=True) # sys.stdout.flush() optimizer.zero_grad() loss.backward() optimizer.step() writer.add_scalar("training loss", loss.item(), n * len(train_loader) + i) time_elapsed = time.time() - since print("Training complete in {:.0f}m {:.0f}s".format( time_elapsed // 60, time_elapsed % 60)) writer.add_graph(model, x) writer.close() # print('Best val Acc: {:4f}'.format(best_acc)) return model, training_stats
def do_evaluation(config, datasets, len_past, len_future, save_predictions=False, verbose=0): """ Evaluate the given model on all given datasets. :param config: Config to create model. :param datasets: List of tuples specifying name and batch size per dataset. :param len_past: Number of past frames to use (BiRNN only). :param len_future: Number of future frames to use (BiRNN only). :param save_predictions: Whether or not to save predictions as pkl files. :param verbose: Verbosity level. """ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' birnn_eval_chunks = False model_stamp = config.get('model_id').split("-")[1] eval_str = "" if config.get("model_type") == C.MODEL_BIRNN: if len_past >= 0: print("\nBiRNN is evaluated on chunks: " + str(len_past) + "_" + str(len_future)) birnn_eval_chunks = True else: print("\nBiRNN is evaluated on the whole sequence.") if birnn_eval_chunks: eval_str = "past_{}_future_{}_frames".format(len_past, len_future) model_stamp += "_p{}_f{}".format(len_past, len_future) else: eval_str = "all_frames" model_stamp += "_all" model_cls = config.model_cls dataset_cls = config.dataset_cls # Data preprocessing configuration. preprocessing_ops = config.get_preprocessing_ops() # Logger object. logger = Logger(os.path.join(config.get('eval_dir'), "evaluation.txt"), sys.stdout) performance_text_format = "*** {} (SIP error): {:.4f} (+/- {:.3f})\n" performance_text_over_datasets = "\nSummary of model " + config.get( 'model_id') + "\n" for eval_key, batch_size in datasets: logger.print('------------------------------------------') logger.print('\nEvaluation on ' + eval_key) logger.print('\n------------------------------------------\n') # Clean slate. tf.reset_default_graph() with tf.Session() as sess: coord = tf.train.Coordinator() queue_threads = [] prediction_list = [] gt_list = [] if config.get(eval_key, None) is None: print("Eval Key {} not found, continue.".format(eval_key)) continue eval_dataset = dataset_cls(config.get(eval_key), var_len_seq=True, preprocessing_ops=preprocessing_ops) assert eval_dataset.num_samples % batch_size == 0, 'number of samples ({}) must be divisible by batch size ({})'.format( eval_dataset.num_samples, batch_size) num_eval_iterations = int(eval_dataset.num_samples / batch_size) with tf.name_scope(eval_key): eval_data_feeder = DataFeederTF(eval_dataset, 1, batch_size, queue_capacity=1024, shuffle=False) data_placeholders = eval_data_feeder.batch_queue( dynamic_pad=eval_dataset.is_dynamic, queue_capacity=512, queue_threads=2) eval_model = model_cls(config=config, session=sess, reuse=False, mode="validation", placeholders=data_placeholders, input_dims=eval_dataset.input_dims, target_dims=eval_dataset.target_dims, data_stats=None) eval_model.build_graph() # Load variables try: saver = tf.train.Saver() # Restore variables. if config.get('checkpoint_id') is None: checkpoint_path = tf.train.latest_checkpoint( config.get("model_dir")) else: checkpoint_path = os.path.join( config.get("model_dir"), config.get("checkpoint_id")) print("Loading model " + checkpoint_path) saver.restore(sess, checkpoint_path) except Exception: raise Exception("Could not load variables.") # In case we want to use feed dictionary. tf_mask = tf.expand_dims( tf.sequence_mask(lengths=data_placeholders[C.PL_SEQ_LEN], dtype=tf.float32), -1) tf_data_fetch = dict() tf_data_fetch['targets'] = data_placeholders[C.PL_TARGET] tf_data_fetch['mask'] = tf_mask tf_data_fetch['inputs'] = data_placeholders[C.PL_INPUT] eval_data_feeder.init(sess, coord) queue_threads.extend( tf.train.start_queue_runners(coord=coord, sess=sess)) queue_threads.append(eval_data_feeder.enqueue_threads) total_loss = 0.0 total_loss_l2 = 0.0 n_data = 0 dof = 9 # where the sensors are attached tracking_sensors = [4, 5, 18, 19, 0, 15] sip_eval_sensors = [1, 2, 16, 17] # the remaining "sensors" are evaluation sensors all_sensors = utils.SMPL_MAJOR_JOINTS remaining_eval_sensors = [ s for s in all_sensors if s not in tracking_sensors and s not in sip_eval_sensors ] with utils.Stats(tracking_sensors, sip_eval_sensors, remaining_eval_sensors, logger) as stats: model_evaluation_ops = dict() model_evaluation_ops['loss'] = eval_model.ops_loss model_evaluation_ops['mask'] = eval_model.seq_loss_mask model_evaluation_ops['targets'] = eval_model.pl_targets model_evaluation_ops['prediction'] = eval_model.output_sample model_evaluation_ops['orientation'] = eval_model.orientation model_evaluation_ops['acceleration'] = eval_model.acceleration for i in range(num_eval_iterations): if verbose > 0 and ( (i + 1) % max(int((num_eval_iterations / 5)), 1) == 0): print(str(i + 1) + "/" + str(num_eval_iterations)) if birnn_eval_chunks: np_batch = sess.run(tf_data_fetch) eval_out = eval_model.model.reconstruct_chunks( input_sequence=np_batch['inputs'], target_sequence=np_batch['targets'], len_past=len_past, len_future=len_future) eval_out['mask'] = np_batch['mask'] eval_out['targets'] = np_batch['targets'] eval_out['prediction'] = eval_out['sample'] else: eval_out = sess.run(model_evaluation_ops, feed_dict={}) total_loss += eval_out['loss']['total_loss'] * batch_size n_data += batch_size pred = undo_smpl(eval_dataset, eval_out['prediction'], eval_out['mask'][:, :, 0]) targ = undo_smpl(eval_dataset, eval_out['targets'], eval_out['mask'][:, :, 0]) if save_predictions: prediction_list.extend(pred) gt_list.extend(targ) # replace root with sensor data for j in range(batch_size): imu_root = np.reshape(np.eye(3), [-1]) if dof == 9 else np.array( [1.0, 0.0, 0.0, 0.0]) pred[j][:, :dof] = imu_root targ[j][:, :dof] = imu_root ja_diffs, euc_diffs = utils.compute_metrics( prediction=pred[j:j + 1], target=targ[j:j + 1], compute_positional_error=False) stats.add(ja_diffs, euc_diffs) total_loss = total_loss / float(n_data) if n_data > 0 else 0.0 total_loss_l2 = total_loss_l2 / float( n_data) if n_data > 0 else 0.0 logger.print('\n*** Loss ***\n') logger.print( 'average main loss per time step: {}\n'.format(total_loss)) logger.print('average l2 loss per time step : {}\n'.format( total_loss_l2)) sip_stats = stats.get_sip_stats() performance_text_over_datasets += performance_text_format.format( eval_key, sip_stats[0], sip_stats[1]) if save_predictions: out = {"prediction": prediction_list, "gt": gt_list} file_name = eval_key + "_" + eval_str if eval_str is not None else eval_key np.savez_compressed( os.path.join(config.get("eval_dir"), file_name), **out) sess.run( eval_data_feeder.input_queue.close( cancel_pending_enqueues=True)) coord.request_stop() coord.join(queue_threads, ignore_live_threads=True, stop_grace_period_secs=1) logger.print(performance_text_over_datasets) logger.close()