def GetStats(self, cur_iter, lr): eta_seconds = self.iter_timer.average_time * (cfg.SOLVER.MAX_ITER - cur_iter) eta = str(datetime.timedelta(seconds=int(eta_seconds))) mem_stats = c2_py_utils.GetGPUMemoryUsageStats() mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS]) stats = dict( iter=cur_iter, lr=float(lr), time=self.iter_timer.average_time, loss=self.smoothed_total_loss.GetMedianValue(), eta=eta, mb_qsize=int(np.round(self.smoothed_mb_qsize.GetMedianValue())), mem=int(np.ceil(mem_usage / 1024 / 1024)), ) if cfg.TRAIN.DA_FADE_IN: stats['da_weight'] = self.model.da_fade_in.get_weight() if cfg.TRAIN.PADA: stats[ 'avg_pada_weight'] = self.model.class_weight_db.get_avg_pada_weight( ) stats[ 'total_detects'] = self.model.class_weight_db.total_sum_softmax.sum( ) / 2 stats['KL_div'] = self.model.class_weight_db.get_KL_to_init() stats['accuracy_fg'] = self.model.class_weight_db.fg_acc.get() stats[ 'acc_fg_weighted'] = self.model.class_weight_db.weighted_fg_acc.get( ) target_dist = self.model.class_weight_db.get_dist() print('target_dist: {}'.format(list(target_dist))) class_weights = self.model.class_weight_db.class_weights print('class_weights: {}'.format(list(class_weights))) classes = np.array( dummy_datasets.get_coco_dataset().classes.values(), dtype=str) for dist in [target_dist, class_weights]: order = np.argsort(dist)[::-1] o_target_dist = target_dist[order] o_classes = classes[order] cwo = class_weights[order] print("dist tops: ", end='') for prob, w, c in list(zip(o_target_dist, cwo, o_classes))[:5]: print("{}:{:.3f} ({:.3f})".format(c, prob, w), end='; ') print() print() for k, v in self.smoothed_losses_and_metrics.items(): stats[k] = v.GetMedianValue() # for k,v in stats.items(): # print(k,v) return stats
def Caffe2LSTM(args): T = args.data_size // args.batch_size input_blob_shape = [args.seq_length, args.batch_size, args.input_dim] queue, label_queue = generate_data(T // args.seq_length, input_blob_shape, args.hidden_dim) workspace.FeedBlob( "seq_lengths", np.array([args.seq_length // 2] * args.batch_size, dtype=np.int32) ) model, output = create_model(args, queue, label_queue, input_blob_shape) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) last_time = time.time() start_time = last_time num_iters = T // args.seq_length entries_per_iter = args.seq_length * args.batch_size # Run the Benchmark log.info("------ Starting benchmark ------") for iteration in range(0, num_iters, args.iters_to_report): iters_once = min(args.iters_to_report, num_iters - iteration) workspace.RunNet(model.net.Proto().name, iters_once) new_time = time.time() log.info("Iter: {} / {}. Entries Per Second: {}k.". format( iteration, num_iters, entries_per_iter * iters_once / (new_time - last_time) // 1000, )) last_time = new_time log.info("Done. Total EPS: {}k".format( entries_per_iter * num_iters / (time.time() - start_time) // 1000, )) if (args.gpu): log.info("Memory stats:") stats = utils.GetGPUMemoryUsageStats() log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024)) if (stats['max_total'] != stats['total']): log.warning( "Max usage differs from current total usage: {} > {}". format(stats['max_total'], stats['total']) ) log.warning("This means that costly deallocations occured.")
def GetStats(self, cur_iter, lr): eta_seconds = self.iter_timer.average_time * ( cfg.SOLVER.NUM_ITERATIONS - cur_iter) eta = str(datetime.timedelta(seconds=int(eta_seconds))) mem_stats = c2_py_utils.GetGPUMemoryUsageStats() mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_DEVICES]) stats = dict(iter=cur_iter, lr=float(lr), time=self.iter_timer.average_time, loss=self.smoothed_total_loss.GetMedianValue(), eta=eta, mb_qsize=int( np.round(self.smoothed_mb_qsize.GetMedianValue())), mem=int(np.ceil(mem_usage / 1024 / 1024))) for k, v in self.smoothed_losses_and_metrics.items(): stats[k] = v.GetMedianValue() return stats
def LogIterStats(self, cur_iter, lr): """Log the tracked statistics.""" if (cur_iter % self.LOG_PERIOD == 0 or cur_iter == cfg.SOLVER.MAX_ITER - 1): eta_seconds = self.iter_timer.average_time * (cfg.SOLVER.MAX_ITER - cur_iter) eta = str(datetime.timedelta(seconds=int(eta_seconds))) mem_stats = c2_py_utils.GetGPUMemoryUsageStats() mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS]) stats = dict(iter=cur_iter, lr=float(lr), time=self.iter_timer.average_time, loss=self.smoothed_total_loss.GetMedianValue(), eta=eta, mb_qsize=int( np.round( self.smoothed_mb_qsize.GetMedianValue())), mem=int(np.ceil(mem_usage / 1024 / 1024))) for k, v in self.smoothed_losses_and_metrics.items(): stats[k] = v.GetMedianValue() log_json_stats(stats)
def GetStats(self, cur_iter, lr): num_iter_per_epoch = self.model.roi_data_loader.get_num_iter_per_epoch() eta_seconds = self.iter_timer.average_time * ( cfg.SOLVER.MAX_ITER * num_iter_per_epoch - cur_iter ) eta = str(datetime.timedelta(seconds=int(eta_seconds))) mem_stats = c2_py_utils.GetGPUMemoryUsageStats() mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS]) stats = dict( iter=cur_iter, lr=float(lr), time=self.iter_timer.average_time, loss=self.smoothed_total_loss.GetAverageValue(), eta=eta, mb_qsize=int( np.round(self.smoothed_mb_qsize.GetAverageValue()) ), mem=int(np.ceil(mem_usage / 1024 / 1024)) ) for k, v in self.smoothed_losses_and_metrics.items(): stats[k] = v.GetAverageValue() return stats
def Caffe2LSTM(args): T = args.data_size // args.batch_size input_blob_shape = [args.seq_length, args.batch_size, args.input_dim] queue, label_queue, entry_counts = generate_data(T // args.seq_length, input_blob_shape, args.hidden_dim, args.fixed_shape) workspace.FeedBlob( "seq_lengths", np.array([args.seq_length] * args.batch_size, dtype=np.int32)) model, output = create_model(args, queue, label_queue, input_blob_shape) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) start_time = time.time() num_iters = T // args.seq_length total_iters = 0 # Run the Benchmark log.info("------ Warming up ------") workspace.RunNet(model.net.Proto().name) if (args.gpu): log.info("Memory stats:") stats = utils.GetGPUMemoryUsageStats() log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024)) log.info("------ Starting benchmark ------") start_time = time.time() last_time = time.time() for iteration in range(1, num_iters, args.iters_to_report): iters_once = min(args.iters_to_report, num_iters - iteration) total_iters += iters_once workspace.RunNet(model.net.Proto().name, iters_once) new_time = time.time() log.info("Iter: {} / {}. Entries Per Second: {}k.".format( iteration, num_iters, np.sum(entry_counts[iteration:iteration + iters_once]) / (new_time - last_time) // 100 / 10, )) last_time = new_time log.info("Done. Total EPS excluding 1st iteration: {}k {}".format( np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10, " (with RNN executor)" if args.rnn_executor else "", )) if (args.gpu): log.info("Memory stats:") stats = utils.GetGPUMemoryUsageStats() log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024)) if (stats['max_total'] != stats['total']): log.warning( "Max usage differs from current total usage: {} > {}".format( stats['max_total'], stats['total'])) log.warning("This means that costly deallocations occured.") return time.time() - start_time
def net_trainer(): model, start_iter, checkpoints = create_model() if 'final' in checkpoints: return checkpoints add_model_inputs(model) if cfg.TRAIN.WEIGHTS: nu.initialize_gpu_0_from_weights_file(model, cfg.TRAIN.WEIGHTS) # Even if we're randomly initializing we still need to synchronize # parameters across GPUs nu.broadcast_parameters(model) workspace.CreateNet(model.net) output_dir = get_output_dir(training=True) logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir))) dump_proto_files(model, output_dir) json_out_file = os.path.join(output_dir, 'json_stats.log') # Start loading mini-batches and enqueuing blobs model.roi_data_loader.register_sigint_handler() # DEBUG data loading if cfg.DEBUG.DATA_LOADING: for _ in range(10000000): # this was with threading... # model.roi_data_loader._get_next_minibatch() model.roi_data_loader._get_next_minibatch2( model.roi_data_loader.shared_readonly_dict, model.roi_data_loader._lock, model.roi_data_loader.mp_cur, model.roi_data_loader.mp_perm) sys.exit(0) model.roi_data_loader.start(prefill=True) smoothed_values = { key: SmoothedValue(WIN_SZ) for key in model.losses + model.metrics} iter_values = {key: 0 for key in model.losses + model.metrics} total_loss = SmoothedValue(WIN_SZ) iter_time = SmoothedValue(WIN_SZ) mb_qsize = SmoothedValue(WIN_SZ) iter_timer = Timer() checkpoints = {} for i in range(start_iter, cfg.SOLVER.MAX_ITER): iter_timer.tic() lr = model.UpdateWorkspaceLr(i) workspace.RunNet(model.net.Proto().name) if i == start_iter: nu.print_net(model) iter_time.AddValue(iter_timer.toc(average=False)) for k in iter_values.keys(): if k in model.losses: iter_values[k] = nu.sum_multi_gpu_blob(k) else: iter_values[k] = nu.average_multi_gpu_blob(k) for k, v in smoothed_values.items(): v.AddValue(iter_values[k]) loss = np.sum(np.array([iter_values[k] for k in model.losses])) total_loss.AddValue(loss) mb_qsize.AddValue(model.roi_data_loader._minibatch_queue.qsize()) if i % LOG_PERIOD == 0 or i == cfg.SOLVER.MAX_ITER - 1: eta_seconds = iter_timer.average_time * (cfg.SOLVER.MAX_ITER - i) eta = str(datetime.timedelta(seconds=int(eta_seconds))) mem_stats = c2_utils.GetGPUMemoryUsageStats() mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS]) stats = dict( iter=i, lr=float(lr), time=iter_timer.average_time, loss=total_loss.GetMedianValue(), eta=eta, mb_qsize=int(np.round(mb_qsize.GetMedianValue())), mem=int(np.ceil(mem_usage / 1024 / 1024))) for k, v in smoothed_values.items(): stats[k] = v.GetMedianValue() log_json_stats(stats, json_out_file=json_out_file) if cfg.DEBUG.STOP_TRAIN_ITER: import pdb pdb.set_trace() if ((i + 1) % int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) == 0 and i > start_iter): checkpoints[i] = os.path.join( output_dir, 'model_iter{}.pkl'.format(i)) nu.save_model_to_weights_file(checkpoints[i], model) if i == start_iter + LOG_PERIOD: # Reset the iter timer after the first LOG_PERIOD iterations to # discard initial iterations that have outlier timings iter_timer.reset() if np.isnan(loss): logger.critical('Loss is NaN, exiting...') os._exit(0) # FB: use code 0 to avoid flow retries # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints