def _eval(self): all_results_1st = eval_coco(self.df, lambda img: detect_one_image(img, self.pred_1st)) all_results_2nd = eval_coco(self.df, lambda img: detect_one_image(img, self.pred_2nd)) all_results_3rd = eval_coco(self.df, lambda img: detect_one_image(img, self.pred_3rd)) output_file_1st = os.path.join( logger.get_logger_dir(), '1st_outputs{}.json'.format(self.global_step)) output_file_2nd = os.path.join( logger.get_logger_dir(), '2nd_outputs{}.json'.format(self.global_step)) output_file_3rd = os.path.join( logger.get_logger_dir(), '3rd_outputs{}.json'.format(self.global_step)) with open(output_file_1st, 'w') as f: json.dump(all_results_1st, f) with open(output_file_2nd, 'w') as f: json.dump(all_results_2nd, f) with open(output_file_3rd, 'w') as f: json.dump(all_results_3rd, f) try: scores_1st = print_evaluation_scores(output_file_1st) scores_2nd = print_evaluation_scores(output_file_2nd) scores_3rd = print_evaluation_scores(output_file_3rd) except Exception: logger.exception("Exception in COCO evaluation.") scores = {} for k, v in scores_1st.items(): self.trainer.monitors.put_scalar(k, v) for k, v in scores_2nd.items(): self.trainer.monitors.put_scalar(k, v) for k, v in scores_3rd.items(): self.trainer.monitors.put_scalar(k, v)
def offline_evaluate(pred_func, output_file): df = get_eval_dataflow() all_results = eval_coco( df, lambda img: detect_one_image(img, pred_func)) with open(output_file, 'w') as f: json.dump(all_results, f) print_evaluation_scores(output_file)
def _eval(self): logdir = args.logdir if cfg.TRAINER == 'replicated': all_results = multithread_eval_coco(self.dataflows, self.predictors) else: filenames = [os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, rank) ) for rank in range(hvd.local_size())] if self._horovod_run_eval: local_results = eval_coco(self.dataflow, self.predictor) fname = filenames[hvd.local_rank()] with open(fname, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for fname in filenames: with open(fname, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(fname) output_file = os.path.join( logdir, 'outputs{}.json'.format(self.global_step)) with open(output_file, 'w') as f: json.dump(all_results, f) try: scores = print_coco_metrics(output_file) for k, v in scores.items(): self.trainer.monitors.put_scalar(k, v) except Exception: logger.exception("Exception in COCO evaluation.")
def _eval(self): all_results = eval_coco(self.df, lambda img: detect_one_image(img, self.pred)) output_file = os.path.join( logger.get_logger_dir(), 'outputs{}.json'.format(self.global_step)) with open(output_file, 'w') as f: json.dump(all_results, f) scores = print_evaluation_scores(output_file) for k, v in scores.items(): self.trainer.monitors.put_scalar(k, v)
def offline_evaluate(pred_config, output_file): num_gpu = cfg.TRAIN.NUM_GPUS graph_funcs = MultiTowerOfflinePredictor(pred_config, list( range(num_gpu))).get_predictors() predictors = [] dataflows = [] for k in range(num_gpu): predictors.append( lambda img, pred=graph_funcs[k]: detect_one_image(img, pred)) dataflows.append(get_eval_dataflow(shard=k, num_shards=num_gpu)) if num_gpu > 1: all_results = multithread_eval_coco(dataflows, predictors) else: all_results = eval_coco(dataflows[0], predictors[0]) with open(output_file, 'w') as f: json.dump(all_results, f) print_coco_metrics(output_file)
def _eval(self): logdir = args.logdir if cfg.TRAINER == 'replicated': with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \ tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar: futures = [] for dataflow, pred in zip(self.dataflows, self.predictors): futures.append( executor.submit(eval_coco, dataflow, pred, pbar)) all_results = list( itertools.chain(*[fut.result() for fut in futures])) else: filenames = [ os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)) for rank in range(hvd.local_size()) ] if self._horovod_run_eval: local_results = eval_coco(self.dataflow, self.predictor) fname = filenames[hvd.local_rank()] with open(fname, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for fname in filenames: with open(fname, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(fname) output_file = os.path.join(logdir, 'outputs{}.json'.format(self.global_step)) with open(output_file, 'w') as f: json.dump(all_results, f) try: scores = print_evaluation_scores(output_file) for k, v in scores.items(): self.trainer.monitors.put_scalar(k, v) except Exception: logger.exception("Exception in COCO evaluation.")
def _eval(self): logdir = args.logdir if cfg.TRAINER == 'replicated': with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \ tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar: futures = [] for dataflow, pred in zip(self.dataflows, self.predictors): futures.append(executor.submit(eval_coco, dataflow, pred, pbar)) all_results = list(itertools.chain(*[fut.result() for fut in futures])) else: if self._horovod_run_eval: local_results = eval_coco(self.dataflow, self.predictor) output_partial = os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank())) with open(output_partial, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for k in range(hvd.local_size()): output_partial = os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, k)) with open(output_partial, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(output_partial) output_file = os.path.join( logdir, 'outputs{}.json'.format(self.global_step)) with open(output_file, 'w') as f: json.dump(all_results, f) try: scores = print_evaluation_scores(output_file) for k, v in scores.items(): self.trainer.monitors.put_scalar(k, v) except Exception: logger.exception("Exception in COCO evaluation.")
def offline_evaluate(pred_config, output_file): num_gpu = cfg.TRAIN.NUM_GPUS graph_funcs = MultiTowerOfflinePredictor(pred_config, list( range(num_gpu))).get_predictors() predictors = [] for k in range(num_gpu): predictors.append( lambda img, pred=graph_funcs[k]: detect_one_image(img, pred)) for dataset in cfg.DATA.VAL: logger.info("Evaluating {} ...".format(dataset)) dataflows = [ get_eval_dataflow(dataset, shard=k, num_shards=num_gpu) for k in range(num_gpu) ] if num_gpu > 1: all_results = multithread_eval_coco(dataflows, predictors) else: all_results = eval_coco(dataflows[0], predictors[0]) output = output_file + '-' + dataset with open(output, 'w') as f: json.dump(all_results, f) print_coco_metrics(dataset, output)
def _eval(self): if cfg.TRAINER == 'replicated': with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \ tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar: futures = [] for dataflow, pred in zip(self.dataflows, self.predictors): futures.append( executor.submit(eval_coco, dataflow, pred, pbar)) all_results = list( itertools.chain(*[fut.result() for fut in futures])) else: local_results = eval_coco(self.dataflow, self.predictor) results_as_arr = np.frombuffer(dumps(local_results), dtype=np.uint8) sizes, concat_arrs = tf.get_default_session().run( [self.string_lens, self.concat_results], feed_dict={self.local_result_tensor: results_as_arr}) if hvd.rank() > 0: return all_results = [] start = 0 for size in sizes: substr = concat_arrs[start:start + size] results = loads(substr.tobytes()) all_results.extend(results) start = start + size output_file = os.path.join(logger.get_logger_dir(), 'outputs{}.json'.format(self.global_step)) with open(output_file, 'w') as f: json.dump(all_results, f) try: scores = print_evaluation_scores(output_file) for k, v in scores.items(): self.trainer.monitors.put_scalar(k, v) except Exception: logger.exception("Exception in COCO evaluation.")
def main(): data_type = 'coco' data_root_dir = '/data/data_coco/' # model_depth = 50 epoch_max = 100 batch_size = 8 if data_type == 'coco': dataset_train = CocoDataset(data_root_dir, set_name='train2017', transform=transforms.Compose( [Normalizer(), Augmenter(), Resizer()])) dataset_val = CocoDataset(data_root_dir, set_name='val2017', transform=transforms.Compose( [Normalizer(), Resizer()])) else: print('暂不支持') sampler = AspectRatioBasedSampler(dataset_train, batch_size=batch_size, drop_last=True) loader_train = DataLoader(dataset_train, num_workers=8, collate_fn=collater, batch_sampler=sampler) sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=batch_size, drop_last=True) loader_val = DataLoader(dataset_val, num_workers=8, collate_fn=collater, batch_sampler=sampler_val) retinanet = model.retinanet_50(dataset_train.num_classes(), pretrained=True) retinanet = retinanet.cuda() optimizer = torch.optim.Adam(retinanet.parameters(), lr=1e-4) # optimizer = torch.optim.SGD(retinanet.parameters(), lr=1e-4, momentum=0.9, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True, factor=0.5) model_pretrain_dir = './model/model_final.pt' if os.path.exists(model_pretrain_dir): print('pretrain model exist!') retinanet = torch.load(model_pretrain_dir) print('train images num: {}'.format(len(loader_train) * batch_size)) for epoch_num in range(epoch_max): retinanet.train() epoch_loss = [] for iter_num, data in enumerate(loader_train): optimizer.zero_grad() input_tensor = [data['img'].cuda().float(), data['annot']] classification_loss, regression_loss = retinanet(input_tensor) classification_loss = classification_loss.mean() regression_loss = regression_loss.mean() loss = classification_loss + regression_loss epoch_loss.append(float(loss)) if loss.item() == 0: continue loss.backward() optimizer.step() print( 'Epoch:{}/{} | Iters:{}/{} | C loss:{:.4f} | R loss:{:.4f} | Current loss:{:.4f} | Current LR:{:.7f}' .format(epoch_num + 1, epoch_max, iter_num + 1, len(loader_train), float(classification_loss), float(regression_loss), np.mean(epoch_loss), optimizer.param_groups[0]['lr'])) del classification_loss del regression_loss # 每个epoch 进行验证一次 eval.eval_coco(dataset_val, retinanet) scheduler.step(np.mean(epoch_loss)) torch.save( retinanet, './model/{}_retinanet_{}.pt'.format(data_type, epoch_num + 1)) retinanet.eval() torch.save(retinanet, './model/model_final.pt')