def log(self, runner): log_dict = OrderedDict() log_dict['epoch'] = runner.epoch log_dict['losses'] = runner.losses if self.by_epoch: log_dict['iter'] = runner.inner_iter else: log_dict['iter'] = runner.iter # TODO: learning rate 信息 # TODO: 内存使用信息(cpu, gpu) log_items = [] for name, val in log_dict.items(): if isinstance(val, Tensor) or is_list_of(val, Tensor): if isinstance(val, list): val = [float(item.item()) for item in val] else: val = float(val.item()) aver_val = self.pool.update(name, val) if isinstance(val, list): val = ", ".join(["{:.5f}".format(item) for item in val]) aver_val = ", ".join( ["{:.5f}".format(item) for item in aver_val]) else: val = "{:.5f}".format(val) aver_val = "{:.5f}".format(aver_val) log_items.append("{}: [{}], {}_ma: [{}]".format( name, val, name, aver_val)) else: log_items.append(f'{name}: {val}') log_str = ', '.join(log_items) runner.logger.info(log_str)
def evaluate(self, results): """Evaluate with different metrics. Args: results (list of dict): for every dict, record metric -> value for one frame Return: dict: Evaluation results dict. """ assert is_list_of(results, dict), f'results must be a list of dict, but got {type(results)}' assert len(results) >= len(self), "results length should >= dataset length, due to multicard eval" self.logger.info("eval samples length: {}, dataset length: {}, only select front {} results".format(len(results), len(self), len(self))) results = results[:len(self)] eval_results = defaultdict(list) # a dict of list for res in results: for metric, val in res.items(): eval_results[metric].append(val) for metric, val_list in eval_results.items(): assert len(val_list) == len(self), ( f'Length of evaluation result of {metric} is {len(val_list)}, ' f'should be {len(self)}') # average the results eval_results = { metric: sum(values) / len(self) for metric, values in eval_results.items() } return eval_results
def evaluate(self, results, save_path): """Evaluate with different metrics. Args: results (list of dict): for every dict, record metric -> value for one frame Return: dict: Evaluation results dict. """ assert is_list_of( results, dict), f'results must be a list of dict, but got {type(results)}' assert len(results) >= len( self ), "results length should >= dataset length, due to multicard eval" self.logger.info( "eval samples length: {}, dataset length: {}, only select front {} results" .format(len(results), len(self), len(self))) results = results[:len(self)] eval_results = defaultdict(list) # a dict of list for res in results: # find on res's id class_id = res['class_id'] # 统计 for metric, val in res.items(): if "id" in metric: continue eval_results[metric].append(val) eval_results[metric + "_" + str(class_id)].append(val) if val > 5: eval_results[metric + "_" + str(class_id) + "_more_than_5_nums"].append(1.0) # for metric, val_list in eval_results.items(): # assert len(val_list) == len(self), ( # f'Length of evaluation result of {metric} is {len(val_list)}, ' # f'should be {len(self)}') # average the results eval_results = { metric: (sum(values) / len(values) if "more" not in metric else sum(values)) for metric, values in eval_results.items() } # update pooling for metric, value in eval_results.items(): self.pooling[metric].append(value) if len(self.pooling[metric]) > self.moving_average_len: # remove the first one self.pooling[metric].pop(0) # eval_results eval_results = { metric: sum(values) / len(values) for metric, values in self.pooling.items() } return eval_results
def _is_in(param_group, param_group_list): assert is_list_of(param_group_list, dict) param = set(param_group['params']) param_set = set() for group in param_group_list: param_set.update(set(group['params'])) return not param.isdisjoint(param_set)
def run(self, data_loaders, workflow, max_epochs): """Start running. Args: data_loaders (list[:obj:`DataLoader`]): Dataloaders for training and test. workflow (list[tuple]): A list of (phase, epochs) to specify the running order and epochs. E.g, [('train', 10), ('test', 1)] means running 10 epochs for training and 1 epoch for test, iteratively. max_epochs (int): Total training epochs. """ assert isinstance(data_loaders, list) assert is_list_of(workflow, tuple) assert len(data_loaders) == len(workflow) self._max_epochs = max_epochs for i, flow in enumerate(workflow): mode, epochs = flow if mode == 'train': self._max_iters = self._max_epochs * len(data_loaders[i]) self._iter = self.epoch * len(data_loaders[i]) self.logger.info( "{} iters for one epoch, trained iters: {}, total iters: {}" .format(len(data_loaders[i]), self._iter, self._max_iters)) # set the epoch in the dist sampler, so that the data is consitent # data_loaders[i].sampler.epoch = self.epoch , do not like torch and paddle, it's index are generated by np.random.RandomState(seed) break self.logger.info( "Start running, work_dir: {}, workflow: {}, max epochs for train: {}" .format(self.work_dir, workflow, max_epochs)) self.logger.info("registered hooks: " + str(self.hooks)) self.call_hook('before_run') while self.epoch < max_epochs: for i, flow in enumerate(workflow): mode, epochs = flow if isinstance(mode, str): # self.train() if not hasattr(self, mode): raise ValueError( f'runner has no method named "{mode}" to run an epoch' ) epoch_runner = getattr(self, mode) else: raise TypeError( 'mode in workflow must be a str, but got {}'.format( type(mode))) for _ in range(epochs): if mode == 'train' and self.epoch >= max_epochs: return epoch_runner(data_loaders[i]) time.sleep(1) # wait for some hooks like loggers to finish self.call_hook('after_run')
def after_train_iter(self, runner): """The behavior after each train iteration. Args: runner (``edit.core.runner.BaseRunner``): The runner. """ if not self.every_n_iters(runner, self.interval): return # for key, para in runner.model.generator.named_parameters(): # para.requires_grad = False self.logger.info("start to eval for iter: {}".format(runner.iter + 1)) save_path = os.path.join(self.save_path, "iter_{}".format(runner.iter + 1)) mkdir_or_exist(save_path) results = [] # list of dict sample_nums_all_threads = 0 for _, data in enumerate(self.dataloader): batchdata = data sample_nums_for_one_thread = batchdata[0].shape[0] outputs = runner.model.test_step( batchdata, save_image=self.save_image, save_path=save_path, sample_id=sample_nums_all_threads + sample_nums_for_one_thread * self.local_rank) if self.nranks > 1: # TODO: # 一定是使用GPU,将所有线程的outputs和data收集过来 # gathered_outputs = xxx # gathered_batchdata = xxx pass else: gathered_outputs = outputs # list of tensor gathered_batchdata = batchdata # list of numpy assert gathered_batchdata[0].shape[0] == gathered_outputs[0].shape[ 0] # batch维度要匹配 assert gathered_batchdata[0].shape[ 0] == sample_nums_for_one_thread * self.nranks # 确保是gather后的 sample_nums_all_threads += gathered_outputs[0].shape[0] # 目前是所有进程前向并保存结果,0号进程去计算metric;之后增加CPU进程通信,把计算metric也分到不同进程上 if self.local_rank == 0: result = runner.model.cal_for_eval(gathered_outputs, gathered_batchdata) assert is_list_of(result, dict) # self.logger.info(result) results += result else: pass if self.local_rank == 0: self.evaluate(results, runner.iter + 1)
def after_train_iter(self, runner): """The behavior after each train iteration. Args: runner (``edit.core.runner.BaseRunner``): The runner. """ if not self.every_n_iters(runner, self.interval): return self.logger.info("start to eval for iter: {}".format(runner.iter + 1)) save_path = os.path.join(self.save_path, "iter_{}".format(runner.iter + 1)) results = [] # list of dict sample_nums_all_threads = 0 for _, data in enumerate(self.dataloader): batchdata = data sample_nums_for_one_thread = batchdata[0].shape[0] gathered_outputs = runner.model.test_step( batchdata, save_image=self.save_image, save_path=save_path, sample_id=sample_nums_all_threads + sample_nums_for_one_thread * self.local_rank) # gather后的batched的outputs if self.nranks > 1: pass # if isinstance(runner.devide, fluid.CUDAPlace): # gathered_batchdata = gpu_gather(batchdata) # else: # raise NotImplementedError("does not support multi_thread eval in cpu!") else: gathered_batchdata = batchdata assert gathered_batchdata[0].shape[0] == gathered_outputs[0].shape[ 0] assert gathered_batchdata[0].shape[ 0] == sample_nums_for_one_thread * self.nranks sample_nums_all_threads += gathered_outputs[0].shape[0] # 目前是所有进程前向并保存结果,0号进程去计算metric;之后增加CPU进程通信,把计算metric也分到不同进程上 if self.local_rank == 0: result = runner.model.cal_for_eval(gathered_outputs, gathered_batchdata) assert is_list_of(result, dict) self.logger.info(result) results += result else: pass if self.local_rank == 0: self.evaluate(results, runner.iter + 1)
def run(self, data_loaders, workflow, max_iters): """Start running. Args: data_loaders (list[:obj:`DataLoader`]): Dataloaders for training and test. workflow (list[tuple]): A list of (phase, iters) to specify the running order and iterations. E.g, [('train', 10000), ('test', 1)] means running 10000 iterations for training and 10 iterations for test, iteratively. max_iters (int): Total training iterations. """ assert isinstance(data_loaders, list) assert is_list_of(workflow, tuple) assert len(data_loaders) == len(workflow) self._max_iters = max_iters self.logger.info( "Start running, work_dir: {}, workflow: {}, max iters for train: {}" .format(self.work_dir, workflow, max_iters)) self.logger.info("registered hooks: " + str(self.hooks)) self.call_hook('before_run') iter_loaders = [IterLoader(x) for x in data_loaders] self.call_hook('before_epoch') self.model.max_iters = max_iters self.model.now_iter = self.iter while self.iter < max_iters: for i, flow in enumerate(workflow): self._inner_iter = 0 mode, iters = flow if not isinstance(mode, str) or not hasattr(self, mode): raise ValueError( 'runner has no method named "{}" to run a workflow'. format(mode)) iter_runner = getattr(self, mode) for _ in range(iters): if mode == 'train' and self.iter >= max_iters: return iter_runner(iter_loaders[i]) time.sleep(1) # wait for some hooks like loggers to finish self.call_hook('after_epoch') self.call_hook('after_run')
def evaluate(self, results, save_path): """ Evaluate with different metrics. Args: results (list of dict): for every dict, record metric -> value for one frame Return: dict: Evaluation results dict. """ save_SVG_path = osp.join(save_path, "SVG") mkdir_or_exist(save_SVG_path) assert is_list_of( results, dict), f'results must be a list of dict, but got {type(results)}' assert len(results) >= len( self ), "results length should >= dataset length, due to multicard eval" self.logger.info( "eval samples length: {}, dataset length: {}, only select front {} results" .format(len(results), len(self), len(self))) results = results[:len(self)] clip_names = sorted(self.frame_num.keys()) # e.g. [`city`, `walk`] frame_nums = [self.frame_num[clip] for clip in clip_names] eval_results = defaultdict(list) # a dict of list do_frames = 0 now_clip_idx = 0 eval_results_one_clip = defaultdict(list) for res in results: for metric, val in res.items(): eval_results_one_clip[metric].append(val) do_frames += 1 if do_frames == frame_nums[now_clip_idx]: # 处理一个clip clip_name = clip_names[now_clip_idx] self.logger.info("{}: {} is ok".format(now_clip_idx, clip_name)) for metric, values in eval_results_one_clip.items(): # metric clip_name values to save an svg average = sum(values) / len(values) save_filename = clip_name + "_" + metric title = "{} for {}, length: {}, average: {:.4f}".format( metric, clip_name, len(values), average) plt.figure(figsize=(len(values) // 4 + 1, 8)) plt.plot(list(range(len(values))), values, label=metric) # promise that <= 10000 plt.title(title) plt.xlabel('frame idx') plt.ylabel('{} value'.format(metric)) plt.legend() fig = plt.gcf() fig.savefig(osp.join(save_SVG_path, save_filename + '.svg'), dpi=600, bbox_inches='tight') # plt.show() plt.clf() plt.close() eval_results[metric].append(average) do_frames = 0 now_clip_idx += 1 eval_results_one_clip = defaultdict(list) for metric, val_list in eval_results.items(): assert len(val_list) == len(clip_names), ( f'Length of evaluation result of {metric} is {len(val_list)}, ' f'should be {len(clip_names)}') # average the results eval_results = { metric: sum(values) / len(values) for metric, values in eval_results.items() } return eval_results
def evaluate(self, results, save_path): """Evaluate with different metrics. Args: results (list of dict): for every dict, record metric -> value for one frame Return: dict: Evaluation results dict. """ assert is_list_of( results, dict), f'results must be a list of dict, but got {type(results)}' assert len(results) >= len( self ), "results length should >= dataset length, due to multicard eval" self.logger.info( "eval samples length: {}, dataset length: {}, only select front {} results" .format(len(results), len(self), len(self))) results = results[:len(self)] eval_results = defaultdict(list) # a dict of list for res in results: class_id = res['class_id'] file_id = res['file_id'] # 统计 for metric, val in res.items(): if "id" in metric: continue eval_results[metric].append(val) eval_results[metric + "_" + str(class_id)].append(val) # 特殊打印 if val > 5: self.logger.info("{} value: {}".format(metric, val)) eval_results[metric + "_" + str(class_id) + "_more_than_5_nums"].append(1.0) else: # val = int(val+0.5) integer = str(int(val)) point = str(int(val * 10) % 10) eval_results[metric + "_" + "{}.{}".format(integer, point) + "_nums"].append(1.0) # 根据eval_results[metric+ "_" + str(class_id)]计算分数信息 def get_score_by_dis(x): if x > 5: return 0 else: return 6 - x now_score = 0 best_score = 0 for class_id, diff_score in self.difficulty_score.items(): key = "dis_" + str(class_id) if eval_results.get(key) is None: self.logger.info( "do not have class index: {}".format(class_id)) else: thre = self.threshold[class_id] list_0_1 = [(dis_value <= thre) for dis_value in eval_results[key]] Lambda = sum(list_0_1) * 1.0 / len(list_0_1) for dis_value in eval_results[key]: now_score += get_score_by_dis( dis_value) * diff_score * Lambda best_score += 6 * diff_score * 1.0 now_score_percent = now_score * 100 / best_score self.logger.info("now competition score: {}".format(now_score_percent)) ans = {} ans['competition_score'] = now_score_percent for metric, values in eval_results.items(): if "nums" not in metric: ans[metric] = sum(values) / len(values) else: ans[metric] = sum(values) if metric == "dis": self.logger.info("now dis: {}".format(ans[metric])) # update pooling for metric, value in ans.items(): self.pooling[metric].append(value) if len(self.pooling[metric]) > self.moving_average_len: # remove the first one self.pooling[metric].pop(0) # eval_results eval_results = { metric: sum(values) / len(values) for metric, values in self.pooling.items() } return eval_results