def make_report(chrono: MultiStageChrono, args: Namespace, version: str, batch_loss: RingBuffer, epoch_loss: RingBuffer, metrics, remote_logger): if args is not None: args = args.__dict__ else: args = {} if args['report'] is None: args['report'] = os.environ.get('REPORT_PATH') filename = args['report'] # Each GPU has its report we will consolidate later filename = f'{filename}_{args["jr_id"]}.json' args['version'] = version unique_id = hashlib.sha256() # make it deterministic items = list(args.items()) items.sort() for k, w in items: if k in not_parameter: continue unique_id.update(str(k).encode('utf-8')) unique_id.update(str(w).encode('utf-8')) # we do not want people do modify our shit if the id do not match then they get disqualified args['unique_id'] = unique_id.hexdigest() # Try to identify vendors so we can find them more easily if args['cuda']: args['gpu'] = get_gpu_name() args['hostname'] = socket.gethostname() args['batch_loss'] = batch_loss.to_list() args['epoch_loss'] = epoch_loss.to_list() args['metrics'] = metrics for excluded in excluded_arguments: args.pop(excluded, None) remote_logger.log_parameters(args) report_dict = chrono.to_dict(args) # train is the default name for batched stuff if 'train' in report_dict: train_data = report_dict['train'] item_count = report_dict['batch_size'] * report_dict['number'] min_item = item_count / train_data['max'] max_item = item_count / train_data['min'] train_item = { 'avg': item_count / train_data['avg'], 'max': max_item, 'min': min_item, 'range': max_item - min_item, 'unit': 'items/sec' } report_dict['train_item'] = train_item print('-' * 80) json_report = json.dumps(report_dict, sort_keys=True, indent=4, separators=(',', ': ')) print(json_report) if not os.path.exists(filename): report_file = open(filename, 'w') report_file.write('[') report_file.close() report_file = open(filename, 'a') report_file.write(json_report) report_file.write(',') report_file.close() print('-' * 80)
class Experiment: """ Store all the information we care about during an experiment chrono : Performance timer args : argument passed to the script name : name of the experiment batch_loss_buffer : Batch loss values (just making sure we do not get NaNs epoch_loss_buffer: Epoch loss values same as batch loss """ def __init__(self, module, skip_obs=10): self.name, self.version = get_experience_descriptor(module) self._chrono = None self.skip_obs = skip_obs self.args = None self.batch_loss_buffer = RingBuffer(100, torch.float32) self.epoch_loss_buffer = RingBuffer(10, torch.float32) self.metrics = {} self.remote_logger = None try: self.remote_logger = make_log( api_key=os.environ.get("CML_API_KEY"), project_name=self.name, workspace=os.environ.get("CML_WORKSPACE")) except Exception as e: print(e) self.remote_logger = make_log() def get_arguments(self, parser, *args, **kwargs): self.args = parser.get_arguments() self._chrono = MultiStageChrono(name=self.name, skip_obs=self.skip_obs, sync=get_sync(self.args)) return self.args def chrono(self): return self._chrono def log_batch_loss(self, val): self.batch_loss_buffer.append(val) self.remote_logger.log_metric('batch_loss', val) def log_epoch_loss(self, val): self.epoch_loss_buffer.append(val) self.remote_logger.log_metric('epoch_loss', val) def log_metric(self, name, val): metric = self.metrics.get(name) if metric is None: metric = RingBuffer(10, torch.float32) self.metrics[name] = metric metric.append(val) self.remote_logger.log_metric(name, val) def report(self): if self.args is not None: args = self.args.__dict__ else: args = {} if args['report'] is None: args['report'] = os.environ.get('REPORT_PATH') filename = args['report'] # Each GPU has its report we will consolidate later filename = f'{filename}_{args["jr_id"]}.json' args['version'] = self.version unique_id = hashlib.sha256() # make it deterministic items = list(args.items()) items.sort() for k, w in items: if k in not_parameter: continue unique_id.update(str(k).encode('utf-8')) unique_id.update(str(w).encode('utf-8')) # we do not want people do modify our shit if the id do not match then they get disqualified args['unique_id'] = unique_id.hexdigest() # Try to identify vendors so we can find them more easily if args['cuda']: args['gpu'] = get_gpu_name() args['hostname'] = socket.gethostname() args['batch_loss'] = self.batch_loss_buffer.to_list() args['epoch_loss'] = self.epoch_loss_buffer.to_list() args['metrics'] = self.metrics for excluded in excluded_arguments: args.pop(excluded, None) self.remote_logger.log_parameters(args) report_dict = self._chrono.to_dict(args) # train is the default name for bathed stuff if 'train' in report_dict: train_data = report_dict['train'] item_count = report_dict['batch_size'] * report_dict['number'] min_item = item_count / train_data['max'] max_item = item_count / train_data['min'] train_item = { 'avg': item_count / train_data['avg'], 'max': max_item, 'min': min_item, 'range': max_item - min_item, 'unit': 'items/sec' } report_dict['train_item'] = train_item print('-' * 80) json_report = json.dumps(report_dict, sort_keys=True, indent=4, separators=(',', ': ')) print(json_report) if not os.path.exists(filename): report_file = open(filename, 'w') report_file.write('[') report_file.close() report_file = open(filename, 'a') report_file.write(json_report) report_file.write(',') report_file.close() print('-' * 80) def get_device(self): return init_torch(self.args) def get_time(self, stream): if stream.avg == 0: return stream.val return stream.avg def show_eta(self, epoch_id, timer, msg=''): if msg: msg = ' | ' + msg loss = self.batch_loss_buffer.last() if loss is not None: loss = f'| Batch Loss {loss:8.4f}' else: loss = '' print( f'[{epoch_id:3d}/{self.args.repeat:3d}] ' f'| ETA: {self.get_time(timer) * (self.args.repeat - (epoch_id + 1)) / 60:6.2f} min ' + loss + msg)