def _func(): run = Run(system_tracking_interval=None) run_hash.append(run.hash) self.assertIsNone(run.end_time) for i in range(10): run.track(i, name='seq') self.assertIsNone(run.end_time)
def before_training(self, model): self.aim_run = Run( repo=self.repo, experiment=self.experiment, system_tracking_interval=self.system_tracking_interval) self.initialized = True return model
def _track_legacy_run_step(run: Run, metric_name: str, context: dict, val): (value, step, epoch, timestamp) = val from aim.storage.context import Context, Metric if context is None: context = {} ctx = Context(context) metric = Metric(metric_name, ctx) if ctx not in run.contexts: run.meta_tree['contexts', ctx.idx] = ctx.to_dict() run.meta_run_tree['contexts', ctx.idx] = ctx.to_dict() run.contexts[ctx] = ctx.idx run._idx_to_ctx[ctx.idx] = ctx time_view = run.series_run_tree.view(metric.selector).array('time').allocate() val_view = run.series_run_tree.view(metric.selector).array('val').allocate() epoch_view = run.series_run_tree.view(metric.selector).array('epoch').allocate() max_idx = run.series_counters.get((ctx, metric_name), None) if max_idx == None: max_idx = len(val_view) if max_idx == 0: run.meta_tree['traces', ctx.idx, metric_name] = 1 run.meta_run_tree['traces', ctx.idx, metric_name, "last"] = value run.series_counters[ctx, metric_name] = max_idx + 1 time_view[step] = timestamp val_view[step] = value epoch_view[step] = epoch
async def custom_aligned_metrics_streamer(requested_runs: List[AlignedRunIn], x_axis: str) -> bytes: for run_data in requested_runs: run_hashname = run_data.run_id requested_traces = run_data.traces run = Run(hashname=run_hashname) traces_list = [] for trace_data in requested_traces: context = Context(trace_data.context) trace = run.get_trace(metric_name=trace_data.metric_name, context=context) x_axis_trace = run.get_trace(metric_name=x_axis, context=context) if not (trace and x_axis_trace): continue _slice = slice(*trace_data.slice) iters = trace.values.sparse_numpy()[0] sliced_iters = sliced_np_array(iters, _slice) x_axis_iters, x_axis_values = collect_x_axis_data( x_axis_trace, sliced_iters) traces_list.append({ 'metric_name': trace.name, 'context': trace.context.to_dict(), 'x_axis_values': x_axis_values, 'x_axis_iters': x_axis_iters, }) run_dict = {run_hashname: traces_list} encoded_tree = encode_tree(run_dict) yield collect_run_streamable_data(encoded_tree)
def test_explicit_run_delete(self): run = Run(system_tracking_interval=None) run_hash = run.hash for i in range(10): run.track(i, name='seq') del run time.sleep(.1) self.assertIsNotNone(self._query_run_finalized_at(run_hash))
def test_different_types_on_different_contexts_and_runs(self): run = Run(system_tracking_interval=None) # same sequence name, different contexts run.track(1., name='numbers', context={'type': 'float'}) run.track(1, name='numbers', context={'type': 'integer'}) run2 = Run(system_tracking_interval=None) # same sequence name, different runs run2.track(1, name='numbers', context={'type': 'float'})
def setup(self, args, state, model): self._initialized = True self._run = Run( repo=self._repo_path, experiment=self._experiment_name, system_tracking_interval=self._system_tracking_interval, ) combined_dict = {**args.to_sanitized_dict()} self._run['hparams'] = combined_dict
class _XgboostCallback(TrainingCallback): def __init__(self, repo: Optional[str] = None, experiment: Optional[str] = None, system_tracking_interval: Optional[ int] = DEFAULT_SYSTEM_TRACKING_INT): super().__init__() self.repo = repo self.experiment = experiment self.system_tracking_interval = system_tracking_interval self.initialized = False self.aim_run = None def before_training(self, model): self.aim_run = Run( repo=self.repo, experiment=self.experiment, system_tracking_interval=self.system_tracking_interval) self.initialized = True return model def after_iteration(self, model, epoch: int, evals_log: CallbackContainer.EvalsLog) -> bool: if not evals_log: return False for data, metric in evals_log.items(): for metric_name, log in metric.items(): stdv: Optional[float] = None if isinstance(log[-1], tuple): score = log[-1][0] stdv = log[-1][1] else: score = log[-1] self.aim_run.track(score, step=0, name=metric_name, context={'stdv': False}) if stdv is not None: self.aim_run.track(score, step=0, name=metric_name, context={'stdv': True}) return False def after_training(self, model): if self.initialized and self.aim_run: del self.aim_run self.aim_run = None return model
def test_incompatible_type_after_tracking_restart(self): run = Run(system_tracking_interval=None) run_hash = run.hash run.track(1., name='numbers', context={}) run.finalize() del run new_run = Run(run_hash=run_hash, system_tracking_interval=None) with self.assertRaises(ValueError) as cm: new_run.track(1, name='numbers', context={}) exception = cm.exception self.assertEqual( 'Cannot log value \'1\' on sequence \'numbers\'. Incompatible data types.', exception.args[0])
def collect_requested_traces(run: Run, requested_traces: List[TraceBase], steps_num: int = 200) -> List[dict]: processed_traces_list = [] for requested_trace in requested_traces: metric_name = requested_trace.metric_name context = Context(requested_trace.context) trace = run.get_trace(metric_name=metric_name, context=context) if not trace: continue iters, values = trace.values.sparse_list() num_records = len(values) step = (num_records // steps_num) or 1 _slice = slice(0, num_records, step) processed_traces_list.append({ 'metric_name': trace.name, 'context': trace.context.to_dict(), 'values': sliced_array(values, _slice), 'iters': sliced_array(iters, _slice), }) return processed_traces_list
def experiment(self) -> Run: if self._run is None: self._run = Run(repo=self._repo_path, experiment=self._experiment_name, system_tracking_interval=self. _system_tracking_interval) return self._run
def requested_figure_object_traces_streamer( run: Run, requested_traces: List[TraceBase], rec_range, rec_num: int = 50 ) -> List[dict]: for requested_trace in requested_traces: trace_name = requested_trace.name context = Context(requested_trace.context) trace = run.get_figure_sequence(name=trace_name, context=context) if not trace: continue record_range_missing = rec_range.start is None or rec_range.stop is None if record_range_missing: rec_range = IndexRange(trace.first_step(), trace.last_step() + 1) steps = [] values = [] steps_vals = trace.values.items_in_range( rec_range.start, rec_range.stop, rec_num ) for step, val in steps_vals: steps.append(step) values.append(preparer(val, trace, step, decode=True)) trace_dict = { 'name': trace.name, 'context': trace.context.to_dict(), 'values': values, 'iters': steps, 'record_range': (trace.first_step(), trace.last_step() + 1), } encoded_tree = encode_tree(trace_dict) yield collect_run_streamable_data(encoded_tree)
def __init__(self, repo: Optional[str] = None, experiment: Optional[str] = None, run: Optional[Run] = None): super(KerasTrackerCallback, self).__init__() if run is None: if repo is None and experiment is None: self._run = Run() else: self._run = Run(repo=repo, experiment=experiment) else: print('Passing Run instance to AimCallback will be ' 'deprecated in future versions, ' 'pass the callback arguments explicitly') self._run = run
def fill_up_test_data(): remove_test_data() # put dummy data into test repo with 10 runs, tracking 2 metrics over 3 contexts repo = Repo.default_repo() run_hashes = [hex(random.getrandbits(64))[-7:] for _ in range(10)] contexts = [{ 'is_training': True, 'subset': 'train' }, { 'is_training': True, 'subset': 'val' }, { 'is_training': False }] metrics = ['loss', 'accuracy'] with repo.structured_db: try: for idx, hash_name in enumerate(run_hashes): run = Run(hashname=hash_name, repo=repo, system_tracking_interval=None) run['hparams'] = create_run_params() run['run_index'] = idx run['start_time'] = datetime.datetime.utcnow().isoformat() run['name'] = f'Run # {idx}' run.props.name = run['name'] metric_contexts = itertools.product(metrics, contexts) for metric_context in metric_contexts: metric = metric_context[0] context = metric_context[1] if metric == 'accuracy' and 'subset' in context: continue else: # track 100 values per run for step in range(100): val = 1.0 - 1.0 / (step + 1) run.track(val, name=metric, step=step, epoch=1, context=context) finally: del run
def iter_runs(self) -> Iterator['Run']: """Iterate over Repo runs. Yields: next :obj:`Run` in readonly mode . """ self.meta_tree.preload() for run_name in self.meta_tree.subtree('chunks').keys(): yield Run(run_name, repo=self, read_only=True)
def finalize_stalled_runs(repo: 'Repo', runs: set): runs_in_progress = [] for run_hash in tqdm.tqdm(runs, desc='Finalizing stalled runs', total=len(runs)): try: run = Run(run_hash=run_hash, repo=repo, system_tracking_interval=None) except filelock.Timeout: runs_in_progress.append(run_hash) else: # TODO: [AT] handle lock timeout on index db (retry logic). run.finalize() if runs_in_progress: click.echo('Skipped indexing for the following runs in progress:') for run_hash in runs_in_progress: click.secho(f'\t\'{run_hash}\'', fg='yellow')
def __init__(self, repo: Optional[str] = None, experiment: Optional[str] = None, flush_frequency: int = 0, # unused block_termination: bool = True, # unused run: Optional[str] = None, system_tracking_interval: Optional[int] = DEFAULT_SYSTEM_TRACKING_INT): self._repo = Repo.from_path(repo) if repo else Repo.default_repo() self._repo_path = self._repo.path self._run = Run(run, repo=self._repo, experiment=experiment, system_tracking_interval=system_tracking_interval) self._run_hash = self._run.hashname self.active = True Session.sessions.setdefault(self._repo_path, []) Session.sessions[self._repo_path].append(self) # Bind signal listeners self._set_exit_handlers()
def setUpClass(cls) -> None: super().setUpClass() cls.image_blobs = {} run = Run(run_hash=cls.run_hash, read_only=True) empty_context = Context({}) for step in range(10): for idx in range(5): img_view = run.series_run_tree.subtree( (empty_context.idx, 'random_images', 'val', step, idx)) cls.image_blobs[img_view['caption']] = img_view['data'].load()
def test_explicit_run_finalize(self): run = Run(system_tracking_interval=None) for i in range(10): run.track(i, name='seq') self.assertIsNone(run.end_time) run.finalize() self.assertIsNotNone(run.end_time)
def iter_runs_from_cache(self, offset: str = None) -> Iterator['Run']: db = self.structured_db cache = db.caches.get('runs_cache') if cache: run_names = cache.keys() try: offset_idx = run_names.index(offset) + 1 except ValueError: offset_idx = 0 for run_name in run_names[offset_idx:]: yield Run(run_name, repo=self, read_only=True) else: raise StopIteration
def get_run(self, run_hash: str) -> Optional['Run']: """Get run if exists. Args: run_hash (str): Run hash. Returns: :obj:`Run` object if hash is found in repository. `None` otherwise. """ # TODO: [MV] optimize existence check for run if run_hash is None or run_hash not in self.meta_tree.subtree( 'chunks').keys(): return None else: return Run(run_hash, repo=self, read_only=True)
def test_incompatible_type_during_tracking(self): run = Run(system_tracking_interval=None) run.track(1., name='numbers', context={}) with self.assertRaises(ValueError) as cm: run.track(1, name='numbers', context={}) exception = cm.exception self.assertEqual( 'Cannot log value \'1\' on sequence \'numbers\'. Incompatible data types.', exception.args[0])
def setUpClass(cls) -> None: super().setUpClass() run = Run(repo=cls.repo) run['images_per_step'] = 16 for step in range(100): images = generate_image_set(img_count=16, caption_prefix=f'Image {step}') run.track(images, name='random_images') run.track(random.random(), name='random_values') cls.run_hash = run.hash
def convert_run(lrun: LegacyRun, repo: Repo, legacy_run_map, skip_failed): try: run = Run( repo=repo, system_tracking_interval=None ) # do not track system metrics as they already logged if needed lrun.open_storage() if lrun.params.get(AIM_MAP_METRICS_KEYWORD): del lrun.params[ AIM_MAP_METRICS_KEYWORD] # set internally. no need to copy run[...] = lrun.params run['v2_params'] = {'run_hash': lrun.run_hash} if 'process' in lrun.config: run['v2_params', 'start_date'] = lrun.config['process']['start_date'] run['v2_params', 'finish_date'] = lrun.config['process']['finish_date'] run.experiment = lrun.experiment_name if lrun.config.get('archived'): run.archived = True run_metrics = {} legacy_run_map[lrun.run_hash] = run_metrics for metric in lrun.get_all_metrics().values(): try: metric.open_artifact() run_metrics[metric.name] = [] for trace in metric.get_all_traces(): metric_name = metric.name context = trace.context run_metrics[metric.name].append(context) for r in trace.read_records(slice(0, None, 1)): step_record, metric_record = deserialize_pb(r) # no need to track in a separate thread. use _track_impl directly. run._track_impl(metric_record.value, step_record.timestamp, metric_name, step_record.step, step_record.epoch, context=context) except Exception: metric.close_artifact() raise finally: metric.close_artifact() del run except Exception as e: click.echo( f'\nFailed to convert run {lrun.run_hash}. Reason: {str(e)}.', err=True) if not skip_failed: raise finally: lrun.close_storage()
def setUpClass(cls) -> None: super().setUpClass() run = Run(system_tracking_interval=None) cls.run_hash = run.hash for step in range(5): images = generate_image_set(img_count=5, caption_prefix=f'Image {step}') run.track(images, name='image_lists', context={}) run.track(images[0], name='single_images', context={})
def requested_image_traces_streamer(run: Run, requested_traces: List[TraceBase], rec_range, idx_range, rec_num: int = 50, idx_num: int = 5) -> List[dict]: for requested_trace in requested_traces: trace_name = requested_trace.name context = Context(requested_trace.context) trace = run.get_image_sequence(name=trace_name, context=context) if not trace: continue record_range_missing = rec_range.start is None or rec_range.stop is None if record_range_missing: rec_range = IndexRange(trace.first_step(), trace.last_step() + 1) index_range_missing = idx_range.start is None or idx_range.stop is None if index_range_missing: idx_range = IndexRange(0, trace.record_length() or 1) rec_length = trace.record_length() or 1 idx_step = rec_length // idx_num or 1 idx_slice = slice(idx_range.start, idx_range.stop, idx_step) steps_vals = trace.values.items_in_range(rec_range.start, rec_range.stop, rec_num) steps = [] values = [] for step, val in steps_vals: steps.append(step) if isinstance(val, list): values.append( img_collection_record_to_encodable(sliced_custom_object_record(val, idx_slice), trace, step) ) elif idx_slice.start == 0: values.append(img_record_to_encodable(val, trace, step)) else: values.append([]) trace_dict = { 'record_range': (trace.first_step(), trace.last_step() + 1), 'index_range': (0, rec_length), 'name': trace.name, 'context': trace.context.to_dict(), 'values': values, 'iters': steps, } encoded_tree = encode_tree(trace_dict) yield collect_run_streamable_data(encoded_tree)
def _pack_run_data(run_: Run, traces_: list): _rec_range = ( trcs_rec_range if record_range_missing or calc_total_ranges else rec_range ) run_dict = { run_.hash: { 'ranges': { 'record_range': [_rec_range.start, _rec_range.stop], 'record_slice': [rec_slice.start, rec_slice.stop, rec_slice.step], }, 'params': run_.get(...), 'traces': traces_, 'props': get_run_props(run_), } } encoded_tree = encode_tree(run_dict) return collect_run_streamable_data(encoded_tree)
async def get_experiment_runs_api(exp_id: str, limit: Optional[int] = None, offset: Optional[str] = None, factory=Depends(object_factory)): project = Project() exp = factory.find_experiment(exp_id) if not exp: raise HTTPException(status_code=404) from aim.sdk.run import Run cache_name = 'exp_runs' project.repo.run_props_cache_hint = cache_name project.repo.structured_db.invalidate_cache(cache_name) project.repo.structured_db.init_cache(cache_name, exp.get_runs, lambda run_: run_.hash) exp_runs = [] run_hashes = [run.hash for run in exp.runs] offset_idx = 0 if offset: try: offset_idx = run_hashes.index(offset) + 1 except ValueError: pass if limit: run_hashes = run_hashes[offset_idx:offset_idx + limit] for run_hash in run_hashes: run = Run(run_hash, repo=project.repo, read_only=True) exp_runs.append({ 'run_id': run.hash, 'name': run.name, 'creation_time': run.creation_time, 'end_time': run.end_time }) project.repo.structured_db.invalidate_cache(cache_name) project.repo.run_props_cache_hint = None response = {'id': exp.uuid, 'runs': exp_runs} return response
async def get_tagged_runs_api(tag_id: str, factory=Depends(object_factory)): project = Project() tag = factory.find_tag(tag_id) if not tag: raise HTTPException from aim.sdk.run import Run cache_name = 'tag_runs' project.repo.run_props_cache_hint = cache_name project.repo.structured_db.invalidate_cache(cache_name) project.repo.structured_db.init_cache(cache_name, tag.get_runs, lambda run_: run_.hash) tag_runs = [] for tagged_run in tag.runs: run = Run(tagged_run.hash, repo=project.repo, read_only=True) tag_runs.append({ 'run_id': tagged_run.hash, 'name': tagged_run.name, 'creation_time': run.creation_time, 'end_time': run.end_time, 'experiment': tagged_run.experiment if tagged_run.experiment else None }) project.repo.structured_db.invalidate_cache(cache_name) project.repo.run_props_cache_hint = None response = {'id': tag.uuid, 'runs': tag_runs} return response
class _HuggingFaceCallback(TrainerCallback): def __init__( self, repo: Optional[str] = None, experiment: Optional[str] = None, system_tracking_interval: Optional[ int] = DEFAULT_SYSTEM_TRACKING_INT, ): self._repo_path = repo self._experiment_name = experiment self._system_tracking_interval = system_tracking_interval self._initialized = False self._current_shift = None self._run = None def setup(self, args, state, model): self._initialized = True self._run = Run( repo=self._repo_path, experiment=self._experiment_name, system_tracking_interval=self._system_tracking_interval, ) combined_dict = {**args.to_sanitized_dict()} self._run['hparams'] = combined_dict # Store model configs as well # if hasattr(model, 'config') and model.config is not None: # model_config = model.config.to_dict() # self._run['model'] = model_config def on_train_begin(self, args, state, control, model=None, **kwargs): if not self._initialized: self.setup(args, state, model) self._current_shift = 'train' def on_evaluate(self, args, state, control, **kwargs): self._current_shift = 'val' def on_prediction_step(self, args, state, control, **kwargs): self._current_shift = 'pred' def on_log(self, args, state, control, model=None, logs=None, **kwargs): if not self._initialized: self.setup(args, state, model) context = { 'subset': self._current_shift, } for log_name, log_value in logs.items(): self._run.track(log_value, name=log_name, context=context) def on_epoch_end(self, args, state, control, **kwargs): pass def __del__(self): if self._initialized and self._run: del self._run self._run = None