def json_spec_from_fpath(json_spec_path, json_fname, store_duplicates=False): base_dir = json_spec_path if not os.path.isdir(base_dir): raise Exception('base_dir {} does not exist.'.format(base_dir)) json_spec = ivy.Container() while True: fpath = os.path.normpath(os.path.join(base_dir, json_fname)) if os.path.isfile(fpath): if store_duplicates: parsed_json_cont = ivy.Container(parse_json_to_cont(fpath)) duplicate_key_chains = list() def map_fn(x, kc): if kc in json_spec: duplicate_key_chains.append(kc) return ivy.Container(duplicated={'parent_dir': json_spec[kc], 'this_dir': x}) else: return x parsed_json_cont = parsed_json_cont.map(map_fn) json_spec = ivy.Container.combine(parsed_json_cont, json_spec.prune_key_chains(duplicate_key_chains)) else: json_spec = ivy.Container.combine(ivy.Container(parse_json_to_cont(fpath)), json_spec) if base_dir.split('/')[-1] == 'json_args': return json_spec base_dir = os.path.normpath(os.path.join(base_dir, '..'))
def build_network_specification(dataset_dirs_args=None, dataset_dirs_class=None, dataset_dirs=None, dataset_spec_args=None, dataset_spec_class=None, dataset_spec=None, network_spec_args=None, network_spec_class=None, json_spec_path=None, spec_cont=None, class_priority=False): """ build network specification """ # build dataset specification dataset_spec = ivy.default( dataset_spec, build_dataset_spec( dataset_dirs_args=dataset_dirs_args, dataset_dirs_class=dataset_dirs_class, dataset_dirs=dataset_dirs, dataset_spec_args=dataset_spec_args, dataset_spec_class=dataset_spec_class, json_spec_path=json_spec_path, spec_cont=spec_cont)) # define network specification arguments if network_spec_args is None: network_spec_args = dict() network_spec_args = ivy.Container(network_spec_args) network_spec_args = ivy.Container.combine(network_spec_args, ivy.Container(dataset_spec=dataset_spec)) # load json file if isinstance(json_spec_path, str): json_spec = json_spec_from_fpath(json_spec_path, 'network_args.json') else: json_spec = ivy.Container() # load from spec dict this_spec_cont =\ ivy.Container(spec_cont['network']) if isinstance(spec_cont, dict) and 'network' in spec_cont \ else ivy.Container() # combine args network_spec_args = ivy.Container.combine(json_spec, this_spec_cont, network_spec_args) # override network_spec_class if specified in network_spec_args network_spec_class = ivy.default(ivy.default( _import_arg_specified_class_if_present(network_spec_args, 'network_spec_class'), network_spec_class, rev=class_priority), NetworkSpec) # return network return network_spec_class(**network_spec_args)
def build_data_loader_spec(dataset_dirs_args=None, dataset_dirs_class=None, dataset_dirs=None, dataset_spec_args=None, dataset_spec_class=None, dataset_spec=None, data_loader_spec_args=None, data_loader_spec_class=None, json_spec_path=None, spec_cont=None, class_priority=False): """ build data loader specification """ # build dataset specification dataset_spec = ivy.default( dataset_spec, build_dataset_spec( dataset_dirs_args=dataset_dirs_args, dataset_dirs_class=dataset_dirs_class, dataset_dirs=dataset_dirs, dataset_spec_args=dataset_spec_args, dataset_spec_class=dataset_spec_class, json_spec_path=json_spec_path, spec_cont=spec_cont)) # define data loader specification arguments if data_loader_spec_args is None: data_loader_spec_args = dict() data_loader_spec_args = ivy.Container(data_loader_spec_args) data_loader_spec_args = ivy.Container.combine(data_loader_spec_args, ivy.Container(dataset_spec=dataset_spec)) # load json file if isinstance(json_spec_path, str): json_spec = json_spec_from_fpath(json_spec_path, 'data_loader_args.json') else: json_spec = ivy.Container() # load from spec dict this_spec_cont =\ ivy.Container(spec_cont['data_loader']) if isinstance(spec_cont, dict) and 'data_loader' in spec_cont \ else ivy.Container() # combine args data_loader_spec_args = ivy.Container.combine(json_spec, this_spec_cont, data_loader_spec_args) # override data_loader_spec_class if specified in data_loader_spec_args data_loader_spec_class = ivy.default(ivy.default( _import_arg_specified_class_if_present(data_loader_spec_args, 'data_loader_spec_class'), data_loader_spec_class, rev=class_priority), DataLoaderSpec) # return data loader return data_loader_spec_class(**data_loader_spec_args)
def command_line_str_to_spec_cont(spec_str): """ save the python dict as a json file at specified filepath """ if spec_str is not None: spec_cont = ivy.Container(json.loads(spec_str.replace("'", '"'))) else: spec_cont = ivy.Container() all_keys = ['dataset_dirs', 'dataset', 'data_loader', 'network', 'trainer', 'tuner'] for key in spec_cont.keys(): if key not in all_keys: raise Exception('spec dict keys must all be one of {}, but found {}'.format(all_keys, key)) for key in all_keys: if key not in spec_cont: spec_cont[key] = ivy.Container() return spec_cont
def test_format_dataset_containers(dev_str, call): this_dir = os.path.dirname(os.path.realpath(__file__)) orig_cont_dir = os.path.join(this_dir, 'dataset/containers') cont_to_format_dir = os.path.join(this_dir, 'dataset/containers_to_format') shutil.rmtree(cont_to_format_dir) shutil.copytree(orig_cont_dir, cont_to_format_dir) # from format file cont_format_fpath = os.path.join(this_dir, 'dataset/new_container_format.json') main(cont_to_format_dir, cont_format_fpath=cont_format_fpath) new_cont_format = ivy.Container.from_disk_as_json(cont_format_fpath) new_cont_fnames = os.listdir(cont_to_format_dir) for new_cont_fname in new_cont_fnames: new_cont_fpath = os.path.join(cont_to_format_dir, new_cont_fname) new_cont = ivy.Container.from_disk_as_json(new_cont_fpath) assert ivy.Container.identical([new_cont, new_cont_format], check_types=False) shutil.rmtree(cont_to_format_dir) shutil.copytree(orig_cont_dir, cont_to_format_dir) # from format string cont_format_as_str = '{"discounts": true, "rewards": true, "step_types": true, "array": true}' main(cont_to_format_dir, cont_format_as_str) new_cont_format = ivy.Container(json.loads(cont_format_as_str)) new_cont_fnames = os.listdir(cont_to_format_dir) for new_cont_fname in new_cont_fnames: new_cont_fpath = os.path.join(cont_to_format_dir, new_cont_fname) new_cont = ivy.Container.from_disk_as_json(new_cont_fpath) assert ivy.Container.identical([new_cont, new_cont_format], check_types=False) shutil.rmtree(cont_to_format_dir) shutil.copytree(orig_cont_dir, cont_to_format_dir)
def get_json_args(json_spec_path, keys_to_ignore, keychains_to_ignore, keychain_to_show, defaults=False, store_duplicates=False, current_dir_only=False, spec_names=None): spec_names = ivy.default(spec_names, [item.split('.json')[0] for item in os.listdir(json_spec_path) if '.json' in item]) if defaults: defaults = '.defaults' else: defaults = '' cont = ivy.Container() if current_dir_only: for spec_name in spec_names: fpath = os.path.join(json_spec_path, spec_name + '.json' + defaults) if os.path.isfile(fpath): cont[spec_name] = parse_json_to_cont(fpath) else: for spec_name in spec_names: cont[spec_name] = \ json_spec_from_fpath(json_spec_path, spec_name + '.json' + defaults, store_duplicates) for keychain_to_ignore in keychains_to_ignore: if keychain_to_ignore in cont: cont[keychain_to_ignore] = 'not_shown' cont = cont.set_at_keys(dict(zip(keys_to_ignore, ['not_shown']*len(keys_to_ignore)))) if ivy.exists(keychain_to_show): cont = cont[keychain_to_show] return cont
def __init__(self, spec: NetworkSpec, v=None) -> None: """ base class for any networks """ self._v_in = v self._spec = spec self._subnets = ivy.Container() super(NetworkGroup, self).__init__(spec, v=v)
def get_next_batch(self, dataset_key='training'): data = self._data[dataset_key] if self._spec.shuffle: self._i = np.random.randint(0, self._num_examples) else: self._i = (self._i + 1) % self._num_examples return ivy.Container(input=data.input[self._i:self._i + 1], target=data.targets[self._i:self._i + 1])
def trainer_to_spec_args_dict(trainer): args_dict = dict() args_dict['data_loader_class'] = _obj_to_class_str(trainer.spec.data_loader) args_dict['network_class'] = _obj_to_class_str(trainer.spec.network) args_dict['trainer_class'] = _obj_to_class_str(trainer) args_dict['dataset_dirs_args'] = ivy.Container(trainer.spec.data_loader.spec.dataset_spec.dirs.kwargs).to_dict() args_dict['dataset_dirs_class'] = _obj_to_class_str(trainer.spec.data_loader.spec.dataset_spec.dirs) args_dict['dataset_spec_args'] = ivy.Container(trainer.spec.data_loader.spec.dataset_spec.kwargs).to_dict() args_dict['dataset_spec_class'] = _obj_to_class_str(trainer.spec.data_loader.spec.dataset_spec) args_dict['data_loader_spec_args'] = ivy.Container(trainer.spec.data_loader.spec.kwargs).to_dict() args_dict['data_loader_spec_class'] = _obj_to_class_str(trainer.spec.data_loader.spec) args_dict['network_spec_args'] = ivy.Container(trainer.spec.network.spec.kwargs).to_dict() args_dict['network_spec_class'] = _obj_to_class_str(trainer.spec.network.spec) args_dict['trainer_spec_args'] = ivy.Container(trainer.spec.kwargs).prune_key_chains( ['data_loader', 'network']).to_dict() args_dict['trainer_spec_class'] = _obj_to_class_str(trainer.spec) return args_dict
def save(self, checkpoint_path: str) -> None: """ save the network weights and optimizer state in checkpoint file :param checkpoint_path: path of the checkpoint file for saving the weights and optimizer state """ checkpoint = ivy.Container({'network': self._network.v, 'optimizer': self._optimizer.state}) os.makedirs('/'.join(checkpoint_path.split('/')[:-1]), exist_ok=True) checkpoint.to_disk_as_hdf5(checkpoint_path)
def save(self, step): checkpoint = ivy.Container({ 'network': self._checkpoint.net.v, 'optimizer': self._checkpoint.optimizer.state }) self._latest_checkpoint_fpath = os.path.join( self._directory, 'chkpt-{}.hdf5'.format(step)) checkpoint.to_disk_as_hdf5(self._latest_checkpoint_fpath)
def _load_json_files(containers): read_files = list() for j_fpath in containers.fpaths: if j_fpath != '': with open(j_fpath, 'r') as file: read_str = file.read() else: read_str = '' read_files.append(read_str) return ivy.Container({'json_str': read_files})
def _init(self, num_processes): self._x = [ivy.array([0, 1]), ivy.array([2, 3, 4, 5, 6, 7, 8, 9])] dataset_container = ivy.Container({'x': self._x}) dataset = Dataset(dataset_container, 'base', dataset_container.shape[0], num_processes=num_processes) dataset = dataset.unbatch('unbatched', num_processes=num_processes) self._dataset = dataset.batch('batched', 3, num_processes=num_processes)
def _build_subnets(self, *args, **kwargs) -> bool: """ Build the network subnets. """ built_rets = list() for k, subnet_spec in self._spec.subnets.items(): subnet = subnet_spec.network_class(subnet_spec, v=ivy.default( lambda: self._v_in[k], None, True)) built_rets.append( subnet.build(*args, dev_str=self._dev_str, **kwargs)) self._subnets[k] = subnet return ivy.Container(dict(zip(self._spec.subnets.keys(), built_rets)))
def main(compile_mode=False): current_dir = os.path.dirname(os.path.abspath(__file__)) # dataset dirs specification dataset_dirs_args = dict() # dataset specification dataset_spec_filepath = os.path.join(current_dir, 'json_specs', 'dataset_spec.json.example') dataset_spec_args = builder.parse_json_to_cont(dataset_spec_filepath) # data loader specification data_loader_spec_filepath = os.path.join(current_dir, 'json_specs', 'data_loader_spec.json.example') data_loader_spec_args = builder.parse_json_to_cont( data_loader_spec_filepath) # network specification network_spec_filepath = os.path.join(current_dir, 'json_specs', 'network_spec.json.example') network_spec_args = builder.parse_json_to_cont(network_spec_filepath) # trainer specification trainer_spec_filepath = os.path.join(current_dir, 'json_specs', 'trainer_spec.json.example') trainer_spec_args = builder.parse_json_to_cont(trainer_spec_filepath) # In all above cases, the user could override the loaded json file dicts with command line args if so desired # before then passing into the TrainingJob for specification class construction, which are all then read-only trainer = builder.build_trainer( ExampleDataLoader, ExampleNetwork, ExampleTrainer, dataset_dirs_args=dataset_dirs_args, dataset_dirs_class=ExampleDatasetDirs, dataset_spec_args=dataset_spec_args, dataset_spec_class=ExampleDatasetSpec, data_loader_spec_args=data_loader_spec_args, data_loader_spec_class=ExampleDataLoaderSpec, network_spec_args=network_spec_args, network_spec_class=ExampleNetworkSpec, trainer_spec_args=trainer_spec_args, spec_cont=ivy.Container({'trainer': { 'compile_mode': compile_mode }})) trainer.setup() print("Finished complete example!") trainer.train() trainer.close()
def _init(self, array_shape, num_processes): x = [ ivy.array([[0], [1], [2]]), ivy.array([[3], [4], [5]]), ivy.array([[6], [7], [8]]) ] self._x = [ivy.reshape(item, array_shape) for item in x] dataset_container = ivy.Container({'x': x}) dataset = Dataset(dataset_container, 'base', dataset_container.shape[0], num_processes=num_processes) self._dataset = dataset.unbatch('unbatched', num_processes=num_processes)
def __init__(self, data_loader_spec): super().__init__(data_loader_spec) # dataset size self._num_examples = self._spec.dataset_spec.num_examples # counter self._i = 0 # load vector data vector_dim = self._spec.dataset_spec.vector_dim self._targets = ivy.zeros((self._num_examples, vector_dim, 1)) # load image data image_dims = self._spec.dataset_spec.image_dims self._input = ivy.ones( (self._num_examples, image_dims[0], image_dims[1], 3)) self._training_data = ivy.Container(targets=self._targets, input=self._input) self._validation_data = ivy.Container(targets=self._targets, input=self._input) self._data = ivy.Container(training=self._training_data, validation=self._validation_data)
def _init(self, array_shape, num_processes): x = [ ivy.array(0), ivy.array(1), ivy.array(2), ivy.array(3), ivy.array(4), ivy.array(5), ivy.array(6), ivy.array(7), ivy.array(8) ] self._x = [ivy.reshape(item, array_shape) for item in x] dataset_container = ivy.Container({'x': self._x}) self._dataset = Dataset(dataset_container, 'base', dataset_container.shape[0], num_processes=num_processes)
def _compute_num_workers(self): # init num_workers = self._total_num_workers self._num_workers = ivy.Container() # prefetch self._num_workers.prefetch = int(self._spec.with_prefetching) + 1 num_workers = math.ceil(num_workers / self._num_workers.prefetch) # post processed self._num_workers.post_processed = 1 # from numpy self._num_workers.from_np = 1 # batched self._num_workers.batched = 1 # loaded data self._num_workers.loaded_data = min(num_workers, self._batch_size) # ToDo: add multi-processing support for these lower level datasets # shuffled self._num_workers.shuffled = 1 # unbatch self._num_workers.unbatched = 1 # windowed self._num_workers.windowed = 1 # valid first frames self._num_workers.valid_first_frames = 1 # keychain pruned self._num_workers.keychain_pruned = 1 # parsed json self._num_workers.parsed_json = 1 # loaded json self._num_workers.loaded_json = 1
def _init(self, array_shape, num_processes): x = [ ivy.array(0), ivy.array(1), ivy.array(2), ivy.array(3), ivy.array(4), ivy.array(5), ivy.array(6), ivy.array(7), ivy.array(8), ivy.array(9) ] self._x = [ivy.reshape(item, array_shape) for item in x] def sleep_fn(cont): start_time = time.perf_counter() while True: if time.perf_counter() - start_time > 0.011: return cont dataset_container = ivy.Container({'x': self._x}) # without pre-fetch dataset_wo_prefetch = Dataset(copy.deepcopy(dataset_container), 'base', dataset_container.shape[0], with_caching=False, cache_size=0, num_processes=num_processes) self._dataset_wo_prefetch = dataset_wo_prefetch.map('sleep', sleep_fn) # with pre-fetch dataset_w_prefetch = Dataset(copy.deepcopy(dataset_container), 'base', dataset_container.shape[0], with_caching=False, cache_size=0, num_processes=num_processes) dataset_w_prefetch = dataset_w_prefetch.map('sleep', sleep_fn) self._dataset_w_prefetch = dataset_w_prefetch.prefetch('prefetch', 1)
def parse_json_to_cont(json_filepath): """ return the data from json file in the form of a python dict """ return_cont = ivy.Container() with open(json_filepath) as json_data_file: loaded_dict = json.load(json_data_file) for k, v in loaded_dict.items(): if k == 'parents': rel_fpaths = v for rel_fpath in rel_fpaths: if rel_fpath[-5:] == '.json': parent_json_fname = rel_fpath.split('/')[-1] else: parent_json_fname = json_filepath.split('/')[-1] rel_fpath = os.path.join(rel_fpath, parent_json_fname) rel_fpath = os.path.normpath(rel_fpath) fpath = os.path.normpath(os.path.join('/'.join(json_filepath.split('/')[:-1]), rel_fpath)) fdir = '/'.join(fpath.split('/')[:-1]) return_cont = ivy.Container.combine(return_cont, json_spec_from_fpath(fdir, parent_json_fname)) return ivy.Container.combine(return_cont, loaded_dict)
def format_containers(cont_dir, cont_format, cont_format_file): if cont_format: key_chains = ivy.Container(json.loads(cont_format)) else: key_chains = ivy.Container.from_disk_as_json(cont_format_file) cont_fnames = os.listdir(cont_dir) cont_fnames.sort() num_conts = len(cont_fnames) num_logs = 100 log_freq = max((num_conts / num_logs), 1) for i, cont_fname in enumerate(cont_fnames): if i % log_freq == 0: logging.info('reformatting container {} of {}...'.format( i, num_conts)) cont_fpath = os.path.join(cont_dir, cont_fname) cont = ivy.Container.from_disk_as_json(cont_fpath) cont = cont.at_key_chains(key_chains) cont.to_disk_as_json(cont_fpath)
def build_tuner_spec(data_loader_class=None, network_class=None, trainer_class=None, dataset_dirs_args=None, dataset_dirs_class=None, dataset_dirs=None, dataset_spec_args=None, dataset_spec_class=None, dataset_spec=None, data_loader_spec_args=None, data_loader_spec_class=None, data_loader_spec=None, data_loader=None, network_spec_args=None, network_spec_class=None, network_spec=None, network=None, trainer_spec_args=None, trainer_spec_class=None, trainer_spec=None, trainer=None, tuner_spec_args=None, tuner_spec_class=None, json_spec_path=None, spec_cont=None, class_priority=False): """ build tuner specification """ # define dataset directories specification arguments if tuner_spec_args is None: tuner_spec_args = dict() tuner_spec_args = ivy.Container(tuner_spec_args) # load json file if isinstance(json_spec_path, str): json_spec = json_spec_from_fpath(json_spec_path, 'tuner_args.json') else: json_spec = ivy.Container() # load from spec dict this_spec_cont =\ ivy.Container(spec_cont['tuner']) if isinstance(spec_cont, dict) and 'tuner' in spec_cont else ivy.Container() # combine args tuner_spec_args = ivy.Container.combine(json_spec, this_spec_cont, tuner_spec_args) # override tuner_spec_class if specified in tuner_spec_args tuner_spec_class = ivy.default(ivy.default( _import_arg_specified_class_if_present(tuner_spec_args, 'tuner_spec_class'), tuner_spec_class, rev=class_priority), TunerSpec) # set framework ivy.set_framework(tuner_spec_class(None, **tuner_spec_args).framework) # build trainer trainer = ivy.default( trainer, build_trainer( data_loader_class=data_loader_class, network_class=network_class, trainer_class=trainer_class, dataset_dirs_args=dataset_dirs_args, dataset_dirs_class=dataset_dirs_class, dataset_dirs=dataset_dirs, dataset_spec_args=dataset_spec_args, dataset_spec_class=dataset_spec_class, dataset_spec=dataset_spec, data_loader_spec_args=data_loader_spec_args, data_loader_spec_class=data_loader_spec_class, data_loader_spec=data_loader_spec, data_loader=data_loader, network_spec_args=network_spec_args, network_spec_class=network_spec_class, network_spec=network_spec, network=network, trainer_spec_args=trainer_spec_args, trainer_spec_class=trainer_spec_class, trainer_spec=trainer_spec, json_spec_path=json_spec_path, spec_cont=spec_cont)) # return tuner specification return tuner_spec_class(trainer, **tuner_spec_args)
def __getitem__(self, slice_obj): if not self._workers_initialized: self._initialize_all_workers() if self._numpy_loading: ivy.set_framework('numpy') if self._num_processes < 2 or isinstance(slice_obj, numbers.Number): ret = self._get_item(slice_obj) if self._numpy_loading: ivy.unset_framework() self._first_pass = False return ret slice_size = int(round(slice_obj.stop - slice_obj.start)) num_sub_slices = min(slice_size, self._num_processes) slice_points = np.linspace(slice_obj.start, slice_obj.stop, num_sub_slices + 1) slice_sizes = np.round(slice_points[1:] - slice_points[:-1]).astype( np.int32) if Dataset._is_int(slice_obj.start) and Dataset._is_int( slice_obj.stop): slice_points = np.round(slice_points) sub_slices = [ slice(slice_points[i], slice_points[i + 1], 1.) for i in range(num_sub_slices) ] if self._prefetching: self._queue_offset = int(not self._queue_offset) else: self._queue_offset = np.random.randint(0, self._num_processes) q_idxs = [ int((i + self._queue_offset) % self._num_processes) for i in range(len(sub_slices)) ] slice_queues = [self._slice_queues[qi] for qi in q_idxs] output_queues = [self._output_queues[qi] for qi in q_idxs] if self._prefetching: if self._first_pass: [ slice_queue.put(sub_slice) for slice_queue, sub_slice in zip(slice_queues, sub_slices) ] else: slice_queues[-1].put(sub_slices[-1]) if self._numpy_loading: ivy.unset_framework() self._first_pass = False return ivy.Container(queues=output_queues, queue_load_sizes=slice_sizes, queue_timeout=self._queue_timeout) else: [ slice_queue.put(sub_slice) for slice_queue, sub_slice in zip(slice_queues, sub_slices) ] if ivy.wrapped_mode(): items_as_lists = [ ivy.Container(output_queue.get( timeout=self._queue_timeout)).to_ivy() for output_queue in output_queues ] else: items_as_lists = [ ivy.Container( output_queue.get(timeout=self._queue_timeout)) for output_queue in output_queues ] if self._numpy_loading: ivy.unset_framework() self._first_pass = False return ivy.Container.list_join(items_as_lists)
def get_first_batch(self, dataset_key=None): return ivy.Container(x=ivy.array([[1.]] * self._spec.batch_size, dev_str=self._spec.dev_strs[0]), target=ivy.array([[0.]] * self._spec.batch_size, dev_str=self._spec.dev_strs[0]))
def _get_dataset(self, starting_example, ending_example): class ContainerIdxMap: def __init__(self, sizes, fpath_template=None, seq_idxs=None, start=None, end=None, max_seq_len=None, conts_to_skip=None, pruned_sizes=None): if isinstance(sizes, (tuple, list)): pruned_sizes = ivy.default(pruned_sizes, [ SeqDataLoader._compute_seq_len(i, sl, conts_to_skip) for i, sl in enumerate(sizes) ]) num_empty = sum([ps == 0 for ps in pruned_sizes]) self._raw_sizes = dict( zip(range(start, end + 1 + num_empty), sizes[start:end + 1 + num_empty])) self._pruned_sizes = dict( zip(range(start, end + 1 + num_empty), pruned_sizes[start:end + 1 + num_empty])) elif isinstance(sizes, (int, dict)): self._raw_sizes = sizes self._pruned_sizes = ivy.default(pruned_sizes, sizes) if isinstance(self._pruned_sizes, int): pruned_dict = dict() for seq_idx, win_idx in conts_to_skip: if seq_idx not in pruned_dict: pruned_dict[seq_idx] = list() pruned_dict[seq_idx].append(win_idx) pruned_dict = { k: len(set(v)) for k, v in pruned_dict.items() } pruned_sizes_dict = { k: self._pruned_sizes - num_pruned for k, num_pruned in pruned_dict.items() } num_empty = sum( [size == 0 for size in pruned_sizes_dict.values()]) pruned_sizes = collections.defaultdict( lambda: self._pruned_sizes, pruned_sizes_dict) else: num_empty = sum([ps == 0 for ps in self._pruned_sizes]) else: raise Exception( 'Invalid type for sizes, expected one of int, dict, tuple or list,' 'but found {} or type {}'.format(sizes, type(sizes))) self._constant_size = isinstance(self._raw_sizes, int) if max_seq_len: self._max_seq_len = max_seq_len else: self._max_seq_len = self._pruned_sizes if self._constant_size else max( self._pruned_sizes.values()) self._fpath_template = fpath_template self._conts_to_skip = conts_to_skip if seq_idxs: self._seq_idxs = seq_idxs else: vals = [ v for i, v in enumerate(range(start, end + 1 + num_empty)) if pruned_sizes[i] > 0 ] keys = range(0, min(end - start + 1 + num_empty, len(vals))) self._seq_idxs = dict(zip(keys, vals)) def __getitem__(self, slice_obj): if isinstance(slice_obj, slice): seq_idxs = collections.OrderedDict([ (i, self._seq_idxs[idx]) for i, idx in enumerate( range(slice_obj.start, slice_obj.stop, ivy.default(slice_obj.step, 1))) ]) elif isinstance(slice_obj, int): seq_idxs = collections.OrderedDict( {0: self._seq_idxs[slice_obj]}) else: raise Exception( 'Invalid type for slice_obj, expected either slice or int,' 'but found {} of type {}'.format( slice_obj, type(slice_obj))) if self._constant_size: sizes = self._raw_sizes else: sizes = collections.OrderedDict({ seq_idx: self._raw_sizes[seq_idx] for seq_idx in seq_idxs.values() }) return ContainerIdxMap(sizes, self._fpath_template, seq_idxs, max_seq_len=self._max_seq_len, conts_to_skip=self._conts_to_skip, pruned_sizes=self._pruned_sizes) def __len__(self): return len(self._seq_idxs) def shuffle(self): mapped_idxs = list(self._seq_idxs.values()) np.random.shuffle(mapped_idxs) self._seq_idxs = collections.OrderedDict( zip(self._seq_idxs.keys(), mapped_idxs)) def to_idxs(self): seq_idxs = self._seq_idxs.values() sizes = [ self._raw_sizes if self._constant_size else self._raw_sizes[seq_idx] for seq_idx in seq_idxs ] rets = [[(seq_idx, win_idx) for win_idx in range(size) if not SeqDataLoader._skip_cont( seq_idx, win_idx, self._conts_to_skip)] for seq_idx, size in zip(seq_idxs, sizes)] return [ r + [(None, None)] * (self._max_seq_len - len(r)) for r in rets if list(set(r)) != [None] ] def to_filepaths(self): if not ivy.exists(self._fpath_template): raise Exception( 'to_filepaths method is not valid if fpath_template has not been specified' 'in the constructor.') seq_idxs = self._seq_idxs.values() sizes = [ self._raw_sizes if self._constant_size else self._raw_sizes[seq_idx] for seq_idx in seq_idxs ] rets = [[ self._fpath_template % (seq_idx, win_idx) for win_idx in range(size) if not SeqDataLoader._skip_cont( seq_idx, win_idx, self._conts_to_skip) ] for seq_idx, size in zip(seq_idxs, sizes)] return [ r + [''] * (self._max_seq_len - len(r)) for r in rets if ''.join(r) != '' ] @property def sizes(self): return self._pruned_sizes # container filepaths if self._spec.container_load_mode in ['preload', 'dynamic']: fpath_template = os.path.join( self._container_data_dir, self._spec.dataset_spec.cont_fname_template) else: fpath_template = None container_idx_map = ContainerIdxMap( self._spec.dataset_spec.unpruned_sequence_lengths, fpath_template, start=starting_example, end=ending_example, conts_to_skip=self._spec.containers_to_skip) if self._spec.num_sequences != -1: container_idx_map = container_idx_map[0:self._spec.num_sequences] # shuffle sequences if self._spec.preshuffle_data: container_idx_map.shuffle() # extract sequence lengths if self._fixed_sequence_length: self._sequence_lengths =\ collections.OrderedDict(zip(range(len(container_idx_map)), [self._spec.dataset_spec.sequence_lengths] * len(container_idx_map))) self._windows_per_seq = self._sequence_lengths[ 0] - self._window_size + 1 # windowing values window_idxs_per_seq = ivy.reshape( ivy.arange(self._windows_per_seq, 0, 1), (self._windows_per_seq, 1)) gather_idxs_list = list() for x in window_idxs_per_seq: gather_idxs_list.append( ivy.expand_dims( ivy.arange(x[0] + self._window_size, x[0], 1), 0)) gather_idxs = ivy.concatenate(gather_idxs_list, 0) self._gather_idxs = \ ivy.to_numpy(ivy.reshape(gather_idxs, (self._windows_per_seq * self._window_size, 1))).tolist() else: self._sequence_lengths = container_idx_map.sizes # maybe pre-load containers if self._spec.container_load_mode == 'preload': # load containers with vector data and image filepath entries container_slices = self._get_containers_w_filepath_img_entries_as_tensor_slices( container_idx_map.to_filepaths()) if self._first_frame_validity_fn is not None: container_slices =\ self._first_frame_validity_fn(container_slices, [ending_example - starting_example + 1]) # prune unwanted chains of keys if 'unused_key_chains' in self._spec: container_slices = self._prune_unused_key_chains( container_slices) dataset = Dataset(ivy.Container.list_stack([ c[0] for c in container_slices.unstack(0, container_slices.shape[0]) ], 0), 'base', container_slices.shape[0], numpy_loading=True, cache_size=self._base_cache_size, queue_timeout=self._spec.queue_timeout) else: if self._spec.container_load_mode == 'dynamic': # load containers with filepath entries dataset = Dataset(ivy.Container({'fpaths': container_idx_map}), 'base', len(container_idx_map), trans_fn=lambda cont: cont.map( lambda x_, kc: x_.to_filepaths()), elementwise_query_fn=False, numpy_loading=True, cache_size=self._base_cache_size, queue_timeout=self._spec.queue_timeout) dataset = dataset.map('loaded_json', self._load_json_files, self._num_workers.loaded_json) dataset = dataset.map('parsed_json', self._parse_json_strings, self._num_workers.parsed_json) else: dataset = Dataset(ivy.Container({'idx_map': container_idx_map}), 'base', len(container_idx_map), trans_fn=lambda cont: self._spec. custom_container_load_fn(self, cont), elementwise_query_fn=False, numpy_loading=True, cache_size=self._base_cache_size, queue_timeout=self._spec.queue_timeout) if 'unused_key_chains' in self._spec: dataset = dataset.map('keychain_pruned', self._prune_unused_key_chains, self._num_workers.keychain_pruned) if self._first_frame_validity_fn is not None: dataset = dataset.map( 'valid_first_frames', lambda x_: self._first_frame_validity_fn(x_, None), self._num_workers.valid_first_frames) if not (self._spec.dataset_spec.sequence_lengths == 1 and self._window_size == 1): # ToDo: add other conditionals which make the loading more efficient if only one of the # above two conditions is True dataset = dataset.map( 'windowed', self._group_container_into_windowed_container, self._num_workers.windowed) dataset = dataset.unbatch( 'unbatched', self._num_workers.unbatched, batch_sizes=[ max(seq_len, self._window_size) - self._window_size + 1 for seq_len in self._sequence_lengths.values() if seq_len > 0 ]) if self._spec.shuffle_buffer_size > 0: dataset = dataset.shuffle('shuffled', self._spec.shuffle_buffer_size, self._num_workers.shuffled) dataset = dataset.map('loaded_data', self._load_data_from_filepath_tensors, self._num_workers.loaded_data) dataset = dataset.batch('batched', self._batch_size, self._num_workers.batched) dataset = dataset.map( 'from_np', lambda cont: cont.map(lambda x_, kc: ivy.array(x_, dev_str='cpu')), self._num_workers.from_np, numpy_loading=False) if ivy.exists(self._spec.post_proc_fn): dataset = dataset.map('post_processed', self._spec.post_proc_fn, self._num_workers.post_processed) if self._spec.with_prefetching: dataset = dataset.prefetch('prefetch') # ToDo: find way to make pre-fetching to GPU actually pre-fetch, ideally using multi-processing. # For example, swapping prefetch and to_gpu ops around would work if to_gpu could accept self._num_workers. if self._spec.prefetch_to_devs: if isinstance(self._spec.prefetch_to_devs, str): dataset = dataset.to_dev('to_dev', self._spec.prefetch_to_devs) elif len(self._spec.prefetch_to_devs) == 1: dataset = dataset.to_dev('to_dev', self._spec.prefetch_to_devs[0]) else: dataset = dataset.to_devs('to_devs', self._spec.prefetch_to_devs) return dataset
def map_fn(x, kc): if kc in json_spec: duplicate_key_chains.append(kc) return ivy.Container(duplicated={'parent_dir': json_spec[kc], 'this_dir': x}) else: return x
def build_trainer_spec(data_loader_class=None, network_class=None, dataset_dirs_args=None, dataset_dirs_class=None, dataset_dirs=None, dataset_spec_args=None, dataset_spec_class=None, dataset_spec=None, data_loader_spec_args=None, data_loader_spec_class=None, data_loader_spec=None, data_loader=None, network_spec_args=None, network_spec_class=None, network_spec=None, network=None, trainer_spec_args=None, trainer_spec_class=None, json_spec_path=None, spec_cont=None, class_priority=False): """ build trainer specification """ # build data loader data_loader = ivy.default( data_loader, build_data_loader( data_loader_class=data_loader_class, dataset_dirs_args=dataset_dirs_args, dataset_dirs_class=dataset_dirs_class, dataset_dirs=dataset_dirs, dataset_spec_args=dataset_spec_args, dataset_spec_class=dataset_spec_class, dataset_spec=dataset_spec, data_loader_spec_args=data_loader_spec_args, data_loader_spec_class=data_loader_spec_class, data_loader_spec=data_loader_spec, json_spec_path=json_spec_path, spec_cont=spec_cont)) # build network network = ivy.default( network, build_network( network_class=network_class, dataset_dirs_args=dataset_dirs_args, dataset_dirs_class=dataset_dirs_class, dataset_spec_args=dataset_spec_args, dataset_spec_class=dataset_spec_class, network_spec_args=network_spec_args, network_spec_class=network_spec_class, network_spec=network_spec, json_spec_path=json_spec_path, spec_cont=spec_cont)) # define trainer specification arguments if trainer_spec_args is None: trainer_spec_args = dict() trainer_spec_args = ivy.Container(trainer_spec_args) trainer_spec_args = ivy.Container.combine(trainer_spec_args, ivy.Container(data_loader=data_loader, network=network)) # load json file if isinstance(json_spec_path, str): json_spec = json_spec_from_fpath(json_spec_path, 'trainer_args.json') else: json_spec = ivy.Container() # load from spec dict this_spec_cont =\ ivy.Container(spec_cont['trainer']) if isinstance(spec_cont, dict) and 'trainer' in spec_cont \ else ivy.Container() # combine args trainer_spec_args = ivy.Container.combine(json_spec, this_spec_cont, trainer_spec_args) # override trainer_spec_class if specified in trainer_spec_args trainer_spec_class = ivy.default(ivy.default( _import_arg_specified_class_if_present(trainer_spec_args, 'trainer_spec_class'), trainer_spec_class, rev=class_priority), TrainerSpec) # return trainer specification return trainer_spec_class(**trainer_spec_args)
def get_first_batch(self, dataset_key='training'): data = self._data[dataset_key] return ivy.Container(input=data.input[0:1], target=data.targets[0:1])
def print_json_args(base_dir=None, keys_to_ignore=None, keychains_to_ignore=None): if not ivy.exists(base_dir): base_dir = os.getcwd() ivy.set_framework('numpy') parser = argparse.ArgumentParser() parser.add_argument('-sd', '--sub_directory', type=str, help='A sub-directory to print the json args for, default is base_dir passed in.') parser.add_argument('-dd', '--diff_directory', type=str, help='The directory from which to compare the difference in specifications.') parser.add_argument('-kti', '--keys_to_ignore', type=str, default=keys_to_ignore, help='Keys to ignore when printing the specification.') parser.add_argument('-kcti', '--keychains_to_ignore', type=str, default=keychains_to_ignore, help='Key-chains to ignore when printing the specification.') parser.add_argument('-kcts', '--keychain_to_show', type=str, help='The key-chain to show. Default is None, in which case all key-chains are shown.') parser.add_argument('-sn', '--spec_names', type=str, help='The specification names for the json files. Default is ivy_builder defaults of' '[ dataset_dirs | dataset | data_loader| network | trainer |]') parser.add_argument('-d', '--show_defaults', action='store_true', help='Whether to show the default json arguments.' 'If unset then the current arguments are shown, not the defaut values.') parser.add_argument('-c', '--current_dir_only', action='store_true', help='Whether to only show the json arguments for the current directory,' 'without searching through parent directories also.') parser.add_argument('-sdo', '--show_diff_only', action='store_true', help='Whether to only show the difference between the current directory' 'and the diff directory.') parser.add_argument('-sso', '--show_same_only', action='store_true', help='Whether to only show the same entries between the current directory' 'and the diff directory.') parsed_args = parser.parse_args() if (parsed_args.show_diff_only or parsed_args.show_same_only) and not parsed_args.diff_directory: raise Exception('show_diff_only and show_same_only flags are only applicable if diff_directory is set.') if parsed_args.show_diff_only and parsed_args.show_same_only: raise Exception('show_diff_only and show_same_only cannot both be set, please choose one to set.') if ivy.exists(parsed_args.spec_names): spec_names = [kc[1:-1] for kc in ''.join(parsed_args.spec_names[1:-1]).split(', ')] else: spec_names = None if ivy.exists(parsed_args.sub_directory): sub_dir = os.path.normpath(os.path.join(base_dir, parsed_args.sub_directory)) else: sub_dir = base_dir if ivy.exists(parsed_args.keys_to_ignore): keys_to_ignore = [kc[1:-1] for kc in ''.join(parsed_args.keys_to_ignore[1:-1]).split(', ')] else: keys_to_ignore = list() if ivy.exists(parsed_args.keychains_to_ignore): keychains_to_ignore = [kc[1:-1] for kc in ''.join(parsed_args.keychains_to_ignore[1:-1]).split(',')] else: keychains_to_ignore = list() these_json_args = get_json_args( sub_dir, keys_to_ignore, keychains_to_ignore, parsed_args.keychain_to_show, parsed_args.show_defaults, store_duplicates=True, current_dir_only=parsed_args.current_dir_only, spec_names=spec_names) if ivy.exists(parsed_args.diff_directory): other_sub_dir = os.path.normpath(os.path.join(base_dir, parsed_args.diff_directory)) if other_sub_dir == sub_dir: raise Exception('Invalid diff_directory {} selected, it is the same as the sub_directory {}.'.format( other_sub_dir, sub_dir)) other_json_args = get_json_args( other_sub_dir, keys_to_ignore, keychains_to_ignore, parsed_args.keychain_to_show, parsed_args.show_defaults, store_duplicates=True, current_dir_only=parsed_args.current_dir_only, spec_names=spec_names) diff_keys = 'diff' for sub_folder, other_sub_folder in zip(sub_dir.split('/'), other_sub_dir.split('/')): if sub_folder != other_sub_folder: diff_keys = [sub_folder, other_sub_folder] break if parsed_args.show_diff_only: mode = 'diff_only' elif parsed_args.show_same_only: mode = 'same_only' else: mode = 'all' diff_json_args = ivy.Container.diff(these_json_args, other_json_args, mode=mode, diff_keys=diff_keys) keyword_color_dict = {'duplicated': 'magenta'} if isinstance(diff_keys, list): diff_keys_dict = dict(zip(diff_keys, ['red'] * 2)) keyword_color_dict = {**keyword_color_dict, **diff_keys_dict} print(ivy.Container(diff_json_args, keyword_color_dict=keyword_color_dict)) else: print(ivy.Container(these_json_args, keyword_color_dict={'duplicated': 'magenta'})) ivy.unset_framework()