def test_full_trainers(dev_str, call, compile_mode): if call is helpers.np_call: # numpy does not support gradients, required for training pytest.skip() if call is helpers.jnp_call and ivy.wrapped_mode(): # Jax does not support ivy.Array instances when calling _jax.grad() pytest.skip() # currently only PyTorch supports graph compilation compile_mode = compile_mode if ivy.current_framework_str( ) == 'torch' else False # test builder_helpers.remove_dirs() full_example.main(compile_mode=compile_mode) builder_helpers.remove_dirs()
def _worker_fn(index_queue, output_queue, dataset, numpy_loading): while True: try: slice_obj = index_queue.get(timeout=1.0) except queue.Empty: continue if slice_obj is None: dataset.close() return if numpy_loading: ivy.set_framework('numpy') item = Dataset._slice_dataset_with_error_checks(dataset, slice_obj) if numpy_loading: ivy.unset_framework() if ivy.wrapped_mode(): item = item.to_native(nested=True) output_queue.put(item.to_dict())
def __getitem__(self, slice_obj): if not self._workers_initialized: self._initialize_all_workers() if self._numpy_loading: ivy.set_framework('numpy') if self._num_processes < 2 or isinstance(slice_obj, numbers.Number): ret = self._get_item(slice_obj) if self._numpy_loading: ivy.unset_framework() self._first_pass = False return ret slice_size = int(round(slice_obj.stop - slice_obj.start)) num_sub_slices = min(slice_size, self._num_processes) slice_points = np.linspace(slice_obj.start, slice_obj.stop, num_sub_slices + 1) slice_sizes = np.round(slice_points[1:] - slice_points[:-1]).astype( np.int32) if Dataset._is_int(slice_obj.start) and Dataset._is_int( slice_obj.stop): slice_points = np.round(slice_points) sub_slices = [ slice(slice_points[i], slice_points[i + 1], 1.) for i in range(num_sub_slices) ] if self._prefetching: self._queue_offset = int(not self._queue_offset) else: self._queue_offset = np.random.randint(0, self._num_processes) q_idxs = [ int((i + self._queue_offset) % self._num_processes) for i in range(len(sub_slices)) ] slice_queues = [self._slice_queues[qi] for qi in q_idxs] output_queues = [self._output_queues[qi] for qi in q_idxs] if self._prefetching: if self._first_pass: [ slice_queue.put(sub_slice) for slice_queue, sub_slice in zip(slice_queues, sub_slices) ] else: slice_queues[-1].put(sub_slices[-1]) if self._numpy_loading: ivy.unset_framework() self._first_pass = False return ivy.Container(queues=output_queues, queue_load_sizes=slice_sizes, queue_timeout=self._queue_timeout) else: [ slice_queue.put(sub_slice) for slice_queue, sub_slice in zip(slice_queues, sub_slices) ] if ivy.wrapped_mode(): items_as_lists = [ ivy.Container(output_queue.get( timeout=self._queue_timeout)).to_ivy() for output_queue in output_queues ] else: items_as_lists = [ ivy.Container( output_queue.get(timeout=self._queue_timeout)) for output_queue in output_queues ] if self._numpy_loading: ivy.unset_framework() self._first_pass = False return ivy.Container.list_join(items_as_lists)
def test_reduced_cost_after_checkpoint_load(dev_str, call, compile_mode): if call is helpers.np_call: # numpy does not support gradients, required for training pytest.skip() if call is helpers.jnp_call and ivy.wrapped_mode(): # Jax does not support ivy.Array instances when calling _jax.grad() pytest.skip() example_dir = os.path.relpath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../ivy_builder_demos')) # currently only PyTorch supports graph compilation compile_mode = compile_mode if ivy.current_framework_str( ) == 'torch' else False # dataset dirs specification dataset_dirs_args = dict() # dataset specification dataset_spec_filepath = os.path.join(example_dir, 'json_specs', 'dataset_spec.json.example') dataset_spec_args = builder.parse_json_to_cont(dataset_spec_filepath) # data loader specification data_loader_spec_filepath = os.path.join(example_dir, 'json_specs', 'data_loader_spec.json.example') data_loader_spec_args = builder.parse_json_to_cont( data_loader_spec_filepath) # network specification network_spec_filepath = os.path.join(example_dir, 'json_specs', 'network_spec.json.example') network_spec_args = builder.parse_json_to_cont(network_spec_filepath) builder_helpers.remove_dirs() ivy.seed(0) trainer_spec_args = { 'total_iterations': 1, 'ld_chkpt': False, 'save_freq': 1, 'compile_mode': compile_mode } trainer = builder.build_trainer( ExampleDataLoader, ExampleNetwork, ExampleTrainer, dataset_dirs_args=dataset_dirs_args, dataset_dirs_class=ExampleDatasetDirs, dataset_spec_args=dataset_spec_args, dataset_spec_class=ExampleDatasetSpec, data_loader_spec_args=data_loader_spec_args, data_loader_spec_class=ExampleDataLoaderSpec, network_spec_args=network_spec_args, network_spec_class=ExampleNetworkSpec, trainer_spec_args=trainer_spec_args) trainer.setup() trainer.train() initial_cost = trainer._total_cost assert trainer._global_step == 1 trainer.close() ivy.seed(0) steps_to_take_first = 10 trainer_spec_args = { 'total_iterations': steps_to_take_first, 'ld_chkpt': False, 'save_freq': 1, 'compile_mode': compile_mode } trainer = builder.build_trainer( ExampleDataLoader, ExampleNetwork, ExampleTrainer, dataset_dirs_args=dataset_dirs_args, dataset_dirs_class=ExampleDatasetDirs, dataset_spec_args=dataset_spec_args, dataset_spec_class=ExampleDatasetSpec, data_loader_spec_args=data_loader_spec_args, data_loader_spec_class=ExampleDataLoaderSpec, network_spec_args=network_spec_args, network_spec_class=ExampleNetworkSpec, trainer_spec_args=trainer_spec_args) trainer.setup() trainer.train() ten_step_cost = trainer._total_cost assert trainer._global_step == steps_to_take_first trainer.close() assert initial_cost > ten_step_cost steps_to_take_second = 20 trainer_spec_args = { 'total_iterations': steps_to_take_second, 'ld_chkpt': True, 'save_freq': 1, 'compile_mode': compile_mode } trainer = builder.build_trainer( ExampleDataLoader, ExampleNetwork, ExampleTrainer, dataset_dirs_args=dataset_dirs_args, dataset_dirs_class=ExampleDatasetDirs, dataset_spec_args=dataset_spec_args, dataset_spec_class=ExampleDatasetSpec, data_loader_spec_args=data_loader_spec_args, data_loader_spec_class=ExampleDataLoaderSpec, network_spec_args=network_spec_args, network_spec_class=ExampleNetworkSpec, trainer_spec_args=trainer_spec_args) trainer.setup() trainer.train() twenty_step_cost = trainer._total_cost assert trainer._global_step == steps_to_take_second trainer.close() assert ten_step_cost > twenty_step_cost builder_helpers.remove_dirs()
def test_tune_resume_training(dev_str, call): if call is not helpers.torch_call: # ToDo: work out why the backend framework is fixed for tune after the first call, # and include other frameworks in test once this is fixed pytest.skip() if ivy.wrapped_mode(): # this test fails when running all tests for some reason, need to further investigate pytest.skip() builder_helpers.remove_dirs() # tuner spec args train_steps_per_tune_step = 2 data_loader_spec_args = {'batch_size': 1} tuner_spec_args = { 'framework': ivy.current_framework_str(), 'train_steps_per_tune_step': train_steps_per_tune_step, 'trainer_spec': { 'initial_learning_rate': { 'min': 10**-5, 'max': 10**-4, 'num_grid_samples': 2, 'grid': True } }, 'name': 'tune', 'num_samples': 1, 'parallel_trials': 1, 'grace_period': -1, 'checkpoint_freq': 0 } # first run total_iterations = 5 trainer_spec_args = { 'total_iterations': total_iterations, 'ld_chkpt': False, 'log_freq': 1, 'log_dir': os.path.join(THIS_DIR, 'log'), 'save_freq': 1 } tuner = builder.build_tuner(ExampleDataLoader, ExampleNetwork, ExampleTrainer, data_loader_spec_args=data_loader_spec_args, trainer_spec_args=trainer_spec_args, tuner_spec_args=tuner_spec_args) first_results = ivy.Container(tuner.tune().results) first_losses = first_results.at_keys('cost').to_flat_list() # second run trainer_spec_args = { 'total_iterations': total_iterations * 2, 'ld_chkpt': True, 'log_freq': 1, 'log_dir': os.path.join(THIS_DIR, 'log'), 'save_freq': 1 } tuner = builder.build_tuner(ExampleDataLoader, ExampleNetwork, ExampleTrainer, data_loader_spec_args=data_loader_spec_args, trainer_spec_args=trainer_spec_args, tuner_spec_args=tuner_spec_args) second_results = ivy.Container(tuner.tune().results) second_losses = second_results.at_keys('cost').to_flat_list() # assertion # first session ends training at ceil(5/2)=3 timesteps first_timestep = int( math.ceil(total_iterations / train_steps_per_tune_step)) assert min([ fts == first_timestep for fts in first_results.at_keys('timestep').to_flat_list() ]) # second session ends training at ceil(10/2)=5 timesteps second_timestep = int( math.ceil(total_iterations * 2 / train_steps_per_tune_step)) assert min([ sts == second_timestep for sts in second_results.at_keys('timestep').to_flat_list() ]) # both sessions trained for ceil(5/2)=3 training iterations training_iteration = int( math.ceil(total_iterations / train_steps_per_tune_step)) assert min([ fti == sti == training_iteration for fti, sti in zip( first_results.at_keys('training_iteration').to_flat_list(), second_results.at_keys('training_iteration').to_flat_list()) ]) # the loss is lower for the second session, after the checkpoint load from the first assert min([ second_loss < first_loss for first_loss, second_loss in zip(first_losses, second_losses) ]) # end builder_helpers.remove_dirs()