def _create_cache(self): # Save all data into cache file(s). self._cache_positions = [] self._position = 0 percent = 0 if single_or_rankzero(): progress(None) while self._position < self._data_source._size: if single_or_rankzero(): progress('Create cache', self._position * 1.0 / self._data_source._size) self._store_data_to_cache_buffer(self._position) self._position += 1 if len(self._cache_positions) > 0: self._save_cache_to_file() if single_or_rankzero(): progress(None) # Adjust data size into reseted position. In most case it means # multiple of bunch(mini-batch) size. num_of_cache_files = int( numpy.ceil(float(self._data_source._size) / self._cache_size)) self._cache_file_order = self._cache_file_order[0:num_of_cache_files] self._cache_file_data_orders = self._cache_file_data_orders[ 0:num_of_cache_files] if self._data_source._size % self._cache_size != 0: self._cache_file_data_orders[num_of_cache_files - 1] = self._cache_file_data_orders[ num_of_cache_files - 1][0:self._data_source._size % self._cache_size] index_filename = os.path.join(self._cache_dir, "cache_index.csv") with open(index_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for fn, orders in zip(self._cache_file_names, self._cache_file_data_orders): writer.writerow((os.path.basename(fn), len(orders))) if self._cache_file_format == ".npy": info_filename = os.path.join(self._cache_dir, "cache_info.csv") with open(info_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for variable in self._variables: writer.writerow((variable, ))
def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator): class Dataset: pass dataset = Dataset() dataset.uri = uri dataset.normalize = not no_image_normalization comm = current_communicator() # use same random state for each process until slice is called rng = numpy.random.RandomState(0) use_memory_cache = comm.size == 1 if comm else True if prepare_data_iterator: if cache_dir == '': cache_dir = None # Disable implicit cache creation when MPI is available. if cache_dir and (create_cache_explicitly or comm): cache_index = os.path.join(cache_dir, "cache_index.csv") if not os.path.exists(cache_index) or overwrite_cache: if single_or_rankzero(): logger.log(99, 'Creating cache data for "' + uri + '"') try: os.makedirs(cache_dir) except OSError: pass # python2 does not support exists_ok arg with data_iterator_csv_dataset(uri, batch_size, shuffle, rng=rng, normalize=False, cache_dir=cache_dir, with_memory_cache=False) as di: pass rng = numpy.random.RandomState(0) dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache)) elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0: if comm: logger.critical( 'Implicit cache creation does not support with MPI') import sys sys.exit(-1) else: if cache_dir: try: os.makedirs(cache_dir) except OSError: pass # python2 does not support exists_ok arg dataset.data_iterator = (lambda: data_iterator_csv_dataset( uri, batch_size, shuffle, rng=rng, normalize=dataset.normalize, cache_dir=cache_dir)) else: dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache)) else: dataset.data_iterator = None return dataset
def train_command(args): callback.update_status(args) if single_or_rankzero(): configure_progress(os.path.join(args.outdir, 'progress.txt')) info = load.load([args.config], prepare_data_iterator=None, exclude_parameter=True) # Check dataset uri is empty. dataset_error = False for dataset in info.datasets.values(): if dataset.uri.strip() == '': dataset_error = True if dataset_error: logger.log(99, 'Fatal error. Dataset URI is empty.') return False class TrainConfig: pass config = TrainConfig() config.timelimit = -1 if args.param: load.load([args.param], parameter_only=True) config.timelimit = callback.get_timelimit(args) config.global_config = info.global_config config.training_config = info.training_config if single_or_rankzero(): logger.log(99, 'Train with contexts {}'.format(available_contexts)) class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterators = [] config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterators = [] config.monitors[name] = m # Training comm = current_communicator() config.training_config.iter_per_epoch //= comm.size if comm else 1 max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch global _save_parameter_info _save_parameter_info = {} _, config_ext = os.path.splitext(args.config) if config_ext == '.prototxt' or config_ext == '.nntxt': _save_parameter_info['config'] = args.config elif config_ext == '.nnp': with zipfile.ZipFile(args.config, 'r') as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if ext == '.nntxt' or ext == '.prototxt': nnp.extract(name, args.outdir) _save_parameter_info['config'] = os.path.join( args.outdir, name) result = False restart = False if max_iteration > 0: rng = np.random.RandomState(comm.rank if comm else 0) with ExitStack() as stack: # Create data_iterator instance only once for each dataset in optimizers optimizer_data_iterators = {} for name, o in config.optimizers.items(): for di in o.optimizer.data_iterators.values(): if di not in optimizer_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) optimizer_data_iterators[di] = di_instance else: di_instance = optimizer_data_iterators[di] o.data_iterators.append(di_instance) # Create data_iterator instance only once for each dataset in monitors monitor_data_iterators = {} for name, m in config.monitors.items(): for di in m.monitor.data_iterators.values(): if di not in monitor_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) monitor_data_iterators[di] = di_instance else: di_instance = monitor_data_iterators[di] m.data_iterators.append(di_instance) monitor_data_iterators.update(optimizer_data_iterators) result, restart = _train(args, config) else: # save parameters without training (0 epoch learning) logger.log(99, '0 epoch learning. (Just save parameter.)') if single_or_rankzero(): _save_parameters(args, None, 0, config, True) result = True if single_or_rankzero() and not restart: if result: logger.log(99, 'Training Completed.') callback.update_status('finished') else: logger.log(99, 'Training Incompleted.') callback.update_status('failed') if single_or_rankzero(): progress(None) return True
def _train(args, config): global _save_parameter_info comm = current_communicator() _CGLOAD_LOG_INTERVAL = 20 best_epoch = None best_error = None last_epoch = 0 if args.resume: last_epoch, best_epoch, best_error = _get_current_parameter(args) if best_epoch is not None: logger.log( 99, "Best error {} recorded at epoch {} in previous training.". format(best_error, best_epoch)) if best_epoch > last_epoch: logger.log( 99, "Resumed epoch is {} but this training keep this result.". format(last_epoch)) logger.log(99, "Resume from epoch {}".format(last_epoch + 1)) callback.update_status(('epoch.max', config.training_config.max_epoch)) callback.update_status( ('epoch.current', last_epoch + 1 if last_epoch < config.training_config.max_epoch else config.training_config.max_epoch)) max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if single_or_rankzero(): logger.log( 99, 'Training epoch {} of {} begin'.format( last_epoch + 1, config.training_config.max_epoch)) class Cost: pass cost = Cost() cost.sum_epoch = 0.0 cost.num_iteration = 0 cost.sum_iteration = 0.0 cost.variables = None class TimeInfo: pass timeinfo = TimeInfo() timeinfo.past_time = 0 timeinfo.estimate_time = 0 timeinfo.last_past_time = None if max_iteration > 0: last_iteration = last_epoch * config.training_config.iter_per_epoch if last_iteration < max_iteration: timeinfo.start_time = time.time() timeinfo.last_epoch_start_time = timeinfo.start_time callback.update_status('processing', True, timeinfo.start_time) for iteration in range(last_iteration, max_iteration): # instant load measurement measure_cpu_gpu_instant_load() cost = _update(iteration, config, cost) if np.isnan(cost.sum_epoch) or np.isinf(cost.sum_epoch): logger.log(99, 'Cost is Nan') return False, False timeinfo = _calc_estimate_time(timeinfo, max_iteration, last_iteration, iteration + 1) callback.update_time_train(prediction=timeinfo.estimate_time) if 0 < config.timelimit < timeinfo.estimate_time: logger.log( 99, 'Expected training time ({:.3f}s) will exceed time limit ({}s).' .format(timeinfo.estimate_time, config.timelimit)) return False, False if (iteration + 1) % config.training_config.iter_per_epoch == 0: last_past_time = -1 # End of epoch epoch = iteration // config.training_config.iter_per_epoch + 1 cost_avg_epoch = cost.sum_epoch / cost.num_iteration if cost.num_iteration else 0 cost.sum_epoch = 0.0 cost.num_iteration = 0 monitoring_report = [] # Evaluation error_str = '' if epoch % config.training_config.monitor_interval == 0 or epoch <= 5: best_error, error_str = _evaluate( args, config, monitoring_report, best_error, epoch) # Cpu/Gpu average load cg_load_str = '' cgload_log = '' cg_load = get_cpu_gpu_average_load() if cg_load: cg_load_str = 'epoch {} average_load_matrix: {}'.format( epoch, cg_load) span = _calc_epoch_span(timeinfo) if span > _CGLOAD_LOG_INTERVAL: cgload_log = _format_cgload_log(cg_load) if single_or_rankzero(): # Write to monitoring_report.yml f = open( os.path.join(args.outdir, 'monitoring_report.yml'), 'a') f.write('{}:\n'.format(epoch - 1)) f.write(' cost: {}\n'.format(cost_avg_epoch)) for s in monitoring_report: f.write(s) f.close() callback.update_status( (['monitoring_report', epoch, 'cost'], cost_avg_epoch)) _save_parameters(args, 'current', epoch, config) callback.update_status(('epoch.current', epoch)) callback.update_status() logger.log( 99, 'epoch {} of {} cost={:.6f} {} time=({:.1f}s /{:.1f}s) {}' .format(epoch, config.training_config.max_epoch, cost_avg_epoch, error_str, timeinfo.past_time, timeinfo.estimate_time, cgload_log)) if cg_load_str: # cpu_gpu_average_load record at epoch level callback.update_status( (['cpu_gpu_epoch_load', epoch], cg_load)) progress(cg_load_str, 1) if not callback.check_training_time( args, config, timeinfo, epoch, last_epoch): _save_parameters(args, 'current', epoch, config, True) return False, True if single_or_rankzero(): _save_parameters(args, 'current', epoch, config, True) return True, False
def _evaluate(args, config, monitoring_report, best_error, epoch): comm = current_communicator() error_str = '' valid_error = 0.0 def _sum_error(sum, error): ret = None if comm: # logger.log(99, "Calc error with communicator") var = [nn.NdArray()] var[0].data = error _all_reduce(comm, var, division=False, inplace=True) ret = sum + var[0].data else: ret = sum + error return ret for name, mon in config.monitors.items(): m = mon.monitor error_sum_monitor = 0.0 error_count = 0 data_size = max([di.size for di in mon.data_iterators]) batch_size = max([di.batch_size for di in mon.data_iterators]) for i in range(data_size // batch_size): # Load dataset data = OrderedDict() for di in mon.data_iterators: data.update(zip(di.variables, di.next())) # Set data to variable for v, d in m.dataset_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data[d], ctx=dest_context, data_name=d, variable_name=v.name) # Generate data for v, generator in m.generator_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Sum error before forward to prepare input data while processing # on GPU if error_count > 0: error_sum = 0.0 for v in m.monitor_variables: error_sum += np.mean(v.variable_instance.d) # v.variable_instance.data.zero() error_sum_monitor = _sum_error(error_sum_monitor, error_sum) if single_or_rankzero(): progress( 'Evaluating "{0}"'.format(name) + ' : error={0:0.6f}'.format( error_sum_monitor / error_count), di.position * 1.0 / di.size) error_count += comm.size if comm else 1 # Forward recursive m.network.forward(m.forward_sequence) # Sum error at the end of dataset error_sum = 0.0 for v in m.monitor_variables: error_sum += np.mean(v.variable_instance.d) # v.variable_instance.data.zero() error_sum_monitor = _sum_error(error_sum_monitor, error_sum) if error_count == 0: error = 0 else: error = error_sum_monitor / error_count if np.isnan(error) or np.isinf(error): logger.log(99, 'Validation error is Nan') error = 0.0 monitoring_report.append(' {}: {}\n'.format(name, error)) callback.update_status((['monitoring_report', epoch, name], error)) callback.update_status((['last', name], error)) # save last value if error_str != '': error_str += ', ' else: error_str = ' {' error_str += '{}={:.6f}'.format(name, error) if name == 'valid_error': valid_error = error if error_str != '': error_str += '}' # Save Parameters if single_or_rankzero(): if (not config.training_config.save_best) or \ (not best_error) or \ (best_error is not None and valid_error <= best_error): best_error = valid_error callback.update_status(('best.valid_error', best_error)) callback.update_status(('best.epoch', epoch)) _save_parameters(args, 'best', epoch, config, True) return best_error, error_str
def _update(iter, config, cost): comm = current_communicator() loaded_data = {} is_first_optimizer = True def _sum_cost(): if comm: # logger.log(99, "Calc cost with communicator") var = [nn.NdArray()] var[0].data = cost.sum_iteration _all_reduce(comm, var, division=False, inplace=True) cost.sum_epoch += var[0].data cost.num_iteration += comm.size else: cost.sum_epoch += cost.sum_iteration cost.num_iteration += 1 def _get_reserved_variable(shape, reserved_variable_name, iter, iter_per_epoch, max_epoch): if reserved_variable_name == "%iter": value = iter elif reserved_variable_name == "%max_iter": value = max_epoch * iter_per_epoch elif reserved_variable_name == "%epoch": value = iter // iter_per_epoch elif reserved_variable_name == "%epochf": value = iter * 1.0 / iter_per_epoch elif reserved_variable_name == "%max_epoch": value = max_epoch elif reserved_variable_name == "%progress": value = (iter * 1.0 / iter_per_epoch) / max_epoch else: raise ValueError( "Unknown reserved variable {}".format(reserved_variable_name)) return value for opt in config.optimizers.values(): o = opt.optimizer if (o.start_iter == 0 or iter + 1 >= o.start_iter) and (o.end_iter == 0 or iter + 1 <= o.end_iter): # Load dataset data = OrderedDict() for di in opt.data_iterators: if di not in loaded_data: loaded_data[di] = di.next() data.update(zip(di.variables, loaded_data[di])) for v, d in o.dataset_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None if d not in data and d[0] == "%": value = _get_reserved_variable( v.variable_instance.shape, d, iter, config.training_config.iter_per_epoch, config.training_config.max_epoch) v.variable_instance.data.fill(value) elif d in data: let_data_to_variable(v.variable_instance, data[d], ctx=dest_context, data_name=d, variable_name=v.name) else: raise ValueError( 'Variable "{}" is not found in dataset "{}", optimizer "{}"' .format(d, ', '.join(o.data_iterators.keys()), o.name)) # Generate data for v, generator in o.generator_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Monitor loss before forward to prepare input data while processing on # GPU if cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) # l.variable_instance.data.zero() if is_first_optimizer: is_first_optimizer = False _sum_cost() if single_or_rankzero(): progress( "Training : cost={0:0.6f}".format( cost.sum_iteration), (iter % config.training_config.iter_per_epoch) * 1.0 / config.training_config.iter_per_epoch) cost.sum_iteration = 0.0 with nodeTimeCollector.collect_cost_time(comm, iter): # Forward o.network.forward(o.forward_sequence) # Backward o.network.backward(o.backward_sequence, iter % o.update_interval == 0) # Update if iter % o.update_interval == o.update_interval - 1: if o.weight_decay > 0: o.solver.weight_decay(o.weight_decay) if o.comm: # Updated param with communicator params = [x.grad for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) if o.scheduler is not None: o.solver.set_learning_rate( o.scheduler.get_learning_rate(iter)) o.solver.update() # Sync w sometimes if iter % 10 == 9: # TODO: change the interval if o.comm: params = [x.data for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) # Reserve monitor loss cost.variables = o.loss_variables # Monitor loss at the end of epoch if iter % config.training_config.iter_per_epoch == config.training_config.iter_per_epoch - 1 and cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) # l.variable_instance.data.zero() _sum_cost() cost.variables = None cost.sum_iteration = 0.0 return cost
def train_command(args): if single_or_rankzero(): configure_progress(os.path.join(args.outdir, 'progress.txt')) info = load.load([args.config], exclude_parameter=True) # Check dataset uri is empty. dataset_error = False for dataset in info.datasets.values(): if dataset.uri.strip() == '': dataset_error = True if dataset_error: logger.log(99, 'Fatal error. Dataset URI is empty.') return False class TrainConfig: pass config = TrainConfig() config.timelimit = -1 if args.param: load.load([args.param], parameter_only=True) config.global_config = info.global_config config.training_config = info.training_config if single_or_rankzero(): logger.log(99, 'Train with contexts {}'.format(available_contexts)) class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Training comm = current_communicator() config.training_config.iter_per_epoch //= comm.size if comm else 1 max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch global _save_parameter_info _save_parameter_info = {} _, config_ext = os.path.splitext(args.config) if config_ext == '.prototxt' or config_ext == '.nntxt': _save_parameter_info['config'] = args.config elif config_ext == '.nnp': with zipfile.ZipFile(args.config, 'r') as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if ext == '.nntxt' or ext == '.prototxt': nnp.extract(name, args.outdir) _save_parameter_info['config'] = os.path.join( args.outdir, name) result = False if max_iteration > 0: data_iterators = {'optimizer': {}, 'monitor': {}} rng = np.random.RandomState(comm.rank if comm else 0) with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) if comm and comm.size > 1: o.data_iterator = o.data_iterator.slice( rng, comm.size, comm.rank) for name, m in config.monitors.items(): m.data_iterator = stack.enter_context( m.monitor.data_iterator()) if comm and comm.size > 1: m.data_iterator = m.data_iterator.slice( rng, comm.size, comm.rank) result = _train(args, config) else: # save parameters without training (0 epoch learning) logger.log(99, '0 epoch learning. (Just save parameter.)') if single_or_rankzero(): _save_parameters(args, 'current', 0, True) result = True if single_or_rankzero(): if result: logger.log(99, 'Training Completed.') else: logger.log(99, 'Training Incompleted.') if single_or_rankzero(): progress(None) return True
def _train(args, config): global _save_parameter_info comm = current_communicator() last_epoch = 0 if args.resume: last_epoch = _get_current_parameter(args) logger.log(99, "Resume from epoch {}".format(last_epoch + 1)) max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if single_or_rankzero(): logger.log( 99, 'Training epoch {} of {} begin'.format( last_epoch + 1, config.training_config.max_epoch)) class Cost: pass cost = Cost() cost.sum_epoch = 0.0 cost.num_iteration = 0 cost.sum_iteration = 0.0 cost.variables = None best_error = None class TimeInfo: pass timeinfo = TimeInfo() timeinfo.last_past_time = None if max_iteration > 0: last_iteration = last_epoch * config.training_config.iter_per_epoch if last_iteration < max_iteration: timeinfo.start_time = time.time() for iteration in range(last_iteration, max_iteration): cost = _update(iteration, config, cost) if (iteration - last_iteration) > 0: timeinfo = _calc_estimate_time(timeinfo, max_iteration, last_iteration, iteration) if config.timelimit > 0 and timeinfo.estimate_time > config.timelimit: logger.log( 99, 'Expected training time ({:.3f}s) will exceed time limit ({}s).' .format(timeinfo.estimate_time, config.timelimit)) return False if (iteration + 1) % config.training_config.iter_per_epoch == 0: last_past_time = -1 # End of epoch epoch = iteration // config.training_config.iter_per_epoch + 1 cost_avg_epoch = cost.sum_epoch / cost.num_iteration cost.sum_epoch = 0.0 cost.num_iteration = 0 monitoring_report = [] # Evaluation error_str = '' if epoch % config.training_config.monitor_interval == 0 or epoch <= 5: best_error, error_str = _evaluate( args, config, monitoring_report, best_error, epoch) if single_or_rankzero(): # Write to monitoring_report.yml f = open( os.path.join(args.outdir, 'monitoring_report.yml'), 'a') f.write('{}:\n'.format(epoch - 1)) f.write(' cost: {}\n'.format(cost_avg_epoch)) for s in monitoring_report: f.write(s) f.close() _save_parameters(args, 'current', epoch) logger.log( 99, 'epoch {} of {} cost={:.6f} {} time=({:.1f}s /{:.1f}s)' .format(epoch, config.training_config.max_epoch, cost_avg_epoch, error_str, timeinfo.past_time, timeinfo.estimate_time)) if single_or_rankzero(): _save_parameters(args, 'current', epoch, True) return True
def _update(iter, config, cost): comm = current_communicator() loaded_data = {} is_first_optimizer = True def _sum_cost(): if comm: # logger.log(99, "Calc cost with communicator") var = [nn.NdArray()] var[0].data = cost.sum_iteration _all_reduce(comm, var, division=False, inplace=True) cost.sum_epoch += var[0].data cost.num_iteration += comm.size else: cost.sum_epoch += cost.sum_iteration cost.num_iteration += 1 for opt in config.optimizers.values(): o = opt.optimizer # Load dataset di = opt.data_iterator if o.data_iterator not in loaded_data: loaded_data[o.data_iterator] = di.next() data = loaded_data[o.data_iterator] for v, d in o.dataset_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data[di.variables.index(d)], ctx=dest_context, data_name=d, variable_name=v.name) # Generate data for v, generator in o.generator_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Monitor loss before forward to prepare input data while processing on # GPU if cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) l.variable_instance.data.zero() if is_first_optimizer: is_first_optimizer = False _sum_cost() if single_or_rankzero(): progress( "Training : cost={0:0.6f}".format(cost.sum_iteration), (iter % config.training_config.iter_per_epoch) * 1.0 / config.training_config.iter_per_epoch) cost.sum_iteration = 0.0 # Forward o.network.forward(o.forward_sequence) # Backward o.network.backward(o.backward_sequence, iter % o.update_interval == 0) # Update if iter % o.update_interval == o.update_interval - 1: if o.weight_decay > 0: o.solver.weight_decay(o.weight_decay) if o.comm: # Updated param with communicator params = [x.grad for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) if o.scheduler is not None: o.solver.set_learning_rate(o.scheduler.get_learning_rate(iter)) o.solver.update() # Sync w sometimes if iter % 10 == 9: # TODO: change the interval if o.comm: params = [x.data for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) # Reserve monitor loss cost.variables = o.loss_variables # Monitor loss at the end of iteration if iter % config.training_config.iter_per_epoch == config.training_config.iter_per_epoch - 1 and cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) l.variable_instance.data.zero() _sum_cost() cost.variables = None cost.sum_iteration = 0.0 return cost
def _create_cache(self): # Save all data into cache file(s). self._cache_positions = [] self._position = 0 percent = 0 if single_or_rankzero(): progress(None) while self._position < self._data_source._size: if single_or_rankzero(): progress('Create cache', self._position * 1.0 / self._data_source._size) self._store_data_to_cache_buffer(self._position) self._position += 1 if len(self._cache_positions) > 0: self._save_cache_to_file() if single_or_rankzero(): progress(None) # Adjust data size into reseted position. In most case it means # multiple of bunch(mini-batch) size. num_of_cache_files = int( numpy.ceil(float(self._data_source._size) / self._cache_size)) self._cache_file_order = self._cache_file_order[0:num_of_cache_files] self._cache_file_data_orders = self._cache_file_data_orders[ 0:num_of_cache_files] if self._data_source._size % self._cache_size != 0: self._cache_file_data_orders[num_of_cache_files - 1] = self._cache_file_data_orders[ num_of_cache_files - 1][0:self._data_source._size % self._cache_size] # Create Index index_filename = os.path.join(self._cache_dir, "cache_index.csv") with open(index_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for fn, orders in zip(self._cache_file_names, self._cache_file_data_orders): writer.writerow((os.path.basename(fn), len(orders))) # Create Info if self._cache_file_format == ".npy": info_filename = os.path.join(self._cache_dir, "cache_info.csv") with open(info_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for variable in self._variables: writer.writerow((variable, )) # Create original.csv if self._data_source._original_source_uri is not None: fr = FileReader(self._data_source._original_source_uri) with fr.open() as f: csv_lines = [x.decode('utf-8') for x in f.readlines()] with open(os.path.join(self._cache_dir, "original.csv"), 'w') as o: for l in csv_lines: o.write(l) # Create order.csv if self._data_source._order is not None and \ self._data_source._original_order is not None: with open(os.path.join(self._cache_dir, "order.csv"), 'w') as o: writer = csv.writer(o, lineterminator='\n') for orders in zip(self._data_source._original_order, self._data_source._order): writer.writerow(list(orders))
def _save_cache(self, args): position = args[0] cache_csv = args[1] # conv dataset cache_data = [tuple(self._process_row(row)) for row in cache_csv] start_position = position + 1 - len(cache_data) end_position = position cache_filename = os.path.join( self._cache_dir, '{}_{:08d}_{:08d}{}'.format(self._cache_file_name_prefix, start_position, end_position, self._cache_file_format)) logger.info('Creating cache file {}'.format(cache_filename)) data = collections.OrderedDict( [(n, []) for n in self._variables]) for _, cd in enumerate(cache_data): for i, n in enumerate(self._variables): if isinstance(cd[i], numpy.ndarray): d = cd[i] else: d = numpy.array(cd[i]).astype(numpy.float32) data[n].append(d) try: if self._cache_file_format == ".h5": h5 = h5py.File(cache_filename, 'w') for k, v in data.items(): h5.create_dataset(k, data=v) h5.close() else: retry_count = 1 is_create_cache_incomplete = True while is_create_cache_incomplete: try: with open(cache_filename, 'wb') as f: for v in data.values(): numpy.save(f, v) is_create_cache_incomplete = False except OSError: retry_count += 1 if retry_count > 10: raise logger.info( 'Creating cache retry {}/10'.format(retry_count)) except: logger.critical( 'An error occurred while creating cache file from dataset.') for k, v in data.items(): size = v[0].shape for d in v: if size != d.shape: logger.critical('The sizes of data "{}" are not the same. ({} != {})'.format( k, size, d.shape)) raise self.current_cache_position += 1 if single_or_rankzero(): if self.current_cache_position % int(self.num_of_cache_file/20+1) == 0: progress('Create cache', self.current_cache_position / self.num_of_cache_file) return cache_filename, len(cache_data)
def create(self, output_cache_dirname, normalize=True, cache_file_name_prefix='cache'): self._normalize = normalize self._cache_file_name_prefix = cache_file_name_prefix self._cache_dir = output_cache_dirname self._cache_file_format = nnabla_config.get( 'DATA_ITERATOR', 'cache_file_format') logger.info('Cache file format is {}'.format(self._cache_file_format)) progress(None) csv_position_and_data = [] csv_row = [] for _position in range(self._size): csv_row.append(self._csv_data[self._order[_position]]) if len(csv_row) == self._cache_size: csv_position_and_data.append((_position, csv_row)) csv_row = [] if len(csv_row): csv_position_and_data.append((self._size-1, csv_row)) self.num_of_cache_file = len(csv_position_and_data) self.current_cache_position = 0 if single_or_rankzero(): progress('Create cache', 0) with closing(ThreadPool(processes=self._num_of_threads)) as pool: cache_index_rows = pool.map( self._save_cache, csv_position_and_data) if single_or_rankzero(): progress('Create cache', 1.0) # Create Index index_filename = os.path.join(output_cache_dirname, "cache_index.csv") with open(index_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for row in cache_index_rows: if row: # row: (file_path, data_nums) writer.writerow((os.path.basename(row[0]), row[1])) # Create Info if self._cache_file_format == ".npy": info_filename = os.path.join( output_cache_dirname, "cache_info.csv") with open(info_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for variable in self._variables: writer.writerow((variable, )) # Create original.csv if self._original_source_uri is not None: shutil.copy(self._original_source_uri, os.path.join( output_cache_dirname, "original.csv")) # Create order.csv if self._order is not None and \ self._original_order is not None: with open(os.path.join(output_cache_dirname, "order.csv"), 'w') as o: writer = csv.writer(o, lineterminator='\n') for orders in zip(self._original_order, self._order): writer.writerow(list(orders))
def train(args): """ Multi-Device Training NOTE: the communicator exposes low-level interfaces Steps: * Instantiate a communicator and set parameter variables. * Specify contexts for computation. * Initialize DataIterator. * Construct a computation graph for training and one for validation. * Initialize solver and set parameter variables to that. * Load checkpoint to resume previous training. * Create monitor instances for saving and displaying training stats. * Training loop * Computate error rate for validation data (periodically) * Get a next minibatch. * Execute forwardprop * Set parameter gradients zero * Execute backprop. * AllReduce for gradients * Solver updates parameters by using gradients computed by backprop and all reduce. * Compute training error """ # Create Communicator and Context comm = create_communicator(ignore_error=True) if comm: n_devices = comm.size mpi_rank = comm.rank device_id = comm.local_rank else: n_devices = 1 mpi_rank = 0 device_id = args.device_id if args.context == 'cpu': import nnabla_ext.cpu context = nnabla_ext.cpu.context() else: import nnabla_ext.cudnn context = nnabla_ext.cudnn.context(device_id=device_id) nn.set_default_context(context) n_train_samples = 50000 n_valid_samples = 10000 bs_valid = args.batch_size iter_per_epoch = int(n_train_samples / args.batch_size / n_devices) # Model rng = np.random.RandomState(313) comm_syncbn = comm if args.sync_bn else None if args.net == "cifar10_resnet23": prediction = functools.partial(resnet23_prediction, rng=rng, ncls=10, nmaps=64, act=F.relu, comm=comm_syncbn) data_iterator = data_iterator_cifar10 if args.net == "cifar100_resnet23": prediction = functools.partial(resnet23_prediction, rng=rng, ncls=100, nmaps=384, act=F.elu, comm=comm_syncbn) data_iterator = data_iterator_cifar100 # Create training graphs image_train = nn.Variable((args.batch_size, 3, 32, 32)) label_train = nn.Variable((args.batch_size, 1)) pred_train = prediction(image_train, test=False) pred_train.persistent = True loss_train = (loss_function(pred_train, label_train) / n_devices).apply(persistent=True) error_train = F.mean(F.top_n_error(pred_train, label_train, axis=1)).apply(persistent=True) loss_error_train = F.sink(loss_train, error_train) # Create validation graphs image_valid = nn.Variable((bs_valid, 3, 32, 32)) label_valid = nn.Variable((bs_valid, 1)) pred_valid = prediction(image_valid, test=True) error_valid = F.mean(F.top_n_error(pred_valid, label_valid, axis=1)) # Solvers solver = S.Adam() solver.set_parameters(nn.get_parameters()) base_lr = args.learning_rate warmup_iter = iter_per_epoch * args.warmup_epoch warmup_slope = base_lr * (n_devices - 1) / warmup_iter solver.set_learning_rate(base_lr) # load checkpoint if file exist. start_point = 0 if args.use_latest_checkpoint: files = glob.glob(f'{args.model_save_path}/checkpoint_*.json') if len(files) != 0: index = max([ int(n) for n in [re.sub(r'.*checkpoint_(\d+).json', '\\1', f) for f in files] ]) # load weights and solver state info from specified checkpoint file. start_point = load_checkpoint( f'{args.model_save_path}/checkpoint_{index}.json', solver) print(f'checkpoint is loaded. start iteration from {start_point}') # Create monitor monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) monitor_err = MonitorSeries("Training error", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training time", monitor, interval=10) monitor_verr = MonitorSeries("Validation error", monitor, interval=1) monitor_vtime = MonitorTimeElapsed("Validation time", monitor, interval=1) # Data Iterator # If the data does not exist, it will try to download it from the server # and prepare it. When executing multiple processes on the same host, it is # necessary to execute initial data preparation by the representative # process (rank is 0) on the host. # Download dataset by rank-0 process if single_or_rankzero(): rng = np.random.RandomState(mpi_rank) _, tdata = data_iterator(args.batch_size, True, rng) vsource, vdata = data_iterator(bs_valid, False) # Wait for data to be prepared without watchdog if comm: comm.barrier() # Prepare dataset for remaining process if not single_or_rankzero(): rng = np.random.RandomState(mpi_rank) _, tdata = data_iterator(args.batch_size, True, rng) vsource, vdata = data_iterator(bs_valid, False) # Training-loop ve = nn.Variable() for i in range(start_point // n_devices, args.epochs * iter_per_epoch): # Validation if i % iter_per_epoch == 0: ve_local = 0. k = 0 idx = np.random.permutation(n_valid_samples) val_images = vsource.images[idx] val_labels = vsource.labels[idx] for j in range(int(n_valid_samples / n_devices * mpi_rank), int(n_valid_samples / n_devices * (mpi_rank + 1)), bs_valid): image = val_images[j:j + bs_valid] label = val_labels[j:j + bs_valid] if len(image ) != bs_valid: # note that smaller batch is ignored continue image_valid.d = image label_valid.d = label error_valid.forward(clear_buffer=True) ve_local += error_valid.d.copy() k += 1 ve_local /= k ve.d = ve_local if comm: comm.all_reduce(ve.data, division=True, inplace=True) # Monitoring error and elapsed time if single_or_rankzero(): monitor_verr.add(i * n_devices, ve.d.copy()) monitor_vtime.add(i * n_devices) # Save model if single_or_rankzero(): if i % (args.model_save_interval // n_devices) == 0: iter = i * n_devices nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % iter)) if args.use_latest_checkpoint: save_checkpoint(args.model_save_path, iter, solver) # Forward/Zerograd image, label = tdata.next() image_train.d = image label_train.d = label loss_error_train.forward(clear_no_need_grad=True) solver.zero_grad() # Backward/AllReduce backward_and_all_reduce( loss_error_train, comm, with_all_reduce_callback=args.with_all_reduce_callback) # Solvers update solver.update() # Linear Warmup if i <= warmup_iter: lr = base_lr + warmup_slope * i solver.set_learning_rate(lr) # Monitoring loss, error and elapsed time if single_or_rankzero(): monitor_loss.add(i * n_devices, loss_train.d.copy()) monitor_err.add(i * n_devices, error_train.d.copy()) monitor_time.add(i * n_devices) # Save nnp last epoch if single_or_rankzero(): runtime_contents = { 'networks': [{ 'name': 'Validation', 'batch_size': args.batch_size, 'outputs': { 'y': pred_valid }, 'names': { 'x': image_valid } }], 'executors': [{ 'name': 'Runtime', 'network': 'Validation', 'data': ['x'], 'output': ['y'] }] } iter = args.epochs * iter_per_epoch nn.save_parameters( os.path.join(args.model_save_path, 'params_%06d.h5' % iter)) nnabla.utils.save.save( os.path.join(args.model_save_path, f'{args.net}_result.nnp'), runtime_contents) if comm: comm.barrier()