def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator): class Dataset: pass dataset = Dataset() dataset.uri = uri dataset.normalize = not no_image_normalization if prepare_data_iterator: if cache_dir == '': cache_dir = None if cache_dir and create_cache_explicitly: if not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0 or overwrite_cache: if not os.path.exists(cache_dir): os.mkdir(cache_dir) logger.log(99, 'Creating cache data for "' + uri + '"') with data_iterator_csv_dataset(uri, batch_size, shuffle, normalize=False, cache_dir=cache_dir) as di: index = 0 while index < di.size: progress('', (1.0 * di.position) / di.size) di.next() index += batch_size dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, normalize=dataset.normalize)) elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0: if cache_dir and not os.path.exists(cache_dir): os.mkdir(cache_dir) dataset.data_iterator = (lambda: data_iterator_csv_dataset( uri, batch_size, shuffle, normalize=dataset.normalize, cache_dir=cache_dir)) else: dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, normalize=dataset.normalize)) else: dataset.data_iterator = None return dataset
def _create_cache(self): # Save all data into cache file(s). self._position = 0 logger.info('Creating cache start') percent = 0 while self._position < self._data_source._size: current_percent = self._position * 10 // self._data_source._size progress('', self._position * 1.0 / self._data_source._size) if current_percent != percent: percent = current_percent logger.info('Creating cache {}0% finished.'.format(percent)) self._store_data_to_cache_buffer(self._position) self._position += 1 if len(self._cache_data) > 0: self._save_cache_to_file() logger.info('Creating cache end') # Adjust data size into reseted position. In most case it means # multiple of bunch(mini-batch) size. num_of_cache_files = int(numpy.ceil( float(self._data_source._size) / self._cache_size)) self._cache_file_order = self._cache_file_order[ 0:num_of_cache_files] self._cache_file_data_orders = self._cache_file_data_orders[ 0:num_of_cache_files] if self._data_source._size % self._cache_size != 0: self._cache_file_data_orders[num_of_cache_files - 1] = self._cache_file_data_orders[ num_of_cache_files - 1][0:self._data_source._size % self._cache_size]
def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator): class Dataset: pass dataset = Dataset() dataset.uri = uri dataset.normalize = not no_image_normalization if prepare_data_iterator: if cache_dir == '': cache_dir = None if cache_dir and create_cache_explicitly: if not os.path.exists(cache_dir) or overwrite_cache: if not os.path.exists(cache_dir): os.mkdir(cache_dir) logger.info('Creating cache data for "' + uri + '"') with data_iterator_csv_dataset(uri, batch_size, shuffle, normalize=False, cache_dir=cache_dir) as di: index = 0 while index < di.size: progress('', (1.0 * di.position) / di.size) di.next() index += batch_size dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, normalize=dataset.normalize)) elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir): if cache_dir and not os.path.exists(cache_dir): os.mkdir(cache_dir) dataset.data_iterator = (lambda: data_iterator_csv_dataset( uri, batch_size, shuffle, normalize=dataset.normalize, cache_dir=cache_dir)) else: dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, normalize=dataset.normalize)) else: dataset.data_iterator = None return dataset
def train_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) class TrainConfig: pass config = TrainConfig() info = load.load(files) logger.log(99, 'Train with contexts {}'.format(available_contexts)) config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Training max_iter = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if max_iter > 0: data_iterators = {'optimizer': {}, 'monitor': {}} with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) for name, m in config.monitors.items(): m.data_iterator = stack.enter_context( m.monitor.data_iterator()) train(args, config) else: # save parameters without training (0 epoch learning) save_parameters(os.path.join(args.outdir, 'parameters.h5')) logger.log(99, 'Training Completed.') progress(None)
def profile_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) class TrainConfig: pass config = TrainConfig() info = load.load(files) config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m ext_module = import_extension_module( config.global_config.default_context.backend[0].split(':')[0]) def synchronize(): return ext_module.synchronize( device_id=config.global_config.default_context.device_id) result_array = [['time in ms']] # Profile Optimizer with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context(o.optimizer.data_iterator()) result_array = profile_optimizer(config, result_array, synchronize) # Write profiling result import csv with open(args.outdir + os.sep + 'profile.csv', 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(result_array) logger.log(99, 'Profile Completed.') progress(None) return True
def create(self, output_cache_dirname, normalize=True, cache_file_name_prefix='cache'): self._normalize = normalize self._cache_file_name_prefix = cache_file_name_prefix self._cache_file_format = nnabla_config.get('DATA_ITERATOR', 'cache_file_format') logger.info('Cache file format is {}'.format(self._cache_file_format)) self._cache_dir = output_cache_dirname progress(None) self._cache_data = [] for self._position in range(self._size): progress('Create cache', self._position * 1.0 / self._size) self._file.seek(self._line_positions[self._order[self._position]]) line = self._file.readline().decode('utf-8') csvreader = csv.reader([line]) row = next(csvreader) self._cache_data.append(tuple(self._process_row(row))) if len(self._cache_data) >= self._cache_size: self._save_cache() self._cache_data = [] self._save_cache() # Create Index index_filename = os.path.join(self._cache_dir, "cache_index.csv") with open(index_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for fn, orders in zip(self._cache_file_names, self._cache_file_data_orders): writer.writerow((os.path.basename(fn), len(orders))) # Create Info if self._cache_file_format == ".npy": info_filename = os.path.join(self._cache_dir, "cache_info.csv") with open(info_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for variable in self._variables: writer.writerow((variable, )) # Create original.csv if self._original_source_uri is not None: shutil.copy(self._original_source_uri, os.path.join(self._cache_dir, "original.csv")) # Create order.csv if self._order is not None and \ self._original_order is not None: with open(os.path.join(self._cache_dir, "order.csv"), 'w') as o: writer = csv.writer(o, lineterminator='\n') for orders in zip(self._original_order, self._order): writer.writerow(list(orders))
def train_command(args): logger.log(99, 'Train with contexts {}'.format(available_contexts)) configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) class TrainConfig: pass config = TrainConfig() info = load.load(files) config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Training max_iter = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if max_iter > 0: data_iterators = {'optimizer': {}, 'monitor': {}} with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) for name, m in config.monitors.items(): m.data_iterator = stack.enter_context( m.monitor.data_iterator()) train(args, config) else: # save parameters without training (0 epoch learning) save_parameters(os.path.join( args.outdir, 'parameters.h5')) logger.log(99, 'Training Completed.') progress(None)
def profile_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) class TrainConfig: pass config = TrainConfig() info = load.load(files) config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m result_array = [['time in ms']] # Profile Optimizer with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) result_array = profile_optimizer(config, result_array) # Write profiling result import csv with open(args.outdir + os.sep + 'profile.csv', 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(result_array) logger.log(99, 'Profile Completed.') progress(None)
def _create_cache(self): # Save all data into cache file(s). self._cache_positions = [] self._position = 0 percent = 0 while self._position < self._data_source._size: if single_or_rankzero(): progress('Create cache', self._position * 1.0 / self._data_source._size) self._store_data_to_cache_buffer(self._position) self._position += 1 if len(self._cache_positions) > 0: self._save_cache_to_file() # Adjust data size into reseted position. In most case it means # multiple of bunch(mini-batch) size. num_of_cache_files = int( numpy.ceil(float(self._data_source._size) / self._cache_size)) self._cache_file_order = self._cache_file_order[0:num_of_cache_files] self._cache_file_data_orders = self._cache_file_data_orders[ 0:num_of_cache_files] if self._data_source._size % self._cache_size != 0: self._cache_file_data_orders[num_of_cache_files - 1] = self._cache_file_data_orders[ num_of_cache_files - 1][0:self._data_source._size % self._cache_size] index_filename = os.path.join(self._cache_dir, "cache_index.csv") with open(index_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for fn, orders in zip(self._cache_file_names, self._cache_file_data_orders): writer.writerow((os.path.basename(fn), len(orders))) if self._cache_file_format == ".npy": info_filename = os.path.join(self._cache_dir, "cache_info.csv") with open(info_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for variable in self._variables: writer.writerow((variable, ))
def create(self, output_cache_dirname, normalize=True, cache_file_name_prefix='cache'): self._normalize = normalize self._cache_file_name_prefix = cache_file_name_prefix self._cache_dir = output_cache_dirname self._cache_file_format = nnabla_config.get('DATA_ITERATOR', 'cache_file_format') logger.info('Cache file format is {}'.format(self._cache_file_format)) progress(None) csv_position_and_data = [] csv_row = [] for _position in range(self._size): csv_row.append(self._csv_data[self._order[_position]]) if len(csv_row) == self._cache_size: csv_position_and_data.append((_position, csv_row)) csv_row = [] if len(csv_row): csv_position_and_data.append((self._size - 1, csv_row)) progress('Create cache', 0) with closing(ThreadPool(processes=self._num_of_threads)) as pool: cache_index_rows = pool.map(self._save_cache, csv_position_and_data) progress('Create cache', 1.0) # Create Index index_filename = os.path.join(output_cache_dirname, "cache_index.csv") with open(index_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for row in cache_index_rows: if row: # row: (file_path, data_nums) writer.writerow((os.path.basename(row[0]), row[1])) # Create Info if self._cache_file_format == ".npy": info_filename = os.path.join(output_cache_dirname, "cache_info.csv") with open(info_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for variable in self._variables: writer.writerow((variable, )) # Create original.csv if self._original_source_uri is not None: shutil.copy(self._original_source_uri, os.path.join(output_cache_dirname, "original.csv")) # Create order.csv if self._order is not None and \ self._original_order is not None: with open(os.path.join(output_cache_dirname, "order.csv"), 'w') as o: writer = csv.writer(o, lineterminator='\n') for orders in zip(self._original_order, self._order): writer.writerow(list(orders))
def train_command(args): if single_or_rankzero(): configure_progress(os.path.join(args.outdir, 'progress.txt')) info = load.load([args.config], exclude_parameter=True) # Check dataset uri is empty. dataset_error = False for dataset in info.datasets.values(): if dataset.uri.strip() == '': dataset_error = True if dataset_error: logger.log(99, 'Fatal error. Dataset URI is empty.') return False class TrainConfig: pass config = TrainConfig() config.timelimit = -1 if args.param: load.load([args.param], parameter_only=True) config.global_config = info.global_config config.training_config = info.training_config if single_or_rankzero(): logger.log(99, 'Train with contexts {}'.format(available_contexts)) class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Training comm = current_communicator() config.training_config.iter_per_epoch //= comm.size if comm else 1 max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch global _save_parameter_info _save_parameter_info = {} _, config_ext = os.path.splitext(args.config) if config_ext == '.prototxt' or config_ext == '.nntxt': _save_parameter_info['config'] = args.config elif config_ext == '.nnp': with zipfile.ZipFile(args.config, 'r') as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if ext == '.nntxt' or ext == '.prototxt': nnp.extract(name, args.outdir) _save_parameter_info['config'] = os.path.join( args.outdir, name) result = False if max_iteration > 0: data_iterators = {'optimizer': {}, 'monitor': {}} rng = np.random.RandomState(comm.rank if comm else 0) with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) if comm and comm.size > 1: o.data_iterator = o.data_iterator.slice( rng, comm.size, comm.rank) for name, m in config.monitors.items(): m.data_iterator = stack.enter_context( m.monitor.data_iterator()) if comm and comm.size > 1: m.data_iterator = m.data_iterator.slice( rng, comm.size, comm.rank) result = _train(args, config) else: # save parameters without training (0 epoch learning) logger.log(99, '0 epoch learning. (Just save parameter.)') if single_or_rankzero(): _save_parameters(args, 'current', 0, True) result = True if single_or_rankzero(): if result: logger.log(99, 'Training Completed.') else: logger.log(99, 'Training Incompleted.') if single_or_rankzero(): progress(None) return True
def _update(iter, config, cost): loaded_datas = {} is_first_optimizer = True for opt in config.optimizers.values(): o = opt.optimizer # Load dataset di = opt.data_iterator if o.data_iterator not in loaded_datas: loaded_datas[o.data_iterator] = di.next() datas = loaded_datas[o.data_iterator] for v, d in o.dataset_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, datas[ di.variables.index(d)], ctx=dest_context) # Generate data for v, generator in o.generator_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context) # Monitor loss before forward to prepare input data while processing on # GPU if cost.variables: for l in cost.variables: cost.sum_iter += np.mean(l.variable_instance.d) if is_first_optimizer: is_first_optimizer = False progress("Training : cost={0:0.6f}".format(cost.sum_iter), (iter % config.training_config.iter_per_epoch) * 1.0 / config.training_config.iter_per_epoch) cost.sum_epoch += cost.sum_iter cost.sum_iter = 0.0 # Forward o.network.forward(o.forward_sequence) # Backward o.network.backward(o.backward_sequence, iter % o.update_interval == 0) # Update if iter % o.update_interval == o.update_interval - 1: if o.weight_decay > 0: o.solver.weight_decay(o.weight_decay) o.solver.update() if o.lr_decay != 1.0 and iter % o.lr_decay_interval == o.lr_decay_interval - 1: o.solver.set_learning_rate(o.solver.learning_rate() * o.lr_decay) # Reserve monitor loss cost.variables = o.loss_variables # Monitor loss at the end of iteration if iter % config.training_config.iter_per_epoch == config.training_config.iter_per_epoch - 1 and cost.variables: for l in cost.variables: cost.sum_iter += np.mean(l.variable_instance.d) cost.sum_epoch += cost.sum_iter cost.variables = None cost.sum_iter = 0.0 return cost
def create(self, output_cache_dirname, normalize=True, cache_file_name_prefix='cache'): cache_file_format = nnabla_config.get('DATA_ITERATOR', 'cache_file_format') logger.info('Cache file format is {}'.format(cache_file_format)) progress(None) cache_file_name_and_data_nums_list = multiprocessing.Manager().list() csv_position_and_data = [] csv_row = [] for _position in range(self._size): csv_row.append(self._csv_data[self._order[_position]]) if len(csv_row) == self._cache_size: csv_position_and_data.append((_position, csv_row)) csv_row = [] if len(csv_row): csv_position_and_data.append((self._size - 1, csv_row)) self_args = { '_cache_file_name_prefix': cache_file_name_prefix, '_cache_file_format': cache_file_format, '_cache_file_name_and_data_nums_list': cache_file_name_and_data_nums_list, '_output_cache_dirname': output_cache_dirname, '_variables': self._variables, '_filereader': self._filereader, '_normalize': normalize, '_columns': self._columns, '_cache_file_count': len(csv_position_and_data) } # Notice: # Here, we have to place a gc.collect(), since we found # python might perform garbage collection operation in # a child process, which tends to release some objects # created by its parent process, thus, it might touch # cuda APIs which has not initialized in child process. # Place a gc.collect() here can avoid such cases. gc.collect() progress('Create cache', 0) with closing(multiprocessing.Pool(self._process_num)) as pool: pool.map(multiprocess_save_cache, ((i, self_args) for i in csv_position_and_data)) progress('Create cache', 1.0) logger.info('The total of cache files is {}'.format( len(cache_file_name_and_data_nums_list))) # Create Index index_filename = os.path.join(output_cache_dirname, "cache_index.csv") cache_index_rows = sorted(cache_file_name_and_data_nums_list, key=lambda x: x[0]) with open(index_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for file_name, data_nums in cache_index_rows: writer.writerow((os.path.basename(file_name), data_nums)) # Create Info if cache_file_format == ".npy": info_filename = os.path.join(output_cache_dirname, "cache_info.csv") with open(info_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for variable in self._variables: writer.writerow((variable, )) # Create original.csv if self._original_source_uri is not None: shutil.copy(self._original_source_uri, os.path.join(output_cache_dirname, "original.csv")) # Create order.csv if self._order is not None and \ self._original_order is not None: with open(os.path.join(output_cache_dirname, "order.csv"), 'w') as o: writer = csv.writer(o, lineterminator='\n') for orders in zip(self._original_order, self._order): writer.writerow(list(orders))
def _update(iter, config, cost): comm = current_communicator() loaded_data = {} is_first_optimizer = True def _sum_cost(): if comm: # logger.log(99, "Calc cost with communicator") var = [nn.NdArray()] var[0].data = cost.sum_iteration _all_reduce(comm, var, division=False, inplace=True) cost.sum_epoch += var[0].data cost.num_iteration += comm.size else: cost.sum_epoch += cost.sum_iteration cost.num_iteration += 1 for opt in config.optimizers.values(): o = opt.optimizer # Load dataset di = opt.data_iterator if o.data_iterator not in loaded_data: loaded_data[o.data_iterator] = di.next() data = loaded_data[o.data_iterator] for v, d in o.dataset_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data[di.variables.index(d)], ctx=dest_context, data_name=d, variable_name=v.name) # Generate data for v, generator in o.generator_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Monitor loss before forward to prepare input data while processing on # GPU if cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) l.variable_instance.data.zero() if is_first_optimizer: is_first_optimizer = False _sum_cost() if single_or_rankzero(): progress( "Training : cost={0:0.6f}".format(cost.sum_iteration), (iter % config.training_config.iter_per_epoch) * 1.0 / config.training_config.iter_per_epoch) cost.sum_iteration = 0.0 # Forward o.network.forward(o.forward_sequence) # Backward o.network.backward(o.backward_sequence, iter % o.update_interval == 0) # Update if iter % o.update_interval == o.update_interval - 1: if o.weight_decay > 0: o.solver.weight_decay(o.weight_decay) if o.comm: # Updated param with communicator params = [x.grad for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) if o.scheduler is not None: o.solver.set_learning_rate(o.scheduler.get_learning_rate(iter)) o.solver.update() # Sync w sometimes if iter % 10 == 9: # TODO: change the interval if o.comm: params = [x.data for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) # Reserve monitor loss cost.variables = o.loss_variables # Monitor loss at the end of iteration if iter % config.training_config.iter_per_epoch == config.training_config.iter_per_epoch - 1 and cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) l.variable_instance.data.zero() _sum_cost() cost.variables = None cost.sum_iteration = 0.0 return cost
def _update(iter, config, cost): loaded_datas = {} is_first_optimizer = True for opt in config.optimizers.values(): o = opt.optimizer # Load dataset di = opt.data_iterator if o.data_iterator not in loaded_datas: loaded_datas[o.data_iterator] = di.next() datas = loaded_datas[o.data_iterator] for v, d in o.dataset_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, datas[di.variables.index(d)], ctx=dest_context) # Generate data for v, generator in o.generator_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context) # Monitor loss before forward to prepare input data while processing on # GPU if cost.variables: for l in cost.variables: cost.sum_iter += np.mean(l.variable_instance.d) if is_first_optimizer: is_first_optimizer = False progress("Training : cost={0:0.6f}".format(cost.sum_iter), (iter % config.training_config.iter_per_epoch) * 1.0 / config.training_config.iter_per_epoch) cost.sum_epoch += cost.sum_iter cost.sum_iter = 0.0 # Forward o.network.forward(o.forward_sequence) # Backward o.network.backward(o.backward_sequence, iter % o.update_interval == 0) # Update if iter % o.update_interval == o.update_interval - 1: if o.weight_decay > 0: o.solver.weight_decay(o.weight_decay) o.solver.update() if o.lr_decay != 1.0 and iter % o.lr_decay_interval == o.lr_decay_interval - 1: o.solver.set_learning_rate(o.solver.learning_rate() * o.lr_decay) # Reserve monitor loss cost.variables = o.loss_variables # Monitor loss at the end of iteration if iter % config.training_config.iter_per_epoch == config.training_config.iter_per_epoch - 1 and cost.variables: for l in cost.variables: cost.sum_iter += np.mean(l.variable_instance.d) cost.sum_epoch += cost.sum_iter cost.variables = None cost.sum_iter = 0.0 return cost
def _update(iter, config, cost): comm = current_communicator() loaded_data = {} is_first_optimizer = True def _sum_cost(): if comm: # logger.log(99, "Calc cost with communicator") var = [nn.NdArray()] var[0].data = cost.sum_iteration _all_reduce(comm, var, division=False, inplace=True) cost.sum_epoch += var[0].data cost.num_iteration += comm.size else: cost.sum_epoch += cost.sum_iteration cost.num_iteration += 1 def _get_reserved_variable(shape, reserved_variable_name, iter, iter_per_epoch, max_epoch): if reserved_variable_name == "%iter": value = iter elif reserved_variable_name == "%max_iter": value = max_epoch * iter_per_epoch elif reserved_variable_name == "%epoch": value = iter // iter_per_epoch elif reserved_variable_name == "%epochf": value = iter * 1.0 / iter_per_epoch elif reserved_variable_name == "%max_epoch": value = max_epoch elif reserved_variable_name == "%progress": value = (iter * 1.0 / iter_per_epoch) / max_epoch else: raise ValueError( "Unknown reserved variable {}".format(reserved_variable_name)) return value for opt in config.optimizers.values(): o = opt.optimizer if (o.start_iter == 0 or iter + 1 >= o.start_iter) and (o.end_iter == 0 or iter + 1 <= o.end_iter): # Load dataset data = OrderedDict() for di in opt.data_iterators: if di not in loaded_data: loaded_data[di] = di.next() data.update(zip(di.variables, loaded_data[di])) for v, d in o.dataset_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None if d not in data and d[0] == "%": value = _get_reserved_variable( v.variable_instance.shape, d, iter, config.training_config.iter_per_epoch, config.training_config.max_epoch) v.variable_instance.data.fill(value) elif d in data: let_data_to_variable(v.variable_instance, data[d], ctx=dest_context, data_name=d, variable_name=v.name) else: raise ValueError( 'Variable "{}" is not found in dataset "{}", optimizer "{}"' .format(d, ', '.join(o.data_iterators.keys()), o.name)) # Generate data for v, generator in o.generator_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Monitor loss before forward to prepare input data while processing on # GPU if cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) # l.variable_instance.data.zero() if is_first_optimizer: is_first_optimizer = False _sum_cost() if single_or_rankzero(): progress( "Training : cost={0:0.6f}".format( cost.sum_iteration), (iter % config.training_config.iter_per_epoch) * 1.0 / config.training_config.iter_per_epoch) cost.sum_iteration = 0.0 with nodeTimeCollector.collect_cost_time(comm, iter): # Forward o.network.forward(o.forward_sequence) # Backward o.network.backward(o.backward_sequence, iter % o.update_interval == 0) # Update if iter % o.update_interval == o.update_interval - 1: if o.weight_decay > 0: o.solver.weight_decay(o.weight_decay) if o.comm: # Updated param with communicator params = [x.grad for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) if o.scheduler is not None: o.solver.set_learning_rate( o.scheduler.get_learning_rate(iter)) o.solver.update() # Sync w sometimes if iter % 10 == 9: # TODO: change the interval if o.comm: params = [x.data for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) # Reserve monitor loss cost.variables = o.loss_variables # Monitor loss at the end of epoch if iter % config.training_config.iter_per_epoch == config.training_config.iter_per_epoch - 1 and cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) # l.variable_instance.data.zero() _sum_cost() cost.variables = None cost.sum_iteration = 0.0 return cost
def create(self, output_cache_dirname, normalize=True, cache_file_name_prefix='cache'): self._normalize = normalize self._cache_file_name_prefix = cache_file_name_prefix self._cache_file_format = nnabla_config.get('DATA_ITERATOR', 'cache_file_format') logger.info('Cache file format is {}'.format(self._cache_file_format)) self._cache_dir = output_cache_dirname progress(None) self._cache_file_name_and_data_nums_q = multiprocessing.Manager( ).Queue() self._csv_position_and_data = [] csv_row = [] for _position in range(self._size): csv_row.append(self._csv_data[self._order[_position]]) if len(csv_row) == self._cache_size: self._csv_position_and_data.append((_position, csv_row)) csv_row = [] if len(csv_row): self._csv_position_and_data.append((self._size - 1, csv_row)) self_args = { '_cache_file_name_prefix': self._cache_file_name_prefix, '_cache_file_format': self._cache_file_format, '_cache_file_name_and_data_nums_q': self._cache_file_name_and_data_nums_q, '_cache_dir': self._cache_dir, '_variables': self._variables, '_filereader': self._filereader, '_normalize': self._normalize, '_columns': self._columns, '_cache_file_count': len(self._csv_position_and_data) } progress('Create cache', 0) with closing(multiprocessing.Pool(self._process_num)) as pool: pool.map(multiprocess_save_cache, ((i, self_args) for i in self._csv_position_and_data)) progress('Create cache', 1.0) logger.info('The total of cache files is {}'.format( self._cache_file_name_and_data_nums_q.qsize())) # Create Index index_filename = os.path.join(self._cache_dir, "cache_index.csv") cache_index_rows = [] while True: try: cache_index_rows.append( self._cache_file_name_and_data_nums_q.get(block=False)) except Exception: break cache_index_rows = sorted(cache_index_rows, key=lambda x: x[0]) with open(index_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for file_name, data_nums in cache_index_rows: writer.writerow((os.path.basename(file_name), data_nums)) # Create Info if self._cache_file_format == ".npy": info_filename = os.path.join(self._cache_dir, "cache_info.csv") with open(info_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for variable in self._variables: writer.writerow((variable, )) # Create original.csv if self._original_source_uri is not None: shutil.copy(self._original_source_uri, os.path.join(self._cache_dir, "original.csv")) # Create order.csv if self._order is not None and \ self._original_order is not None: with open(os.path.join(self._cache_dir, "order.csv"), 'w') as o: writer = csv.writer(o, lineterminator='\n') for orders in zip(self._original_order, self._order): writer.writerow(list(orders))
def _create_cache(self): # Save all data into cache file(s). self._cache_positions = [] self._position = 0 percent = 0 if single_or_rankzero(): progress(None) while self._position < self._data_source._size: if single_or_rankzero(): progress('Create cache', self._position * 1.0 / self._data_source._size) self._store_data_to_cache_buffer(self._position) self._position += 1 if len(self._cache_positions) > 0: self._save_cache_to_file() if single_or_rankzero(): progress(None) # Adjust data size into reseted position. In most case it means # multiple of bunch(mini-batch) size. num_of_cache_files = int( numpy.ceil(float(self._data_source._size) / self._cache_size)) self._cache_file_order = self._cache_file_order[0:num_of_cache_files] self._cache_file_data_orders = self._cache_file_data_orders[ 0:num_of_cache_files] if self._data_source._size % self._cache_size != 0: self._cache_file_data_orders[num_of_cache_files - 1] = self._cache_file_data_orders[ num_of_cache_files - 1][0:self._data_source._size % self._cache_size] # Create Index index_filename = os.path.join(self._cache_dir, "cache_index.csv") with open(index_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for fn, orders in zip(self._cache_file_names, self._cache_file_data_orders): writer.writerow((os.path.basename(fn), len(orders))) # Create Info if self._cache_file_format == ".npy": info_filename = os.path.join(self._cache_dir, "cache_info.csv") with open(info_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for variable in self._variables: writer.writerow((variable, )) # Create original.csv if self._data_source._original_source_uri is not None: fr = FileReader(self._data_source._original_source_uri) with fr.open() as f: csv_lines = [x.decode('utf-8') for x in f.readlines()] with open(os.path.join(self._cache_dir, "original.csv"), 'w') as o: for l in csv_lines: o.write(l) # Create order.csv if self._data_source._order is not None and \ self._data_source._original_order is not None: with open(os.path.join(self._cache_dir, "order.csv"), 'w') as o: writer = csv.writer(o, lineterminator='\n') for orders in zip(self._data_source._original_order, self._data_source._order): writer.writerow(list(orders))
def compare_with_cpu_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) class TrainConfig: pass class OptConfig: pass class MonConfig: pass # Load config with current context files = [] files.append(args.config) with nn.parameter_scope('current'): info = load.load(files) parameters = get_parameters(grad_only=False) config = TrainConfig() config.global_config = info.global_config config.training_config = info.training_config config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Load config with cpu context files = [] files.append(args.config2) with nn.parameter_scope('cpu'): info_cpu = load.load(files) cpu_parameters = get_parameters(grad_only=False) config_cpu = TrainConfig() config_cpu.global_config = info_cpu.global_config config_cpu.training_config = info_cpu.training_config config_cpu.optimizers = OrderedDict() for name, opt in info_cpu.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config_cpu.optimizers[name] = o config_cpu.monitors = OrderedDict() for name, mon in info_cpu.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config_cpu.monitors[name] = m result_array = [['1-Correl']] # Profile Optimizer with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context(o.optimizer.data_iterator()) for name, o in config_cpu.optimizers.items(): o.data_iterator = stack.enter_context(o.optimizer.data_iterator()) result_array = compare_optimizer(config, parameters, config_cpu, cpu_parameters, result_array) # Write profiling result import csv with open(args.outdir + os.sep + 'compare_with_cpu.csv', 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(result_array) logger.log(99, 'Compare with CPU Completed.') progress(None) return True
def _train(args, config): global _save_parameter_info comm = current_communicator() _CGLOAD_LOG_INTERVAL = 20 best_epoch = None best_error = None last_epoch = 0 if args.resume: last_epoch, best_epoch, best_error = _get_current_parameter(args) if best_epoch is not None: logger.log( 99, "Best error {} recorded at epoch {} in previous training.". format(best_error, best_epoch)) if best_epoch > last_epoch: logger.log( 99, "Resumed epoch is {} but this training keep this result.". format(last_epoch)) logger.log(99, "Resume from epoch {}".format(last_epoch + 1)) callback.update_status(('epoch.max', config.training_config.max_epoch)) callback.update_status( ('epoch.current', last_epoch + 1 if last_epoch < config.training_config.max_epoch else config.training_config.max_epoch)) max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if single_or_rankzero(): logger.log( 99, 'Training epoch {} of {} begin'.format( last_epoch + 1, config.training_config.max_epoch)) class Cost: pass cost = Cost() cost.sum_epoch = 0.0 cost.num_iteration = 0 cost.sum_iteration = 0.0 cost.variables = None class TimeInfo: pass timeinfo = TimeInfo() timeinfo.past_time = 0 timeinfo.estimate_time = 0 timeinfo.last_past_time = None if max_iteration > 0: last_iteration = last_epoch * config.training_config.iter_per_epoch if last_iteration < max_iteration: timeinfo.start_time = time.time() timeinfo.last_epoch_start_time = timeinfo.start_time callback.update_status('processing', True, timeinfo.start_time) for iteration in range(last_iteration, max_iteration): # instant load measurement measure_cpu_gpu_instant_load() cost = _update(iteration, config, cost) if np.isnan(cost.sum_epoch) or np.isinf(cost.sum_epoch): logger.log(99, 'Cost is Nan') return False, False timeinfo = _calc_estimate_time(timeinfo, max_iteration, last_iteration, iteration + 1) callback.update_time_train(prediction=timeinfo.estimate_time) if 0 < config.timelimit < timeinfo.estimate_time: logger.log( 99, 'Expected training time ({:.3f}s) will exceed time limit ({}s).' .format(timeinfo.estimate_time, config.timelimit)) return False, False if (iteration + 1) % config.training_config.iter_per_epoch == 0: last_past_time = -1 # End of epoch epoch = iteration // config.training_config.iter_per_epoch + 1 cost_avg_epoch = cost.sum_epoch / cost.num_iteration if cost.num_iteration else 0 cost.sum_epoch = 0.0 cost.num_iteration = 0 monitoring_report = [] # Evaluation error_str = '' if epoch % config.training_config.monitor_interval == 0 or epoch <= 5: best_error, error_str = _evaluate( args, config, monitoring_report, best_error, epoch) # Cpu/Gpu average load cg_load_str = '' cgload_log = '' cg_load = get_cpu_gpu_average_load() if cg_load: cg_load_str = 'epoch {} average_load_matrix: {}'.format( epoch, cg_load) span = _calc_epoch_span(timeinfo) if span > _CGLOAD_LOG_INTERVAL: cgload_log = _format_cgload_log(cg_load) if single_or_rankzero(): # Write to monitoring_report.yml f = open( os.path.join(args.outdir, 'monitoring_report.yml'), 'a') f.write('{}:\n'.format(epoch - 1)) f.write(' cost: {}\n'.format(cost_avg_epoch)) for s in monitoring_report: f.write(s) f.close() callback.update_status( (['monitoring_report', epoch, 'cost'], cost_avg_epoch)) _save_parameters(args, 'current', epoch, config) callback.update_status(('epoch.current', epoch)) callback.update_status() logger.log( 99, 'epoch {} of {} cost={:.6f} {} time=({:.1f}s /{:.1f}s) {}' .format(epoch, config.training_config.max_epoch, cost_avg_epoch, error_str, timeinfo.past_time, timeinfo.estimate_time, cgload_log)) if cg_load_str: # cpu_gpu_average_load record at epoch level callback.update_status( (['cpu_gpu_epoch_load', epoch], cg_load)) progress(cg_load_str, 1) if not callback.check_training_time( args, config, timeinfo, epoch, last_epoch): _save_parameters(args, 'current', epoch, config, True) return False, True if single_or_rankzero(): _save_parameters(args, 'current', epoch, config, True) return True, False
def create(self, output_cache_dirname, normalize=True, cache_file_name_prefix='cache'): self._normalize = normalize self._cache_file_name_prefix = cache_file_name_prefix self._cache_file_format = nnabla_config.get('DATA_ITERATOR', 'cache_file_format') logger.info('Cache file format is {}'.format(self._cache_file_format)) self._cache_dir = output_cache_dirname progress(None) self._cache_file_order = [] self._cache_file_data_orders = [] self._cache_file_names = [] self._cache_data = [] progress('Create cache', 0) last_time = time.time() for self._position in range(self._size): if time.time() >= last_time + 1.0: progress('Create cache', self._position / self._size) last_time = time.time() self._file.seek(self._line_positions[self._order[self._position]]) line = self._file.readline().decode('utf-8') csvreader = csv.reader([line]) row = next(csvreader) self._cache_data.append(tuple(self._process_row(row))) if len(self._cache_data) >= self._cache_size: self._save_cache() self._cache_data = [] self._save_cache() progress('Create cache', 1.0) # Adjust data size into reseted position. In most case it means # multiple of bunch(mini-batch) size. num_of_cache_files = int( numpy.ceil(float(self._size) / self._cache_size)) self._cache_file_order = self._cache_file_order[0:num_of_cache_files] self._cache_file_data_orders = self._cache_file_data_orders[ 0:num_of_cache_files] if self._size % self._cache_size != 0: self._cache_file_data_orders[num_of_cache_files - 1] = self._cache_file_data_orders[ num_of_cache_files - 1][0:self._size % self._cache_size] # Create Index index_filename = os.path.join(self._cache_dir, "cache_index.csv") with open(index_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for fn, orders in zip(self._cache_file_names, self._cache_file_data_orders): writer.writerow((os.path.basename(fn), len(orders))) # Create Info if self._cache_file_format == ".npy": info_filename = os.path.join(self._cache_dir, "cache_info.csv") with open(info_filename, 'w') as f: writer = csv.writer(f, lineterminator='\n') for variable in self._variables: writer.writerow((variable, )) # Create original.csv if self._original_source_uri is not None: shutil.copy(self._original_source_uri, os.path.join(self._cache_dir, "original.csv")) # Create order.csv if self._order is not None and \ self._original_order is not None: with open(os.path.join(self._cache_dir, "order.csv"), 'w') as o: writer = csv.writer(o, lineterminator='\n') for orders in zip(self._original_order, self._order): writer.writerow(list(orders))
def forward_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) class ForwardConfig: pass config = ForwardConfig info = load.load(files, prepare_data_iterator=False) config.global_config = info.global_config config.executors = info.executors.values() config.networks = [] for e in config.executors: if e.network.name in info.networks.keys(): config.networks.append(info.networks[e.network.name]) else: logger.critical('Network {} does not found.'.format( config.executor.network.name)) return normalize = True for d in info.datasets.values(): if d.uri == args.dataset: normalize = d.normalize data_iterator = (lambda: data_iterator_csv_dataset( args.dataset, config.networks[0].batch_size, False, normalize=normalize)) # load dataset as csv with open(args.dataset, 'rt') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) root_path = os.path.dirname(args.dataset) root_path = os.path.abspath(root_path.replace('/|\\', os.path.sep)) rows = list(map(lambda row: list(map(lambda x: x if is_float( x) else compute_full_path(root_path, x), row)), rows)) with data_iterator() as di: index = 0 while index < di.size: data = di.next() result, outputs = forward(args, index, config, data, di.variables) if index == 0: for name, dim in zip(result.names, result.dims): if dim == 1: row0.append(name) else: for d in range(dim): row0.append(name + '__' + str(d)) for i, output in enumerate(outputs): if index + i < len(rows): rows[index + i].extend(output) index += len(outputs) logger.log( 99, 'data {} / {}'.format(min([index, len(rows)]), len(rows))) with open(os.path.join(args.outdir, 'output_result.csv'), 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerow(row0) writer.writerows(rows) logger.log(99, 'Forward Completed.') progress(None)
def multiprocess_save_cache(create_cache_args): def _process_row(row, args): def _get_value(value, is_vector=False): try: if is_vector: value = [float(value)] else: value = float(value) return value except ValueError: pass ext = (os.path.splitext(value)[1]).lower() with args._filereader.open(value) as f: value = load(ext)(f, normalize=args._normalize) return value values = collections.OrderedDict() if len(row) == len(args._columns): for column, column_value in enumerate(row): variable, index, label = args._columns[column] if index is None: values[variable] = _get_value(column_value, is_vector=True) else: if variable not in values: values[variable] = [] values[variable].append(_get_value(column_value)) return values.values() (position, cache_csv), cc_args = create_cache_args cc_args = SimpleNamespace(**cc_args) cache_data = [] for row in cache_csv: cache_data.append(tuple(_process_row(row, cc_args))) if len(cache_data) > 0: start_position = position + 1 - len(cache_data) end_position = position cache_filename = os.path.join( cc_args._output_cache_dirname, '{}_{:08d}_{:08d}{}'.format(cc_args._cache_file_name_prefix, start_position, end_position, cc_args._cache_file_format)) logger.info('Creating cache file {}'.format(cache_filename)) data = collections.OrderedDict([(n, []) for n in cc_args._variables]) for _, cd in enumerate(cache_data): for i, n in enumerate(cc_args._variables): if isinstance(cd[i], numpy.ndarray): d = cd[i] else: d = numpy.array(cd[i]).astype(numpy.float32) data[n].append(d) try: if cc_args._cache_file_format == ".h5": h5 = h5py.File(cache_filename, 'w') for k, v in data.items(): h5.create_dataset(k, data=v) h5.close() else: retry_count = 1 is_create_cache_incomplete = True while is_create_cache_incomplete: try: with open(cache_filename, 'wb') as f: for v in data.values(): numpy.save(f, v) is_create_cache_incomplete = False except OSError: retry_count += 1 if retry_count > 10: raise logger.info( 'Creating cache retry {}/10'.format(retry_count)) except: logger.critical( 'An error occurred while creating cache file from dataset.') for k, v in data.items(): size = v[0].shape for d in v: if size != d.shape: logger.critical( 'The sizes of data "{}" are not the same. ({} != {})' .format(k, size, d.shape)) raise cc_args._cache_file_name_and_data_nums_list.append( (cache_filename, len(cache_data))) progress( 'Create cache', len(cc_args._cache_file_name_and_data_nums_list) / cc_args._cache_file_count)
def forward_command(args): callback.update_status(args) configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) batch_size = args.batch_size if batch_size < 1: batch_size = None class ForwardConfig: pass config = ForwardConfig info = load.load(files, prepare_data_iterator=False, batch_size=batch_size) config.global_config = info.global_config config.executors = info.executors.values() config.networks = [] for e in config.executors: if e.network.name in info.networks.keys(): config.networks.append(info.networks[e.network.name]) else: logger.critical('Network {} is not found.'.format( config.executor.network.name)) return False normalize = True for d in info.datasets.values(): if d.uri == args.dataset or d.cache_dir == args.dataset: normalize = d.normalize for e in config.executors: normalize = normalize and not e.no_image_normalization orders = {} # With CSV if os.path.splitext(args.dataset)[1] == '.csv': data_iterator = (lambda: data_iterator_csv_dataset( uri=args.dataset, batch_size=config.networks[0].batch_size, shuffle=False, normalize=normalize, with_memory_cache=False, with_file_cache=False)) # load dataset as csv filereader = FileReader(args.dataset) with filereader.open(textmode=True, encoding='utf-8-sig') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) if args.replace_path: root_path = os.path.dirname(args.dataset) root_path = os.path.abspath(root_path.replace('/|\\', os.path.sep)) else: root_path = '.' rows = [row for row in rows if len(row)] rows = list( map( lambda row: list( map( lambda i, x: x if row0[i][0] == '#' or is_float( x) else compute_full_path(root_path, x), range(len(row)), row)), rows)) for i in range(len(rows)): orders[i] = i # With Cache elif os.path.splitext(args.dataset)[1] == '.cache': data_iterator = (lambda: data_iterator_cache(uri=args.dataset, batch_size=config. networks[0].batch_size, shuffle=False, normalize=normalize)) # Get original CSV original_csv = os.path.join(args.dataset, 'original.csv') try: # load dataset as csv filereader = FileReader(original_csv) with filereader.open(textmode=True, encoding='utf-8-sig') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) root_path = '.' rows = list( map( lambda row: list( map( lambda x: x if is_float(x) else compute_full_path( root_path, x), row)), rows)) except: print('Cannot open', original_csv) pass # Get original Data order. order_csv = os.path.join(args.dataset, 'order.csv') try: filereader = FileReader(order_csv) with filereader.open(textmode=True) as f: for original, shuffled in [[int(x) for x in row] for row in csv.reader(f)]: orders[original] = shuffled except: print('Cannot open', order_csv) for i in range(len(rows)): orders[i] = i else: print('Unsupported extension "{}" in "{}".'.format( os.path.splitext(args.dataset)[1], args.dataset)) callback.update_status(('data.max', len(rows))) callback.update_status(('data.current', 0)) callback.update_status('processing', True) result_csv_filename = os.path.join(args.outdir, args.outfile) with open(result_csv_filename, 'w', encoding='utf-8') as f: writer = csv.writer(f, lineterminator='\n') with data_iterator() as di: index = 0 while index < di.size: data = di.next() result, outputs = _forward(args, index, config, data, di.variables) if index == 0: for name, dim in zip(result.names, result.dims): if dim == 1: if e.repeat_evaluation_type == "std": name = "Uncertainty(Std)" row0.append(name) else: for d in range(dim): row0.append(name + '__' + str(d)) writer.writerow(row0) for i, output in enumerate(outputs): if index + i < len(rows): import copy row = copy.deepcopy(rows[orders[index + i]]) row.extend(output) writer.writerow(row) index += len(outputs) callback.update_status(('data.current', min([index, len(rows)]))) callback.update_forward_time() callback.update_status() logger.log( 99, 'data {} / {}'.format(min([index, len(rows)]), len(rows))) callback.process_evaluation_result(args.outdir, result_csv_filename) logger.log(99, 'Forward Completed.') progress(None) callback.update_status(('output_result.csv_header', ','.join(row0))) callback.update_status(('output_result.column_num', len(row0))) callback.update_status(('output_result.data_num', len(rows))) callback.update_status('finished') return True
def compare_with_cpu_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) class TrainConfig: pass class OptConfig: pass class MonConfig: pass # Load config with current context files = [] files.append(args.config) with nn.parameter_scope('current'): info = load.load(files) parameters = get_parameters(grad_only=False) config = TrainConfig() config.global_config = info.global_config config.training_config = info.training_config config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Load config with cpu context files = [] files.append(args.config2) with nn.parameter_scope('cpu'): info_cpu = load.load(files) cpu_parameters = get_parameters(grad_only=False) config_cpu = TrainConfig() config_cpu.global_config = info_cpu.global_config config_cpu.training_config = info_cpu.training_config config_cpu.optimizers = OrderedDict() for name, opt in info_cpu.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config_cpu.optimizers[name] = o config_cpu.monitors = OrderedDict() for name, mon in info_cpu.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config_cpu.monitors[name] = m result_array = [['1-Correl']] # Profile Optimizer with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) for name, o in config_cpu.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) result_array = compare_optimizer( config, parameters, config_cpu, cpu_parameters, result_array) # Write profiling result import csv with open(args.outdir + os.sep + 'compare_with_cpu.csv', 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(result_array) logger.log(99, 'Compare with CPU Completed.') progress(None)
def _save_cache(self, args): position = args[0] cache_csv = args[1] # conv dataset cache_data = [tuple(self._process_row(row)) for row in cache_csv] start_position = position + 1 - len(cache_data) end_position = position cache_filename = os.path.join( self._cache_dir, '{}_{:08d}_{:08d}{}'.format(self._cache_file_name_prefix, start_position, end_position, self._cache_file_format)) logger.info('Creating cache file {}'.format(cache_filename)) data = collections.OrderedDict( [(n, []) for n in self._variables]) for _, cd in enumerate(cache_data): for i, n in enumerate(self._variables): if isinstance(cd[i], numpy.ndarray): d = cd[i] else: d = numpy.array(cd[i]).astype(numpy.float32) data[n].append(d) try: if self._cache_file_format == ".h5": h5 = h5py.File(cache_filename, 'w') for k, v in data.items(): h5.create_dataset(k, data=v) h5.close() else: retry_count = 1 is_create_cache_incomplete = True while is_create_cache_incomplete: try: with open(cache_filename, 'wb') as f: for v in data.values(): numpy.save(f, v) is_create_cache_incomplete = False except OSError: retry_count += 1 if retry_count > 10: raise logger.info( 'Creating cache retry {}/10'.format(retry_count)) except: logger.critical( 'An error occurred while creating cache file from dataset.') for k, v in data.items(): size = v[0].shape for d in v: if size != d.shape: logger.critical('The sizes of data "{}" are not the same. ({} != {})'.format( k, size, d.shape)) raise self.current_cache_position += 1 if single_or_rankzero(): if self.current_cache_position % int(self.num_of_cache_file/20+1) == 0: progress('Create cache', self.current_cache_position / self.num_of_cache_file) return cache_filename, len(cache_data)
def _evaluate(args, config, monitoring_report, best_error): error_str = '' valid_error = 0.0 for name, mon in config.monitors.items(): m = mon.monitor error_sum_monitor = 0.0 error_count = 0 di = mon.data_iterator dp_epoch = di.epoch while dp_epoch == di.epoch: # Set data to variable datas = di.next() for v, d in m.dataset_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, datas[ di.variables.index(d)], ctx=dest_context) # Generate data for v, generator in m.generator_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context) # Sum error before forward to prepare input data while processing # on GPU if error_count > 0: for v in m.monitor_variables: error_sum_monitor += np.mean(v.variable_instance.d) progress('Evaluating "{0}"'.format( name) + ' : error={0:0.6f}'.format( error_sum_monitor / error_count), di.position * 1.0 / di.size) error_count += 1 # Forward recursive m.network.forward(m.forward_sequence) # Sum error at the end of dataset for v in m.monitor_variables: error_sum_monitor += np.mean(v.variable_instance.d) error = error_sum_monitor / error_count monitoring_report.append(' {}: {}\n'.format(name, error)) if error_str != '': error_str += ', ' else: error_str = ' {' error_str += '{}={:.6f}'.format(name, error) if name == 'valid_error': valid_error = error if error_str != '': error_str += '}' # Save Parameters if (not config.training_config.save_best) or \ (not best_error) or \ (best_error is not None and valid_error <= best_error): best_error = valid_error save_parameters(os.path.join(args.outdir, 'parameters.h5')) return best_error, error_str
def _evaluate(args, config, monitoring_report, best_error): error_str = '' valid_error = 0.0 for name, mon in config.monitors.items(): m = mon.monitor error_sum_monitor = 0.0 error_count = 0 di = mon.data_iterator dp_epoch = di.epoch while dp_epoch == di.epoch: # Set data to variable datas = di.next() for v, d in m.dataset_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, datas[di.variables.index(d)], ctx=dest_context) # Generate data for v, generator in m.generator_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context) # Sum error before forward to prepare input data while processing # on GPU if error_count > 0: for v in m.monitor_variables: error_sum_monitor += np.mean(v.variable_instance.d) progress( 'Evaluating "{0}"'.format(name) + ' : error={0:0.6f}'.format( error_sum_monitor / error_count), di.position * 1.0 / di.size) error_count += 1 # Forward recursive m.network.forward(m.forward_sequence) # Sum error at the end of dataset for v in m.monitor_variables: error_sum_monitor += np.mean(v.variable_instance.d) error = error_sum_monitor / error_count monitoring_report.append(' {}: {}\n'.format(name, error)) if error_str != '': error_str += ', ' else: error_str = ' {' error_str += '{}={:.6f}'.format(name, error) if name == 'valid_error': valid_error = error if error_str != '': error_str += '}' # Save Parameters if (not config.training_config.save_best) or \ (not best_error) or \ (best_error is not None and valid_error <= best_error): best_error = valid_error save_parameters(os.path.join(args.outdir, 'parameters.h5')) return best_error, error_str
def forward_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) batch_size = args.batch_size if batch_size < 1: batch_size = None class ForwardConfig: pass config = ForwardConfig info = load.load(files, prepare_data_iterator=False, batch_size=batch_size) config.global_config = info.global_config config.executors = info.executors.values() config.networks = [] for e in config.executors: if e.network.name in info.networks.keys(): config.networks.append(info.networks[e.network.name]) else: logger.critical('Network {} is not found.'.format( config.executor.network.name)) return False normalize = True for d in info.datasets.values(): if d.uri == args.dataset: normalize = d.normalize for e in config.executors: normalize = normalize and not e.no_image_normalization data_iterator = (lambda: data_iterator_csv_dataset(uri=args.dataset, batch_size=config. networks[0].batch_size, shuffle=False, normalize=normalize, with_memory_cache=False, with_file_cache=False)) # load dataset as csv filereader = FileReader(args.dataset) with filereader.open(textmode=True) as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) root_path = os.path.dirname(args.dataset) root_path = os.path.abspath(root_path.replace('/|\\', os.path.sep)) rows = list( map( lambda row: list( map( lambda x: x if is_float(x) else compute_full_path(root_path, x), row)), rows)) with open(os.path.join(args.outdir, 'output_result.csv'), 'w') as f: writer = csv.writer(f, lineterminator='\n') with data_iterator() as di: index = 0 while index < di.size: data = di.next() result, outputs = _forward(args, index, config, data, di.variables) if index == 0: for name, dim in zip(result.names, result.dims): if dim == 1: row0.append(name) else: for d in range(dim): row0.append(name + '__' + str(d)) writer.writerow(row0) for i, output in enumerate(outputs): if index + i < len(rows): import copy row = copy.deepcopy(rows[index + i]) row.extend(output) writer.writerow(row) index += len(outputs) logger.log( 99, 'data {} / {}'.format(min([index, len(rows)]), len(rows))) logger.log(99, 'Forward Completed.') progress(None) return True
def _evaluate(args, config, monitoring_report, best_error, epoch): comm = current_communicator() error_str = '' valid_error = 0.0 def _sum_error(sum, error): ret = None if comm: # logger.log(99, "Calc error with communicator") var = [nn.NdArray()] var[0].data = error _all_reduce(comm, var, division=False, inplace=True) ret = sum + var[0].data else: ret = sum + error return ret for name, mon in config.monitors.items(): m = mon.monitor error_sum_monitor = 0.0 error_count = 0 data_size = max([di.size for di in mon.data_iterators]) batch_size = max([di.batch_size for di in mon.data_iterators]) for i in range(data_size // batch_size): # Load dataset data = OrderedDict() for di in mon.data_iterators: data.update(zip(di.variables, di.next())) # Set data to variable for v, d in m.dataset_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data[d], ctx=dest_context, data_name=d, variable_name=v.name) # Generate data for v, generator in m.generator_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Sum error before forward to prepare input data while processing # on GPU if error_count > 0: error_sum = 0.0 for v in m.monitor_variables: error_sum += np.mean(v.variable_instance.d) # v.variable_instance.data.zero() error_sum_monitor = _sum_error(error_sum_monitor, error_sum) if single_or_rankzero(): progress( 'Evaluating "{0}"'.format(name) + ' : error={0:0.6f}'.format( error_sum_monitor / error_count), di.position * 1.0 / di.size) error_count += comm.size if comm else 1 # Forward recursive m.network.forward(m.forward_sequence) # Sum error at the end of dataset error_sum = 0.0 for v in m.monitor_variables: error_sum += np.mean(v.variable_instance.d) # v.variable_instance.data.zero() error_sum_monitor = _sum_error(error_sum_monitor, error_sum) if error_count == 0: error = 0 else: error = error_sum_monitor / error_count if np.isnan(error) or np.isinf(error): logger.log(99, 'Validation error is Nan') error = 0.0 monitoring_report.append(' {}: {}\n'.format(name, error)) callback.update_status((['monitoring_report', epoch, name], error)) callback.update_status((['last', name], error)) # save last value if error_str != '': error_str += ', ' else: error_str = ' {' error_str += '{}={:.6f}'.format(name, error) if name == 'valid_error': valid_error = error if error_str != '': error_str += '}' # Save Parameters if single_or_rankzero(): if (not config.training_config.save_best) or \ (not best_error) or \ (best_error is not None and valid_error <= best_error): best_error = valid_error callback.update_status(('best.valid_error', best_error)) callback.update_status(('best.epoch', epoch)) _save_parameters(args, 'best', epoch, config, True) return best_error, error_str
def forward_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) class ForwardConfig: pass config = ForwardConfig info = load.load(files, prepare_data_iterator=False) config.global_config = info.global_config config.executors = info.executors.values() config.networks = [] for e in config.executors: if e.network.name in info.networks.keys(): config.networks.append(info.networks[e.network.name]) else: logger.critical('Network {} does not found.'.format( config.executor.network.name)) return normalize = True for d in info.datasets.values(): if d.uri == args.dataset: normalize = d.normalize data_iterator = (lambda: data_iterator_csv_dataset( args.dataset, config.networks[0].batch_size, False, padding=True, normalize=normalize)) # load dataset as csv with open(args.dataset, 'rt') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) root_path = os.path.dirname(args.dataset) root_path = os.path.abspath(root_path.replace('/|\\', os.path.sep)) rows = map(lambda row: map(lambda x: x if is_float( x) else compute_full_path(root_path, x), row), rows) with data_iterator() as di: index = 0 while index < di.size: data = di.next() result, outputs = forward(args, index, config, data, di.variables) if index == 0: for name, dim in zip(result.names, result.dims): if dim == 1: row0.append(name) else: for d in range(dim): row0.append(name + '__' + str(d)) for i, output in enumerate(outputs): if index + i < len(rows): rows[index + i].extend(output) index += len(outputs) logger.log( 99, 'data {} / {}'.format(min([index, len(rows)]), len(rows))) with open(os.path.join(args.outdir, 'output_result.csv'), 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerow(row0) writer.writerows(rows) logger.log(99, 'Forward Completed.') progress(None)
def train_command(args): callback.update_status(args) if single_or_rankzero(): configure_progress(os.path.join(args.outdir, 'progress.txt')) info = load.load([args.config], prepare_data_iterator=None, exclude_parameter=True) # Check dataset uri is empty. dataset_error = False for dataset in info.datasets.values(): if dataset.uri.strip() == '': dataset_error = True if dataset_error: logger.log(99, 'Fatal error. Dataset URI is empty.') return False class TrainConfig: pass config = TrainConfig() config.timelimit = -1 if args.param: load.load([args.param], parameter_only=True) config.timelimit = callback.get_timelimit(args) config.global_config = info.global_config config.training_config = info.training_config if single_or_rankzero(): logger.log(99, 'Train with contexts {}'.format(available_contexts)) class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterators = [] config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterators = [] config.monitors[name] = m # Training comm = current_communicator() config.training_config.iter_per_epoch //= comm.size if comm else 1 max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch global _save_parameter_info _save_parameter_info = {} _, config_ext = os.path.splitext(args.config) if config_ext == '.prototxt' or config_ext == '.nntxt': _save_parameter_info['config'] = args.config elif config_ext == '.nnp': with zipfile.ZipFile(args.config, 'r') as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if ext == '.nntxt' or ext == '.prototxt': nnp.extract(name, args.outdir) _save_parameter_info['config'] = os.path.join( args.outdir, name) result = False restart = False if max_iteration > 0: rng = np.random.RandomState(comm.rank if comm else 0) with ExitStack() as stack: # Create data_iterator instance only once for each dataset in optimizers optimizer_data_iterators = {} for name, o in config.optimizers.items(): for di in o.optimizer.data_iterators.values(): if di not in optimizer_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) optimizer_data_iterators[di] = di_instance else: di_instance = optimizer_data_iterators[di] o.data_iterators.append(di_instance) # Create data_iterator instance only once for each dataset in monitors monitor_data_iterators = {} for name, m in config.monitors.items(): for di in m.monitor.data_iterators.values(): if di not in monitor_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) monitor_data_iterators[di] = di_instance else: di_instance = monitor_data_iterators[di] m.data_iterators.append(di_instance) monitor_data_iterators.update(optimizer_data_iterators) result, restart = _train(args, config) else: # save parameters without training (0 epoch learning) logger.log(99, '0 epoch learning. (Just save parameter.)') if single_or_rankzero(): _save_parameters(args, None, 0, config, True) result = True if single_or_rankzero() and not restart: if result: logger.log(99, 'Training Completed.') callback.update_status('finished') else: logger.log(99, 'Training Incompleted.') callback.update_status('failed') if single_or_rankzero(): progress(None) return True
def profile_command(args): callback.update_status(args) configure_progress(os.path.join(args.outdir, 'progress.txt')) class TrainConfig: pass config = TrainConfig() info = load.load(args.config) config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterators = [] config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterators = [] config.monitors[name] = m ext_module = import_extension_module( config.global_config.default_context.backend[0].split(':')[0]) def synchronize(): return ext_module.synchronize( device_id=config.global_config.default_context.device_id) result_array = [['time in ms']] callback.update_status('processing', True) # Profile Optimizer with ExitStack() as stack: # Create data_iterator instance only once for each dataset in optimizers optimizer_data_iterators = {} for name, o in config.optimizers.items(): for di in o.optimizer.data_iterators.values(): if di not in optimizer_data_iterators: di_instance = stack.enter_context(di()) optimizer_data_iterators[di] = di_instance else: di_instance = optimizer_data_iterators[di] o.data_iterators.append(di_instance) result_array = profile_optimizer(config, result_array, synchronize) # Write profiling result import csv with open(args.outdir + os.sep + 'profile.csv', 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(result_array) logger.log(99, 'Profile Completed.') progress(None) callback.update_status('finished') return True