def initialize_cache_files(self, filename): length = -1 with self._filereader.open_cache(filename) as cache: # Check variables. if self._variables is None: self._variables = list(cache.keys()) else: if current_communicator(): if not set(self._variables) == set(cache.keys()): logger.log( 99, 'Error at worker {} {} {}'.format( current_communicator().rank, set(self._variables), set(cache.keys()))) raise for k, v in cache.items(): if length < 0: length = len(v) else: assert (length == len(v)) self._cache_files.append((filename, length)) logger.info('{} {}'.format(filename, length)) if length > self._max_length: self._max_length = length
def _get_data(self, position): self._position = position if current_communicator(): try: filename, index = self._order[position] except IndexError: logger.log(99, '_get_data() fails at worker {} retrying.'.format( current_communicator().rank)) sleep(0.01) return self._get_data(position) else: filename, index = self._order[position] if filename != self._current_filename: file_names_to_prefetch = None if self._cache_type == ".npy" and self._num_of_threads > 0: file_names_to_prefetch = [o[0] for o in self._order[position + self._max_length:position + self._max_length * self._num_of_threads:self._max_length]] self._current_data = self._get_next_data( filename, file_names_to_prefetch) self._current_filename = filename data = [self._current_data[v][index] for v in self.variables] if self._normalize: data = [d.astype(numpy.float32) * (1.0 / 255.0) if d.dtype == numpy.uint8 else d for d in data] return data
def collect_and_shape_result(c_load, g_load): # c_load : float e.g. 58.5 # g_load : [[nvidia_device_id, gpu_load]] comm = current_communicator() if comm: res = [[comm.rank, c_load], *g_load[:1]] t_load_ndarray = np.array(res).reshape(-1) load_var = nn.Variable([ len(t_load_ndarray), ]) load_var.d = t_load_ndarray load_list_var = [ nn.Variable([ len(t_load_ndarray), ]) for _ in range(comm.size) ] comm.all_gather(load_var.data, [a.data for a in load_list_var]) result_arr = [[*np.round(a.d.astype(float), decimals=1)] for a in load_list_var] else: res = [[0, c_load], *g_load[:1]] t_load_ndarray = np.round(np.array(res).reshape(-1), decimals=1) result_arr = [[*t_load_ndarray.astype(float)]] result_arr = sorted(result_arr, key=lambda x: x[0]) return result_arr
def train(info, config): config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterators = [] config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterators = [] config.monitors[name] = m # Training comm = current_communicator() config.training_config.iter_per_epoch //= comm.size if comm else 1 rng = np.random.RandomState(comm.rank if comm else 0) with ExitStack() as stack: # Create data_iterator instance only once for each dataset in optimizers optimizer_data_iterators = {} for name, o in config.optimizers.items(): for di in o.optimizer.data_iterators.values(): if di not in optimizer_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) optimizer_data_iterators[di] = di_instance else: di_instance = optimizer_data_iterators[di] o.data_iterators.append(di_instance) # Create data_iterator instance only once for each dataset in monitors monitor_data_iterators = {} for name, m in config.monitors.items(): for di in m.monitor.data_iterators.values(): if di not in monitor_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) monitor_data_iterators[di] = di_instance else: di_instance = monitor_data_iterators[di] m.data_iterators.append(di_instance) monitor_data_iterators.update(optimizer_data_iterators) yield from _train(config)
def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator): class Dataset: pass dataset = Dataset() dataset.uri = uri dataset.normalize = not no_image_normalization comm = current_communicator() # use same random state for each process until slice is called rng = numpy.random.RandomState(0) use_memory_cache = comm.size == 1 if comm else True if prepare_data_iterator: if cache_dir == '': cache_dir = None # Disable implicit cache creation when MPI is available. if cache_dir and (create_cache_explicitly or comm): cache_index = os.path.join(cache_dir, "cache_index.csv") if not os.path.exists(cache_index) or overwrite_cache: if single_or_rankzero(): logger.log(99, 'Creating cache data for "' + uri + '"') try: os.makedirs(cache_dir) except OSError: pass # python2 does not support exists_ok arg with data_iterator_csv_dataset(uri, batch_size, shuffle, rng=rng, normalize=False, cache_dir=cache_dir, with_memory_cache=False) as di: pass rng = numpy.random.RandomState(0) dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache)) elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0: if comm: logger.critical( 'Implicit cache creation does not support with MPI') import sys sys.exit(-1) else: if cache_dir: try: os.makedirs(cache_dir) except OSError: pass # python2 does not support exists_ok arg dataset.data_iterator = (lambda: data_iterator_csv_dataset( uri, batch_size, shuffle, rng=rng, normalize=dataset.normalize, cache_dir=cache_dir)) else: dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache)) else: dataset.data_iterator = None return dataset
def _get_next_data(self, filename, file_names_to_prefetch, retry=1): if retry > 10: logger.log(99, '_get_next_data() retry count over give up.') raise if self._cache_type == '.npy': next_data = self._cache_reader_with_prefetch.open_and_prefetch_cache( filename, file_names_to_prefetch) else: # h5 format next_data = {} with self._filereader.open_cache(filename) as cache: for k, v in cache.items(): next_data[k] = v[()] if current_communicator(): if set(self._variables) != set(next_data.keys()): logger.log(99, '_get_next_data() fails at worker {} retrying count {}/10.'.format( current_communicator().rank, retry)) sleep(0.01) return self._get_next_data(filename, file_names_to_prefetch, retry+1) return next_data
def _context(proto): comm = current_communicator() if not proto.backends: logger.warn('Old-style context. Updating to new format.') # Update from old Context backends = [x.strip() for x in proto.backend.split('|')] compute_backends = [ x.strip() for x in proto.compute_backend.split('|') ] if 'cuda' in backends: device_id = str(proto.device_id) if comm: device_id = str(comm.local_rank) if 'cudnn' in compute_backends: try: import nnabla_ext.cudnn ctx = nnabla_ext.cudnn.context(device_id=device_id) except ImportError: logger.warn('Fallback to CPU context.') import nnabla_ext.cpu ctx = nnabla_ext.cpu.context() elif 'default' in compute_backends: try: import nnabla_ext.cuda ctx = nnabla_ext.cuda.context(device_id=device_id) except ImportError: logger.warn('Fallback to CPU context.') import nnabla_ext.cpu ctx = nnabla_ext.cpu.context() else: raise ValueError('Invalid compute_backend {}'.format( proto.compute_backend)) elif 'cpu' in backends: import nnabla_ext.cpu ctx = nnabla_ext.cpu.context() else: raise ValueError('Invalid context {}'.format(proto)) ctx.array_class = str(proto.array_class) return ctx ctx = nn.Context() ctx.backend = proto.backends ctx.array_class = str(proto.array_class) if comm: ctx.device_id = str(comm.local_rank) else: ctx.device_id = str(proto.device_id) return ctx
def measure_cpu_gpu_instant_load(): # Get current cpu gpu load, as # load = [rank, cpu_load, nvidia_device_id, gpu_load] # result_arr: [load, load, ...] gpu_load = [] if gpu_load_backend_ok: global gpu_a_load global gpu_m_count gpu_m_count += 1 try: comm = current_communicator() if comm: index = comm.local_rank elif 'cuda' in str(nn.get_current_context().backend): index = 0 else: raise Exception handler = pynvml.nvmlDeviceGetHandleByIndex(index) gpu_load = [[ index, pynvml.nvmlDeviceGetUtilizationRates(handler).gpu ]] if index in gpu_a_load.keys(): gpu_a_load[index]['name'] = pynvml.nvmlDeviceGetName( handler).decode("utf-8") o_load = gpu_a_load[index]['load'] n_load = gpu_load[0][1] gpu_a_load[index]['load'] = ( (gpu_m_count - 1) * o_load + n_load) / gpu_m_count else: gpu_a_load[index] = { 'name': pynvml.nvmlDeviceGetName(handler).decode("utf-8"), 'load': gpu_load[0][1] } except Exception: gpu_load = [] if cpu_load_backend_ok: global p_handler cpu_load = p_handler.cpu_percent() callback.update_status( ('cpu_gpu_load', collect_and_shape_result(cpu_load, gpu_load)))
def load(filenames, prepare_data_iterator=True, batch_size=None, exclude_parameter=False, parameter_only=False, extension=".nntxt"): '''load Load network information from files. Args: filenames (list): file-like object or List of filenames. extension: if filenames is file-like object, extension is one of ".nntxt", ".prototxt", ".protobuf", ".h5", ".nnp". Returns: dict: Network information. ''' class Info: pass info = Info() proto = nnabla_pb2.NNablaProtoBuf() # optimizer checkpoint opti_proto = nnabla_pb2.NNablaProtoBuf() OPTI_BUF_EXT = ['.optimizer'] opti_h5_files = {} tmpdir = tempfile.mkdtemp() if isinstance(filenames, list) or isinstance(filenames, tuple): pass elif isinstance(filenames, str) or hasattr(filenames, 'read'): filenames = [filenames] for filename in filenames: if isinstance(filename, str): _, ext = os.path.splitext(filename) else: ext = extension # TODO: Here is some known problems. # - Even when protobuf file includes network structure, # it will not loaded. # - Even when prototxt file includes parameter, # it will not loaded. if ext in ['.nntxt', '.prototxt']: if not parameter_only: with get_file_handle_load(filename, ext) as f: try: text_format.Merge(f.read(), proto) except: logger.critical('Failed to read {}.'.format(filename)) logger.critical( '2 byte characters may be used for file name or folder name.' ) raise if len(proto.parameter) > 0: if not exclude_parameter: nn.load_parameters(filename, extension=ext) elif ext in ['.protobuf', '.h5']: if not exclude_parameter: nn.load_parameters(filename, extension=ext) else: logger.info('Skip loading parameter.') elif ext == '.nnp': with get_file_handle_load(filename, ext) as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if name == 'nnp_version.txt': pass # TODO currently do nothing with version. elif ext in ['.nntxt', '.prototxt']: if not parameter_only: with nnp.open(name, 'r') as f: text_format.Merge(f.read(), proto) if len(proto.parameter) > 0: if not exclude_parameter: with nnp.open(name, 'r') as f: nn.load_parameters(f, extension=ext) elif ext in ['.protobuf', '.h5']: if not exclude_parameter: with nnp.open(name, 'r') as f: nn.load_parameters(f, extension=ext) else: logger.info('Skip loading parameter.') elif ext in OPTI_BUF_EXT: buf_type = get_buf_type(name) if buf_type == 'protobuf': with nnp.open(name, 'r') as f: with get_file_handle_load( f, '.protobuf') as opti_p: opti_proto.MergeFromString(opti_p.read()) elif buf_type == 'h5': nnp.extract(name, tmpdir) opti_h5_files[name] = os.path.join(tmpdir, name) default_context = None if proto.HasField('global_config'): info.global_config = _global_config(proto) default_context = info.global_config.default_context if 'cuda' in default_context.backend: import nnabla_ext.cudnn elif 'cuda:float' in default_context.backend: try: import nnabla_ext.cudnn except: pass try: x = nn.Variable() y = nn.Variable() func = F.ReLU(default_context, inplace=True) func.setup([x], [y]) func.forward([x], [y]) except: logger.warn('Fallback to CPU context.') import nnabla_ext.cpu default_context = nnabla_ext.cpu.context() else: import nnabla_ext.cpu default_context = nnabla_ext.cpu.context() comm = current_communicator() if comm: default_context.device_id = str(comm.local_rank) if proto.HasField('training_config'): info.training_config = _training_config(proto) info.datasets = _datasets( proto, prepare_data_iterator if prepare_data_iterator is not None else info.training_config.max_epoch > 0) info.networks = _networks(proto, default_context, batch_size) info.optimizers = _optimizers(proto, default_context, info.networks, info.datasets) _load_optimizer_checkpoint(opti_proto, opti_h5_files, info) shutil.rmtree(tmpdir) info.monitors = _monitors(proto, default_context, info.networks, info.datasets) info.executors = _executors(proto, info.networks) return info
def train_command(args): if single_or_rankzero(): configure_progress(os.path.join(args.outdir, 'progress.txt')) info = load.load([args.config], exclude_parameter=True) # Check dataset uri is empty. dataset_error = False for dataset in info.datasets.values(): if dataset.uri.strip() == '': dataset_error = True if dataset_error: logger.log(99, 'Fatal error. Dataset URI is empty.') return False class TrainConfig: pass config = TrainConfig() config.timelimit = -1 if args.param: load.load([args.param], parameter_only=True) config.global_config = info.global_config config.training_config = info.training_config if single_or_rankzero(): logger.log(99, 'Train with contexts {}'.format(available_contexts)) class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Training comm = current_communicator() config.training_config.iter_per_epoch //= comm.size if comm else 1 max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch global _save_parameter_info _save_parameter_info = {} _, config_ext = os.path.splitext(args.config) if config_ext == '.prototxt' or config_ext == '.nntxt': _save_parameter_info['config'] = args.config elif config_ext == '.nnp': with zipfile.ZipFile(args.config, 'r') as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if ext == '.nntxt' or ext == '.prototxt': nnp.extract(name, args.outdir) _save_parameter_info['config'] = os.path.join( args.outdir, name) result = False if max_iteration > 0: data_iterators = {'optimizer': {}, 'monitor': {}} rng = np.random.RandomState(comm.rank if comm else 0) with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) if comm and comm.size > 1: o.data_iterator = o.data_iterator.slice( rng, comm.size, comm.rank) for name, m in config.monitors.items(): m.data_iterator = stack.enter_context( m.monitor.data_iterator()) if comm and comm.size > 1: m.data_iterator = m.data_iterator.slice( rng, comm.size, comm.rank) result = _train(args, config) else: # save parameters without training (0 epoch learning) logger.log(99, '0 epoch learning. (Just save parameter.)') if single_or_rankzero(): _save_parameters(args, 'current', 0, True) result = True if single_or_rankzero(): if result: logger.log(99, 'Training Completed.') else: logger.log(99, 'Training Incompleted.') if single_or_rankzero(): progress(None) return True
def _create_optimizer(ctx, o, networks, datasets): class Optimizer: pass optimizer = Optimizer() optimizer.comm = current_communicator() comm_size = optimizer.comm.size if optimizer.comm else 1 optimizer.start_iter = (o.start_iter - 1) // comm_size + \ 1 if o.start_iter > 0 else 0 optimizer.end_iter = (o.end_iter - 1) // comm_size + \ 1 if o.end_iter > 0 else 0 optimizer.name = o.name optimizer.order = o.order optimizer.update_interval = o.update_interval if o.update_interval > 0 else 1 optimizer.network = networks[o.network_name] optimizer.data_iterators = OrderedDict() for d in o.dataset_name: optimizer.data_iterators[d] = datasets[d].data_iterator optimizer.dataset_assign = OrderedDict() for d in o.data_variable: optimizer.dataset_assign[optimizer.network.variables[ d.variable_name]] = d.data_name optimizer.generator_assign = OrderedDict() for g in o.generator_variable: optimizer.generator_assign[optimizer.network.variables[ g.variable_name]] = _get_generator(g) optimizer.loss_variables = [] for l in o.loss_variable: optimizer.loss_variables.append( optimizer.network.variables[l.variable_name]) optimizer.parameter_learning_rate_multipliers = OrderedDict() for p in o.parameter_variable: param_variable_names = _get_matching_variable_names( p.variable_name, optimizer.network.variables.keys()) for v_name in param_variable_names: optimizer.parameter_learning_rate_multipliers[ optimizer.network. variables[v_name]] = p.learning_rate_multiplier with nn.context_scope(ctx): if o.solver.type == 'Adagrad': optimizer.solver = S.Adagrad(o.solver.adagrad_param.lr, o.solver.adagrad_param.eps) init_lr = o.solver.adagrad_param.lr elif o.solver.type == 'Adadelta': optimizer.solver = S.Adadelta(o.solver.adadelta_param.lr, o.solver.adadelta_param.decay, o.solver.adadelta_param.eps) init_lr = o.solver.adadelta_param.lr elif o.solver.type == 'Adam': optimizer.solver = S.Adam(o.solver.adam_param.alpha, o.solver.adam_param.beta1, o.solver.adam_param.beta2, o.solver.adam_param.eps) init_lr = o.solver.adam_param.alpha elif o.solver.type == 'Adamax': optimizer.solver = S.Adamax(o.solver.adamax_param.alpha, o.solver.adamax_param.beta1, o.solver.adamax_param.beta2, o.solver.adamax_param.eps) init_lr = o.solver.adamax_param.alpha elif o.solver.type == 'AdaBound': optimizer.solver = S.AdaBound(o.solver.adabound_param.alpha, o.solver.adabound_param.beta1, o.solver.adabound_param.beta2, o.solver.adabound_param.eps, o.solver.adabound_param.final_lr, o.solver.adabound_param.gamma) init_lr = o.solver.adabound_param.alpha elif o.solver.type == 'AMSGRAD': optimizer.solver = S.AMSGRAD(o.solver.amsgrad_param.alpha, o.solver.amsgrad_param.beta1, o.solver.amsgrad_param.beta2, o.solver.amsgrad_param.eps) init_lr = o.solver.amsgrad_param.alpha elif o.solver.type == 'AMSBound': optimizer.solver = S.AMSBound(o.solver.amsbound_param.alpha, o.solver.amsbound_param.beta1, o.solver.amsbound_param.beta2, o.solver.amsbound_param.eps, o.solver.amsbound_param.final_lr, o.solver.amsbound_param.gamma) init_lr = o.solver.amsbound_param.alpha elif o.solver.type == 'Eve': p = o.solver.eve_param optimizer.solver = S.Eve(p.alpha, p.beta1, p.beta2, p.beta3, p.k, p.k2, p.eps) init_lr = p.alpha elif o.solver.type == 'Momentum': optimizer.solver = S.Momentum(o.solver.momentum_param.lr, o.solver.momentum_param.momentum) init_lr = o.solver.momentum_param.lr elif o.solver.type == 'Nesterov': optimizer.solver = S.Nesterov(o.solver.nesterov_param.lr, o.solver.nesterov_param.momentum) init_lr = o.solver.nesterov_param.lr elif o.solver.type == 'RMSprop': optimizer.solver = S.RMSprop(o.solver.rmsprop_param.lr, o.solver.rmsprop_param.decay, o.solver.rmsprop_param.eps) init_lr = o.solver.rmsprop_param.lr elif o.solver.type == 'Sgd' or o.solver.type == 'SGD': optimizer.solver = S.Sgd(o.solver.sgd_param.lr) init_lr = o.solver.sgd_param.lr else: raise ValueError('Solver "' + o.solver.type + '" is not supported.') parameters = { v.name: v.variable_instance for v, local_lr in optimizer.parameter_learning_rate_multipliers.items() if local_lr > 0.0 } optimizer.solver.set_parameters(parameters) optimizer.parameters = OrderedDict( sorted(parameters.items(), key=lambda x: x[0])) optimizer.weight_decay = o.solver.weight_decay # keep following 2 lines for backward compatibility optimizer.lr_decay = o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0 optimizer.lr_decay_interval = o.solver.lr_decay_interval if o.solver.lr_decay_interval > 0 else 1 optimizer.solver.set_states_from_protobuf(o) optimizer.comm = current_communicator() comm_size = optimizer.comm.size if optimizer.comm else 1 optimizer.scheduler = ExponentialScheduler(init_lr, 1.0, 1) if o.solver.lr_scheduler_type == 'Polynomial': if o.solver.polynomial_scheduler_param.power != 0.0: optimizer.scheduler = PolynomialScheduler( init_lr, o.solver.polynomial_scheduler_param.max_iter // comm_size, o.solver.polynomial_scheduler_param.power) elif o.solver.lr_scheduler_type == 'Cosine': optimizer.scheduler = CosineScheduler( init_lr, o.solver.cosine_scheduler_param.max_iter // comm_size) elif o.solver.lr_scheduler_type == 'Exponential': if o.solver.exponential_scheduler_param.gamma != 1.0: optimizer.scheduler = ExponentialScheduler( init_lr, o.solver.exponential_scheduler_param.gamma, o.solver.exponential_scheduler_param.iter_interval // comm_size if o.solver.exponential_scheduler_param.iter_interval > comm_size else 1) elif o.solver.lr_scheduler_type == 'Step': if o.solver.step_scheduler_param.gamma != 1.0 and len( o.solver.step_scheduler_param.iter_steps) > 0: optimizer.scheduler = StepScheduler( init_lr, o.solver.step_scheduler_param.gamma, [ step // comm_size for step in o.solver.step_scheduler_param.iter_steps ]) elif o.solver.lr_scheduler_type == 'Custom': # ToDo raise NotImplementedError() elif o.solver.lr_scheduler_type == '': if o.solver.lr_decay_interval != 0 or o.solver.lr_decay != 0.0: optimizer.scheduler = ExponentialScheduler( init_lr, o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0, o.solver.lr_decay_interval // comm_size if o.solver.lr_decay_interval > comm_size else 1) else: raise ValueError('Learning Rate Scheduler "' + o.solver.lr_scheduler_type + '" is not supported.') if o.solver.lr_warmup_scheduler_type == 'Linear': if o.solver.linear_warmup_scheduler_param.warmup_iter >= comm_size: optimizer.scheduler = LinearWarmupScheduler( optimizer.scheduler, o.solver.linear_warmup_scheduler_param.warmup_iter // comm_size) optimizer.forward_sequence = optimizer.network.get_forward_sequence( optimizer.loss_variables) optimizer.backward_sequence = optimizer.network.get_backward_sequence( optimizer.loss_variables, optimizer.parameter_learning_rate_multipliers) return optimizer
def _train(args, config): global _save_parameter_info comm = current_communicator() _CGLOAD_LOG_INTERVAL = 20 best_epoch = None best_error = None last_epoch = 0 if args.resume: last_epoch, best_epoch, best_error = _get_current_parameter(args) if best_epoch is not None: logger.log( 99, "Best error {} recorded at epoch {} in previous training.". format(best_error, best_epoch)) if best_epoch > last_epoch: logger.log( 99, "Resumed epoch is {} but this training keep this result.". format(last_epoch)) logger.log(99, "Resume from epoch {}".format(last_epoch + 1)) callback.update_status(('epoch.max', config.training_config.max_epoch)) callback.update_status( ('epoch.current', last_epoch + 1 if last_epoch < config.training_config.max_epoch else config.training_config.max_epoch)) max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if single_or_rankzero(): logger.log( 99, 'Training epoch {} of {} begin'.format( last_epoch + 1, config.training_config.max_epoch)) class Cost: pass cost = Cost() cost.sum_epoch = 0.0 cost.num_iteration = 0 cost.sum_iteration = 0.0 cost.variables = None class TimeInfo: pass timeinfo = TimeInfo() timeinfo.past_time = 0 timeinfo.estimate_time = 0 timeinfo.last_past_time = None if max_iteration > 0: last_iteration = last_epoch * config.training_config.iter_per_epoch if last_iteration < max_iteration: timeinfo.start_time = time.time() timeinfo.last_epoch_start_time = timeinfo.start_time callback.update_status('processing', True, timeinfo.start_time) for iteration in range(last_iteration, max_iteration): # instant load measurement measure_cpu_gpu_instant_load() cost = _update(iteration, config, cost) if np.isnan(cost.sum_epoch) or np.isinf(cost.sum_epoch): logger.log(99, 'Cost is Nan') return False, False timeinfo = _calc_estimate_time(timeinfo, max_iteration, last_iteration, iteration + 1) callback.update_time_train(prediction=timeinfo.estimate_time) if 0 < config.timelimit < timeinfo.estimate_time: logger.log( 99, 'Expected training time ({:.3f}s) will exceed time limit ({}s).' .format(timeinfo.estimate_time, config.timelimit)) return False, False if (iteration + 1) % config.training_config.iter_per_epoch == 0: last_past_time = -1 # End of epoch epoch = iteration // config.training_config.iter_per_epoch + 1 cost_avg_epoch = cost.sum_epoch / cost.num_iteration if cost.num_iteration else 0 cost.sum_epoch = 0.0 cost.num_iteration = 0 monitoring_report = [] # Evaluation error_str = '' if epoch % config.training_config.monitor_interval == 0 or epoch <= 5: best_error, error_str = _evaluate( args, config, monitoring_report, best_error, epoch) # Cpu/Gpu average load cg_load_str = '' cgload_log = '' cg_load = get_cpu_gpu_average_load() if cg_load: cg_load_str = 'epoch {} average_load_matrix: {}'.format( epoch, cg_load) span = _calc_epoch_span(timeinfo) if span > _CGLOAD_LOG_INTERVAL: cgload_log = _format_cgload_log(cg_load) if single_or_rankzero(): # Write to monitoring_report.yml f = open( os.path.join(args.outdir, 'monitoring_report.yml'), 'a') f.write('{}:\n'.format(epoch - 1)) f.write(' cost: {}\n'.format(cost_avg_epoch)) for s in monitoring_report: f.write(s) f.close() callback.update_status( (['monitoring_report', epoch, 'cost'], cost_avg_epoch)) _save_parameters(args, 'current', epoch, config) callback.update_status(('epoch.current', epoch)) callback.update_status() logger.log( 99, 'epoch {} of {} cost={:.6f} {} time=({:.1f}s /{:.1f}s) {}' .format(epoch, config.training_config.max_epoch, cost_avg_epoch, error_str, timeinfo.past_time, timeinfo.estimate_time, cgload_log)) if cg_load_str: # cpu_gpu_average_load record at epoch level callback.update_status( (['cpu_gpu_epoch_load', epoch], cg_load)) progress(cg_load_str, 1) if not callback.check_training_time( args, config, timeinfo, epoch, last_epoch): _save_parameters(args, 'current', epoch, config, True) return False, True if single_or_rankzero(): _save_parameters(args, 'current', epoch, config, True) return True, False
def _create_optimizer(ctx, o, networks, datasets): class Optimizer: pass optimizer = Optimizer() optimizer.name = o.name optimizer.order = o.order optimizer.update_interval = o.update_interval if o.update_interval > 0 else 1 optimizer.network = networks[o.network_name] optimizer.data_iterator = datasets[o.dataset_name].data_iterator optimizer.dataset_assign = OrderedDict() for d in o.data_variable: optimizer.dataset_assign[optimizer.network.variables[ d.variable_name]] = d.data_name optimizer.generator_assign = OrderedDict() for g in o.generator_variable: optimizer.generator_assign[optimizer.network.variables[ g.variable_name]] = _get_generator(g) optimizer.loss_variables = [] for l in o.loss_variable: optimizer.loss_variables.append( optimizer.network.variables[l.variable_name]) optimizer.parameter_learning_rate_multipliers = OrderedDict() for p in o.parameter_variable: param_variable_names = _get_matching_variable_names( p.variable_name, optimizer.network.variables.keys()) for v_name in param_variable_names: optimizer.parameter_learning_rate_multipliers[ optimizer.network. variables[v_name]] = p.learning_rate_multiplier with nn.context_scope(ctx): if o.solver.type == 'Adagrad': optimizer.solver = S.Adagrad(o.solver.adagrad_param.lr, o.solver.adagrad_param.eps) elif o.solver.type == 'Adadelta': optimizer.solver = S.Adadelta(o.solver.adadelta_param.lr, o.solver.adadelta_param.decay, o.solver.adadelta_param.eps) elif o.solver.type == 'Adam': optimizer.solver = S.Adam(o.solver.adam_param.alpha, o.solver.adam_param.beta1, o.solver.adam_param.beta2, o.solver.adam_param.eps) elif o.solver.type == 'Adamax': optimizer.solver = S.Adamax(o.solver.adamax_param.alpha, o.solver.adamax_param.beta1, o.solver.adamax_param.beta2, o.solver.adamax_param.eps) elif o.solver.type == 'Eve': p = o.solver.eve_param optimizer.solver = S.Eve(p.alpha, p.beta1, p.beta2, p.beta3, p.k, p.k2, p.eps) elif o.solver.type == 'Momentum': optimizer.solver = S.Momentum(o.solver.momentum_param.lr, o.solver.momentum_param.momentum) elif o.solver.type == 'Nesterov': optimizer.solver = S.Nesterov(o.solver.nesterov_param.lr, o.solver.nesterov_param.momentum) elif o.solver.type == 'RMSprop': optimizer.solver = S.RMSprop(o.solver.rmsprop_param.lr, o.solver.rmsprop_param.decay, o.solver.rmsprop_param.eps) elif o.solver.type == 'Sgd' or o.solver.type == 'SGD': optimizer.solver = S.Sgd(o.solver.sgd_param.lr) else: raise ValueError('Solver "' + o.solver.type + '" is not supported.') parameters = { v.name: v.variable_instance for v, local_lr in optimizer.parameter_learning_rate_multipliers.items() if local_lr > 0.0 } optimizer.solver.set_parameters(parameters) optimizer.parameters = OrderedDict( sorted(parameters.items(), key=lambda x: x[0])) optimizer.weight_decay = o.solver.weight_decay optimizer.lr_decay = o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0 optimizer.lr_decay_interval = o.solver.lr_decay_interval if o.solver.lr_decay_interval > 0 else 1 optimizer.comm = current_communicator() if optimizer.comm is not None: new_interval = optimizer.lr_decay_interval // optimizer.comm.size if new_interval == 0: new_interval = 1 logger.log( 99, 'LR Decay interval divide by {} ({} -> {})'.format( optimizer.comm.size, optimizer.lr_decay_interval, new_interval)) optimizer.lr_decay_interval = new_interval optimizer.forward_sequence = optimizer.network.get_forward_sequence( optimizer.loss_variables) optimizer.backward_sequence = optimizer.network.get_backward_sequence( optimizer.loss_variables, optimizer.parameter_learning_rate_multipliers) return optimizer
def lms_scheduler(ctx, use_lms, gpu_memory_size=None, window_length=None): _check_list = [x.split(":")[0] for x in ctx.backend] if "cudnn" not in _check_list and "cuda" not in _check_list: logger.warn( "ctx passed to scheduler doesn't have cuda/cudnn backend. lms scheduler will not be used." ) use_lms = False comm = current_communicator() if comm: logger.log(99, f'[OoC] Currently OoC is disabled for Multi-GPU training.') use_lms = False if use_lms: gpu_index = 0 if 'cuda' in str(ctx.backend): gpu_index = int(ctx.device_id) else: logger.log(99, f'[OoC] OoC is only enabled for GPU training.') raise Exception if gpu_memory_size is None or gpu_memory_size == 0: try: handle = nvml.nvmlDeviceGetHandleByIndex(gpu_index) total_memory = nvml.nvmlDeviceGetMemoryInfo(handle).total gpu_memory_size = int(total_memory * 0.7) except: logger.log( 99, f'[OoC] Could not get GPU memory size using default value(6GB).' ) gpu_memory_size = 6e9 # default 6 GiB pass if window_length is None or window_length == 0: window_length = int(gpu_memory_size * 1.5) logger.log( 99, f'[OoC] gpu_memory_limit: {gpu_memory_size / 1e9}GB, prefetch_window_length: {window_length / 1e9}GB' ) # Change array preference so that lms works well. # import nnabla_ext.cuda.init as cuda_init # cuda_init.prefer_cpu_pinned_array() # cuda_init.prefer_cuda_virtual_array() from nnabla.ext_utils import get_extension_context be, tc = ctx.backend[0].split(":") cpu_ctx = get_extension_context("cpu", device_id="", type_config=tc) return SwapInOutScheduler(cpu_ctx, ctx, gpu_memory_size, window_length) else: class DummyScheduler(object): function_pre_hook = None function_post_hook = None update_pre_hook = None update_post_hook = None def start_scheduling(self): return None def end_scheduling(self): return None def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass return DummyScheduler()
def load(filenames, prepare_data_iterator=True, batch_size=None, exclude_parameter=False, parameter_only=False, extension=".nntxt", context=None): '''load Load network information from files. Args: filenames (list): file-like object or List of filenames. extension: if filenames is file-like object, extension is one of ".nntxt", ".prototxt", ".protobuf", ".h5", ".nnp". Returns: dict: Network information. ''' class Info: pass info = Info() info.prepare_data_iterator = prepare_data_iterator info.batch_size = batch_size info.exclude_parameter = exclude_parameter info.parameter_only = parameter_only info.proto = nnabla_pb2.NNablaProtoBuf() # first stage file loaders file_loaders = get_initial_file_loader() # using global parameter scope, keep consistency with legacy implementation. # To avoid to surprise previous developers, but it is better using # stand-alone OrderedDict() instance. info.parameter_scope = nn.parameter.get_current_parameter_scope() load_files(info, file_loaders, filenames, extension) default_context = None if context: if context == 'cpu': import nnabla_ext.cpu default_context = nnabla_ext.cpu.context() else: cs = context.split(':') if cs[0] == 'cudnn': if len(cs) == 1: devid = 0 else: devid = int(cs[1]) import nnabla_ext.cudnn default_context = nnabla_ext.cudnn.context(device_id=devid) if default_context is None: logger.warn('Invalid context [{}]'.format(context)) elif info.proto.HasField('global_config'): info.global_config = _global_config(proto) info.global_config.default_context = default_context if default_context is None: if info.proto.HasField('global_config'): info.global_config = _global_config(info.proto) default_context = info.global_config.default_context if 'cuda' in default_context.backend: import nnabla_ext.cudnn elif 'cuda:float' in default_context.backend: try: import nnabla_ext.cudnn except: pass else: import nnabla_ext.cpu default_context = nnabla_ext.cpu.context() info.global_config = _global_config( None, default_context=default_context) default_context = _check_context(default_context) logger.log(99, 'Using context "{}"'.format(default_context)) comm = current_communicator() if comm: default_context.device_id = str(comm.local_rank) if info.proto.HasField('training_config'): info.training_config = _training_config(info.proto) info.default_context = default_context info.datasets = _datasets( info.proto, prepare_data_iterator if prepare_data_iterator is not None else info.training_config.max_epoch > 0) info.renamed_variables = {} info.networks = _networks(info, nn.graph_def.ProtoGraph.from_proto(info.proto, param_scope=info.parameter_scope, rng=numpy.random.RandomState(0))) info.optimizers = _optimizers(info) info.monitors = _monitors(info) info.executors = _executors(info) return info
def _update(iter, config, cost): comm = current_communicator() loaded_data = {} is_first_optimizer = True def _sum_cost(): if comm: # logger.log(99, "Calc cost with communicator") var = [nn.NdArray()] var[0].data = cost.sum_iteration _all_reduce(comm, var, division=False, inplace=True) cost.sum_epoch += var[0].data cost.num_iteration += comm.size else: cost.sum_epoch += cost.sum_iteration cost.num_iteration += 1 def _get_reserved_variable(shape, reserved_variable_name, iter, iter_per_epoch, max_epoch): if reserved_variable_name == "%iter": value = iter elif reserved_variable_name == "%max_iter": value = max_epoch * iter_per_epoch elif reserved_variable_name == "%epoch": value = iter // iter_per_epoch elif reserved_variable_name == "%epochf": value = iter * 1.0 / iter_per_epoch elif reserved_variable_name == "%max_epoch": value = max_epoch elif reserved_variable_name == "%progress": value = (iter * 1.0 / iter_per_epoch) / max_epoch else: raise ValueError( "Unknown reserved variable {}".format(reserved_variable_name)) return value for opt in config.optimizers.values(): o = opt.optimizer if (o.start_iter == 0 or iter + 1 >= o.start_iter) and (o.end_iter == 0 or iter + 1 <= o.end_iter): # Load dataset data = OrderedDict() for di in opt.data_iterators: if di not in loaded_data: loaded_data[di] = di.next() data.update(zip(di.variables, loaded_data[di])) for v, d in o.dataset_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None if d not in data and d[0] == "%": value = _get_reserved_variable( v.variable_instance.shape, d, iter, config.training_config.iter_per_epoch, config.training_config.max_epoch) v.variable_instance.data.fill(value) elif d in data: let_data_to_variable(v.variable_instance, data[d], ctx=dest_context, data_name=d, variable_name=v.name) else: raise ValueError( 'Variable "{}" is not found in dataset "{}", optimizer "{}"' .format(d, ', '.join(o.data_iterators.keys()), o.name)) # Generate data for v, generator in o.generator_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Monitor loss before forward to prepare input data while processing on # GPU if cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) # l.variable_instance.data.zero() if is_first_optimizer: is_first_optimizer = False _sum_cost() if single_or_rankzero(): progress( "Training : cost={0:0.6f}".format( cost.sum_iteration), (iter % config.training_config.iter_per_epoch) * 1.0 / config.training_config.iter_per_epoch) cost.sum_iteration = 0.0 with nodeTimeCollector.collect_cost_time(comm, iter): # Forward o.network.forward(o.forward_sequence) # Backward o.network.backward(o.backward_sequence, iter % o.update_interval == 0) # Update if iter % o.update_interval == o.update_interval - 1: if o.weight_decay > 0: o.solver.weight_decay(o.weight_decay) if o.comm: # Updated param with communicator params = [x.grad for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) if o.scheduler is not None: o.solver.set_learning_rate( o.scheduler.get_learning_rate(iter)) o.solver.update() # Sync w sometimes if iter % 10 == 9: # TODO: change the interval if o.comm: params = [x.data for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) # Reserve monitor loss cost.variables = o.loss_variables # Monitor loss at the end of epoch if iter % config.training_config.iter_per_epoch == config.training_config.iter_per_epoch - 1 and cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) # l.variable_instance.data.zero() _sum_cost() cost.variables = None cost.sum_iteration = 0.0 return cost
def _evaluate(args, config, monitoring_report, best_error, epoch): comm = current_communicator() error_str = '' valid_error = 0.0 def _sum_error(sum, error): ret = None if comm: # logger.log(99, "Calc error with communicator") var = [nn.NdArray()] var[0].data = error _all_reduce(comm, var, division=False, inplace=True) ret = sum + var[0].data else: ret = sum + error return ret for name, mon in config.monitors.items(): m = mon.monitor error_sum_monitor = 0.0 error_count = 0 data_size = max([di.size for di in mon.data_iterators]) batch_size = max([di.batch_size for di in mon.data_iterators]) for i in range(data_size // batch_size): # Load dataset data = OrderedDict() for di in mon.data_iterators: data.update(zip(di.variables, di.next())) # Set data to variable for v, d in m.dataset_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data[d], ctx=dest_context, data_name=d, variable_name=v.name) # Generate data for v, generator in m.generator_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Sum error before forward to prepare input data while processing # on GPU if error_count > 0: error_sum = 0.0 for v in m.monitor_variables: error_sum += np.mean(v.variable_instance.d) # v.variable_instance.data.zero() error_sum_monitor = _sum_error(error_sum_monitor, error_sum) if single_or_rankzero(): progress( 'Evaluating "{0}"'.format(name) + ' : error={0:0.6f}'.format( error_sum_monitor / error_count), di.position * 1.0 / di.size) error_count += comm.size if comm else 1 # Forward recursive m.network.forward(m.forward_sequence) # Sum error at the end of dataset error_sum = 0.0 for v in m.monitor_variables: error_sum += np.mean(v.variable_instance.d) # v.variable_instance.data.zero() error_sum_monitor = _sum_error(error_sum_monitor, error_sum) if error_count == 0: error = 0 else: error = error_sum_monitor / error_count if np.isnan(error) or np.isinf(error): logger.log(99, 'Validation error is Nan') error = 0.0 monitoring_report.append(' {}: {}\n'.format(name, error)) callback.update_status((['monitoring_report', epoch, name], error)) callback.update_status((['last', name], error)) # save last value if error_str != '': error_str += ', ' else: error_str = ' {' error_str += '{}={:.6f}'.format(name, error) if name == 'valid_error': valid_error = error if error_str != '': error_str += '}' # Save Parameters if single_or_rankzero(): if (not config.training_config.save_best) or \ (not best_error) or \ (best_error is not None and valid_error <= best_error): best_error = valid_error callback.update_status(('best.valid_error', best_error)) callback.update_status(('best.epoch', epoch)) _save_parameters(args, 'best', epoch, config, True) return best_error, error_str
def _update(iter, config, cost): comm = current_communicator() loaded_data = {} is_first_optimizer = True def _sum_cost(): if comm: # logger.log(99, "Calc cost with communicator") var = [nn.NdArray()] var[0].data = cost.sum_iteration _all_reduce(comm, var, division=False, inplace=True) cost.sum_epoch += var[0].data cost.num_iteration += comm.size else: cost.sum_epoch += cost.sum_iteration cost.num_iteration += 1 for opt in config.optimizers.values(): o = opt.optimizer # Load dataset di = opt.data_iterator if o.data_iterator not in loaded_data: loaded_data[o.data_iterator] = di.next() data = loaded_data[o.data_iterator] for v, d in o.dataset_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data[di.variables.index(d)], ctx=dest_context, data_name=d, variable_name=v.name) # Generate data for v, generator in o.generator_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Monitor loss before forward to prepare input data while processing on # GPU if cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) l.variable_instance.data.zero() if is_first_optimizer: is_first_optimizer = False _sum_cost() if single_or_rankzero(): progress( "Training : cost={0:0.6f}".format(cost.sum_iteration), (iter % config.training_config.iter_per_epoch) * 1.0 / config.training_config.iter_per_epoch) cost.sum_iteration = 0.0 # Forward o.network.forward(o.forward_sequence) # Backward o.network.backward(o.backward_sequence, iter % o.update_interval == 0) # Update if iter % o.update_interval == o.update_interval - 1: if o.weight_decay > 0: o.solver.weight_decay(o.weight_decay) if o.comm: # Updated param with communicator params = [x.grad for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) if o.scheduler is not None: o.solver.set_learning_rate(o.scheduler.get_learning_rate(iter)) o.solver.update() # Sync w sometimes if iter % 10 == 9: # TODO: change the interval if o.comm: params = [x.data for x in o.parameters.values()] _all_reduce(o.comm, params, division=True, inplace=True) # Reserve monitor loss cost.variables = o.loss_variables # Monitor loss at the end of iteration if iter % config.training_config.iter_per_epoch == config.training_config.iter_per_epoch - 1 and cost.variables: for l in cost.variables: cost.sum_iteration += np.mean(l.variable_instance.d) l.variable_instance.data.zero() _sum_cost() cost.variables = None cost.sum_iteration = 0.0 return cost
def train_command(args): callback.update_status(args) if single_or_rankzero(): configure_progress(os.path.join(args.outdir, 'progress.txt')) info = load.load([args.config], prepare_data_iterator=None, exclude_parameter=True) # Check dataset uri is empty. dataset_error = False for dataset in info.datasets.values(): if dataset.uri.strip() == '': dataset_error = True if dataset_error: logger.log(99, 'Fatal error. Dataset URI is empty.') return False class TrainConfig: pass config = TrainConfig() config.timelimit = -1 if args.param: load.load([args.param], parameter_only=True) config.timelimit = callback.get_timelimit(args) config.global_config = info.global_config config.training_config = info.training_config if single_or_rankzero(): logger.log(99, 'Train with contexts {}'.format(available_contexts)) class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterators = [] config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterators = [] config.monitors[name] = m # Training comm = current_communicator() config.training_config.iter_per_epoch //= comm.size if comm else 1 max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch global _save_parameter_info _save_parameter_info = {} _, config_ext = os.path.splitext(args.config) if config_ext == '.prototxt' or config_ext == '.nntxt': _save_parameter_info['config'] = args.config elif config_ext == '.nnp': with zipfile.ZipFile(args.config, 'r') as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if ext == '.nntxt' or ext == '.prototxt': nnp.extract(name, args.outdir) _save_parameter_info['config'] = os.path.join( args.outdir, name) result = False restart = False if max_iteration > 0: rng = np.random.RandomState(comm.rank if comm else 0) with ExitStack() as stack: # Create data_iterator instance only once for each dataset in optimizers optimizer_data_iterators = {} for name, o in config.optimizers.items(): for di in o.optimizer.data_iterators.values(): if di not in optimizer_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) optimizer_data_iterators[di] = di_instance else: di_instance = optimizer_data_iterators[di] o.data_iterators.append(di_instance) # Create data_iterator instance only once for each dataset in monitors monitor_data_iterators = {} for name, m in config.monitors.items(): for di in m.monitor.data_iterators.values(): if di not in monitor_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) monitor_data_iterators[di] = di_instance else: di_instance = monitor_data_iterators[di] m.data_iterators.append(di_instance) monitor_data_iterators.update(optimizer_data_iterators) result, restart = _train(args, config) else: # save parameters without training (0 epoch learning) logger.log(99, '0 epoch learning. (Just save parameter.)') if single_or_rankzero(): _save_parameters(args, None, 0, config, True) result = True if single_or_rankzero() and not restart: if result: logger.log(99, 'Training Completed.') callback.update_status('finished') else: logger.log(99, 'Training Incompleted.') callback.update_status('failed') if single_or_rankzero(): progress(None) return True
def _train(args, config): global _save_parameter_info comm = current_communicator() last_epoch = 0 if args.resume: last_epoch = _get_current_parameter(args) logger.log(99, "Resume from epoch {}".format(last_epoch + 1)) max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if single_or_rankzero(): logger.log( 99, 'Training epoch {} of {} begin'.format( last_epoch + 1, config.training_config.max_epoch)) class Cost: pass cost = Cost() cost.sum_epoch = 0.0 cost.num_iteration = 0 cost.sum_iteration = 0.0 cost.variables = None best_error = None class TimeInfo: pass timeinfo = TimeInfo() timeinfo.last_past_time = None if max_iteration > 0: last_iteration = last_epoch * config.training_config.iter_per_epoch if last_iteration < max_iteration: timeinfo.start_time = time.time() for iteration in range(last_iteration, max_iteration): cost = _update(iteration, config, cost) if (iteration - last_iteration) > 0: timeinfo = _calc_estimate_time(timeinfo, max_iteration, last_iteration, iteration) if config.timelimit > 0 and timeinfo.estimate_time > config.timelimit: logger.log( 99, 'Expected training time ({:.3f}s) will exceed time limit ({}s).' .format(timeinfo.estimate_time, config.timelimit)) return False if (iteration + 1) % config.training_config.iter_per_epoch == 0: last_past_time = -1 # End of epoch epoch = iteration // config.training_config.iter_per_epoch + 1 cost_avg_epoch = cost.sum_epoch / cost.num_iteration cost.sum_epoch = 0.0 cost.num_iteration = 0 monitoring_report = [] # Evaluation error_str = '' if epoch % config.training_config.monitor_interval == 0 or epoch <= 5: best_error, error_str = _evaluate( args, config, monitoring_report, best_error, epoch) if single_or_rankzero(): # Write to monitoring_report.yml f = open( os.path.join(args.outdir, 'monitoring_report.yml'), 'a') f.write('{}:\n'.format(epoch - 1)) f.write(' cost: {}\n'.format(cost_avg_epoch)) for s in monitoring_report: f.write(s) f.close() _save_parameters(args, 'current', epoch) logger.log( 99, 'epoch {} of {} cost={:.6f} {} time=({:.1f}s /{:.1f}s)' .format(epoch, config.training_config.max_epoch, cost_avg_epoch, error_str, timeinfo.past_time, timeinfo.estimate_time)) if single_or_rankzero(): _save_parameters(args, 'current', epoch, True) return True
def load(filenames, prepare_data_iterator=True, batch_size=None, exclude_parameter=False, parameter_only=False): '''load Load network information from files. Args: filenames (list): List of filenames. Returns: dict: Network information. ''' class Info: pass info = Info() proto = nnabla_pb2.NNablaProtoBuf() for filename in filenames: _, ext = os.path.splitext(filename) # TODO: Here is some known problems. # - Even when protobuf file includes network structure, # it will not loaded. # - Even when prototxt file includes parameter, # it will not loaded. if ext in ['.nntxt', '.prototxt']: if not parameter_only: with open(filename, 'rt') as f: try: text_format.Merge(f.read(), proto) except: logger.critical('Failed to read {}.'.format(filename)) logger.critical( '2 byte characters may be used for file name or folder name.' ) raise if len(proto.parameter) > 0: if not exclude_parameter: nn.load_parameters(filename) elif ext in ['.protobuf', '.h5']: if not exclude_parameter: nn.load_parameters(filename) else: logger.info('Skip loading parameter.') elif ext == '.nnp': try: tmpdir = tempfile.mkdtemp() with zipfile.ZipFile(filename, 'r') as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if name == 'nnp_version.txt': nnp.extract(name, tmpdir) with open(os.path.join(tmpdir, name), 'rt') as f: pass # TODO currently do nothing with version. elif ext in ['.nntxt', '.prototxt']: nnp.extract(name, tmpdir) if not parameter_only: with open(os.path.join(tmpdir, name), 'rt') as f: text_format.Merge(f.read(), proto) if len(proto.parameter) > 0: if not exclude_parameter: nn.load_parameters( os.path.join(tmpdir, name)) elif ext in ['.protobuf', '.h5']: nnp.extract(name, tmpdir) if not exclude_parameter: nn.load_parameters(os.path.join(tmpdir, name)) else: logger.info('Skip loading parameter.') finally: shutil.rmtree(tmpdir) default_context = None if proto.HasField('global_config'): info.global_config = _global_config(proto) default_context = info.global_config.default_context if 'cuda' in default_context.backend: import nnabla_ext.cudnn elif 'cuda:float' in default_context.backend: try: import nnabla_ext.cudnn except: pass else: import nnabla_ext.cpu default_context = nnabla_ext.cpu.context() comm = current_communicator() if comm: default_context.device_id = str(comm.rank) if proto.HasField('training_config'): info.training_config = _training_config(proto) info.datasets = _datasets( proto, prepare_data_iterator if prepare_data_iterator is not None else info.training_config.max_epoch > 0) info.networks = _networks(proto, default_context, batch_size) info.optimizers = _optimizers(proto, default_context, info.networks, info.datasets) info.monitors = _monitors(proto, default_context, info.networks, info.datasets) info.executors = _executors(proto, info.networks) return info
def lms_scheduler(ctx, use_lms, gpu_memory_size=None, window_length=None): _check_list = [x.split(":")[0] for x in ctx.backend] if "cudnn" not in _check_list and "cuda" not in _check_list: logger.warn( "ctx passed to scheduler doesn't have cuda/cudnn backend. lms scheduler will not be used." ) use_lms = False comm = current_communicator() if comm: logger.log(99, f'[OoC] Currently OoC is disabled for Multi-GPU training.') use_lms = False if use_lms: gpu_index = 0 if 'cuda' in str(ctx.backend): gpu_index = int(ctx.device_id) else: logger.log(99, f'[OoC] OoC is only enabled for GPU training.') raise Exception # It is better to use nvml to get GPU infomation but due to windows problem, temporarily get information with `nvidia-smi`. if gpu_memory_size is None or gpu_memory_size == 0: try: import subprocess gpu_memory_size = int( int( subprocess.check_output( 'nvidia-smi --query-gpu=index,memory.total --format=csv' ).decode().splitlines()[1:][gpu_index].split(',') [1].strip().split()[0]) * (1024**2) * 0.7) except: logger.log( 99, f'[OoC] Could not get GPU memory size using default value(6GB).' ) gpu_memory_size = 6e9 # default 6 GiB pass if window_length is None or window_length == 0: window_length = int(gpu_memory_size * 1.5) logger.log( 99, f'[OoC] gpu_memory_limit: {gpu_memory_size / 1e9}GB, prefetch_window_length: {window_length / 1e9}GB' ) # Change array preference so that lms works well. # import nnabla_ext.cuda.init as cuda_init # cuda_init.prefer_cpu_pinned_array() # cuda_init.prefer_cuda_virtual_array() from nnabla.ext_utils import get_extension_context be, tc = ctx.backend[0].split(":") cpu_ctx = get_extension_context("cpu", device_id="", type_config=tc) return SwapInOutScheduler(cpu_ctx, ctx, gpu_memory_size, window_length) else: class DummyScheduler(object): function_pre_hook = None function_post_hook = None update_pre_hook = None update_post_hook = None def start_scheduling(self): return None def end_scheduling(self): return None def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass return DummyScheduler()