def progress(state, progress=0.0): if len(state_file_name): global last_state_datetime if last_state_datetime < datetime.now() + timedelta( milliseconds=-1000) or state is None: last_state_datetime = datetime.now() retry = 1 while True: try: with open(state_file_name, 'w') as f: if state is not None: f.write(state + ' ({0:3.2f}%)'.format(progress * 100)) break except: retry += 1 if retry > 100: logger.critical( 'Failed to write to {}.'.format(state_file_name)) raise time.sleep(0.1) callback.update_progress('{0} ({1:3.2f}%)'.format(state, progress * 100)) if cg_load_backend_ok: callback.update_status() if state_callback is not None: state_callback(state, progress)
def measure_cpu_gpu_instant_load(): # Get current cpu gpu load, as # load = [rank, cpu_load, nvidia_device_id, gpu_load] # result_arr: [load, load, ...] gpu_load = [] if gpu_load_backend_ok: global gpu_a_load global gpu_m_count gpu_m_count += 1 try: comm = current_communicator() if comm: index = comm.local_rank elif 'cuda' in str(nn.get_current_context().backend): index = 0 else: raise Exception handler = pynvml.nvmlDeviceGetHandleByIndex(index) gpu_load = [[ index, pynvml.nvmlDeviceGetUtilizationRates(handler).gpu ]] if index in gpu_a_load.keys(): gpu_a_load[index]['name'] = pynvml.nvmlDeviceGetName( handler).decode("utf-8") o_load = gpu_a_load[index]['load'] n_load = gpu_load[0][1] gpu_a_load[index]['load'] = ( (gpu_m_count - 1) * o_load + n_load) / gpu_m_count else: gpu_a_load[index] = { 'name': pynvml.nvmlDeviceGetName(handler).decode("utf-8"), 'load': gpu_load[0][1] } except Exception: gpu_load = [] if cpu_load_backend_ok: global p_handler cpu_load = p_handler.cpu_percent() callback.update_status( ('cpu_gpu_load', collect_and_shape_result(cpu_load, gpu_load)))
def train_command(args): callback.update_status(args) if single_or_rankzero(): configure_progress(os.path.join(args.outdir, 'progress.txt')) info = load.load([args.config], prepare_data_iterator=None, exclude_parameter=True) # Check dataset uri is empty. dataset_error = False for dataset in info.datasets.values(): if dataset.uri.strip() == '': dataset_error = True if dataset_error: logger.log(99, 'Fatal error. Dataset URI is empty.') return False class TrainConfig: pass config = TrainConfig() config.timelimit = -1 if args.param: load.load([args.param], parameter_only=True) config.timelimit = callback.get_timelimit(args) config.global_config = info.global_config config.training_config = info.training_config if single_or_rankzero(): logger.log(99, 'Train with contexts {}'.format(available_contexts)) class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterators = [] config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterators = [] config.monitors[name] = m # Training comm = current_communicator() config.training_config.iter_per_epoch //= comm.size if comm else 1 max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch global _save_parameter_info _save_parameter_info = {} _, config_ext = os.path.splitext(args.config) if config_ext == '.prototxt' or config_ext == '.nntxt': _save_parameter_info['config'] = args.config elif config_ext == '.nnp': with zipfile.ZipFile(args.config, 'r') as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if ext == '.nntxt' or ext == '.prototxt': nnp.extract(name, args.outdir) _save_parameter_info['config'] = os.path.join( args.outdir, name) result = False restart = False if max_iteration > 0: rng = np.random.RandomState(comm.rank if comm else 0) with ExitStack() as stack: # Create data_iterator instance only once for each dataset in optimizers optimizer_data_iterators = {} for name, o in config.optimizers.items(): for di in o.optimizer.data_iterators.values(): if di not in optimizer_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) optimizer_data_iterators[di] = di_instance else: di_instance = optimizer_data_iterators[di] o.data_iterators.append(di_instance) # Create data_iterator instance only once for each dataset in monitors monitor_data_iterators = {} for name, m in config.monitors.items(): for di in m.monitor.data_iterators.values(): if di not in monitor_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) monitor_data_iterators[di] = di_instance else: di_instance = monitor_data_iterators[di] m.data_iterators.append(di_instance) monitor_data_iterators.update(optimizer_data_iterators) result, restart = _train(args, config) else: # save parameters without training (0 epoch learning) logger.log(99, '0 epoch learning. (Just save parameter.)') if single_or_rankzero(): _save_parameters(args, None, 0, config, True) result = True if single_or_rankzero() and not restart: if result: logger.log(99, 'Training Completed.') callback.update_status('finished') else: logger.log(99, 'Training Incompleted.') callback.update_status('failed') if single_or_rankzero(): progress(None) return True
def _train(args, config): global _save_parameter_info comm = current_communicator() _CGLOAD_LOG_INTERVAL = 20 best_epoch = None best_error = None last_epoch = 0 if args.resume: last_epoch, best_epoch, best_error = _get_current_parameter(args) if best_epoch is not None: logger.log( 99, "Best error {} recorded at epoch {} in previous training.". format(best_error, best_epoch)) if best_epoch > last_epoch: logger.log( 99, "Resumed epoch is {} but this training keep this result.". format(last_epoch)) logger.log(99, "Resume from epoch {}".format(last_epoch + 1)) callback.update_status(('epoch.max', config.training_config.max_epoch)) callback.update_status( ('epoch.current', last_epoch + 1 if last_epoch < config.training_config.max_epoch else config.training_config.max_epoch)) max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if single_or_rankzero(): logger.log( 99, 'Training epoch {} of {} begin'.format( last_epoch + 1, config.training_config.max_epoch)) class Cost: pass cost = Cost() cost.sum_epoch = 0.0 cost.num_iteration = 0 cost.sum_iteration = 0.0 cost.variables = None class TimeInfo: pass timeinfo = TimeInfo() timeinfo.past_time = 0 timeinfo.estimate_time = 0 timeinfo.last_past_time = None if max_iteration > 0: last_iteration = last_epoch * config.training_config.iter_per_epoch if last_iteration < max_iteration: timeinfo.start_time = time.time() timeinfo.last_epoch_start_time = timeinfo.start_time callback.update_status('processing', True, timeinfo.start_time) for iteration in range(last_iteration, max_iteration): # instant load measurement measure_cpu_gpu_instant_load() cost = _update(iteration, config, cost) if np.isnan(cost.sum_epoch) or np.isinf(cost.sum_epoch): logger.log(99, 'Cost is Nan') return False, False timeinfo = _calc_estimate_time(timeinfo, max_iteration, last_iteration, iteration + 1) callback.update_time_train(prediction=timeinfo.estimate_time) if 0 < config.timelimit < timeinfo.estimate_time: logger.log( 99, 'Expected training time ({:.3f}s) will exceed time limit ({}s).' .format(timeinfo.estimate_time, config.timelimit)) return False, False if (iteration + 1) % config.training_config.iter_per_epoch == 0: last_past_time = -1 # End of epoch epoch = iteration // config.training_config.iter_per_epoch + 1 cost_avg_epoch = cost.sum_epoch / cost.num_iteration if cost.num_iteration else 0 cost.sum_epoch = 0.0 cost.num_iteration = 0 monitoring_report = [] # Evaluation error_str = '' if epoch % config.training_config.monitor_interval == 0 or epoch <= 5: best_error, error_str = _evaluate( args, config, monitoring_report, best_error, epoch) # Cpu/Gpu average load cg_load_str = '' cgload_log = '' cg_load = get_cpu_gpu_average_load() if cg_load: cg_load_str = 'epoch {} average_load_matrix: {}'.format( epoch, cg_load) span = _calc_epoch_span(timeinfo) if span > _CGLOAD_LOG_INTERVAL: cgload_log = _format_cgload_log(cg_load) if single_or_rankzero(): # Write to monitoring_report.yml f = open( os.path.join(args.outdir, 'monitoring_report.yml'), 'a') f.write('{}:\n'.format(epoch - 1)) f.write(' cost: {}\n'.format(cost_avg_epoch)) for s in monitoring_report: f.write(s) f.close() callback.update_status( (['monitoring_report', epoch, 'cost'], cost_avg_epoch)) _save_parameters(args, 'current', epoch, config) callback.update_status(('epoch.current', epoch)) callback.update_status() logger.log( 99, 'epoch {} of {} cost={:.6f} {} time=({:.1f}s /{:.1f}s) {}' .format(epoch, config.training_config.max_epoch, cost_avg_epoch, error_str, timeinfo.past_time, timeinfo.estimate_time, cgload_log)) if cg_load_str: # cpu_gpu_average_load record at epoch level callback.update_status( (['cpu_gpu_epoch_load', epoch], cg_load)) progress(cg_load_str, 1) if not callback.check_training_time( args, config, timeinfo, epoch, last_epoch): _save_parameters(args, 'current', epoch, config, True) return False, True if single_or_rankzero(): _save_parameters(args, 'current', epoch, config, True) return True, False
def _evaluate(args, config, monitoring_report, best_error, epoch): comm = current_communicator() error_str = '' valid_error = 0.0 def _sum_error(sum, error): ret = None if comm: # logger.log(99, "Calc error with communicator") var = [nn.NdArray()] var[0].data = error _all_reduce(comm, var, division=False, inplace=True) ret = sum + var[0].data else: ret = sum + error return ret for name, mon in config.monitors.items(): m = mon.monitor error_sum_monitor = 0.0 error_count = 0 data_size = max([di.size for di in mon.data_iterators]) batch_size = max([di.batch_size for di in mon.data_iterators]) for i in range(data_size // batch_size): # Load dataset data = OrderedDict() for di in mon.data_iterators: data.update(zip(di.variables, di.next())) # Set data to variable for v, d in m.dataset_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data[d], ctx=dest_context, data_name=d, variable_name=v.name) # Generate data for v, generator in m.generator_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Sum error before forward to prepare input data while processing # on GPU if error_count > 0: error_sum = 0.0 for v in m.monitor_variables: error_sum += np.mean(v.variable_instance.d) # v.variable_instance.data.zero() error_sum_monitor = _sum_error(error_sum_monitor, error_sum) if single_or_rankzero(): progress( 'Evaluating "{0}"'.format(name) + ' : error={0:0.6f}'.format( error_sum_monitor / error_count), di.position * 1.0 / di.size) error_count += comm.size if comm else 1 # Forward recursive m.network.forward(m.forward_sequence) # Sum error at the end of dataset error_sum = 0.0 for v in m.monitor_variables: error_sum += np.mean(v.variable_instance.d) # v.variable_instance.data.zero() error_sum_monitor = _sum_error(error_sum_monitor, error_sum) if error_count == 0: error = 0 else: error = error_sum_monitor / error_count if np.isnan(error) or np.isinf(error): logger.log(99, 'Validation error is Nan') error = 0.0 monitoring_report.append(' {}: {}\n'.format(name, error)) callback.update_status((['monitoring_report', epoch, name], error)) callback.update_status((['last', name], error)) # save last value if error_str != '': error_str += ', ' else: error_str = ' {' error_str += '{}={:.6f}'.format(name, error) if name == 'valid_error': valid_error = error if error_str != '': error_str += '}' # Save Parameters if single_or_rankzero(): if (not config.training_config.save_best) or \ (not best_error) or \ (best_error is not None and valid_error <= best_error): best_error = valid_error callback.update_status(('best.valid_error', best_error)) callback.update_status(('best.epoch', epoch)) _save_parameters(args, 'best', epoch, config, True) return best_error, error_str
def forward_command(args): callback.update_status(args) configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) batch_size = args.batch_size if batch_size < 1: batch_size = None class ForwardConfig: pass config = ForwardConfig info = load.load(files, prepare_data_iterator=False, batch_size=batch_size) config.global_config = info.global_config config.executors = info.executors.values() config.networks = [] for e in config.executors: if e.network.name in info.networks.keys(): config.networks.append(info.networks[e.network.name]) else: logger.critical('Network {} is not found.'.format( config.executor.network.name)) return False normalize = True for d in info.datasets.values(): if d.uri == args.dataset or d.cache_dir == args.dataset: normalize = d.normalize for e in config.executors: normalize = normalize and not e.no_image_normalization orders = {} # With CSV if os.path.splitext(args.dataset)[1] == '.csv': data_iterator = (lambda: data_iterator_csv_dataset( uri=args.dataset, batch_size=config.networks[0].batch_size, shuffle=False, normalize=normalize, with_memory_cache=False, with_file_cache=False)) # load dataset as csv filereader = FileReader(args.dataset) with filereader.open(textmode=True, encoding='utf-8-sig') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) if args.replace_path: root_path = os.path.dirname(args.dataset) root_path = os.path.abspath(root_path.replace('/|\\', os.path.sep)) else: root_path = '.' rows = [row for row in rows if len(row)] rows = list( map( lambda row: list( map( lambda i, x: x if row0[i][0] == '#' or is_float( x) else compute_full_path(root_path, x), range(len(row)), row)), rows)) for i in range(len(rows)): orders[i] = i # With Cache elif os.path.splitext(args.dataset)[1] == '.cache': data_iterator = (lambda: data_iterator_cache(uri=args.dataset, batch_size=config. networks[0].batch_size, shuffle=False, normalize=normalize)) # Get original CSV original_csv = os.path.join(args.dataset, 'original.csv') try: # load dataset as csv filereader = FileReader(original_csv) with filereader.open(textmode=True, encoding='utf-8-sig') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) root_path = '.' rows = list( map( lambda row: list( map( lambda x: x if is_float(x) else compute_full_path( root_path, x), row)), rows)) except: print('Cannot open', original_csv) pass # Get original Data order. order_csv = os.path.join(args.dataset, 'order.csv') try: filereader = FileReader(order_csv) with filereader.open(textmode=True) as f: for original, shuffled in [[int(x) for x in row] for row in csv.reader(f)]: orders[original] = shuffled except: print('Cannot open', order_csv) for i in range(len(rows)): orders[i] = i else: print('Unsupported extension "{}" in "{}".'.format( os.path.splitext(args.dataset)[1], args.dataset)) callback.update_status(('data.max', len(rows))) callback.update_status(('data.current', 0)) callback.update_status('processing', True) result_csv_filename = os.path.join(args.outdir, args.outfile) with open(result_csv_filename, 'w', encoding='utf-8') as f: writer = csv.writer(f, lineterminator='\n') with data_iterator() as di: index = 0 while index < di.size: data = di.next() result, outputs = _forward(args, index, config, data, di.variables) if index == 0: for name, dim in zip(result.names, result.dims): if dim == 1: if e.repeat_evaluation_type == "std": name = "Uncertainty(Std)" row0.append(name) else: for d in range(dim): row0.append(name + '__' + str(d)) writer.writerow(row0) for i, output in enumerate(outputs): if index + i < len(rows): import copy row = copy.deepcopy(rows[orders[index + i]]) row.extend(output) writer.writerow(row) index += len(outputs) callback.update_status(('data.current', min([index, len(rows)]))) callback.update_forward_time() callback.update_status() logger.log( 99, 'data {} / {}'.format(min([index, len(rows)]), len(rows))) callback.process_evaluation_result(args.outdir, result_csv_filename) logger.log(99, 'Forward Completed.') progress(None) callback.update_status(('output_result.csv_header', ','.join(row0))) callback.update_status(('output_result.column_num', len(row0))) callback.update_status(('output_result.data_num', len(rows))) callback.update_status('finished') return True
def profile_command(args): callback.update_status(args) configure_progress(os.path.join(args.outdir, 'progress.txt')) class TrainConfig: pass config = TrainConfig() info = load.load(args.config) config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterators = [] config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterators = [] config.monitors[name] = m ext_module = import_extension_module( config.global_config.default_context.backend[0].split(':')[0]) def synchronize(): return ext_module.synchronize( device_id=config.global_config.default_context.device_id) result_array = [['time in ms']] callback.update_status('processing', True) # Profile Optimizer with ExitStack() as stack: # Create data_iterator instance only once for each dataset in optimizers optimizer_data_iterators = {} for name, o in config.optimizers.items(): for di in o.optimizer.data_iterators.values(): if di not in optimizer_data_iterators: di_instance = stack.enter_context(di()) optimizer_data_iterators[di] = di_instance else: di_instance = optimizer_data_iterators[di] o.data_iterators.append(di_instance) result_array = profile_optimizer(config, result_array, synchronize) # Write profiling result import csv with open(args.outdir + os.sep + 'profile.csv', 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(result_array) logger.log(99, 'Profile Completed.') progress(None) callback.update_status('finished') return True