def load(self): # Prevent double-loading the dataset if self.is_loaded: return data_files = list(sorted(iterate_files(self.folder, pattern=f'.*{self.extension}'))) if len(data_files) == 0: print('WARNING: No data files found.') return self._arrays = [np.load(data_file, mmap_mode='r') for data_file in data_files] self._array_lengths = [int(len(arr) / len(self._fields)) for arr in self._arrays] self.set_length(sum(self._array_lengths)) self._ids = list(range(self.length)) # Retrieve saved index or build sequential index if none given index_file = os.path.join(self.folder, INDEX_FILE) if os.path.exists(index_file): self._index = read_by_file_suffix(index_file) self._ids = list(sorted(self._index.keys())) else: for sample_id in self._ids: self._index[sample_id] = self._get_array_index(sample_id) self.set_loaded(True)
def save(comparison: List[float], baseline_log_path: str): output_folder, baseline_file = os.path.split(baseline_log_path) # Read in the previous log if it exists comparison_log_path = os.path.join(output_folder, 'energy_comparison.jsonl.gz') if os.path.exists(comparison_log_path): comparison_log = list(read_by_file_suffix(comparison_log_path))[0] else: comparison_log = dict() # Save results under the baseline name tokens = baseline_file.split('-') policy = tokens[1].upper() model = tokens[2].upper() key = '{0} {1}'.format(model, policy) comparison_log[key] = { 'mean': np.average(comparison), 'std': np.std(comparison), 'median': np.median(comparison), 'raw': comparison } save_by_file_suffix([comparison_log], comparison_log_path)
def model_test(path: str, batch_size: Optional[int], max_num_batches: Optional[int], dataset_folder: Optional[str], series: str): save_folder, model_file = os.path.split(path) model_name = extract_model_name(model_file) assert model_name is not None, f'Could not extract name from file: {model_file}' # Extract hyperparameters hypers_name = HYPERS_PATH.format(model_name) hypers_path = os.path.join(save_folder, hypers_name) hypers = HyperParameters.create_from_file(hypers_path) # Extract data folders if dataset_folder is None: metadata_file = os.path.join(save_folder, METADATA_PATH.format(model_name)) metadata = read_by_file_suffix(metadata_file) train_folder = metadata['data_folders'][TRAIN.upper()] dataset_folder, _ = os.path.split(train_folder) assert os.path.exists( dataset_folder), f'The folder {dataset_folder} does not exist!' test(model_name=model_name, dataset_folder=dataset_folder, save_folder=save_folder, hypers=hypers, batch_size=batch_size, max_num_batches=max_num_batches, series=DataSeries[series.upper()])
def data_generator(data_folder: str) -> Iterable[Dict[str, Any]]: for data_file in iterate_files(data_folder, pattern=r'.*jsonl.gz'): for sample in read_by_file_suffix(data_file): # indices = list(range(len(sample[INPUTS]))) # sampled_indices = np.sort(np.random.choice(indices, size=seq_length, replace=False)) # sample[INPUTS] = yield sample
def restore(self, name: str, is_train: bool, is_frozen: bool): """ Restore model metadata, hyper-parameters, and trainable parameters. """ # Restore hyperparameters params_path = os.path.join(self.save_folder, HYPERS_PATH.format(name)) self.hypers = HyperParameters.create_from_file(params_path) # Restore metadata metadata_path = os.path.join(self.save_folder, METADATA_PATH.format(name)) train_metadata = read_by_file_suffix(metadata_path) self.metadata = train_metadata['metadata'] # Build the model self.make(is_train=is_train, is_frozen=is_frozen) # Initialize all variables (some may not be trainable) self.init() # Restore the trainable parameters with self.sess.graph.as_default(): model_path = os.path.join(self.save_folder, MODEL_PATH.format(name)) vars_dict = read_by_file_suffix(model_path) # Collect all saved variables assign_ops = [] for trainable_var in self.trainable_vars: saved_value = vars_dict.get(trainable_var.name) if saved_value is None: print('WARNING: No value for {0}'.format( trainable_var.name)) else: assign_op = trainable_var.assign(saved_value, use_locking=True, read_value=False) assign_ops.append(assign_op) # Execute assignment self.sess.run(assign_ops) if is_frozen: self.freeze()
def load(self): if self.is_loaded: return # Prevent double loading data_files = sorted(iterate_files(self.folder, pattern=f'.*{self.extension}')) for data_file in data_files: self._dataset.extend(read_by_file_suffix(data_file)) self.set_length(len(self._dataset)) self._ids = list(range(self.length)) self.set_loaded(True)
def count_samples(data_folder: str, file_type: str, num_fields: int): """ Counts the number of samples in the given archive. """ count = 0 for data_file in iterate_files(data_folder, pattern=f'.*\.{file_type}'): data = read_by_file_suffix(data_file) if file_type == 'npz': count += int(len(data) / num_fields) else: count += sum((1 for _ in data)) print(f'Total number of samples: {count}')
def merge_datasets(folders: List[str], output_folder: str, file_prefix: str, file_suffix: str, chunk_size: int): with DataWriter(output_folder, file_prefix=file_prefix, file_suffix=file_suffix, chunk_size=chunk_size) as writer: data_files = chain(*(iterate_files(folder, pattern=f'.*{file_suffix}') for folder in folders)) sample_id = 0 for data_file in data_files: for sample in read_by_file_suffix(data_file): sample[SAMPLE_ID] = sample_id writer.add(sample) sample_id += 1 if (sample_id + 1) % chunk_size == 0: print('Completed {0} samples.'.format(sample_id + 1), end='\r') print()
def save_test_log(accuracy: float, power: float, valid_accuracy: Optional[float], budget: float, system_name: str, key: str, output_file: str): test_log: Dict[str, Dict[str, Any]] = dict() if os.path.exists(output_file): test_log = list(read_by_file_suffix(output_file))[0] if key not in test_log: test_log[key] = dict() log_value = { 'ACCURACY': accuracy, 'AVG_POWER': power, 'VALID_ACCURACY': valid_accuracy, 'BUDGET': budget, 'SYSTEM_NAME': system_name } budget_str = '{0:.4f}'.format(budget) test_log[key][budget_str] = log_value save_by_file_suffix([test_log], output_file)
parser = ArgumentParser() parser.add_argument( '--input-folder', type=str, required=True, help='Folder containing RNN models (of the same type) to measure.') args = parser.parse_args() total_time = timedelta() times_list: List[datetime] = [] model_count = 0 for train_log_path in iterate_files( args.input_folder, pattern=r'.*model-train-log-.*\.pkl\.gz'): train_log = read_by_file_suffix(train_log_path) if 'start_time' not in train_log: match = TIME_REGEX.match(train_log_path) start_date = datetime.strptime(match.group(1), '%Y-%m-%d-%H-%M-%S') times_list.append(start_date) else: start_time, end_time = train_log['start_time'], train_log[ 'end_time'] start_date = datetime.strptime(start_time, '%Y-%m-%d-%H-%M-%S') end_date = datetime.strptime(end_time, '%Y-%m-%d-%H-%M-%S') time_delta = (end_date - start_date)
# Set gridline to denote the x-axis ax.axhline(0, linestyle='-', color='k', linewidth=1) # Create a vertical line to denote the `All` category ax.axvline((xs[-2] + xs[-1]) / 2, linestyle='--', color='k', linewidth=0.5) ax.legend(fontsize=16) ax.set_title('Mean Normalized Budget Required for Accuracy Equal to the Budget RNN', fontsize=22) ax.set_xlabel('Dataset', fontsize=18) ax.set_ylabel('Mean Normalized Energy Budget', fontsize=18) plt.tight_layout() if output_file is None: plt.show() else: plt.savefig(output_file, bbox_type='tight', transparent=True) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--input-folders', type=str, required=True, nargs='+', help='Paths to the merge simulation results') parser.add_argument('--output-file', type=str, help='Optional output file to save the plot') args = parser.parse_args() dataset_names = [t.split('/')[-1] if len(t.split('/')[-1]) > 0 else t.split('/')[-2] for t in args.input_folders] comparison_logs = [list(read_by_file_suffix(os.path.join(log, 'energy_comparison.jsonl.gz')))[0] for log in args.input_folders] merged = merge(comparison_logs, datasets=dataset_names) plot(merged, output_file=args.output_file)
out_file.write("#pragma PERSISTENT(LABEL_COUNTS)\n") out_file.write(label_counts_variable + '\n') out_file.write('\n\n'.join(weight_variables)) out_file.write('\n#endif\n') if __name__ == '__main__': parser = ArgumentParser( 'Compresses the neural network and converts the parameters into a C header file.' ) parser.add_argument('--model-path', type=str, required=True) parser.add_argument('--sensor-type', type=str, required=True, choices=['bluetooth', 'temperature']) parser.add_argument('--precision', type=int, required=True) parser.add_argument('--msp', action='store_true') args = parser.parse_args() assert args.precision > 0 and args.precision < 16, 'The precision must be in [1, 15]' model_parameters = read_by_file_suffix(args.model_path) convert_network(args.model_path, model_parameters, precision=args.precision, sensor_type=args.sensor_type, is_msp=args.msp)
seq_length=seq_length, num_levels=num_levels, num_classes=num_classes) key = 'BUDGET_RNN({0}) ADAPTIVE'.format(model.stride_length) adaptive_result_dict[key] = test_results adaptive_system_dict[key] = system power_system_dict[key] = power_system # Make the noise generator to get the log key noise_params = dict(noise_type='gaussian', loc=0.0, scale=0.05) noise_generator = list(get_noise_generator(noise_params, max_time=max_time))[0] noise_type = str(noise_generator) # Load the adaptive testing log adaptive_log = list(read_by_file_suffix(args.adaptive_log))[0] adaptive_results = adaptive_log[noise_type] for baseline_log_file in args.baseline_logs: # Load the baseline testing log baseline_log = list(read_by_file_suffix(baseline_log_file))[0] baseline_results = baseline_log[noise_type] if args.should_print: log_file_name = os.path.split(baseline_log_file)[1] print('==========') print('Starting Comparison To {0}'.format(log_file_name)) # Perform the comparison energy_diff = energy_comparison(adaptive_results=adaptive_results, baseline_results=baseline_results,
((budget * max_time) / fixed_power).astype(int), max_time) # [S] adjusted_fixed_accuracy = (fixed_accuracy * time_steps) / max_time # [S] fixed_valid_accuracy[budget] = adjusted_fixed_accuracy adaptive_model_accuracy.append(adaptive_valid_accuracy) fixed_model_accuracy.append(fixed_valid_accuracy) # Get the simulation logs adaptive_log_file_name = LOG_FILE_FMT.format('adaptive', model_name, args.power_system_type) adaptive_log_path = os.path.join(args.log_folder, adaptive_log_file_name) adaptive_log = list(read_by_file_suffix(adaptive_log_path))[0] adaptive_logs.append(adaptive_log) fixed_budget_log_file_name = LOG_FILE_FMT.format( 'fixed_under_budget', model_name, args.power_system_type) fixed_budget_log_path = os.path.join(args.log_folder, fixed_budget_log_file_name) fixed_budget_log = list(read_by_file_suffix(fixed_budget_log_path))[0] fixed_budget_logs.append(fixed_budget_log) randomized_log_file_name = LOG_FILE_FMT.format('randomized', model_name, args.power_system_type) randomized_log_path = os.path.join(args.log_folder, randomized_log_file_name) randomized_log = list(read_by_file_suffix(randomized_log_path))[0]
def convert_network(model_path: str, model_parameters: Dict[str, np.ndarray], precision: int, sensor_type: str, is_msp: bool): # Extract the model meta-data and hyper-parameters hypers = get_hyperparameters(model_path) metadata = get_metadata(model_path) model_type = SequenceModelType[hypers.model_params['model_type'].upper()] # Holds a list of C variable declarations for each trainable parameter weight_variables: List[str] = [] # Estimate LEA RAM consumption for model weights lea_ram_estimation = 0 # Create C declarations for all trainable variables for var_name, var_value in model_parameters.items(): layer_name, weight_type = parse_variable_name(var_name) c_declaration = None if layer_name in (EMBEDDING_NAME, STOP_PREDICTION, OUTPUT_LAYER_NAME, RNN_CELL_NAME, AGGREGATION_NAME): c_declaration = weight_matrix_conversion(layer_name, weight_type, var_value, precision=precision, is_msp=is_msp) else: raise ValueError('Unknown layer name: {0}'.format(layer_name)) if isinstance(var_value, np.ndarray) and should_use_lea_ram(var_value): lea_ram_estimation += 2 * var_value.shape[0] * var_value.shape[1] weight_variables.append(c_declaration) # Extract meta-data and hyper-parameters to create the computational graph state_size = hypers.model_params['state_size'] num_outputs = hypers.model_params.get('num_outputs', 1) stride_length = hypers.model_params.get('stride_length', 1) rnn_cell_type = hypers.model_params['rnn_cell_type'] seq_length = metadata[SEQ_LENGTH] num_input_features = metadata[INPUT_SHAPE][0] if hypers.model_params['output_type'] == 'multi_classification': num_output_features = metadata[NUM_CLASSES] else: num_output_features = metadata[NUM_OUTPUT_FEATURES] # Get thresholds for Sample RNNs thresholds = np.zeros(shape=(1, num_outputs)) budgets = np.array([0]) label_counts = np.zeros(shape=(1, 1, num_outputs)) avg_energy = np.zeros(shape=(1, )) if model_type == SequenceModelType.SAMPLE_RNN: save_folder, model_file_name = os.path.split(model_path) model_name = extract_model_name(model_file_name) controller_path = os.path.join( save_folder, CONTROLLER_PATH.format(sensor_type, model_name)) if os.path.exists(controller_path): controller_info = read_by_file_suffix(controller_path) thresholds = controller_info['thresholds'] budgets = controller_info['budgets'] # Estimate the energy level for each threshold set avg_level_counts = controller_info['avg_level_counts'] power_system = make_power_system( mode=PowerType[sensor_type.upper()], num_levels=num_outputs, seq_length=seq_length) power_estimates = power_system.get_power_estimates() avg_energy = np.sum(avg_level_counts * np.expand_dims(power_estimates, axis=0), axis=-1) print(avg_energy) avg_energy = tensor_to_fixed_point(avg_energy, precision=precision) distribution = controller_info['label_distribution'] label_counts_lst: List[np.ndarray] = [] for budget in budgets: budget_counts: List[np.ndarray] = [] for _, counts in sorted(distribution[budget].items()): budget_counts.append(np.expand_dims(counts, axis=0)) label_counts_lst.append( np.expand_dims(np.vstack(budget_counts), axis=0)) label_counts = np.vstack(label_counts_lst).astype(int) # Get the power estimates num_levels = num_outputs if model_type == SequenceModelType.SAMPLE_RNN else seq_length power_system = make_power_system(mode=PowerType[sensor_type.upper()], num_levels=num_levels, seq_length=seq_length) power_estimates = power_system.get_power_estimates() # [L] power_estimates = tensor_to_fixed_point(power_estimates, precision=precision) with open('neural_network_parameters.h', 'w') as out_file: # Include necessary header files out_file.write('#include <stdint.h>\n') out_file.write('#include "math/matrix.h"\n\n') # Create header guard out_file.write('#ifndef NEURAL_NETWORK_PARAMS_GUARD\n') out_file.write('#define NEURAL_NETWORK_PARAMS_GUARD\n\n') # Create constants used during graph construction out_file.write(create_constant('FIXED_POINT_PRECISION', precision)) out_file.write(create_constant('STATE_SIZE', state_size)) out_file.write( create_constant('NUM_INPUT_FEATURES', num_input_features)) out_file.write( create_constant('NUM_OUTPUT_FEATURES', num_output_features)) out_file.write(create_constant('SEQ_LENGTH', seq_length)) out_file.write(create_constant('NUM_OUTPUTS', num_outputs)) out_file.write(create_constant('STRIDE_LENGTH', stride_length)) out_file.write( create_constant('SAMPLES_PER_SEQ', int(seq_length / num_outputs))) out_file.write(create_constant('NUM_BUDGETS', len(budgets))) out_file.write( create_constant('{0}_TRANSFORM'.format(rnn_cell_type.upper()), value=None)) out_file.write( create_constant('IS_{0}'.format(model_type.name.upper()), value=None)) if 'on_fraction' in hypers.model_params: on_fraction = float_to_fixed_point( hypers.model_params['on_fraction'], precision) out_file.write(create_constant('ON_FRACTION', value=on_fraction)) if is_msp: out_file.write(create_constant('IS_MSP', value=None)) out_file.write(create_constant('VECTOR_COLS', value=2)) else: out_file.write(create_constant('VECTOR_COLS', value=1)) out_file.write('\n') thresholds = tensor_to_fixed_point(thresholds, precision=precision) thresholds_variable = create_array(thresholds, name='THRESHOLDS', dtype='int16_t') out_file.write(thresholds_variable + '\n') budgets = tensor_to_fixed_point(budgets, precision=precision) budgets_variable = create_array(budgets, name='BUDGETS', dtype='int32_t') out_file.write(budgets_variable + '\n') avg_energy_variable = create_array(avg_energy, name='AVG_ENERGY', dtype='int16_t') out_file.write(avg_energy_variable + '\n') power_estimates_variable = create_array(power_estimates, name='ENERGY_ESTIMATES', dtype='int32_t') out_file.write(power_estimates_variable + '\n') label_counts_variable = create_array(label_counts, name='LABEL_COUNTS', dtype='int16_t') if (is_msp): out_file.write("#pragma PERSISTENT(LABEL_COUNTS)\n") out_file.write(label_counts_variable + '\n') out_file.write('\n\n'.join(weight_variables)) out_file.write('\n#endif\n')
import re from argparse import ArgumentParser from utils.file_utils import read_by_file_suffix if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--test-log', type=str, required=True, help='Path to the test log.') args = parser.parse_args() test_log = list(read_by_file_suffix(args.test_log))[0] prediction_key_regex = re.compile('prediction[_-]*([0-9]*)') prediction_keys = [] for key in test_log.keys(): match = prediction_key_regex.match(key) if match is not None: level = int(match.group(1)) if len(match.groups()) > 1 else 0 prediction_keys.append((key, level)) for prediction_name, _ in sorted(prediction_keys, key=lambda t: t[1]): prediction_results = test_log[prediction_name] print('{0}: Accuracy -> {1:.5f}'.format( prediction_name, prediction_results['ACCURACY']))
runtime_systems.extend(skip_rnn_systems) # Add the Phased RNN models if provided phased_rnn_folder = args.phased_model_folder if phased_rnn_folder is not None: phased_rnn_systems = create_multi_model_systems( folder=phased_rnn_folder, model_type='PHASED_RNN', power_system_type=power_system_type) runtime_systems.extend(phased_rnn_systems) # Max time equals the number of test samples max_time = dataset.dataset[DataSeries.TEST].length for noise_params_path in args.noise_params: noise_params = read_by_file_suffix(noise_params_path) # Create the noise generator for the given parameters for noise_generator in get_noise_generator(noise_params=noise_params, max_time=max_time): # Run the simulation on each budget for budget in sorted(budgets): print('===== Starting budget: {0:.4f} ====='.format(budget)) result, noise_terms = run_simulation( runtime_systems=runtime_systems, max_time=max_time, noise_generator=noise_generator, budget=budget)
def get_results( input_folders: List[str], noise_generator: NoiseGenerator, model_type: str ) -> Dict[str, DefaultDict[float, Dict[str, List[ModelResult]]]]: """ Gets the results for all models in the given folder with the given power shift value. Args: input_folders: A list of input folders containing model results. target_shift: The power shift to extract results from Returns: A dictionary of the following format. Key: Dataset Name Value: A dictionary of the format below. Key: Budget Value: Dictionary of Model Name -> List of accuracy values. """ # Create the key for this series noise_key = str(noise_generator) baseline_mode = 'under_budget' fixed_type = 'fixed_{0}'.format(baseline_mode) model_results: Dict[str, DefaultDict[float, Dict[str, List[float]]]] = dict() for folder in input_folders: for file_name in iterate_files(folder, pattern=r'.*\.jsonl\.gz'): model_info = get_model_and_type(file_name) if model_info is None: continue system_type, model_name, dataset_name = model_info # Initialize new dataset entry dataset_name = normalize_dataset_name(dataset_name) if dataset_name not in model_results: model_results[dataset_name] = defaultdict(dict) # Skip all systems which don't match the criteria if system_type.lower() not in ('adaptive', fixed_type, 'randomized'): continue # Read the test log and get the accuracy for each budget matching the provided shift test_log = list(read_by_file_suffix(file_name))[0] noise_test_log = test_log[noise_key] for log_entry in noise_test_log.values(): budget = log_entry['BUDGET'] # Get the accuracy and power accuracy = log_entry['ACCURACY'] power = log_entry['AVG_POWER'] valid_accuracy = log_entry.get('VALID_ACCURACY') model_result = ModelResult(power=power, accuracy=accuracy, validation_accuracy=valid_accuracy) system_name = log_entry.get( 'SYSTEM_NAME', '{0} {1}'.format(system_type, model_name)).upper() # Append accuracy to the adaptive model results if system_name not in model_results[dataset_name][budget]: model_results[dataset_name][budget][system_name] = [] model_results[dataset_name][budget][system_name].append( model_result) return model_results
training_iterations = 0 model_count = 0 for train_log_path in iterate_files(args.input_folder, pattern=r'model-train-log.*pkl.gz'): match = MODEL_NAME_REGEX.match(train_log_path) assert match is not None, 'Could not match {0}'.format(train_log_path) # Get the batch size from the hyperparameters name = match.group(1) save_folder, _ = os.path.split(train_log_path) hypers_path = os.path.join( save_folder, 'model-hyper-params-{0}_model_best.pkl.gz'.format(name)) hypers = read_by_file_suffix(hypers_path) batch_size = hypers['batch_size'] batches_per_epoch = int(math.ceil(train_size / batch_size)) # Use the number of batches to calculate the total number of iterations train_log = read_by_file_suffix(train_log_path) train_epochs = len(train_log['loss']['train']) training_iterations += train_epochs * batches_per_epoch # If there is a controller present, then we use add these iterations to the result controller_path = os.path.join( save_folder, 'model-controller-temp-{0}_model_best.pkl.gz'.format(name)) if os.path.exists(controller_path):