def load(self): # Prevent double-loading the dataset if self.is_loaded: return data_files = list(sorted(iterate_files(self.folder, pattern=f'.*{self.extension}'))) if len(data_files) == 0: print('WARNING: No data files found.') return self._arrays = [np.load(data_file, mmap_mode='r') for data_file in data_files] self._array_lengths = [int(len(arr) / len(self._fields)) for arr in self._arrays] self.set_length(sum(self._array_lengths)) self._ids = list(range(self.length)) # Retrieve saved index or build sequential index if none given index_file = os.path.join(self.folder, INDEX_FILE) if os.path.exists(index_file): self._index = read_by_file_suffix(index_file) self._ids = list(sorted(self._index.keys())) else: for sample_id in self._ids: self._index[sample_id] = self._get_array_index(sample_id) self.set_loaded(True)
def create_multi_model_systems( folder: str, model_type: str, power_system_type: PowerType) -> List[RuntimeSystem]: model_type = model_type.upper() assert model_type in ( 'SKIP_RNN', 'PHASED_RNN'), 'Unknown model type: {0}'.format(model_type) runtime_systems: List[RuntimeSystem] = [] valid_results: List[ModelResults] = [] test_results: List[ModelResults] = [] model_paths: List[str] = [] for model_path in iterate_files( folder, pattern=r'model-{0}-.*model_best\.pkl\.gz'.format(model_type)): model, dataset = restore_neural_network(model_path, dataset_folder=dataset_folder) if model_type == 'SKIP_RNN': valid_result = execute_skip_rnn_model(model, dataset, series=DataSeries.VALID) test_result = execute_skip_rnn_model(model, dataset, series=DataSeries.TEST) else: valid_result = execute_phased_rnn_model(model, dataset, series=DataSeries.VALID) test_result = execute_phased_rnn_model(model, dataset, series=DataSeries.TEST) valid_results.append(valid_result) test_results.append(test_result) model_paths.append(model_path) # Concatenate the results from each model test_results_concat = concat_model_results(test_results) valid_results_concat = concat_model_results(valid_results) power_system = make_power_system(num_levels=seq_length, seq_length=seq_length, model_type=model.model_type, power_type=power_system_type) under_budget = RuntimeSystem(test_results=test_results_concat, valid_results=valid_results_concat, system_type=SystemType.FIXED_UNDER_BUDGET, model_path=model_paths[0], dataset_folder=dataset_folder, seq_length=seq_length, num_levels=len(model_paths), num_classes=num_classes, power_system=power_system) runtime_systems.append(under_budget) return runtime_systems
def data_generator(data_folder: str) -> Iterable[Dict[str, Any]]: for data_file in iterate_files(data_folder, pattern=r'.*jsonl.gz'): for sample in read_by_file_suffix(data_file): # indices = list(range(len(sample[INPUTS]))) # sampled_indices = np.sort(np.random.choice(indices, size=seq_length, replace=False)) # sample[INPUTS] = yield sample
def load(self): if self.is_loaded: return # Prevent double loading data_files = sorted(iterate_files(self.folder, pattern=f'.*{self.extension}')) for data_file in data_files: self._dataset.extend(read_by_file_suffix(data_file)) self.set_length(len(self._dataset)) self._ids = list(range(self.length)) self.set_loaded(True)
def count_samples(data_folder: str, file_type: str, num_fields: int): """ Counts the number of samples in the given archive. """ count = 0 for data_file in iterate_files(data_folder, pattern=f'.*\.{file_type}'): data = read_by_file_suffix(data_file) if file_type == 'npz': count += int(len(data) / num_fields) else: count += sum((1 for _ in data)) print(f'Total number of samples: {count}')
def merge_datasets(folders: List[str], output_folder: str, file_prefix: str, file_suffix: str, chunk_size: int): with DataWriter(output_folder, file_prefix=file_prefix, file_suffix=file_suffix, chunk_size=chunk_size) as writer: data_files = chain(*(iterate_files(folder, pattern=f'.*{file_suffix}') for folder in folders)) sample_id = 0 for data_file in data_files: for sample in read_by_file_suffix(data_file): sample[SAMPLE_ID] = sample_id writer.add(sample) sample_id += 1 if (sample_id + 1) % chunk_size == 0: print('Completed {0} samples.'.format(sample_id + 1), end='\r') print()
if __name__ == '__main__': parser = ArgumentParser() parser.add_argument( '--input-folder', type=str, required=True, help='Folder containing RNN models (of the same type) to measure.') args = parser.parse_args() total_time = timedelta() times_list: List[datetime] = [] model_count = 0 for train_log_path in iterate_files( args.input_folder, pattern=r'.*model-train-log-.*\.pkl\.gz'): train_log = read_by_file_suffix(train_log_path) if 'start_time' not in train_log: match = TIME_REGEX.match(train_log_path) start_date = datetime.strptime(match.group(1), '%Y-%m-%d-%H-%M-%S') times_list.append(start_date) else: start_time, end_time = train_log['start_time'], train_log[ 'end_time'] start_date = datetime.strptime(start_time, '%Y-%m-%d-%H-%M-%S') end_date = datetime.strptime(end_time, '%Y-%m-%d-%H-%M-%S')
# Load the target data-set dataset = get_dataset(dataset_type='standard', data_folder=args.dataset_folder) # Unpack the power system type power_type = PowerType[args.sensor_type.upper()] # Load the adaptive model results and controllers adaptive_result_dict: Dict[str, ModelResults] = dict() adaptive_system_dict: Dict[str, RuntimeSystem] = dict() power_system_dict: Dict[str, PowerSystem] = dict() # Expand the model paths by unpacking directories model_paths: List[str] = [] for model_path in args.adaptive_model_paths: if os.path.isdir(model_path): model_paths.extend(iterate_files(model_path, pattern=r'.*model-SAMPLE_RNN-.*')) model_paths.extend(iterate_files(model_path, pattern=r'.*model-BUDGET_RNN-.*')) else: model_paths.append(model_path) for model_path in model_paths: model, _ = restore_neural_network(model_path=model_path, dataset_folder=args.dataset_folder) seq_length = model.metadata[SEQ_LENGTH] num_classes = model.metadata[NUM_CLASSES] num_levels = model.num_outputs # Get the validation and test results valid_results = execute_adaptive_model(model=model, dataset=dataset, series=DataSeries.VALID)
def get_results( input_folders: List[str], noise_generator: NoiseGenerator, model_type: str ) -> Dict[str, DefaultDict[float, Dict[str, List[ModelResult]]]]: """ Gets the results for all models in the given folder with the given power shift value. Args: input_folders: A list of input folders containing model results. target_shift: The power shift to extract results from Returns: A dictionary of the following format. Key: Dataset Name Value: A dictionary of the format below. Key: Budget Value: Dictionary of Model Name -> List of accuracy values. """ # Create the key for this series noise_key = str(noise_generator) baseline_mode = 'under_budget' fixed_type = 'fixed_{0}'.format(baseline_mode) model_results: Dict[str, DefaultDict[float, Dict[str, List[float]]]] = dict() for folder in input_folders: for file_name in iterate_files(folder, pattern=r'.*\.jsonl\.gz'): model_info = get_model_and_type(file_name) if model_info is None: continue system_type, model_name, dataset_name = model_info # Initialize new dataset entry dataset_name = normalize_dataset_name(dataset_name) if dataset_name not in model_results: model_results[dataset_name] = defaultdict(dict) # Skip all systems which don't match the criteria if system_type.lower() not in ('adaptive', fixed_type, 'randomized'): continue # Read the test log and get the accuracy for each budget matching the provided shift test_log = list(read_by_file_suffix(file_name))[0] noise_test_log = test_log[noise_key] for log_entry in noise_test_log.values(): budget = log_entry['BUDGET'] # Get the accuracy and power accuracy = log_entry['ACCURACY'] power = log_entry['AVG_POWER'] valid_accuracy = log_entry.get('VALID_ACCURACY') model_result = ModelResult(power=power, accuracy=accuracy, validation_accuracy=valid_accuracy) system_name = log_entry.get( 'SYSTEM_NAME', '{0} {1}'.format(system_type, model_name)).upper() # Append accuracy to the adaptive model results if system_name not in model_results[dataset_name][budget]: model_results[dataset_name][budget][system_name] = [] model_results[dataset_name][budget][system_name].append( model_result) return model_results
'--dataset-folder', type=str, required=True, help='Folder containing the dataset used for training and evaluation.') args = parser.parse_args() dataset = get_dataset(dataset_type='standard', data_folder=args.dataset_folder) dataset.dataset[DataSeries.TRAIN].load() train_size = dataset.dataset[DataSeries.TRAIN].length training_iterations = 0 model_count = 0 for train_log_path in iterate_files(args.input_folder, pattern=r'model-train-log.*pkl.gz'): match = MODEL_NAME_REGEX.match(train_log_path) assert match is not None, 'Could not match {0}'.format(train_log_path) # Get the batch size from the hyperparameters name = match.group(1) save_folder, _ = os.path.split(train_log_path) hypers_path = os.path.join( save_folder, 'model-hyper-params-{0}_model_best.pkl.gz'.format(name)) hypers = read_by_file_suffix(hypers_path) batch_size = hypers['batch_size'] batches_per_epoch = int(math.ceil(train_size / batch_size))
assert os.path.exists( train_folder), f'The folder {train_folder} does not exist!' valid_folder = os.path.join(data_folder, VALID) assert os.path.exists( valid_folder), f'The folder {valid_folder} does not exist!' test_folder = os.path.join(data_folder, TEST) assert os.path.exists( test_folder), f'The folder {test_folder} does not exist!' # Unpack and Validate params files (to fail fast) params_files: List[str] = [] for params_file in args.params_files: if os.path.isdir(params_file): params_files.extend(iterate_files(params_file, pattern=r'.*json')) else: params_files.append(params_file) for params_file in params_files: assert os.path.exists( params_file), f'The file {params_file} does not exist!' assert params_file.endswith( '.json'), f'The params file must be a JSON.' trials = max(args.trials, 1) num_models = trials * len(params_files) # Create save folder (if necessary) base_save_folder = args.save_folder make_dir(base_save_folder)