Esempio n. 1
0
    def load(self):
        # Prevent double-loading the dataset
        if self.is_loaded:
            return

        data_files = list(sorted(iterate_files(self.folder, pattern=f'.*{self.extension}')))

        if len(data_files) == 0:
            print('WARNING: No data files found.')
            return

        self._arrays = [np.load(data_file, mmap_mode='r') for data_file in data_files]

        self._array_lengths = [int(len(arr) / len(self._fields)) for arr in self._arrays]
        self.set_length(sum(self._array_lengths))

        self._ids = list(range(self.length))

        # Retrieve saved index or build sequential index if none given
        index_file = os.path.join(self.folder, INDEX_FILE)
        if os.path.exists(index_file):
            self._index = read_by_file_suffix(index_file)
            self._ids = list(sorted(self._index.keys()))
        else:
            for sample_id in self._ids:
                self._index[sample_id] = self._get_array_index(sample_id)

        self.set_loaded(True)
Esempio n. 2
0
def create_multi_model_systems(
        folder: str, model_type: str,
        power_system_type: PowerType) -> List[RuntimeSystem]:
    model_type = model_type.upper()
    assert model_type in (
        'SKIP_RNN', 'PHASED_RNN'), 'Unknown model type: {0}'.format(model_type)

    runtime_systems: List[RuntimeSystem] = []

    valid_results: List[ModelResults] = []
    test_results: List[ModelResults] = []
    model_paths: List[str] = []
    for model_path in iterate_files(
            folder,
            pattern=r'model-{0}-.*model_best\.pkl\.gz'.format(model_type)):
        model, dataset = restore_neural_network(model_path,
                                                dataset_folder=dataset_folder)

        if model_type == 'SKIP_RNN':
            valid_result = execute_skip_rnn_model(model,
                                                  dataset,
                                                  series=DataSeries.VALID)
            test_result = execute_skip_rnn_model(model,
                                                 dataset,
                                                 series=DataSeries.TEST)
        else:
            valid_result = execute_phased_rnn_model(model,
                                                    dataset,
                                                    series=DataSeries.VALID)
            test_result = execute_phased_rnn_model(model,
                                                   dataset,
                                                   series=DataSeries.TEST)

        valid_results.append(valid_result)
        test_results.append(test_result)

        model_paths.append(model_path)

    # Concatenate the results from each model
    test_results_concat = concat_model_results(test_results)
    valid_results_concat = concat_model_results(valid_results)

    power_system = make_power_system(num_levels=seq_length,
                                     seq_length=seq_length,
                                     model_type=model.model_type,
                                     power_type=power_system_type)

    under_budget = RuntimeSystem(test_results=test_results_concat,
                                 valid_results=valid_results_concat,
                                 system_type=SystemType.FIXED_UNDER_BUDGET,
                                 model_path=model_paths[0],
                                 dataset_folder=dataset_folder,
                                 seq_length=seq_length,
                                 num_levels=len(model_paths),
                                 num_classes=num_classes,
                                 power_system=power_system)
    runtime_systems.append(under_budget)

    return runtime_systems
Esempio n. 3
0
def data_generator(data_folder: str) -> Iterable[Dict[str, Any]]:
    for data_file in iterate_files(data_folder, pattern=r'.*jsonl.gz'):
        for sample in read_by_file_suffix(data_file):
            # indices = list(range(len(sample[INPUTS])))
            # sampled_indices = np.sort(np.random.choice(indices, size=seq_length, replace=False))
            # sample[INPUTS] = 

            yield sample
Esempio n. 4
0
    def load(self):
        if self.is_loaded:
            return  # Prevent double loading

        data_files = sorted(iterate_files(self.folder, pattern=f'.*{self.extension}'))
        for data_file in data_files:
            self._dataset.extend(read_by_file_suffix(data_file))

        self.set_length(len(self._dataset))
        self._ids = list(range(self.length))

        self.set_loaded(True)
Esempio n. 5
0
def count_samples(data_folder: str, file_type: str, num_fields: int):
    """
    Counts the number of samples in the given archive.
    """
    count = 0
    for data_file in iterate_files(data_folder, pattern=f'.*\.{file_type}'):
        data = read_by_file_suffix(data_file)

        if file_type == 'npz':
            count += int(len(data) / num_fields)
        else:
            count += sum((1 for _ in data))

    print(f'Total number of samples: {count}')
Esempio n. 6
0
def merge_datasets(folders: List[str], output_folder: str, file_prefix: str,
                   file_suffix: str, chunk_size: int):
    with DataWriter(output_folder,
                    file_prefix=file_prefix,
                    file_suffix=file_suffix,
                    chunk_size=chunk_size) as writer:

        data_files = chain(*(iterate_files(folder, pattern=f'.*{file_suffix}')
                             for folder in folders))

        sample_id = 0
        for data_file in data_files:
            for sample in read_by_file_suffix(data_file):
                sample[SAMPLE_ID] = sample_id
                writer.add(sample)
                sample_id += 1

                if (sample_id + 1) % chunk_size == 0:
                    print('Completed {0} samples.'.format(sample_id + 1),
                          end='\r')
        print()
Esempio n. 7
0
if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument(
        '--input-folder',
        type=str,
        required=True,
        help='Folder containing RNN models (of the same type) to measure.')
    args = parser.parse_args()

    total_time = timedelta()

    times_list: List[datetime] = []
    model_count = 0

    for train_log_path in iterate_files(
            args.input_folder, pattern=r'.*model-train-log-.*\.pkl\.gz'):
        train_log = read_by_file_suffix(train_log_path)

        if 'start_time' not in train_log:
            match = TIME_REGEX.match(train_log_path)
            start_date = datetime.strptime(match.group(1), '%Y-%m-%d-%H-%M-%S')

            times_list.append(start_date)
        else:

            start_time, end_time = train_log['start_time'], train_log[
                'end_time']

            start_date = datetime.strptime(start_time, '%Y-%m-%d-%H-%M-%S')
            end_date = datetime.strptime(end_time, '%Y-%m-%d-%H-%M-%S')
    # Load the target data-set
    dataset = get_dataset(dataset_type='standard', data_folder=args.dataset_folder)

    # Unpack the power system type
    power_type = PowerType[args.sensor_type.upper()]

    # Load the adaptive model results and controllers
    adaptive_result_dict: Dict[str, ModelResults] = dict()
    adaptive_system_dict: Dict[str, RuntimeSystem] = dict()
    power_system_dict: Dict[str, PowerSystem] = dict()

    # Expand the model paths by unpacking directories
    model_paths: List[str] = []
    for model_path in args.adaptive_model_paths:
        if os.path.isdir(model_path):
            model_paths.extend(iterate_files(model_path, pattern=r'.*model-SAMPLE_RNN-.*'))
            model_paths.extend(iterate_files(model_path, pattern=r'.*model-BUDGET_RNN-.*'))
        else:
            model_paths.append(model_path)

    for model_path in model_paths:
        model, _ = restore_neural_network(model_path=model_path,
                                          dataset_folder=args.dataset_folder)
        seq_length = model.metadata[SEQ_LENGTH]
        num_classes = model.metadata[NUM_CLASSES]
        num_levels = model.num_outputs

        # Get the validation and test results
        valid_results = execute_adaptive_model(model=model,
                                               dataset=dataset,
                                               series=DataSeries.VALID)
Esempio n. 9
0
def get_results(
    input_folders: List[str], noise_generator: NoiseGenerator, model_type: str
) -> Dict[str, DefaultDict[float, Dict[str, List[ModelResult]]]]:
    """
    Gets the results for all models in the given folder with the given power shift value.

    Args:
        input_folders: A list of input folders containing model results.
        target_shift: The power shift to extract results from
    Returns:
        A dictionary of the following format.
        Key: Dataset Name
        Value: A dictionary of the format below.
            Key: Budget
            Value: Dictionary of Model Name -> List of accuracy values.
    """
    # Create the key for this series
    noise_key = str(noise_generator)
    baseline_mode = 'under_budget'
    fixed_type = 'fixed_{0}'.format(baseline_mode)

    model_results: Dict[str, DefaultDict[float, Dict[str,
                                                     List[float]]]] = dict()

    for folder in input_folders:

        for file_name in iterate_files(folder, pattern=r'.*\.jsonl\.gz'):

            model_info = get_model_and_type(file_name)

            if model_info is None:
                continue

            system_type, model_name, dataset_name = model_info

            # Initialize new dataset entry
            dataset_name = normalize_dataset_name(dataset_name)
            if dataset_name not in model_results:
                model_results[dataset_name] = defaultdict(dict)

            # Skip all systems which don't match the criteria
            if system_type.lower() not in ('adaptive', fixed_type,
                                           'randomized'):
                continue

            # Read the test log and get the accuracy for each budget matching the provided shift
            test_log = list(read_by_file_suffix(file_name))[0]
            noise_test_log = test_log[noise_key]

            for log_entry in noise_test_log.values():

                budget = log_entry['BUDGET']

                # Get the accuracy and power
                accuracy = log_entry['ACCURACY']
                power = log_entry['AVG_POWER']
                valid_accuracy = log_entry.get('VALID_ACCURACY')
                model_result = ModelResult(power=power,
                                           accuracy=accuracy,
                                           validation_accuracy=valid_accuracy)

                system_name = log_entry.get(
                    'SYSTEM_NAME', '{0} {1}'.format(system_type,
                                                    model_name)).upper()

                # Append accuracy to the adaptive model results
                if system_name not in model_results[dataset_name][budget]:
                    model_results[dataset_name][budget][system_name] = []

                model_results[dataset_name][budget][system_name].append(
                    model_result)

    return model_results
Esempio n. 10
0
        '--dataset-folder',
        type=str,
        required=True,
        help='Folder containing the dataset used for training and evaluation.')
    args = parser.parse_args()

    dataset = get_dataset(dataset_type='standard',
                          data_folder=args.dataset_folder)

    dataset.dataset[DataSeries.TRAIN].load()
    train_size = dataset.dataset[DataSeries.TRAIN].length

    training_iterations = 0
    model_count = 0

    for train_log_path in iterate_files(args.input_folder,
                                        pattern=r'model-train-log.*pkl.gz'):
        match = MODEL_NAME_REGEX.match(train_log_path)
        assert match is not None, 'Could not match {0}'.format(train_log_path)

        # Get the batch size from the hyperparameters
        name = match.group(1)

        save_folder, _ = os.path.split(train_log_path)
        hypers_path = os.path.join(
            save_folder,
            'model-hyper-params-{0}_model_best.pkl.gz'.format(name))
        hypers = read_by_file_suffix(hypers_path)

        batch_size = hypers['batch_size']
        batches_per_epoch = int(math.ceil(train_size / batch_size))
Esempio n. 11
0
        assert os.path.exists(
            train_folder), f'The folder {train_folder} does not exist!'

        valid_folder = os.path.join(data_folder, VALID)
        assert os.path.exists(
            valid_folder), f'The folder {valid_folder} does not exist!'

        test_folder = os.path.join(data_folder, TEST)
        assert os.path.exists(
            test_folder), f'The folder {test_folder} does not exist!'

    # Unpack and Validate params files (to fail fast)
    params_files: List[str] = []
    for params_file in args.params_files:
        if os.path.isdir(params_file):
            params_files.extend(iterate_files(params_file, pattern=r'.*json'))
        else:
            params_files.append(params_file)

    for params_file in params_files:
        assert os.path.exists(
            params_file), f'The file {params_file} does not exist!'
        assert params_file.endswith(
            '.json'), f'The params file must be a JSON.'

    trials = max(args.trials, 1)
    num_models = trials * len(params_files)

    # Create save folder (if necessary)
    base_save_folder = args.save_folder
    make_dir(base_save_folder)