Python save_run_uuids Exemples, ciml.gather_results.save_run_uuids Python Exemples

Exemple #1

0

Afficher le fichier

def db_trainer(estimator, dataset, build_name, limit, db_uri, evaluate):
    runs = gather_results.get_runs_by_name(db_uri, build_name=build_name)
    model_config = {'build_name': build_name}
    gather_results.save_model_config(dataset, model_config)
    if limit > 0:
        runs = runs[:limit]
    gather_results.save_run_uuids(dataset, runs)
    for run in runs:
        if estimator == 'tf.estimator.DNNClassifier':
            gather_results.get_subunit_results_for_run(run,
                                                       '1s',
                                                       db_uri,
                                                       use_cache=True)
            print('Acquired run %s' % run.uuid)
        else:
            result = gather_results.get_subunit_results_for_run(
                run, '1s', db_uri)[0]
            print('Acquired run %s' % run.uuid)
            try:
                features, labels = nn_trainer.normalize_data(result)
            except TypeError:
                print('Unable to normalize data in run %s, '
                      'skipping' % run.uuid)
                continue
            if not evaluate:
                nn_trainer.train_model(features, labels, dataset_name=dataset)
            else:
                nn_trainer.evaluate_model(features,
                                          labels,
                                          dataset_name=dataset)

Exemple #2

0

Afficher le fichier

Fichier : trainer.py Projet : mtreinish/ciml

def build_dataset(dataset, build_name, slicer, sample_interval, features_regex,
                  class_label, tdt_split, force, visualize, data_path,
                  target_data_path, s3_profile, s3_url, data_plots_folder,
                  aggregation_functions):
    # s3 support
    s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile)

    # Prevent overwrite by mistake
    if gather_results.load_model_config(
            dataset, data_path=target_data_path, s3=s3) and not force:
        print("Dataset %s already configured" % dataset)
        sys.exit(1)

    # Validate tdt-split
    training, dev, test = map(lambda x: x / 10, tdt_split)
    if not sum(tdt_split) == 10:
        print("Training (%d) + dev (%d) + test (%d) != 10" % tdt_split)
        sys.exit(1)

    # Load available run ids for the build name (from s3)
    runs = gather_results.load_run_uuids('.raw',
                                         name=build_name,
                                         data_path=data_path,
                                         s3=s3)

    # Apply the slice
    def slice_fn(x):
        return int(x.strip()) if x.strip() else None

    slice_object = slice(*map(slice_fn, slicer.split(":")))
    runs = np.array(runs[slice_object])
    print("Obtained %d runs for build %s" % (len(runs), build_name))

    # Split the runs in training, dev and test
    training_idx, dev_idx, test_idx = dataset_split_filters(
        len(runs), training, dev, data_path=target_data_path, s3=s3)
    np_runs = np.array(runs)
    # Saving dataset metadata
    gather_results.save_run_uuids(dataset,
                                  np_runs[training_idx],
                                  name='training',
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_run_uuids(dataset,
                                  np_runs[dev_idx],
                                  name='dev',
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_run_uuids(dataset,
                                  np_runs[test_idx],
                                  name='test',
                                  data_path=target_data_path,
                                  s3=s3)

    # Calculate normalized and filtered dimensions and labels
    normalized_length, num_dstat_features, labels = \
        data_sizes_and_labels(runs[0], features_regex, sample_interval,
                              aggregation_functions=aggregation_functions,
                              data_path=data_path, s3=s3)
    model_config = {
        'build_name': build_name,
        'sample_interval': sample_interval,
        'features_regex': features_regex,
        'class_label': class_label,
        'aggregation_functions': aggregation_functions,
        'training_set': training,
        'dev_set': dev,
        'test_set': test,
        'normalized_length': normalized_length,
        'labels': labels,
        'num_columns': num_dstat_features,
        'num_features': len(labels)
    }

    # Save the config and complete list of run uuids
    gather_results.save_run_uuids(dataset,
                                  runs,
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_model_config(dataset,
                                     model_config,
                                     data_path=target_data_path,
                                     s3=s3)
    print("Stored %d run IDs in the model %s config" % (len(runs), dataset))

    # Resolve the aggregation function names to functions
    resolved_agg_fn = [
        resolve_aggregation_function(x) for x in aggregation_functions
    ]

    datasets = {}
    # Training must come first so we calculate normalization params
    for data_type in ['training', 'dev', 'test']:
        data, _figure_sizes = prepare_dataset(
            dataset,
            normalized_length,
            num_dstat_features,
            data_type,
            features_regex=features_regex,
            sample_interval=sample_interval,
            class_label=class_label,
            aggregation_functions=resolved_agg_fn,
            visualize=visualize,
            data_path=data_path,
            target_data_path=target_data_path,
            s3=s3)
        datasets[data_type] = data
        examples = data['examples']
        if len(examples) == 0:
            continue

        # Perform dataset-wise normalization
        if data_type == 'training':
            n_examples, normalization_params = normalize_dataset(
                examples, labels)

            # We cache normalization parameters from the training data set
            # to normalize the dev and test set, as well as other input data
            model_config['normalization_params'] = normalization_params
            gather_results.save_model_config(dataset,
                                             model_config,
                                             data_path=target_data_path,
                                             s3=s3)

            # Save figure sizes as well for training only
            figure_sizes = _figure_sizes
        else:
            # Perform dataset-wise normalization
            n_examples, normalization_params = normalize_dataset(
                examples, labels, model_config['normalization_params'])

        # Replace examples with normalized ones
        datasets[data_type]['examples'] = n_examples

        # Store the normalized data to disk
        gather_results.save_dataset(dataset,
                                    name=data_type,
                                    data_path=target_data_path,
                                    s3=s3,
                                    **datasets[data_type])

    # Plot some more figures
    if visualize and not aggregation_functions:
        for n in range(n_examples.shape[0]):
            figure_name = sample_interval + "_%s_" + str(n)
            unrolled_norm_plot = pd.Series(n_examples[n]).plot()
            fig = unrolled_norm_plot.get_figure()
            axes = plt.gca()
            axes.set_ylim([-1, 1])
            fig.savefig(
                os.sep.join([data_plots_folder] +
                            [figure_name % "normalized"]))
            plt.close(fig)

        df = pd.DataFrame(figure_sizes, columns=['size', 'status'])
        size_plot = df.plot.scatter(x='size', y='status')
        fig = size_plot.get_figure()
        fig.savefig(os.sep.join([data_plots_folder] + ['sizes_by_result.png']))
        plt.close(fig)

    # Store labels to disk
    gather_results.save_dataset(dataset,
                                name='labels',
                                data_path=target_data_path,
                                s3=s3,
                                labels=labels)
    print("Done creating dataset %s" % model_config)

Exemple #3

0

Afficher le fichier

Fichier : trainer.py Projet : mtreinish/ciml

def prepare_dataset(dataset,
                    normalized_length,
                    num_dstat_features,
                    data_type,
                    features_regex,
                    sample_interval='1s',
                    class_label='status',
                    aggregation_functions=None,
                    visualize=False,
                    data_path=None,
                    target_data_path=None,
                    s3=None):
    """Takes a dataset and filters and does the magic

    Loads the run ids from the dataset configuration.
    Loads the data (dsv + meta) for every run from cache.
    Builds the unrolled examples as a numpy ndarray.
    Builds the classes as a numpy array.
    Saves the data setup to the dataset config.
    Does some visualization (if enabled).
    """
    if visualize:
        data_plots_folder = [
            os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data',
            dataset, 'plots'
        ]
        os.makedirs(os.sep.join(data_plots_folder), exist_ok=True)

    # Load the list of runs and base labels
    runs = gather_results.load_run_uuids(dataset,
                                         name=data_type,
                                         data_path=target_data_path,
                                         s3=s3)

    # run_uuids are the example_ids
    sizes = []
    # The data for each example.
    examples = examples_ndarray(len(runs), num_dstat_features,
                                normalized_length)

    # The test result for each example
    classes = []
    skips = []
    print("Loading %s data:" % data_type, end='\r', flush=True)
    for count, run in enumerate(runs):
        print("Loading %s data: %d of %d" % (data_type, count + 1, len(runs)),
              end='\r',
              flush=True)
        result = gather_results.get_subunit_results_for_run(
            run, sample_interval, data_path=data_path, s3=s3)
        # For one run_uuid we must only get on example (result)
        # Filtering by columns
        if not result:
            skips.append(run.uuid)
            continue

        # Apply column filtering
        result = filter_example(result, features_regex)

        # Normalize data
        example = fixed_lenght_example(result, normalized_length,
                                       aggregation_functions)

        vector = unroll_example(example, normalized_length)

        # Normalize status
        status = get_class(result, class_label)

        # Examples is an np ndarrays
        examples[count] = vector.values
        classes.append(status)

        # Plot from figures
        if visualize and not aggregation_functions:
            # Prepare some more data if we are going to visualize
            sizes.append((result['dstat'].shape[0], status))
            figure_name = sample_interval + "_%s_" + str(count)
            # Plot un-normalized data
            data_plot = result['dstat'].plot()
            fig = data_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "downsampled"]))
            plt.close(fig)
            # Plot fixed size data
            fixed_plot = example.plot()
            fig = fixed_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "fixedsize"]))
            plt.close(fig)
            # Plot unrolled data
            unrolled_plot = pd.Series(vector).plot()
            fig = unrolled_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "unrolled"]))
            plt.close(fig)

    print("Loading %s data: %d done!" % (data_type, len(runs)))
    # Check that everything went well
    if len(skips) > 0:
        print('Unable to train model because of missing runs %s' % skips)
        safe_runs = [run.uuid for run in runs if run.uuid not in skips]
        gather_results.save_run_uuids(dataset, safe_runs)
        print('The model has been updated to exclude those runs.')
        print('Please re-run the training step.')
        sys.exit(1)

    classes = np.array(classes)
    figure_sizes = np.array(sizes)
    example_ids = np.array(runs)

    print("%s set: examples: %s, classes: %s, example IDs: %s" %
          (data_type, str(examples.shape), str(
              classes.shape), str(example_ids.shape)))

    data = {
        'examples': examples,
        'example_ids': example_ids,
        'classes': classes
    }

    if visualize and aggregation_functions and len(examples) > 0:
        if len(aggregation_functions) > 3:
            print('Visualization skipped, cannot represent more than 3D')
            sys.exit(1)
        else:
            fig = plt.figure()
            if len(aggregation_functions) == 3:
                ax = fig.add_subplot(111, projection='3d')
            else:
                ax = fig.add_subplot(111)

            # Build a dict [class] -> [int ID]
            unique_classes = list(set(classes))
            dict_classes = dict(
                zip(unique_classes, list(range(len(unique_classes)))))

            # Setup colours
            cm = plt.get_cmap('jet')
            cNorm = pltcolors.Normalize(vmin=0, vmax=len(unique_classes))
            scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm)

            # Scatter the data
            for ii in range(len(examples)):
                ax.scatter(*examples[ii],
                           marker='o',
                           c=scalarMap.to_rgba(dict_classes[classes[ii]]))

            # Set axis labels
            ax.set_xlabel(aggregation_functions[0].__name__)
            if len(aggregation_functions) > 1:
                ax.set_ylabel(aggregation_functions[1].__name__)
            if len(aggregation_functions) > 2:
                ax.set_zlabel(aggregation_functions[2].__name__)

            # scalarMap.set_array(classes)
            # fig.colorbar(scalarMap)

            # Save the plot
            fig.savefig(
                os.sep.join(data_plots_folder + [data_type + "_3d_plot"]))
            plt.close(fig)

    return data, figure_sizes

Exemple #4

0

Afficher le fichier

def local_trainer(train, estimator, dataset, sample_interval, features_regex,
                  class_label, visualize, steps, gpu, debug):
    # Normalized lenght before resampling
    normalized_length = 5500
    if sample_interval:
        # Calculate the desired normalized lenght after resample
        normalized_length = get_downsampled_example_lenght(
            sample_interval, normalized_length)

    data_plots_folder = [
        os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data',
        dataset, 'plots'
    ]
    os.makedirs(os.sep.join(data_plots_folder), exist_ok=True)
    runs = gather_results.load_run_uuids(dataset)

    # run_uuids are the example_ids
    sizes = []
    # The data for each example. We don't know yet the pre-set shape, so
    # wait until the first result comes in
    examples = []

    # Model configuration. We need to cache sample_interval, features-regex and
    # the normalization parameters for each feature so we can re-use them
    # during prediction.
    model_config = {
        'sample_interval': sample_interval,
        'features_regex': features_regex,
        'normalized_length': normalized_length
    }

    # The test result for each example
    classes = []
    labels = []
    idx = 0
    skips = []
    for run in runs:
        results = gather_results.get_subunit_results_for_run(
            run, sample_interval)
        # For one run_uuid we must only get on example (result)
        result = results[0]
        # Filtering by columns
        if not result:
            skips.append(run.uuid)
            continue
        df = result['dstat']
        if features_regex:
            col_regex = re.compile(features_regex)
            result['dstat'] = df[list(filter(col_regex.search, df.columns))]
        # Setup the numpy matrix and sizes
        if len(examples) == 0:
            # Adjust normalized_length to the actual re-sample one
            examples = np.ndarray(shape=(len(runs),
                                         len(result['dstat'].columns) *
                                         normalized_length))
            model_config['num_columns'] = len(result['dstat'].columns)
            model_config['num_features'] = (len(result['dstat'].columns) *
                                            normalized_length)
        # Normalize data
        example = fixed_lenght_example(result, normalized_length)
        # Normalize status
        status = get_class(result, class_label)
        vector, new_labels = unroll_example(example, normalized_length, labels)
        # Only calculate labels for the first example
        if len(labels) == 0:
            labels = new_labels
            model_config['labels'] = labels
        print("Normalized example %d of %d" % (runs.index(run) + 1, len(runs)),
              end='\r',
              flush=True)
        # Examples is an np ndarrays
        examples[idx] = vector.values
        classes.append(status)
        if visualize:
            # Prepare some more data if we are going to visualize
            sizes.append((result['dstat'].shape[0], status))
            figure_name = sample_interval + "_%s_" + str(idx)
            # Plot un-normalized data
            data_plot = result['dstat'].plot()
            fig = data_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "downsampled"]))
            plt.close(fig)
            # Plot fixed size data
            fixed_plot = example.plot()
            fig = fixed_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "fixedsize"]))
            plt.close(fig)
            # Plot unrolled data
            unrolled_plot = pd.Series(vector).plot()
            fig = unrolled_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "unrolled"]))
            plt.close(fig)
        idx += 1
    if len(skips) > 0:
        print('Unable to train model because of missing runs %s' % skips)
        safe_runs = [run for run in runs if run.uuid not in skips]
        gather_results.save_run_uuids(dataset, safe_runs)
        print('The model has been updated to exclude those runs.')
        print('Please re-run the training step.')
        sys.exit(1)
    # Perform dataset-wise normalization
    # NOTE(andreaf) When we train the model we ignore any saved normalization
    # parameter, since the sample interval and features may be different.
    n_examples, normalization_params = normalize_dataset(examples, labels)
    # We do cache the result to normalize the prediction set.
    model_config['normalization_params'] = normalization_params
    gather_results.save_model_config(dataset, model_config)
    if visualize:
        for n in range(len(runs)):
            figure_name = sample_interval + "_%s_" + str(n)
            unrolled_norm_plot = pd.Series(n_examples[n]).plot()
            fig = unrolled_norm_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "normalized"]))
            plt.close(fig)

        np_sizes = np.array(sizes)
        df = pd.DataFrame(np_sizes, columns=['size', 'status'])
        size_plot = df.plot.scatter(x='size', y='status')
        fig = size_plot.get_figure()
        fig.savefig(os.sep.join(data_plots_folder + ['sizes_by_result.png']))
        plt.close(fig)

    # Now do the training
    exmple_ids = [run.uuid for run in runs]
    classes = np.array(classes)
    print("\nTraining data shape: (%d, %d)" % n_examples.shape)
    if train:
        if debug:
            tf.logging.set_verbosity(tf.logging.DEBUG)
        config = tf.ConfigProto(log_device_placement=True, )
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        model = svm_trainer.SVMTrainer(n_examples,
                                       exmple_ids,
                                       labels,
                                       classes,
                                       dataset_name=dataset,
                                       force_gpu=gpu)
        model.train(steps=steps)

Exemple #5

0

Afficher le fichier

Fichier : train_api.py Projet : mtreinish/ciml

def train_model(build_name):

    global estimator
    dataset = estimator
    global model_dir
    with session_scope() as session:
        if not os.path.isfile(os.sep.join([model_dir, 'data', dataset,
                                           'runs.json.gz'])):
            runs = gather_results.get_runs_by_name(None,
                                                   build_name=build_name,
                                                   session=session)
            model_config = {'build_name': build_name}
            gather_results.save_model_config(dataset, model_config,
                                             data_path=model_dir)
            gather_results.save_run_uuids(dataset, runs,
                                          data_path=model_dir)
        else:
            runs = gather_results.load_run_uuids(dataset,
                                                 data_path=model_dir)
        normalized_length = 5500
        if estimator == 'svm':
            skips = []
            classes = []
            labels = []
            examples = []
            class_label = 'status'
            features_regex = None
            sample_interval = None
            idx = 0
            # Model configuration. We need to cache sample_interval,
            # features-regex and the normalization parameters for each
            # feature so we can re-use them during prediction.
            model_config = {
                'sample_interval': sample_interval,
                'features_regex': features_regex,
                'normalized_length': normalized_length
            }
            for run in runs:
                results = gather_results.get_subunit_results_for_run(
                    run, '1s', session=None, data_path=model_dir,
                    use_cache=True)
                print('Acquired run %s' % run.uuid)
                # For one run_uuid we must only get on example (result)
                result = results[0]
                if not result:
                    skips.append(run.uuid)
                    continue
                # Setup the numpy matrix and sizes
                if len(examples) == 0:
                    # Adjust normalized_length to the actual re-sample one
                    examples = np.ndarray(
                        shape=(
                            len(runs),
                            (len(result['dstat'].columns)
                             * normalized_length)))
                    model_config['num_columns'] = len(
                        result['dstat'].columns)
                    model_config['num_features'] = (len(
                        result['dstat'].columns) * normalized_length)
                    # Normalize data
                    example = fixed_lenght_example(result,
                                                   normalized_length)
                    # Normalize status
                    status = get_class(result, class_label)
                    vector, new_labels = unroll_example(
                        example, normalized_length, labels)
                    # Only calculate labels for the first example
                    if len(labels) == 0:
                        labels = new_labels
                        model_config['labels'] = labels
                    # Examples is an np ndarrays
                    examples[idx] = vector.values
                    classes.append(status)
            if len(skips) > 0:
                    print('Unable to train model because of missing '
                          'runs %s' % skips)
                    safe_runs = [
                        run for run in runs if run.uuid not in skips]
                    gather_results.save_run_uuids(dataset, safe_runs,
                                                  data_path=model_dir)
                    message = ('The model has been updated to exclude '
                               'those runs. Please re-run the training'
                               ' step.')
                    abort(make_response(message, 400))

            def run_training():
                # Perform dataset-wise normalization
                # NOTE(andreaf) When we train the model we ignore any saved
                # normalization
                # parameter, since the sample interval and features may be
                # different.
                n_examples, normalization_params = normalize_dataset(
                    examples, labels)
                # We do cache the result to normalize the prediction set.
                model_config['normalization_params'] = normalization_params
                gather_results.save_model_config(dataset, model_config,
                                                 data_path=model_dir)
                # Now do the training
                example_ids = [run.uuid for run in runs]
                outclasses = np.array(classes)
                svm_trainer.SVMTrainer(n_examples, example_ids, labels,
                                       outclasses, dataset_name=dataset,
                                       model_path=model_dir)
            thread = threading.Thread(target=run_training)
            thread.start()
            return "training started", 202
        else:
            def run_nn_training():
                for run in runs:
                    uuid = run.uuid
                    result = gather_results.get_subunit_results_for_run(
                        run, '1s', session=session, use_cache=False,
                        data_path=model_dir)[0]
                    try:
                        features, labels = nn_trainer.normalize_data(result)
                    except TypeError:
                        print('Unable to normalize data in run %s, '
                              'skipping' % uuid)
                        continue
                    nn_trainer.train_model(features, labels,
                                           dataset_name=dataset,
                                           model_path=model_dir)
                print('done')
            thread = threading.Thread(target=run_nn_training)
            thread.start()
            return "training started", 202

Exemple #6

0

Afficher le fichier

Fichier : trainer.py Projet : kwulffert/ciml

def prepare_dataset(dataset,
                    normalized_length,
                    num_dstat_features,
                    data_type,
                    features_regex,
                    sample_interval='1s',
                    class_label='status',
                    visualize=False,
                    data_path=None,
                    target_data_path=None,
                    s3=None):
    """Takes a dataset and filters and does the magic

    Loads the run ids from the dataset configuration.
    Loads the data (dsv + meta) for every run from cache.
    Builds the unrolled exaples as a numpy ndarray.
    Builds the classes as a numpy array.
    Saves the data setup to the dataset config.
    Does some visualization (if enabled).
    """
    if visualize:
        data_plots_folder = [
            os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data',
            dataset, 'plots'
        ]
        os.makedirs(os.sep.join(data_plots_folder), exist_ok=True)

    # Load the list of runs and base labels
    runs = gather_results.load_run_uuids(dataset,
                                         name=data_type,
                                         data_path=target_data_path,
                                         s3=s3)

    # run_uuids are the example_ids
    sizes = []
    # The data for each example.
    examples = examples_ndarray(len(runs), num_dstat_features,
                                normalized_length)

    # The test result for each example
    classes = []
    skips = []
    print("Loading %s data:" % data_type, end='\r', flush=True)
    for count, run in enumerate(runs):
        print("Loading %s data: %d of %d" % (data_type, count + 1, len(runs)),
              end='\r',
              flush=True)
        result = gather_results.get_subunit_results_for_run(
            run, sample_interval, data_path=data_path, s3=s3)
        # For one run_uuid we must only get on example (result)
        # Filtering by columns
        if not result:
            skips.append(run.uuid)
            continue

        # Apply column filtering
        result = filter_example(result, features_regex)

        # Normalize data
        example = fixed_lenght_example(result, normalized_length)

        vector = unroll_example(example, normalized_length)

        # Normalize status
        status = get_class(result, class_label)

        # Examples is an np ndarrays
        examples[count] = vector.values
        classes.append(status)

        # Plot from figures
        if visualize:
            # Prepare some more data if we are going to visualize
            sizes.append((result['dstat'].shape[0], status))
            figure_name = sample_interval + "_%s_" + str(count)
            # Plot un-normalized data
            data_plot = result['dstat'].plot()
            fig = data_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "downsampled"]))
            plt.close(fig)
            # Plot fixed size data
            fixed_plot = example.plot()
            fig = fixed_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "fixedsize"]))
            plt.close(fig)
            # Plot unrolled data
            unrolled_plot = pd.Series(vector).plot()
            fig = unrolled_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "unrolled"]))
            plt.close(fig)

    print("Loading %s data: %d done!" % (data_type, len(runs)))
    # Check that everything went well
    if len(skips) > 0:
        print('Unable to train model because of missing runs %s' % skips)
        safe_runs = [run.uuid for run in runs if run.uuid not in skips]
        gather_results.save_run_uuids(dataset, safe_runs)
        print('The model has been updated to exclude those runs.')
        print('Please re-run the training step.')
        sys.exit(1)

    classes = np.array(classes)
    figure_sizes = np.array(sizes)
    example_ids = np.array(runs)

    print("%s set: examples: %s, classes: %s, example IDs: %s" %
          (data_type, str(examples.shape), str(
              classes.shape), str(example_ids.shape)))

    data = {
        'examples': examples,
        'example_ids': example_ids,
        'classes': classes
    }

    return data, figure_sizes