Ejemplo n.º 1
0
def build_dataset(dataset, build_name, slicer, sample_interval, features_regex,
                  class_label, tdt_split, force, visualize, data_path,
                  target_data_path, s3_profile, s3_url, data_plots_folder,
                  aggregation_functions):
    # s3 support
    s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile)

    # Prevent overwrite by mistake
    if gather_results.load_model_config(
            dataset, data_path=target_data_path, s3=s3) and not force:
        print("Dataset %s already configured" % dataset)
        sys.exit(1)

    # Validate tdt-split
    training, dev, test = map(lambda x: x / 10, tdt_split)
    if not sum(tdt_split) == 10:
        print("Training (%d) + dev (%d) + test (%d) != 10" % tdt_split)
        sys.exit(1)

    # Load available run ids for the build name (from s3)
    runs = gather_results.load_run_uuids('.raw',
                                         name=build_name,
                                         data_path=data_path,
                                         s3=s3)

    # Apply the slice
    def slice_fn(x):
        return int(x.strip()) if x.strip() else None

    slice_object = slice(*map(slice_fn, slicer.split(":")))
    runs = np.array(runs[slice_object])
    print("Obtained %d runs for build %s" % (len(runs), build_name))

    # Split the runs in training, dev and test
    training_idx, dev_idx, test_idx = dataset_split_filters(
        len(runs), training, dev, data_path=target_data_path, s3=s3)
    np_runs = np.array(runs)
    # Saving dataset metadata
    gather_results.save_run_uuids(dataset,
                                  np_runs[training_idx],
                                  name='training',
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_run_uuids(dataset,
                                  np_runs[dev_idx],
                                  name='dev',
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_run_uuids(dataset,
                                  np_runs[test_idx],
                                  name='test',
                                  data_path=target_data_path,
                                  s3=s3)

    # Calculate normalized and filtered dimensions and labels
    normalized_length, num_dstat_features, labels = \
        data_sizes_and_labels(runs[0], features_regex, sample_interval,
                              aggregation_functions=aggregation_functions,
                              data_path=data_path, s3=s3)
    model_config = {
        'build_name': build_name,
        'sample_interval': sample_interval,
        'features_regex': features_regex,
        'class_label': class_label,
        'aggregation_functions': aggregation_functions,
        'training_set': training,
        'dev_set': dev,
        'test_set': test,
        'normalized_length': normalized_length,
        'labels': labels,
        'num_columns': num_dstat_features,
        'num_features': len(labels)
    }

    # Save the config and complete list of run uuids
    gather_results.save_run_uuids(dataset,
                                  runs,
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_model_config(dataset,
                                     model_config,
                                     data_path=target_data_path,
                                     s3=s3)
    print("Stored %d run IDs in the model %s config" % (len(runs), dataset))

    # Resolve the aggregation function names to functions
    resolved_agg_fn = [
        resolve_aggregation_function(x) for x in aggregation_functions
    ]

    datasets = {}
    # Training must come first so we calculate normalization params
    for data_type in ['training', 'dev', 'test']:
        data, _figure_sizes = prepare_dataset(
            dataset,
            normalized_length,
            num_dstat_features,
            data_type,
            features_regex=features_regex,
            sample_interval=sample_interval,
            class_label=class_label,
            aggregation_functions=resolved_agg_fn,
            visualize=visualize,
            data_path=data_path,
            target_data_path=target_data_path,
            s3=s3)
        datasets[data_type] = data
        examples = data['examples']
        if len(examples) == 0:
            continue

        # Perform dataset-wise normalization
        if data_type == 'training':
            n_examples, normalization_params = normalize_dataset(
                examples, labels)

            # We cache normalization parameters from the training data set
            # to normalize the dev and test set, as well as other input data
            model_config['normalization_params'] = normalization_params
            gather_results.save_model_config(dataset,
                                             model_config,
                                             data_path=target_data_path,
                                             s3=s3)

            # Save figure sizes as well for training only
            figure_sizes = _figure_sizes
        else:
            # Perform dataset-wise normalization
            n_examples, normalization_params = normalize_dataset(
                examples, labels, model_config['normalization_params'])

        # Replace examples with normalized ones
        datasets[data_type]['examples'] = n_examples

        # Store the normalized data to disk
        gather_results.save_dataset(dataset,
                                    name=data_type,
                                    data_path=target_data_path,
                                    s3=s3,
                                    **datasets[data_type])

    # Plot some more figures
    if visualize and not aggregation_functions:
        for n in range(n_examples.shape[0]):
            figure_name = sample_interval + "_%s_" + str(n)
            unrolled_norm_plot = pd.Series(n_examples[n]).plot()
            fig = unrolled_norm_plot.get_figure()
            axes = plt.gca()
            axes.set_ylim([-1, 1])
            fig.savefig(
                os.sep.join([data_plots_folder] +
                            [figure_name % "normalized"]))
            plt.close(fig)

        df = pd.DataFrame(figure_sizes, columns=['size', 'status'])
        size_plot = df.plot.scatter(x='size', y='status')
        fig = size_plot.get_figure()
        fig.savefig(os.sep.join([data_plots_folder] + ['sizes_by_result.png']))
        plt.close(fig)

    # Store labels to disk
    gather_results.save_dataset(dataset,
                                name='labels',
                                data_path=target_data_path,
                                s3=s3,
                                labels=labels)
    print("Done creating dataset %s" % model_config)
Ejemplo n.º 2
0
def prepare_dataset(dataset,
                    normalized_length,
                    num_dstat_features,
                    data_type,
                    features_regex,
                    sample_interval='1s',
                    class_label='status',
                    aggregation_functions=None,
                    visualize=False,
                    data_path=None,
                    target_data_path=None,
                    s3=None):
    """Takes a dataset and filters and does the magic

    Loads the run ids from the dataset configuration.
    Loads the data (dsv + meta) for every run from cache.
    Builds the unrolled examples as a numpy ndarray.
    Builds the classes as a numpy array.
    Saves the data setup to the dataset config.
    Does some visualization (if enabled).
    """
    if visualize:
        data_plots_folder = [
            os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data',
            dataset, 'plots'
        ]
        os.makedirs(os.sep.join(data_plots_folder), exist_ok=True)

    # Load the list of runs and base labels
    runs = gather_results.load_run_uuids(dataset,
                                         name=data_type,
                                         data_path=target_data_path,
                                         s3=s3)

    # run_uuids are the example_ids
    sizes = []
    # The data for each example.
    examples = examples_ndarray(len(runs), num_dstat_features,
                                normalized_length)

    # The test result for each example
    classes = []
    skips = []
    print("Loading %s data:" % data_type, end='\r', flush=True)
    for count, run in enumerate(runs):
        print("Loading %s data: %d of %d" % (data_type, count + 1, len(runs)),
              end='\r',
              flush=True)
        result = gather_results.get_subunit_results_for_run(
            run, sample_interval, data_path=data_path, s3=s3)
        # For one run_uuid we must only get on example (result)
        # Filtering by columns
        if not result:
            skips.append(run.uuid)
            continue

        # Apply column filtering
        result = filter_example(result, features_regex)

        # Normalize data
        example = fixed_lenght_example(result, normalized_length,
                                       aggregation_functions)

        vector = unroll_example(example, normalized_length)

        # Normalize status
        status = get_class(result, class_label)

        # Examples is an np ndarrays
        examples[count] = vector.values
        classes.append(status)

        # Plot from figures
        if visualize and not aggregation_functions:
            # Prepare some more data if we are going to visualize
            sizes.append((result['dstat'].shape[0], status))
            figure_name = sample_interval + "_%s_" + str(count)
            # Plot un-normalized data
            data_plot = result['dstat'].plot()
            fig = data_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "downsampled"]))
            plt.close(fig)
            # Plot fixed size data
            fixed_plot = example.plot()
            fig = fixed_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "fixedsize"]))
            plt.close(fig)
            # Plot unrolled data
            unrolled_plot = pd.Series(vector).plot()
            fig = unrolled_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "unrolled"]))
            plt.close(fig)

    print("Loading %s data: %d done!" % (data_type, len(runs)))
    # Check that everything went well
    if len(skips) > 0:
        print('Unable to train model because of missing runs %s' % skips)
        safe_runs = [run.uuid for run in runs if run.uuid not in skips]
        gather_results.save_run_uuids(dataset, safe_runs)
        print('The model has been updated to exclude those runs.')
        print('Please re-run the training step.')
        sys.exit(1)

    classes = np.array(classes)
    figure_sizes = np.array(sizes)
    example_ids = np.array(runs)

    print("%s set: examples: %s, classes: %s, example IDs: %s" %
          (data_type, str(examples.shape), str(
              classes.shape), str(example_ids.shape)))

    data = {
        'examples': examples,
        'example_ids': example_ids,
        'classes': classes
    }

    if visualize and aggregation_functions and len(examples) > 0:
        if len(aggregation_functions) > 3:
            print('Visualization skipped, cannot represent more than 3D')
            sys.exit(1)
        else:
            fig = plt.figure()
            if len(aggregation_functions) == 3:
                ax = fig.add_subplot(111, projection='3d')
            else:
                ax = fig.add_subplot(111)

            # Build a dict [class] -> [int ID]
            unique_classes = list(set(classes))
            dict_classes = dict(
                zip(unique_classes, list(range(len(unique_classes)))))

            # Setup colours
            cm = plt.get_cmap('jet')
            cNorm = pltcolors.Normalize(vmin=0, vmax=len(unique_classes))
            scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm)

            # Scatter the data
            for ii in range(len(examples)):
                ax.scatter(*examples[ii],
                           marker='o',
                           c=scalarMap.to_rgba(dict_classes[classes[ii]]))

            # Set axis labels
            ax.set_xlabel(aggregation_functions[0].__name__)
            if len(aggregation_functions) > 1:
                ax.set_ylabel(aggregation_functions[1].__name__)
            if len(aggregation_functions) > 2:
                ax.set_zlabel(aggregation_functions[2].__name__)

            # scalarMap.set_array(classes)
            # fig.colorbar(scalarMap)

            # Save the plot
            fig.savefig(
                os.sep.join(data_plots_folder + [data_type + "_3d_plot"]))
            plt.close(fig)

    return data, figure_sizes
Ejemplo n.º 3
0
def db_batch_predict(db_uri, dataset, slice, gpu, debug):
    """Run predict on all DB items on included in the dataset yet

    Takes a dataset and a build name. It builds the list of runs in the DB
    that fit the specified build name, and that are not yet used for training
    in the specified dataset. It runs prediction on all of them.
    """
    if debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    # Get the configuration for the model
    model_config = gather_results.load_model_config(dataset)
    # Get the list of runs from the dataset
    run_uuids = gather_results.load_run_uuids(dataset)
    # Get the list of runs from the DB
    runs = gather_results.get_runs_by_name(
        db_uri=db_uri, build_name=model_config['build_name'])
    # Run a predict loop, include all runs not in the train dataset
    predict_runs = [r for r in runs if r.uuid not in run_uuids]
    predict_runs = predict_runs[]
    if len(predict_runs) == 0:
        print("Empty prediction set, nothing to do.")
        sys.exit(0)
    # Initialize the array
    examples = np.ndarray(
        shape=(len(predict_runs), model_config['num_features']))
    idx = 0
    classes = []
    labels = []
    print("All runs: %d, dataset size: %d, predict size: %d" % (
        len(runs), len(run_uuids), len(predict_runs)))
    for run in predict_runs:
        # This will also store new runs in cache. In future we may want to
        # train on those as well, but nor now let's try to predict only
        results = gather_results.get_subunit_results_for_run(
            run, model_config['sample_interval'], db_uri=db_uri)
        for result in results:
            # Skip runs with no data
            if result is None:
                continue
            if model_config['features_regex']:
                df = result['dstat']
                col_regex = re.compile(model_config['features_regex'])
                result['dstat'] = df[list(filter(
                    col_regex.search, df.columns))]
            # Normalize examples
            vector, status, labels = trainer.normalize_example(
                result, model_config['normalized_length'],
                model_config['labels'])
            examples[idx] = vector.values
            classes.append(status)
            idx += 1
    # Normalize dataset
    n_examples, _ = trainer.normalize_dataset(
        examples, labels, params=model_config['normalization_params'])
    # Prepare other arrays
    classes = np.array(classes)
    run_uuids = [r.uuid for r in predict_runs]
    # Configure TF
    config = tf.ConfigProto(log_device_placement=True,)
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    # Now do the prediction
    model = svm_trainer.SVMTrainer(n_examples, run_uuids, labels,
                                   classes, dataset_name=dataset,
                                   force_gpu=gpu)
    predictions = model.predict()
    errors = []
    for prediction, actual in zip(predictions, classes):
        if prediction['classes'] != actual:
            errors.append((prediction, actual))
    print("Prediction of %d inputs completed." % len(classes))
    print("Input set composition: %d PASS, %s FAIL" % (
        len([x for x in classes if x == 0]),
        len([x for x in classes if x == 1])))
    if len(errors) > 0:
        print("There were some prediction errors: %s" % errors)
    else:
        print("All predicted correctly.")
Ejemplo n.º 4
0
def local_trainer(train, estimator, dataset, sample_interval, features_regex,
                  class_label, visualize, steps, gpu, debug):
    # Normalized lenght before resampling
    normalized_length = 5500
    if sample_interval:
        # Calculate the desired normalized lenght after resample
        normalized_length = get_downsampled_example_lenght(
            sample_interval, normalized_length)

    data_plots_folder = [
        os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data',
        dataset, 'plots'
    ]
    os.makedirs(os.sep.join(data_plots_folder), exist_ok=True)
    runs = gather_results.load_run_uuids(dataset)

    # run_uuids are the example_ids
    sizes = []
    # The data for each example. We don't know yet the pre-set shape, so
    # wait until the first result comes in
    examples = []

    # Model configuration. We need to cache sample_interval, features-regex and
    # the normalization parameters for each feature so we can re-use them
    # during prediction.
    model_config = {
        'sample_interval': sample_interval,
        'features_regex': features_regex,
        'normalized_length': normalized_length
    }

    # The test result for each example
    classes = []
    labels = []
    idx = 0
    skips = []
    for run in runs:
        results = gather_results.get_subunit_results_for_run(
            run, sample_interval)
        # For one run_uuid we must only get on example (result)
        result = results[0]
        # Filtering by columns
        if not result:
            skips.append(run.uuid)
            continue
        df = result['dstat']
        if features_regex:
            col_regex = re.compile(features_regex)
            result['dstat'] = df[list(filter(col_regex.search, df.columns))]
        # Setup the numpy matrix and sizes
        if len(examples) == 0:
            # Adjust normalized_length to the actual re-sample one
            examples = np.ndarray(shape=(len(runs),
                                         len(result['dstat'].columns) *
                                         normalized_length))
            model_config['num_columns'] = len(result['dstat'].columns)
            model_config['num_features'] = (len(result['dstat'].columns) *
                                            normalized_length)
        # Normalize data
        example = fixed_lenght_example(result, normalized_length)
        # Normalize status
        status = get_class(result, class_label)
        vector, new_labels = unroll_example(example, normalized_length, labels)
        # Only calculate labels for the first example
        if len(labels) == 0:
            labels = new_labels
            model_config['labels'] = labels
        print("Normalized example %d of %d" % (runs.index(run) + 1, len(runs)),
              end='\r',
              flush=True)
        # Examples is an np ndarrays
        examples[idx] = vector.values
        classes.append(status)
        if visualize:
            # Prepare some more data if we are going to visualize
            sizes.append((result['dstat'].shape[0], status))
            figure_name = sample_interval + "_%s_" + str(idx)
            # Plot un-normalized data
            data_plot = result['dstat'].plot()
            fig = data_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "downsampled"]))
            plt.close(fig)
            # Plot fixed size data
            fixed_plot = example.plot()
            fig = fixed_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "fixedsize"]))
            plt.close(fig)
            # Plot unrolled data
            unrolled_plot = pd.Series(vector).plot()
            fig = unrolled_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "unrolled"]))
            plt.close(fig)
        idx += 1
    if len(skips) > 0:
        print('Unable to train model because of missing runs %s' % skips)
        safe_runs = [run for run in runs if run.uuid not in skips]
        gather_results.save_run_uuids(dataset, safe_runs)
        print('The model has been updated to exclude those runs.')
        print('Please re-run the training step.')
        sys.exit(1)
    # Perform dataset-wise normalization
    # NOTE(andreaf) When we train the model we ignore any saved normalization
    # parameter, since the sample interval and features may be different.
    n_examples, normalization_params = normalize_dataset(examples, labels)
    # We do cache the result to normalize the prediction set.
    model_config['normalization_params'] = normalization_params
    gather_results.save_model_config(dataset, model_config)
    if visualize:
        for n in range(len(runs)):
            figure_name = sample_interval + "_%s_" + str(n)
            unrolled_norm_plot = pd.Series(n_examples[n]).plot()
            fig = unrolled_norm_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "normalized"]))
            plt.close(fig)

        np_sizes = np.array(sizes)
        df = pd.DataFrame(np_sizes, columns=['size', 'status'])
        size_plot = df.plot.scatter(x='size', y='status')
        fig = size_plot.get_figure()
        fig.savefig(os.sep.join(data_plots_folder + ['sizes_by_result.png']))
        plt.close(fig)

    # Now do the training
    exmple_ids = [run.uuid for run in runs]
    classes = np.array(classes)
    print("\nTraining data shape: (%d, %d)" % n_examples.shape)
    if train:
        if debug:
            tf.logging.set_verbosity(tf.logging.DEBUG)
        config = tf.ConfigProto(log_device_placement=True, )
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        model = svm_trainer.SVMTrainer(n_examples,
                                       exmple_ids,
                                       labels,
                                       classes,
                                       dataset_name=dataset,
                                       force_gpu=gpu)
        model.train(steps=steps)
Ejemplo n.º 5
0
def train_model(build_name):

    global estimator
    dataset = estimator
    global model_dir
    with session_scope() as session:
        if not os.path.isfile(os.sep.join([model_dir, 'data', dataset,
                                           'runs.json.gz'])):
            runs = gather_results.get_runs_by_name(None,
                                                   build_name=build_name,
                                                   session=session)
            model_config = {'build_name': build_name}
            gather_results.save_model_config(dataset, model_config,
                                             data_path=model_dir)
            gather_results.save_run_uuids(dataset, runs,
                                          data_path=model_dir)
        else:
            runs = gather_results.load_run_uuids(dataset,
                                                 data_path=model_dir)
        normalized_length = 5500
        if estimator == 'svm':
            skips = []
            classes = []
            labels = []
            examples = []
            class_label = 'status'
            features_regex = None
            sample_interval = None
            idx = 0
            # Model configuration. We need to cache sample_interval,
            # features-regex and the normalization parameters for each
            # feature so we can re-use them during prediction.
            model_config = {
                'sample_interval': sample_interval,
                'features_regex': features_regex,
                'normalized_length': normalized_length
            }
            for run in runs:
                results = gather_results.get_subunit_results_for_run(
                    run, '1s', session=None, data_path=model_dir,
                    use_cache=True)
                print('Acquired run %s' % run.uuid)
                # For one run_uuid we must only get on example (result)
                result = results[0]
                if not result:
                    skips.append(run.uuid)
                    continue
                # Setup the numpy matrix and sizes
                if len(examples) == 0:
                    # Adjust normalized_length to the actual re-sample one
                    examples = np.ndarray(
                        shape=(
                            len(runs),
                            (len(result['dstat'].columns)
                             * normalized_length)))
                    model_config['num_columns'] = len(
                        result['dstat'].columns)
                    model_config['num_features'] = (len(
                        result['dstat'].columns) * normalized_length)
                    # Normalize data
                    example = fixed_lenght_example(result,
                                                   normalized_length)
                    # Normalize status
                    status = get_class(result, class_label)
                    vector, new_labels = unroll_example(
                        example, normalized_length, labels)
                    # Only calculate labels for the first example
                    if len(labels) == 0:
                        labels = new_labels
                        model_config['labels'] = labels
                    # Examples is an np ndarrays
                    examples[idx] = vector.values
                    classes.append(status)
            if len(skips) > 0:
                    print('Unable to train model because of missing '
                          'runs %s' % skips)
                    safe_runs = [
                        run for run in runs if run.uuid not in skips]
                    gather_results.save_run_uuids(dataset, safe_runs,
                                                  data_path=model_dir)
                    message = ('The model has been updated to exclude '
                               'those runs. Please re-run the training'
                               ' step.')
                    abort(make_response(message, 400))

            def run_training():
                # Perform dataset-wise normalization
                # NOTE(andreaf) When we train the model we ignore any saved
                # normalization
                # parameter, since the sample interval and features may be
                # different.
                n_examples, normalization_params = normalize_dataset(
                    examples, labels)
                # We do cache the result to normalize the prediction set.
                model_config['normalization_params'] = normalization_params
                gather_results.save_model_config(dataset, model_config,
                                                 data_path=model_dir)
                # Now do the training
                example_ids = [run.uuid for run in runs]
                outclasses = np.array(classes)
                svm_trainer.SVMTrainer(n_examples, example_ids, labels,
                                       outclasses, dataset_name=dataset,
                                       model_path=model_dir)
            thread = threading.Thread(target=run_training)
            thread.start()
            return "training started", 202
        else:
            def run_nn_training():
                for run in runs:
                    uuid = run.uuid
                    result = gather_results.get_subunit_results_for_run(
                        run, '1s', session=session, use_cache=False,
                        data_path=model_dir)[0]
                    try:
                        features, labels = nn_trainer.normalize_data(result)
                    except TypeError:
                        print('Unable to normalize data in run %s, '
                              'skipping' % uuid)
                        continue
                    nn_trainer.train_model(features, labels,
                                           dataset_name=dataset,
                                           model_path=model_dir)
                print('done')
            thread = threading.Thread(target=run_nn_training)
            thread.start()
            return "training started", 202
Ejemplo n.º 6
0
def prepare_dataset(dataset,
                    normalized_length,
                    num_dstat_features,
                    data_type,
                    features_regex,
                    sample_interval='1s',
                    class_label='status',
                    visualize=False,
                    data_path=None,
                    target_data_path=None,
                    s3=None):
    """Takes a dataset and filters and does the magic

    Loads the run ids from the dataset configuration.
    Loads the data (dsv + meta) for every run from cache.
    Builds the unrolled exaples as a numpy ndarray.
    Builds the classes as a numpy array.
    Saves the data setup to the dataset config.
    Does some visualization (if enabled).
    """
    if visualize:
        data_plots_folder = [
            os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data',
            dataset, 'plots'
        ]
        os.makedirs(os.sep.join(data_plots_folder), exist_ok=True)

    # Load the list of runs and base labels
    runs = gather_results.load_run_uuids(dataset,
                                         name=data_type,
                                         data_path=target_data_path,
                                         s3=s3)

    # run_uuids are the example_ids
    sizes = []
    # The data for each example.
    examples = examples_ndarray(len(runs), num_dstat_features,
                                normalized_length)

    # The test result for each example
    classes = []
    skips = []
    print("Loading %s data:" % data_type, end='\r', flush=True)
    for count, run in enumerate(runs):
        print("Loading %s data: %d of %d" % (data_type, count + 1, len(runs)),
              end='\r',
              flush=True)
        result = gather_results.get_subunit_results_for_run(
            run, sample_interval, data_path=data_path, s3=s3)
        # For one run_uuid we must only get on example (result)
        # Filtering by columns
        if not result:
            skips.append(run.uuid)
            continue

        # Apply column filtering
        result = filter_example(result, features_regex)

        # Normalize data
        example = fixed_lenght_example(result, normalized_length)

        vector = unroll_example(example, normalized_length)

        # Normalize status
        status = get_class(result, class_label)

        # Examples is an np ndarrays
        examples[count] = vector.values
        classes.append(status)

        # Plot from figures
        if visualize:
            # Prepare some more data if we are going to visualize
            sizes.append((result['dstat'].shape[0], status))
            figure_name = sample_interval + "_%s_" + str(count)
            # Plot un-normalized data
            data_plot = result['dstat'].plot()
            fig = data_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "downsampled"]))
            plt.close(fig)
            # Plot fixed size data
            fixed_plot = example.plot()
            fig = fixed_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "fixedsize"]))
            plt.close(fig)
            # Plot unrolled data
            unrolled_plot = pd.Series(vector).plot()
            fig = unrolled_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "unrolled"]))
            plt.close(fig)

    print("Loading %s data: %d done!" % (data_type, len(runs)))
    # Check that everything went well
    if len(skips) > 0:
        print('Unable to train model because of missing runs %s' % skips)
        safe_runs = [run.uuid for run in runs if run.uuid not in skips]
        gather_results.save_run_uuids(dataset, safe_runs)
        print('The model has been updated to exclude those runs.')
        print('Please re-run the training step.')
        sys.exit(1)

    classes = np.array(classes)
    figure_sizes = np.array(sizes)
    example_ids = np.array(runs)

    print("%s set: examples: %s, classes: %s, example IDs: %s" %
          (data_type, str(examples.shape), str(
              classes.shape), str(example_ids.shape)))

    data = {
        'examples': examples,
        'example_ids': example_ids,
        'classes': classes
    }

    return data, figure_sizes