Esempio n. 1
0
def mqtt_trainer():
    event_queue = queue.Queue()
    listen_thread = listener.MQTTSubscribe(event_queue,
                                           'firehose.openstack.org',
                                           'gearman-subunit/#')
    listen_thread.start()
    #    dstat_model = dstat_data.DstatTrainer('mqtt-dataset')
    while True:
        event = event_queue.get()
        results = gather_results.get_subunit_results(event['build_uuid'],
                                                     'mqtt-dataset', '1s',
                                                     default_db_uri)
        examples = []
        classes = []
        labels_list = []
        for result in results:
            vector, status = normalize_example(result)
            examples = vector
            classes = status
            labels_list = labels
            if vector and labels and status:
                break
        run_uuids = [event['build_uuid']] * len(examples)
        dstat_model = svm_trainer.SVMTrainer(examples, run_uuids, labels_list,
                                             classes)
        dstat_model.train()
Esempio n. 2
0
def mqtt_predict(db_uri, mqtt_hostname, topic, dataset, sample_interval,
                 build_name, debug, model_dir):
    event_queue = queue.Queue()
    if debug:
        print('Starting MQTT listener')
    listen_thread = listener.MQTTSubscribe(event_queue, mqtt_hostname, topic)
    listen_thread.start()
    if debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
        print('Entering main loop')
    while True:
        event = event_queue.get()
        if debug:
            print('Received event with build uuid %s' % event['build_uuid'])
        results = gather_results.get_subunit_results(
            event['build_uuid'], dataset, sample_interval, db_uri, build_name,
            data_path=model_dir, use_cache=False)
        if results:
            print('Obtained dstat file for %s' % event['build_uuid'])
        else:
            print('Build uuid: %s is not of proper build_name, skipping'
                  % event['build_uuid'])
        for res in results:
            vector, status, labels = trainer.normalize_example(res)
            model = svm_trainer.SVMTrainer(
                vector, [event['build_uuid']] * len(results), labels, [status],
                dataset_name=dataset, model_path=model_dir)
            model.predict()
Esempio n. 3
0
def db_predict(db_uri, dataset, sample_interval, build_name, debug,
               build_uuid):
    if debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    results = gather_results.get_subunit_results(
        build_uuid, dataset, sample_interval, db_uri, build_name)
    if results:
        print('Obtained dstat file for %s' % build_uuid)
    else:
        print('Build uuid: %s is not of proper build_uuid, skipping'
              % build_uuid)
    for res in results:
        vector, status, labels = trainer.normalize_example(res)
        model = svm_trainer.SVMTrainer(
            vector, [build_uuid] * len(results), labels, [status],
            dataset_name=dataset)
        model.predict()
Esempio n. 4
0
 def run_training():
     # Perform dataset-wise normalization
     # NOTE(andreaf) When we train the model we ignore any saved
     # normalization
     # parameter, since the sample interval and features may be
     # different.
     n_examples, normalization_params = normalize_dataset(
         examples, labels)
     # We do cache the result to normalize the prediction set.
     model_config['normalization_params'] = normalization_params
     gather_results.save_model_config(dataset, model_config,
                                      data_path=model_dir)
     # Now do the training
     example_ids = [run.uuid for run in runs]
     outclasses = np.array(classes)
     svm_trainer.SVMTrainer(n_examples, example_ids, labels,
                            outclasses, dataset_name=dataset,
                            model_path=model_dir)
Esempio n. 5
0
def db_batch_predict(db_uri, dataset, slice, gpu, debug):
    """Run predict on all DB items on included in the dataset yet

    Takes a dataset and a build name. It builds the list of runs in the DB
    that fit the specified build name, and that are not yet used for training
    in the specified dataset. It runs prediction on all of them.
    """
    if debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    # Get the configuration for the model
    model_config = gather_results.load_model_config(dataset)
    # Get the list of runs from the dataset
    run_uuids = gather_results.load_run_uuids(dataset)
    # Get the list of runs from the DB
    runs = gather_results.get_runs_by_name(
        db_uri=db_uri, build_name=model_config['build_name'])
    # Run a predict loop, include all runs not in the train dataset
    predict_runs = [r for r in runs if r.uuid not in run_uuids]
    predict_runs = predict_runs[]
    if len(predict_runs) == 0:
        print("Empty prediction set, nothing to do.")
        sys.exit(0)
    # Initialize the array
    examples = np.ndarray(
        shape=(len(predict_runs), model_config['num_features']))
    idx = 0
    classes = []
    labels = []
    print("All runs: %d, dataset size: %d, predict size: %d" % (
        len(runs), len(run_uuids), len(predict_runs)))
    for run in predict_runs:
        # This will also store new runs in cache. In future we may want to
        # train on those as well, but nor now let's try to predict only
        results = gather_results.get_subunit_results_for_run(
            run, model_config['sample_interval'], db_uri=db_uri)
        for result in results:
            # Skip runs with no data
            if result is None:
                continue
            if model_config['features_regex']:
                df = result['dstat']
                col_regex = re.compile(model_config['features_regex'])
                result['dstat'] = df[list(filter(
                    col_regex.search, df.columns))]
            # Normalize examples
            vector, status, labels = trainer.normalize_example(
                result, model_config['normalized_length'],
                model_config['labels'])
            examples[idx] = vector.values
            classes.append(status)
            idx += 1
    # Normalize dataset
    n_examples, _ = trainer.normalize_dataset(
        examples, labels, params=model_config['normalization_params'])
    # Prepare other arrays
    classes = np.array(classes)
    run_uuids = [r.uuid for r in predict_runs]
    # Configure TF
    config = tf.ConfigProto(log_device_placement=True,)
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    # Now do the prediction
    model = svm_trainer.SVMTrainer(n_examples, run_uuids, labels,
                                   classes, dataset_name=dataset,
                                   force_gpu=gpu)
    predictions = model.predict()
    errors = []
    for prediction, actual in zip(predictions, classes):
        if prediction['classes'] != actual:
            errors.append((prediction, actual))
    print("Prediction of %d inputs completed." % len(classes))
    print("Input set composition: %d PASS, %s FAIL" % (
        len([x for x in classes if x == 0]),
        len([x for x in classes if x == 1])))
    if len(errors) > 0:
        print("There were some prediction errors: %s" % errors)
    else:
        print("All predicted correctly.")
Esempio n. 6
0
def local_trainer(train, estimator, dataset, sample_interval, features_regex,
                  class_label, visualize, steps, gpu, debug):
    # Normalized lenght before resampling
    normalized_length = 5500
    if sample_interval:
        # Calculate the desired normalized lenght after resample
        normalized_length = get_downsampled_example_lenght(
            sample_interval, normalized_length)

    data_plots_folder = [
        os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data',
        dataset, 'plots'
    ]
    os.makedirs(os.sep.join(data_plots_folder), exist_ok=True)
    runs = gather_results.load_run_uuids(dataset)

    # run_uuids are the example_ids
    sizes = []
    # The data for each example. We don't know yet the pre-set shape, so
    # wait until the first result comes in
    examples = []

    # Model configuration. We need to cache sample_interval, features-regex and
    # the normalization parameters for each feature so we can re-use them
    # during prediction.
    model_config = {
        'sample_interval': sample_interval,
        'features_regex': features_regex,
        'normalized_length': normalized_length
    }

    # The test result for each example
    classes = []
    labels = []
    idx = 0
    skips = []
    for run in runs:
        results = gather_results.get_subunit_results_for_run(
            run, sample_interval)
        # For one run_uuid we must only get on example (result)
        result = results[0]
        # Filtering by columns
        if not result:
            skips.append(run.uuid)
            continue
        df = result['dstat']
        if features_regex:
            col_regex = re.compile(features_regex)
            result['dstat'] = df[list(filter(col_regex.search, df.columns))]
        # Setup the numpy matrix and sizes
        if len(examples) == 0:
            # Adjust normalized_length to the actual re-sample one
            examples = np.ndarray(shape=(len(runs),
                                         len(result['dstat'].columns) *
                                         normalized_length))
            model_config['num_columns'] = len(result['dstat'].columns)
            model_config['num_features'] = (len(result['dstat'].columns) *
                                            normalized_length)
        # Normalize data
        example = fixed_lenght_example(result, normalized_length)
        # Normalize status
        status = get_class(result, class_label)
        vector, new_labels = unroll_example(example, normalized_length, labels)
        # Only calculate labels for the first example
        if len(labels) == 0:
            labels = new_labels
            model_config['labels'] = labels
        print("Normalized example %d of %d" % (runs.index(run) + 1, len(runs)),
              end='\r',
              flush=True)
        # Examples is an np ndarrays
        examples[idx] = vector.values
        classes.append(status)
        if visualize:
            # Prepare some more data if we are going to visualize
            sizes.append((result['dstat'].shape[0], status))
            figure_name = sample_interval + "_%s_" + str(idx)
            # Plot un-normalized data
            data_plot = result['dstat'].plot()
            fig = data_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "downsampled"]))
            plt.close(fig)
            # Plot fixed size data
            fixed_plot = example.plot()
            fig = fixed_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "fixedsize"]))
            plt.close(fig)
            # Plot unrolled data
            unrolled_plot = pd.Series(vector).plot()
            fig = unrolled_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "unrolled"]))
            plt.close(fig)
        idx += 1
    if len(skips) > 0:
        print('Unable to train model because of missing runs %s' % skips)
        safe_runs = [run for run in runs if run.uuid not in skips]
        gather_results.save_run_uuids(dataset, safe_runs)
        print('The model has been updated to exclude those runs.')
        print('Please re-run the training step.')
        sys.exit(1)
    # Perform dataset-wise normalization
    # NOTE(andreaf) When we train the model we ignore any saved normalization
    # parameter, since the sample interval and features may be different.
    n_examples, normalization_params = normalize_dataset(examples, labels)
    # We do cache the result to normalize the prediction set.
    model_config['normalization_params'] = normalization_params
    gather_results.save_model_config(dataset, model_config)
    if visualize:
        for n in range(len(runs)):
            figure_name = sample_interval + "_%s_" + str(n)
            unrolled_norm_plot = pd.Series(n_examples[n]).plot()
            fig = unrolled_norm_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "normalized"]))
            plt.close(fig)

        np_sizes = np.array(sizes)
        df = pd.DataFrame(np_sizes, columns=['size', 'status'])
        size_plot = df.plot.scatter(x='size', y='status')
        fig = size_plot.get_figure()
        fig.savefig(os.sep.join(data_plots_folder + ['sizes_by_result.png']))
        plt.close(fig)

    # Now do the training
    exmple_ids = [run.uuid for run in runs]
    classes = np.array(classes)
    print("\nTraining data shape: (%d, %d)" % n_examples.shape)
    if train:
        if debug:
            tf.logging.set_verbosity(tf.logging.DEBUG)
        config = tf.ConfigProto(log_device_placement=True, )
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        model = svm_trainer.SVMTrainer(n_examples,
                                       exmple_ids,
                                       labels,
                                       classes,
                                       dataset_name=dataset,
                                       force_gpu=gpu)
        model.train(steps=steps)