Exemple #1
0
def build_dataset(dataset, build_name, slicer, sample_interval, features_regex,
                  class_label, tdt_split, force, visualize, data_path,
                  target_data_path, s3_profile, s3_url, data_plots_folder,
                  aggregation_functions):
    # s3 support
    s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile)

    # Prevent overwrite by mistake
    if gather_results.load_model_config(
            dataset, data_path=target_data_path, s3=s3) and not force:
        print("Dataset %s already configured" % dataset)
        sys.exit(1)

    # Validate tdt-split
    training, dev, test = map(lambda x: x / 10, tdt_split)
    if not sum(tdt_split) == 10:
        print("Training (%d) + dev (%d) + test (%d) != 10" % tdt_split)
        sys.exit(1)

    # Load available run ids for the build name (from s3)
    runs = gather_results.load_run_uuids('.raw',
                                         name=build_name,
                                         data_path=data_path,
                                         s3=s3)

    # Apply the slice
    def slice_fn(x):
        return int(x.strip()) if x.strip() else None

    slice_object = slice(*map(slice_fn, slicer.split(":")))
    runs = np.array(runs[slice_object])
    print("Obtained %d runs for build %s" % (len(runs), build_name))

    # Split the runs in training, dev and test
    training_idx, dev_idx, test_idx = dataset_split_filters(
        len(runs), training, dev, data_path=target_data_path, s3=s3)
    np_runs = np.array(runs)
    # Saving dataset metadata
    gather_results.save_run_uuids(dataset,
                                  np_runs[training_idx],
                                  name='training',
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_run_uuids(dataset,
                                  np_runs[dev_idx],
                                  name='dev',
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_run_uuids(dataset,
                                  np_runs[test_idx],
                                  name='test',
                                  data_path=target_data_path,
                                  s3=s3)

    # Calculate normalized and filtered dimensions and labels
    normalized_length, num_dstat_features, labels = \
        data_sizes_and_labels(runs[0], features_regex, sample_interval,
                              aggregation_functions=aggregation_functions,
                              data_path=data_path, s3=s3)
    model_config = {
        'build_name': build_name,
        'sample_interval': sample_interval,
        'features_regex': features_regex,
        'class_label': class_label,
        'aggregation_functions': aggregation_functions,
        'training_set': training,
        'dev_set': dev,
        'test_set': test,
        'normalized_length': normalized_length,
        'labels': labels,
        'num_columns': num_dstat_features,
        'num_features': len(labels)
    }

    # Save the config and complete list of run uuids
    gather_results.save_run_uuids(dataset,
                                  runs,
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_model_config(dataset,
                                     model_config,
                                     data_path=target_data_path,
                                     s3=s3)
    print("Stored %d run IDs in the model %s config" % (len(runs), dataset))

    # Resolve the aggregation function names to functions
    resolved_agg_fn = [
        resolve_aggregation_function(x) for x in aggregation_functions
    ]

    datasets = {}
    # Training must come first so we calculate normalization params
    for data_type in ['training', 'dev', 'test']:
        data, _figure_sizes = prepare_dataset(
            dataset,
            normalized_length,
            num_dstat_features,
            data_type,
            features_regex=features_regex,
            sample_interval=sample_interval,
            class_label=class_label,
            aggregation_functions=resolved_agg_fn,
            visualize=visualize,
            data_path=data_path,
            target_data_path=target_data_path,
            s3=s3)
        datasets[data_type] = data
        examples = data['examples']
        if len(examples) == 0:
            continue

        # Perform dataset-wise normalization
        if data_type == 'training':
            n_examples, normalization_params = normalize_dataset(
                examples, labels)

            # We cache normalization parameters from the training data set
            # to normalize the dev and test set, as well as other input data
            model_config['normalization_params'] = normalization_params
            gather_results.save_model_config(dataset,
                                             model_config,
                                             data_path=target_data_path,
                                             s3=s3)

            # Save figure sizes as well for training only
            figure_sizes = _figure_sizes
        else:
            # Perform dataset-wise normalization
            n_examples, normalization_params = normalize_dataset(
                examples, labels, model_config['normalization_params'])

        # Replace examples with normalized ones
        datasets[data_type]['examples'] = n_examples

        # Store the normalized data to disk
        gather_results.save_dataset(dataset,
                                    name=data_type,
                                    data_path=target_data_path,
                                    s3=s3,
                                    **datasets[data_type])

    # Plot some more figures
    if visualize and not aggregation_functions:
        for n in range(n_examples.shape[0]):
            figure_name = sample_interval + "_%s_" + str(n)
            unrolled_norm_plot = pd.Series(n_examples[n]).plot()
            fig = unrolled_norm_plot.get_figure()
            axes = plt.gca()
            axes.set_ylim([-1, 1])
            fig.savefig(
                os.sep.join([data_plots_folder] +
                            [figure_name % "normalized"]))
            plt.close(fig)

        df = pd.DataFrame(figure_sizes, columns=['size', 'status'])
        size_plot = df.plot.scatter(x='size', y='status')
        fig = size_plot.get_figure()
        fig.savefig(os.sep.join([data_plots_folder] + ['sizes_by_result.png']))
        plt.close(fig)

    # Store labels to disk
    gather_results.save_dataset(dataset,
                                name='labels',
                                data_path=target_data_path,
                                s3=s3,
                                labels=labels)
    print("Done creating dataset %s" % model_config)
Exemple #2
0
def local_trainer(dataset, experiment, eval_dataset, gpu, debug, data_path,
                  s3_profile, s3_url):
    # s3 support. When both using s3, dataset and experiment must stored
    # in the same bucket
    s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile)

    # Load experiment data
    experiment_data = gather_results.load_experiment(experiment,
                                                     data_path=data_path,
                                                     s3=s3)
    if not experiment_data:
        print("Experiment %s not found" % experiment)
        sys.exit(1)

    # Load dataset data
    dataset_data = gather_results.load_model_config(dataset,
                                                    data_path=data_path,
                                                    s3=s3)
    if not dataset_data:
        print("Dataset %s not found" % dataset)
        sys.exit(1)

    # Read hyper_params and params
    estimator = experiment_data['estimator']
    hyper_params = experiment_data['hyper_params']
    params = experiment_data['params']
    steps = int(hyper_params['steps'])
    num_epochs = int(hyper_params['epochs'])
    batch_size = int(hyper_params['batch_size'])
    optimizer = hyper_params['optimizer']
    learning_rate = float(hyper_params['learning_rate'])
    class_label = dataset_data['class_label']

    if debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)

    # Load the normalized data
    labels = gather_results.load_dataset(dataset,
                                         'labels',
                                         data_path=data_path,
                                         s3=s3)['labels']
    training_data = gather_results.load_dataset(dataset,
                                                'training',
                                                data_path=data_path,
                                                s3=s3)
    test_data = gather_results.load_dataset(dataset,
                                            'test',
                                            data_path=data_path,
                                            s3=s3)
    print("Training data shape: (%d, %d)" % training_data['examples'].shape)

    if class_label == 'node_provider':
        label_vocabulary = set([
            'rax', 'ovh', 'packethost-us-west-1', 'vexxhost',
            'limestone-regionone', 'inap-mtl01', 'fortnebula-regionone'
        ])
    elif class_label == 'node_provider_all':
        label_vocabulary = set([
            'rax-iad', 'ovh-bhs1', 'packethost-us-west-1', 'rax-dfw',
            'vexxhost-ca-ymq-1', 'ovh-gra1', 'limestone-regionone',
            'inap-mtl01', 'rax-ord', 'vexxhost-sjc1', 'fortnebula-regionone'
        ])
    else:
        label_vocabulary = None

    # Get the estimator
    model_dir = gather_results.get_model_folder(dataset, experiment)
    estimator = tf_trainer.get_estimator(
        estimator,
        hyper_params,
        params,
        labels,
        model_dir,
        optimizer=_OPTIMIZER_CLS_NAMES[optimizer](learning_rate=learning_rate),
        label_vocabulary=label_vocabulary,
        gpu=gpu)

    def train_and_eval():
        # Train
        tf_trainer.get_training_method(estimator)(
            input_fn=tf_trainer.get_input_fn(shuffle=True,
                                             batch_size=batch_size,
                                             num_epochs=num_epochs,
                                             labels=labels,
                                             **training_data),
            steps=steps)
        # Eval on the experiment dataset + any other requested
        eval_sets = [dataset]
        eval_sets.extend(eval_dataset)
        for eval_dataset_name in eval_sets:
            eval_data = gather_results.load_dataset(eval_dataset_name,
                                                    'test',
                                                    data_path=data_path,
                                                    s3=s3)
            eval_size = len(eval_data['example_ids'])

            # Run tf evaluation and store the metrics
            print("Evaluation data shape: (%d, %d)" %
                  eval_data['examples'].shape)
            eval_loss = estimator.evaluate(
                input_fn=tf_trainer.get_input_fn(batch_size=eval_size,
                                                 num_epochs=1,
                                                 labels=labels,
                                                 **eval_data),
                name=eval_dataset_name)
            # Saving and Logging loss
            print('Training eval data for %s: %r' %
                  (eval_dataset_name, eval_loss))
            eval_name = "eval_" + eval_dataset_name
            gather_results.save_data_json(dataset,
                                          eval_loss,
                                          eval_name,
                                          sub_folder=experiment)

        # Run a prediction on the "dev" set, which we use as prod, and store it
        prod_data = gather_results.load_dataset(dataset,
                                                'dev',
                                                data_path=data_path,
                                                s3=s3)
        prod_size = len(prod_data['example_ids'])

        prediction = estimator.predict(input_fn=tf_trainer.get_input_fn(
            batch_size=prod_size, num_epochs=1, labels=labels, **prod_data))

        # Convert bytes fields to string for serialization
        serializable_pred = []
        for pred in prediction:
            _classes = pred['classes']
            pred['classes'] = [x.decode("utf-8") for x in _classes]
            serializable_pred.append(pred)

        prediction_name = "prediction_" + dataset
        pred_data = zip(prod_data['example_ids'], serializable_pred,
                        prod_data['classes'])
        gather_results.save_data_json(dataset, [x for x in pred_data],
                                      prediction_name,
                                      sub_folder=experiment)

    # Now do the training and evalutation
    if gpu:
        with tf.device('/gpu:0'):
            eval_loss = train_and_eval()
    else:
        eval_loss = train_and_eval()
Exemple #3
0
def db_batch_predict(db_uri, dataset, slice, gpu, debug):
    """Run predict on all DB items on included in the dataset yet

    Takes a dataset and a build name. It builds the list of runs in the DB
    that fit the specified build name, and that are not yet used for training
    in the specified dataset. It runs prediction on all of them.
    """
    if debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    # Get the configuration for the model
    model_config = gather_results.load_model_config(dataset)
    # Get the list of runs from the dataset
    run_uuids = gather_results.load_run_uuids(dataset)
    # Get the list of runs from the DB
    runs = gather_results.get_runs_by_name(
        db_uri=db_uri, build_name=model_config['build_name'])
    # Run a predict loop, include all runs not in the train dataset
    predict_runs = [r for r in runs if r.uuid not in run_uuids]
    predict_runs = predict_runs[]
    if len(predict_runs) == 0:
        print("Empty prediction set, nothing to do.")
        sys.exit(0)
    # Initialize the array
    examples = np.ndarray(
        shape=(len(predict_runs), model_config['num_features']))
    idx = 0
    classes = []
    labels = []
    print("All runs: %d, dataset size: %d, predict size: %d" % (
        len(runs), len(run_uuids), len(predict_runs)))
    for run in predict_runs:
        # This will also store new runs in cache. In future we may want to
        # train on those as well, but nor now let's try to predict only
        results = gather_results.get_subunit_results_for_run(
            run, model_config['sample_interval'], db_uri=db_uri)
        for result in results:
            # Skip runs with no data
            if result is None:
                continue
            if model_config['features_regex']:
                df = result['dstat']
                col_regex = re.compile(model_config['features_regex'])
                result['dstat'] = df[list(filter(
                    col_regex.search, df.columns))]
            # Normalize examples
            vector, status, labels = trainer.normalize_example(
                result, model_config['normalized_length'],
                model_config['labels'])
            examples[idx] = vector.values
            classes.append(status)
            idx += 1
    # Normalize dataset
    n_examples, _ = trainer.normalize_dataset(
        examples, labels, params=model_config['normalization_params'])
    # Prepare other arrays
    classes = np.array(classes)
    run_uuids = [r.uuid for r in predict_runs]
    # Configure TF
    config = tf.ConfigProto(log_device_placement=True,)
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    # Now do the prediction
    model = svm_trainer.SVMTrainer(n_examples, run_uuids, labels,
                                   classes, dataset_name=dataset,
                                   force_gpu=gpu)
    predictions = model.predict()
    errors = []
    for prediction, actual in zip(predictions, classes):
        if prediction['classes'] != actual:
            errors.append((prediction, actual))
    print("Prediction of %d inputs completed." % len(classes))
    print("Input set composition: %d PASS, %s FAIL" % (
        len([x for x in classes if x == 0]),
        len([x for x in classes if x == 1])))
    if len(errors) > 0:
        print("There were some prediction errors: %s" % errors)
    else:
        print("All predicted correctly.")