Exemple #1
0
def setup_experiment(experiment, estimator, hidden_layers, steps, batch_size,
                     epochs, optimizer, learning_rate, force, data_path,
                     s3_profile, s3_url):
    """Define experiment parameters and hyper parameters

    Supported optimizers:
    * 'Adagrad': Returns an `AdagradOptimizer`.
    * 'Adam': Returns an `AdamOptimizer`.
    * 'Ftrl': Returns an `FtrlOptimizer`.
    * 'RMSProp': Returns an `RMSPropOptimizer`.
    * 'SGD': Returns a `GradientDescentOptimizer`.
    """
    # s3 support, only for loading the dataset
    s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile)

    # Prevent overwrite by mistake
    if gather_results.load_experiment(experiment, data_path=data_path,
                                      s3=s3) and not force:
        print("Experiment %s already configured" % experiment)
        sys.exit(1)
    params = {}
    hyper_params = {
        'steps':
        steps,
        'batch_size':
        batch_size,
        'epochs':
        epochs,
        'hidden_units':
        [x for x in map(lambda x: int(x), hidden_layers.split('/'))],
        'optimizer':
        optimizer,
        'learning_rate':
        learning_rate
    }
    experiment_data = {
        'estimator': estimator,
        'params': params,
        'hyper_params': hyper_params
    }
    # Store the experiment to disk
    gather_results.save_experiment(experiment_data,
                                   experiment,
                                   data_path=data_path,
                                   s3=s3)
    print("Experiment %s saved successfully." % experiment)
    print("\testimator: %s" % estimator)
    print("\tparameters: %s" % params)
    print("\thyper parameters: %s" % hyper_params)
Exemple #2
0
def local_trainer(dataset, experiment, eval_dataset, gpu, debug, data_path,
                  s3_profile, s3_url):
    # s3 support. When both using s3, dataset and experiment must stored
    # in the same bucket
    s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile)

    # Load experiment data
    experiment_data = gather_results.load_experiment(experiment,
                                                     data_path=data_path,
                                                     s3=s3)
    if not experiment_data:
        print("Experiment %s not found" % experiment)
        sys.exit(1)

    # Load dataset data
    dataset_data = gather_results.load_model_config(dataset,
                                                    data_path=data_path,
                                                    s3=s3)
    if not dataset_data:
        print("Dataset %s not found" % dataset)
        sys.exit(1)

    # Read hyper_params and params
    estimator = experiment_data['estimator']
    hyper_params = experiment_data['hyper_params']
    params = experiment_data['params']
    steps = int(hyper_params['steps'])
    num_epochs = int(hyper_params['epochs'])
    batch_size = int(hyper_params['batch_size'])
    optimizer = hyper_params['optimizer']
    learning_rate = float(hyper_params['learning_rate'])
    class_label = dataset_data['class_label']

    if debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)

    # Load the normalized data
    labels = gather_results.load_dataset(dataset,
                                         'labels',
                                         data_path=data_path,
                                         s3=s3)['labels']
    training_data = gather_results.load_dataset(dataset,
                                                'training',
                                                data_path=data_path,
                                                s3=s3)
    test_data = gather_results.load_dataset(dataset,
                                            'test',
                                            data_path=data_path,
                                            s3=s3)
    print("Training data shape: (%d, %d)" % training_data['examples'].shape)

    if class_label == 'node_provider':
        label_vocabulary = set([
            'rax', 'ovh', 'packethost-us-west-1', 'vexxhost',
            'limestone-regionone', 'inap-mtl01', 'fortnebula-regionone'
        ])
    elif class_label == 'node_provider_all':
        label_vocabulary = set([
            'rax-iad', 'ovh-bhs1', 'packethost-us-west-1', 'rax-dfw',
            'vexxhost-ca-ymq-1', 'ovh-gra1', 'limestone-regionone',
            'inap-mtl01', 'rax-ord', 'vexxhost-sjc1', 'fortnebula-regionone'
        ])
    else:
        label_vocabulary = None

    # Get the estimator
    model_dir = gather_results.get_model_folder(dataset, experiment)
    estimator = tf_trainer.get_estimator(
        estimator,
        hyper_params,
        params,
        labels,
        model_dir,
        optimizer=_OPTIMIZER_CLS_NAMES[optimizer](learning_rate=learning_rate),
        label_vocabulary=label_vocabulary,
        gpu=gpu)

    def train_and_eval():
        # Train
        tf_trainer.get_training_method(estimator)(
            input_fn=tf_trainer.get_input_fn(shuffle=True,
                                             batch_size=batch_size,
                                             num_epochs=num_epochs,
                                             labels=labels,
                                             **training_data),
            steps=steps)
        # Eval on the experiment dataset + any other requested
        eval_sets = [dataset]
        eval_sets.extend(eval_dataset)
        for eval_dataset_name in eval_sets:
            eval_data = gather_results.load_dataset(eval_dataset_name,
                                                    'test',
                                                    data_path=data_path,
                                                    s3=s3)
            eval_size = len(eval_data['example_ids'])

            # Run tf evaluation and store the metrics
            print("Evaluation data shape: (%d, %d)" %
                  eval_data['examples'].shape)
            eval_loss = estimator.evaluate(
                input_fn=tf_trainer.get_input_fn(batch_size=eval_size,
                                                 num_epochs=1,
                                                 labels=labels,
                                                 **eval_data),
                name=eval_dataset_name)
            # Saving and Logging loss
            print('Training eval data for %s: %r' %
                  (eval_dataset_name, eval_loss))
            eval_name = "eval_" + eval_dataset_name
            gather_results.save_data_json(dataset,
                                          eval_loss,
                                          eval_name,
                                          sub_folder=experiment)

        # Run a prediction on the "dev" set, which we use as prod, and store it
        prod_data = gather_results.load_dataset(dataset,
                                                'dev',
                                                data_path=data_path,
                                                s3=s3)
        prod_size = len(prod_data['example_ids'])

        prediction = estimator.predict(input_fn=tf_trainer.get_input_fn(
            batch_size=prod_size, num_epochs=1, labels=labels, **prod_data))

        # Convert bytes fields to string for serialization
        serializable_pred = []
        for pred in prediction:
            _classes = pred['classes']
            pred['classes'] = [x.decode("utf-8") for x in _classes]
            serializable_pred.append(pred)

        prediction_name = "prediction_" + dataset
        pred_data = zip(prod_data['example_ids'], serializable_pred,
                        prod_data['classes'])
        gather_results.save_data_json(dataset, [x for x in pred_data],
                                      prediction_name,
                                      sub_folder=experiment)

    # Now do the training and evalutation
    if gpu:
        with tf.device('/gpu:0'):
            eval_loss = train_and_eval()
    else:
        eval_loss = train_and_eval()
Exemple #3
0
def build_dataset(dataset, build_name, slicer, sample_interval, features_regex,
                  class_label, tdt_split, force, visualize, data_path,
                  target_data_path, s3_profile, s3_url, data_plots_folder,
                  aggregation_functions):
    # s3 support
    s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile)

    # Prevent overwrite by mistake
    if gather_results.load_model_config(
            dataset, data_path=target_data_path, s3=s3) and not force:
        print("Dataset %s already configured" % dataset)
        sys.exit(1)

    # Validate tdt-split
    training, dev, test = map(lambda x: x / 10, tdt_split)
    if not sum(tdt_split) == 10:
        print("Training (%d) + dev (%d) + test (%d) != 10" % tdt_split)
        sys.exit(1)

    # Load available run ids for the build name (from s3)
    runs = gather_results.load_run_uuids('.raw',
                                         name=build_name,
                                         data_path=data_path,
                                         s3=s3)

    # Apply the slice
    def slice_fn(x):
        return int(x.strip()) if x.strip() else None

    slice_object = slice(*map(slice_fn, slicer.split(":")))
    runs = np.array(runs[slice_object])
    print("Obtained %d runs for build %s" % (len(runs), build_name))

    # Split the runs in training, dev and test
    training_idx, dev_idx, test_idx = dataset_split_filters(
        len(runs), training, dev, data_path=target_data_path, s3=s3)
    np_runs = np.array(runs)
    # Saving dataset metadata
    gather_results.save_run_uuids(dataset,
                                  np_runs[training_idx],
                                  name='training',
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_run_uuids(dataset,
                                  np_runs[dev_idx],
                                  name='dev',
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_run_uuids(dataset,
                                  np_runs[test_idx],
                                  name='test',
                                  data_path=target_data_path,
                                  s3=s3)

    # Calculate normalized and filtered dimensions and labels
    normalized_length, num_dstat_features, labels = \
        data_sizes_and_labels(runs[0], features_regex, sample_interval,
                              aggregation_functions=aggregation_functions,
                              data_path=data_path, s3=s3)
    model_config = {
        'build_name': build_name,
        'sample_interval': sample_interval,
        'features_regex': features_regex,
        'class_label': class_label,
        'aggregation_functions': aggregation_functions,
        'training_set': training,
        'dev_set': dev,
        'test_set': test,
        'normalized_length': normalized_length,
        'labels': labels,
        'num_columns': num_dstat_features,
        'num_features': len(labels)
    }

    # Save the config and complete list of run uuids
    gather_results.save_run_uuids(dataset,
                                  runs,
                                  data_path=target_data_path,
                                  s3=s3)
    gather_results.save_model_config(dataset,
                                     model_config,
                                     data_path=target_data_path,
                                     s3=s3)
    print("Stored %d run IDs in the model %s config" % (len(runs), dataset))

    # Resolve the aggregation function names to functions
    resolved_agg_fn = [
        resolve_aggregation_function(x) for x in aggregation_functions
    ]

    datasets = {}
    # Training must come first so we calculate normalization params
    for data_type in ['training', 'dev', 'test']:
        data, _figure_sizes = prepare_dataset(
            dataset,
            normalized_length,
            num_dstat_features,
            data_type,
            features_regex=features_regex,
            sample_interval=sample_interval,
            class_label=class_label,
            aggregation_functions=resolved_agg_fn,
            visualize=visualize,
            data_path=data_path,
            target_data_path=target_data_path,
            s3=s3)
        datasets[data_type] = data
        examples = data['examples']
        if len(examples) == 0:
            continue

        # Perform dataset-wise normalization
        if data_type == 'training':
            n_examples, normalization_params = normalize_dataset(
                examples, labels)

            # We cache normalization parameters from the training data set
            # to normalize the dev and test set, as well as other input data
            model_config['normalization_params'] = normalization_params
            gather_results.save_model_config(dataset,
                                             model_config,
                                             data_path=target_data_path,
                                             s3=s3)

            # Save figure sizes as well for training only
            figure_sizes = _figure_sizes
        else:
            # Perform dataset-wise normalization
            n_examples, normalization_params = normalize_dataset(
                examples, labels, model_config['normalization_params'])

        # Replace examples with normalized ones
        datasets[data_type]['examples'] = n_examples

        # Store the normalized data to disk
        gather_results.save_dataset(dataset,
                                    name=data_type,
                                    data_path=target_data_path,
                                    s3=s3,
                                    **datasets[data_type])

    # Plot some more figures
    if visualize and not aggregation_functions:
        for n in range(n_examples.shape[0]):
            figure_name = sample_interval + "_%s_" + str(n)
            unrolled_norm_plot = pd.Series(n_examples[n]).plot()
            fig = unrolled_norm_plot.get_figure()
            axes = plt.gca()
            axes.set_ylim([-1, 1])
            fig.savefig(
                os.sep.join([data_plots_folder] +
                            [figure_name % "normalized"]))
            plt.close(fig)

        df = pd.DataFrame(figure_sizes, columns=['size', 'status'])
        size_plot = df.plot.scatter(x='size', y='status')
        fig = size_plot.get_figure()
        fig.savefig(os.sep.join([data_plots_folder] + ['sizes_by_result.png']))
        plt.close(fig)

    # Store labels to disk
    gather_results.save_dataset(dataset,
                                name='labels',
                                data_path=target_data_path,
                                s3=s3,
                                labels=labels)
    print("Done creating dataset %s" % model_config)