Example #1
0
def setup_experiment(experiment, estimator, hidden_layers, steps, batch_size,
                     epochs, optimizer, learning_rate, force, data_path,
                     s3_profile, s3_url):
    """Define experiment parameters and hyper parameters

    Supported optimizers:
    * 'Adagrad': Returns an `AdagradOptimizer`.
    * 'Adam': Returns an `AdamOptimizer`.
    * 'Ftrl': Returns an `FtrlOptimizer`.
    * 'RMSProp': Returns an `RMSPropOptimizer`.
    * 'SGD': Returns a `GradientDescentOptimizer`.
    """
    # s3 support, only for loading the dataset
    s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile)

    # Prevent overwrite by mistake
    if gather_results.load_experiment(experiment, data_path=data_path,
                                      s3=s3) and not force:
        print("Experiment %s already configured" % experiment)
        sys.exit(1)
    params = {}
    hyper_params = {
        'steps':
        steps,
        'batch_size':
        batch_size,
        'epochs':
        epochs,
        'hidden_units':
        [x for x in map(lambda x: int(x), hidden_layers.split('/'))],
        'optimizer':
        optimizer,
        'learning_rate':
        learning_rate
    }
    experiment_data = {
        'estimator': estimator,
        'params': params,
        'hyper_params': hyper_params
    }
    # Store the experiment to disk
    gather_results.save_experiment(experiment_data,
                                   experiment,
                                   data_path=data_path,
                                   s3=s3)
    print("Experiment %s saved successfully." % experiment)
    print("\testimator: %s" % estimator)
    print("\tparameters: %s" % params)
    print("\thyper parameters: %s" % hyper_params)
Example #2
0
def local_trainer(dataset, experiment, eval_dataset, gpu, debug, data_path,
                  s3_profile, s3_url):
    # s3 support. When both using s3, dataset and experiment must stored
    # in the same bucket
    s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile)

    # Load experiment data
    experiment_data = gather_results.load_experiment(experiment,
                                                     data_path=data_path,
                                                     s3=s3)
    if not experiment_data:
        print("Experiment %s not found" % experiment)
        sys.exit(1)

    # Load dataset data
    dataset_data = gather_results.load_model_config(dataset,
                                                    data_path=data_path,
                                                    s3=s3)
    if not dataset_data:
        print("Dataset %s not found" % dataset)
        sys.exit(1)

    # Read hyper_params and params
    estimator = experiment_data['estimator']
    hyper_params = experiment_data['hyper_params']
    params = experiment_data['params']
    steps = int(hyper_params['steps'])
    num_epochs = int(hyper_params['epochs'])
    batch_size = int(hyper_params['batch_size'])
    optimizer = hyper_params['optimizer']
    learning_rate = float(hyper_params['learning_rate'])
    class_label = dataset_data['class_label']

    if debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)

    # Load the normalized data
    labels = gather_results.load_dataset(dataset,
                                         'labels',
                                         data_path=data_path,
                                         s3=s3)['labels']
    training_data = gather_results.load_dataset(dataset,
                                                'training',
                                                data_path=data_path,
                                                s3=s3)
    test_data = gather_results.load_dataset(dataset,
                                            'test',
                                            data_path=data_path,
                                            s3=s3)
    print("Training data shape: (%d, %d)" % training_data['examples'].shape)

    if class_label == 'node_provider':
        label_vocabulary = set([
            'rax', 'ovh', 'packethost-us-west-1', 'vexxhost',
            'limestone-regionone', 'inap-mtl01', 'fortnebula-regionone'
        ])
    elif class_label == 'node_provider_all':
        label_vocabulary = set([
            'rax-iad', 'ovh-bhs1', 'packethost-us-west-1', 'rax-dfw',
            'vexxhost-ca-ymq-1', 'ovh-gra1', 'limestone-regionone',
            'inap-mtl01', 'rax-ord', 'vexxhost-sjc1', 'fortnebula-regionone'
        ])
    else:
        label_vocabulary = None

    # Get the estimator
    model_dir = gather_results.get_model_folder(dataset, experiment)
    estimator = tf_trainer.get_estimator(
        estimator,
        hyper_params,
        params,
        labels,
        model_dir,
        optimizer=_OPTIMIZER_CLS_NAMES[optimizer](learning_rate=learning_rate),
        label_vocabulary=label_vocabulary,
        gpu=gpu)

    def train_and_eval():
        # Train
        tf_trainer.get_training_method(estimator)(
            input_fn=tf_trainer.get_input_fn(shuffle=True,
                                             batch_size=batch_size,
                                             num_epochs=num_epochs,
                                             labels=labels,
                                             **training_data),
            steps=steps)
        # Eval on the experiment dataset + any other requested
        eval_sets = [dataset]
        eval_sets.extend(eval_dataset)
        for eval_dataset_name in eval_sets:
            eval_data = gather_results.load_dataset(eval_dataset_name,
                                                    'test',
                                                    data_path=data_path,
                                                    s3=s3)
            eval_size = len(eval_data['example_ids'])

            # Run tf evaluation and store the metrics
            print("Evaluation data shape: (%d, %d)" %
                  eval_data['examples'].shape)
            eval_loss = estimator.evaluate(
                input_fn=tf_trainer.get_input_fn(batch_size=eval_size,
                                                 num_epochs=1,
                                                 labels=labels,
                                                 **eval_data),
                name=eval_dataset_name)
            # Saving and Logging loss
            print('Training eval data for %s: %r' %
                  (eval_dataset_name, eval_loss))
            eval_name = "eval_" + eval_dataset_name
            gather_results.save_data_json(dataset,
                                          eval_loss,
                                          eval_name,
                                          sub_folder=experiment)

        # Run a prediction on the "dev" set, which we use as prod, and store it
        prod_data = gather_results.load_dataset(dataset,
                                                'dev',
                                                data_path=data_path,
                                                s3=s3)
        prod_size = len(prod_data['example_ids'])

        prediction = estimator.predict(input_fn=tf_trainer.get_input_fn(
            batch_size=prod_size, num_epochs=1, labels=labels, **prod_data))

        # Convert bytes fields to string for serialization
        serializable_pred = []
        for pred in prediction:
            _classes = pred['classes']
            pred['classes'] = [x.decode("utf-8") for x in _classes]
            serializable_pred.append(pred)

        prediction_name = "prediction_" + dataset
        pred_data = zip(prod_data['example_ids'], serializable_pred,
                        prod_data['classes'])
        gather_results.save_data_json(dataset, [x for x in pred_data],
                                      prediction_name,
                                      sub_folder=experiment)

    # Now do the training and evalutation
    if gpu:
        with tf.device('/gpu:0'):
            eval_loss = train_and_eval()
    else:
        eval_loss = train_and_eval()