def setup_experiment(experiment, estimator, hidden_layers, steps, batch_size, epochs, optimizer, learning_rate, force, data_path, s3_profile, s3_url): """Define experiment parameters and hyper parameters Supported optimizers: * 'Adagrad': Returns an `AdagradOptimizer`. * 'Adam': Returns an `AdamOptimizer`. * 'Ftrl': Returns an `FtrlOptimizer`. * 'RMSProp': Returns an `RMSPropOptimizer`. * 'SGD': Returns a `GradientDescentOptimizer`. """ # s3 support, only for loading the dataset s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile) # Prevent overwrite by mistake if gather_results.load_experiment(experiment, data_path=data_path, s3=s3) and not force: print("Experiment %s already configured" % experiment) sys.exit(1) params = {} hyper_params = { 'steps': steps, 'batch_size': batch_size, 'epochs': epochs, 'hidden_units': [x for x in map(lambda x: int(x), hidden_layers.split('/'))], 'optimizer': optimizer, 'learning_rate': learning_rate } experiment_data = { 'estimator': estimator, 'params': params, 'hyper_params': hyper_params } # Store the experiment to disk gather_results.save_experiment(experiment_data, experiment, data_path=data_path, s3=s3) print("Experiment %s saved successfully." % experiment) print("\testimator: %s" % estimator) print("\tparameters: %s" % params) print("\thyper parameters: %s" % hyper_params)
def local_trainer(dataset, experiment, eval_dataset, gpu, debug, data_path, s3_profile, s3_url): # s3 support. When both using s3, dataset and experiment must stored # in the same bucket s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile) # Load experiment data experiment_data = gather_results.load_experiment(experiment, data_path=data_path, s3=s3) if not experiment_data: print("Experiment %s not found" % experiment) sys.exit(1) # Load dataset data dataset_data = gather_results.load_model_config(dataset, data_path=data_path, s3=s3) if not dataset_data: print("Dataset %s not found" % dataset) sys.exit(1) # Read hyper_params and params estimator = experiment_data['estimator'] hyper_params = experiment_data['hyper_params'] params = experiment_data['params'] steps = int(hyper_params['steps']) num_epochs = int(hyper_params['epochs']) batch_size = int(hyper_params['batch_size']) optimizer = hyper_params['optimizer'] learning_rate = float(hyper_params['learning_rate']) class_label = dataset_data['class_label'] if debug: tf.logging.set_verbosity(tf.logging.DEBUG) # Load the normalized data labels = gather_results.load_dataset(dataset, 'labels', data_path=data_path, s3=s3)['labels'] training_data = gather_results.load_dataset(dataset, 'training', data_path=data_path, s3=s3) test_data = gather_results.load_dataset(dataset, 'test', data_path=data_path, s3=s3) print("Training data shape: (%d, %d)" % training_data['examples'].shape) if class_label == 'node_provider': label_vocabulary = set([ 'rax', 'ovh', 'packethost-us-west-1', 'vexxhost', 'limestone-regionone', 'inap-mtl01', 'fortnebula-regionone' ]) elif class_label == 'node_provider_all': label_vocabulary = set([ 'rax-iad', 'ovh-bhs1', 'packethost-us-west-1', 'rax-dfw', 'vexxhost-ca-ymq-1', 'ovh-gra1', 'limestone-regionone', 'inap-mtl01', 'rax-ord', 'vexxhost-sjc1', 'fortnebula-regionone' ]) else: label_vocabulary = None # Get the estimator model_dir = gather_results.get_model_folder(dataset, experiment) estimator = tf_trainer.get_estimator( estimator, hyper_params, params, labels, model_dir, optimizer=_OPTIMIZER_CLS_NAMES[optimizer](learning_rate=learning_rate), label_vocabulary=label_vocabulary, gpu=gpu) def train_and_eval(): # Train tf_trainer.get_training_method(estimator)( input_fn=tf_trainer.get_input_fn(shuffle=True, batch_size=batch_size, num_epochs=num_epochs, labels=labels, **training_data), steps=steps) # Eval on the experiment dataset + any other requested eval_sets = [dataset] eval_sets.extend(eval_dataset) for eval_dataset_name in eval_sets: eval_data = gather_results.load_dataset(eval_dataset_name, 'test', data_path=data_path, s3=s3) eval_size = len(eval_data['example_ids']) # Run tf evaluation and store the metrics print("Evaluation data shape: (%d, %d)" % eval_data['examples'].shape) eval_loss = estimator.evaluate( input_fn=tf_trainer.get_input_fn(batch_size=eval_size, num_epochs=1, labels=labels, **eval_data), name=eval_dataset_name) # Saving and Logging loss print('Training eval data for %s: %r' % (eval_dataset_name, eval_loss)) eval_name = "eval_" + eval_dataset_name gather_results.save_data_json(dataset, eval_loss, eval_name, sub_folder=experiment) # Run a prediction on the "dev" set, which we use as prod, and store it prod_data = gather_results.load_dataset(dataset, 'dev', data_path=data_path, s3=s3) prod_size = len(prod_data['example_ids']) prediction = estimator.predict(input_fn=tf_trainer.get_input_fn( batch_size=prod_size, num_epochs=1, labels=labels, **prod_data)) # Convert bytes fields to string for serialization serializable_pred = [] for pred in prediction: _classes = pred['classes'] pred['classes'] = [x.decode("utf-8") for x in _classes] serializable_pred.append(pred) prediction_name = "prediction_" + dataset pred_data = zip(prod_data['example_ids'], serializable_pred, prod_data['classes']) gather_results.save_data_json(dataset, [x for x in pred_data], prediction_name, sub_folder=experiment) # Now do the training and evalutation if gpu: with tf.device('/gpu:0'): eval_loss = train_and_eval() else: eval_loss = train_and_eval()
def build_dataset(dataset, build_name, slicer, sample_interval, features_regex, class_label, tdt_split, force, visualize, data_path, target_data_path, s3_profile, s3_url, data_plots_folder, aggregation_functions): # s3 support s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile) # Prevent overwrite by mistake if gather_results.load_model_config( dataset, data_path=target_data_path, s3=s3) and not force: print("Dataset %s already configured" % dataset) sys.exit(1) # Validate tdt-split training, dev, test = map(lambda x: x / 10, tdt_split) if not sum(tdt_split) == 10: print("Training (%d) + dev (%d) + test (%d) != 10" % tdt_split) sys.exit(1) # Load available run ids for the build name (from s3) runs = gather_results.load_run_uuids('.raw', name=build_name, data_path=data_path, s3=s3) # Apply the slice def slice_fn(x): return int(x.strip()) if x.strip() else None slice_object = slice(*map(slice_fn, slicer.split(":"))) runs = np.array(runs[slice_object]) print("Obtained %d runs for build %s" % (len(runs), build_name)) # Split the runs in training, dev and test training_idx, dev_idx, test_idx = dataset_split_filters( len(runs), training, dev, data_path=target_data_path, s3=s3) np_runs = np.array(runs) # Saving dataset metadata gather_results.save_run_uuids(dataset, np_runs[training_idx], name='training', data_path=target_data_path, s3=s3) gather_results.save_run_uuids(dataset, np_runs[dev_idx], name='dev', data_path=target_data_path, s3=s3) gather_results.save_run_uuids(dataset, np_runs[test_idx], name='test', data_path=target_data_path, s3=s3) # Calculate normalized and filtered dimensions and labels normalized_length, num_dstat_features, labels = \ data_sizes_and_labels(runs[0], features_regex, sample_interval, aggregation_functions=aggregation_functions, data_path=data_path, s3=s3) model_config = { 'build_name': build_name, 'sample_interval': sample_interval, 'features_regex': features_regex, 'class_label': class_label, 'aggregation_functions': aggregation_functions, 'training_set': training, 'dev_set': dev, 'test_set': test, 'normalized_length': normalized_length, 'labels': labels, 'num_columns': num_dstat_features, 'num_features': len(labels) } # Save the config and complete list of run uuids gather_results.save_run_uuids(dataset, runs, data_path=target_data_path, s3=s3) gather_results.save_model_config(dataset, model_config, data_path=target_data_path, s3=s3) print("Stored %d run IDs in the model %s config" % (len(runs), dataset)) # Resolve the aggregation function names to functions resolved_agg_fn = [ resolve_aggregation_function(x) for x in aggregation_functions ] datasets = {} # Training must come first so we calculate normalization params for data_type in ['training', 'dev', 'test']: data, _figure_sizes = prepare_dataset( dataset, normalized_length, num_dstat_features, data_type, features_regex=features_regex, sample_interval=sample_interval, class_label=class_label, aggregation_functions=resolved_agg_fn, visualize=visualize, data_path=data_path, target_data_path=target_data_path, s3=s3) datasets[data_type] = data examples = data['examples'] if len(examples) == 0: continue # Perform dataset-wise normalization if data_type == 'training': n_examples, normalization_params = normalize_dataset( examples, labels) # We cache normalization parameters from the training data set # to normalize the dev and test set, as well as other input data model_config['normalization_params'] = normalization_params gather_results.save_model_config(dataset, model_config, data_path=target_data_path, s3=s3) # Save figure sizes as well for training only figure_sizes = _figure_sizes else: # Perform dataset-wise normalization n_examples, normalization_params = normalize_dataset( examples, labels, model_config['normalization_params']) # Replace examples with normalized ones datasets[data_type]['examples'] = n_examples # Store the normalized data to disk gather_results.save_dataset(dataset, name=data_type, data_path=target_data_path, s3=s3, **datasets[data_type]) # Plot some more figures if visualize and not aggregation_functions: for n in range(n_examples.shape[0]): figure_name = sample_interval + "_%s_" + str(n) unrolled_norm_plot = pd.Series(n_examples[n]).plot() fig = unrolled_norm_plot.get_figure() axes = plt.gca() axes.set_ylim([-1, 1]) fig.savefig( os.sep.join([data_plots_folder] + [figure_name % "normalized"])) plt.close(fig) df = pd.DataFrame(figure_sizes, columns=['size', 'status']) size_plot = df.plot.scatter(x='size', y='status') fig = size_plot.get_figure() fig.savefig(os.sep.join([data_plots_folder] + ['sizes_by_result.png'])) plt.close(fig) # Store labels to disk gather_results.save_dataset(dataset, name='labels', data_path=target_data_path, s3=s3, labels=labels) print("Done creating dataset %s" % model_config)