def build_dataset(dataset, build_name, slicer, sample_interval, features_regex, class_label, tdt_split, force, visualize, data_path, target_data_path, s3_profile, s3_url, data_plots_folder, aggregation_functions): # s3 support s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile) # Prevent overwrite by mistake if gather_results.load_model_config( dataset, data_path=target_data_path, s3=s3) and not force: print("Dataset %s already configured" % dataset) sys.exit(1) # Validate tdt-split training, dev, test = map(lambda x: x / 10, tdt_split) if not sum(tdt_split) == 10: print("Training (%d) + dev (%d) + test (%d) != 10" % tdt_split) sys.exit(1) # Load available run ids for the build name (from s3) runs = gather_results.load_run_uuids('.raw', name=build_name, data_path=data_path, s3=s3) # Apply the slice def slice_fn(x): return int(x.strip()) if x.strip() else None slice_object = slice(*map(slice_fn, slicer.split(":"))) runs = np.array(runs[slice_object]) print("Obtained %d runs for build %s" % (len(runs), build_name)) # Split the runs in training, dev and test training_idx, dev_idx, test_idx = dataset_split_filters( len(runs), training, dev, data_path=target_data_path, s3=s3) np_runs = np.array(runs) # Saving dataset metadata gather_results.save_run_uuids(dataset, np_runs[training_idx], name='training', data_path=target_data_path, s3=s3) gather_results.save_run_uuids(dataset, np_runs[dev_idx], name='dev', data_path=target_data_path, s3=s3) gather_results.save_run_uuids(dataset, np_runs[test_idx], name='test', data_path=target_data_path, s3=s3) # Calculate normalized and filtered dimensions and labels normalized_length, num_dstat_features, labels = \ data_sizes_and_labels(runs[0], features_regex, sample_interval, aggregation_functions=aggregation_functions, data_path=data_path, s3=s3) model_config = { 'build_name': build_name, 'sample_interval': sample_interval, 'features_regex': features_regex, 'class_label': class_label, 'aggregation_functions': aggregation_functions, 'training_set': training, 'dev_set': dev, 'test_set': test, 'normalized_length': normalized_length, 'labels': labels, 'num_columns': num_dstat_features, 'num_features': len(labels) } # Save the config and complete list of run uuids gather_results.save_run_uuids(dataset, runs, data_path=target_data_path, s3=s3) gather_results.save_model_config(dataset, model_config, data_path=target_data_path, s3=s3) print("Stored %d run IDs in the model %s config" % (len(runs), dataset)) # Resolve the aggregation function names to functions resolved_agg_fn = [ resolve_aggregation_function(x) for x in aggregation_functions ] datasets = {} # Training must come first so we calculate normalization params for data_type in ['training', 'dev', 'test']: data, _figure_sizes = prepare_dataset( dataset, normalized_length, num_dstat_features, data_type, features_regex=features_regex, sample_interval=sample_interval, class_label=class_label, aggregation_functions=resolved_agg_fn, visualize=visualize, data_path=data_path, target_data_path=target_data_path, s3=s3) datasets[data_type] = data examples = data['examples'] if len(examples) == 0: continue # Perform dataset-wise normalization if data_type == 'training': n_examples, normalization_params = normalize_dataset( examples, labels) # We cache normalization parameters from the training data set # to normalize the dev and test set, as well as other input data model_config['normalization_params'] = normalization_params gather_results.save_model_config(dataset, model_config, data_path=target_data_path, s3=s3) # Save figure sizes as well for training only figure_sizes = _figure_sizes else: # Perform dataset-wise normalization n_examples, normalization_params = normalize_dataset( examples, labels, model_config['normalization_params']) # Replace examples with normalized ones datasets[data_type]['examples'] = n_examples # Store the normalized data to disk gather_results.save_dataset(dataset, name=data_type, data_path=target_data_path, s3=s3, **datasets[data_type]) # Plot some more figures if visualize and not aggregation_functions: for n in range(n_examples.shape[0]): figure_name = sample_interval + "_%s_" + str(n) unrolled_norm_plot = pd.Series(n_examples[n]).plot() fig = unrolled_norm_plot.get_figure() axes = plt.gca() axes.set_ylim([-1, 1]) fig.savefig( os.sep.join([data_plots_folder] + [figure_name % "normalized"])) plt.close(fig) df = pd.DataFrame(figure_sizes, columns=['size', 'status']) size_plot = df.plot.scatter(x='size', y='status') fig = size_plot.get_figure() fig.savefig(os.sep.join([data_plots_folder] + ['sizes_by_result.png'])) plt.close(fig) # Store labels to disk gather_results.save_dataset(dataset, name='labels', data_path=target_data_path, s3=s3, labels=labels) print("Done creating dataset %s" % model_config)
def local_trainer(dataset, experiment, eval_dataset, gpu, debug, data_path, s3_profile, s3_url): # s3 support. When both using s3, dataset and experiment must stored # in the same bucket s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile) # Load experiment data experiment_data = gather_results.load_experiment(experiment, data_path=data_path, s3=s3) if not experiment_data: print("Experiment %s not found" % experiment) sys.exit(1) # Load dataset data dataset_data = gather_results.load_model_config(dataset, data_path=data_path, s3=s3) if not dataset_data: print("Dataset %s not found" % dataset) sys.exit(1) # Read hyper_params and params estimator = experiment_data['estimator'] hyper_params = experiment_data['hyper_params'] params = experiment_data['params'] steps = int(hyper_params['steps']) num_epochs = int(hyper_params['epochs']) batch_size = int(hyper_params['batch_size']) optimizer = hyper_params['optimizer'] learning_rate = float(hyper_params['learning_rate']) class_label = dataset_data['class_label'] if debug: tf.logging.set_verbosity(tf.logging.DEBUG) # Load the normalized data labels = gather_results.load_dataset(dataset, 'labels', data_path=data_path, s3=s3)['labels'] training_data = gather_results.load_dataset(dataset, 'training', data_path=data_path, s3=s3) test_data = gather_results.load_dataset(dataset, 'test', data_path=data_path, s3=s3) print("Training data shape: (%d, %d)" % training_data['examples'].shape) if class_label == 'node_provider': label_vocabulary = set([ 'rax', 'ovh', 'packethost-us-west-1', 'vexxhost', 'limestone-regionone', 'inap-mtl01', 'fortnebula-regionone' ]) elif class_label == 'node_provider_all': label_vocabulary = set([ 'rax-iad', 'ovh-bhs1', 'packethost-us-west-1', 'rax-dfw', 'vexxhost-ca-ymq-1', 'ovh-gra1', 'limestone-regionone', 'inap-mtl01', 'rax-ord', 'vexxhost-sjc1', 'fortnebula-regionone' ]) else: label_vocabulary = None # Get the estimator model_dir = gather_results.get_model_folder(dataset, experiment) estimator = tf_trainer.get_estimator( estimator, hyper_params, params, labels, model_dir, optimizer=_OPTIMIZER_CLS_NAMES[optimizer](learning_rate=learning_rate), label_vocabulary=label_vocabulary, gpu=gpu) def train_and_eval(): # Train tf_trainer.get_training_method(estimator)( input_fn=tf_trainer.get_input_fn(shuffle=True, batch_size=batch_size, num_epochs=num_epochs, labels=labels, **training_data), steps=steps) # Eval on the experiment dataset + any other requested eval_sets = [dataset] eval_sets.extend(eval_dataset) for eval_dataset_name in eval_sets: eval_data = gather_results.load_dataset(eval_dataset_name, 'test', data_path=data_path, s3=s3) eval_size = len(eval_data['example_ids']) # Run tf evaluation and store the metrics print("Evaluation data shape: (%d, %d)" % eval_data['examples'].shape) eval_loss = estimator.evaluate( input_fn=tf_trainer.get_input_fn(batch_size=eval_size, num_epochs=1, labels=labels, **eval_data), name=eval_dataset_name) # Saving and Logging loss print('Training eval data for %s: %r' % (eval_dataset_name, eval_loss)) eval_name = "eval_" + eval_dataset_name gather_results.save_data_json(dataset, eval_loss, eval_name, sub_folder=experiment) # Run a prediction on the "dev" set, which we use as prod, and store it prod_data = gather_results.load_dataset(dataset, 'dev', data_path=data_path, s3=s3) prod_size = len(prod_data['example_ids']) prediction = estimator.predict(input_fn=tf_trainer.get_input_fn( batch_size=prod_size, num_epochs=1, labels=labels, **prod_data)) # Convert bytes fields to string for serialization serializable_pred = [] for pred in prediction: _classes = pred['classes'] pred['classes'] = [x.decode("utf-8") for x in _classes] serializable_pred.append(pred) prediction_name = "prediction_" + dataset pred_data = zip(prod_data['example_ids'], serializable_pred, prod_data['classes']) gather_results.save_data_json(dataset, [x for x in pred_data], prediction_name, sub_folder=experiment) # Now do the training and evalutation if gpu: with tf.device('/gpu:0'): eval_loss = train_and_eval() else: eval_loss = train_and_eval()
def db_batch_predict(db_uri, dataset, slice, gpu, debug): """Run predict on all DB items on included in the dataset yet Takes a dataset and a build name. It builds the list of runs in the DB that fit the specified build name, and that are not yet used for training in the specified dataset. It runs prediction on all of them. """ if debug: tf.logging.set_verbosity(tf.logging.DEBUG) # Get the configuration for the model model_config = gather_results.load_model_config(dataset) # Get the list of runs from the dataset run_uuids = gather_results.load_run_uuids(dataset) # Get the list of runs from the DB runs = gather_results.get_runs_by_name( db_uri=db_uri, build_name=model_config['build_name']) # Run a predict loop, include all runs not in the train dataset predict_runs = [r for r in runs if r.uuid not in run_uuids] predict_runs = predict_runs[] if len(predict_runs) == 0: print("Empty prediction set, nothing to do.") sys.exit(0) # Initialize the array examples = np.ndarray( shape=(len(predict_runs), model_config['num_features'])) idx = 0 classes = [] labels = [] print("All runs: %d, dataset size: %d, predict size: %d" % ( len(runs), len(run_uuids), len(predict_runs))) for run in predict_runs: # This will also store new runs in cache. In future we may want to # train on those as well, but nor now let's try to predict only results = gather_results.get_subunit_results_for_run( run, model_config['sample_interval'], db_uri=db_uri) for result in results: # Skip runs with no data if result is None: continue if model_config['features_regex']: df = result['dstat'] col_regex = re.compile(model_config['features_regex']) result['dstat'] = df[list(filter( col_regex.search, df.columns))] # Normalize examples vector, status, labels = trainer.normalize_example( result, model_config['normalized_length'], model_config['labels']) examples[idx] = vector.values classes.append(status) idx += 1 # Normalize dataset n_examples, _ = trainer.normalize_dataset( examples, labels, params=model_config['normalization_params']) # Prepare other arrays classes = np.array(classes) run_uuids = [r.uuid for r in predict_runs] # Configure TF config = tf.ConfigProto(log_device_placement=True,) config.gpu_options.allow_growth = True config.allow_soft_placement = True # Now do the prediction model = svm_trainer.SVMTrainer(n_examples, run_uuids, labels, classes, dataset_name=dataset, force_gpu=gpu) predictions = model.predict() errors = [] for prediction, actual in zip(predictions, classes): if prediction['classes'] != actual: errors.append((prediction, actual)) print("Prediction of %d inputs completed." % len(classes)) print("Input set composition: %d PASS, %s FAIL" % ( len([x for x in classes if x == 0]), len([x for x in classes if x == 1]))) if len(errors) > 0: print("There were some prediction errors: %s" % errors) else: print("All predicted correctly.")