def db_trainer(estimator, dataset, build_name, limit, db_uri, evaluate): runs = gather_results.get_runs_by_name(db_uri, build_name=build_name) model_config = {'build_name': build_name} gather_results.save_model_config(dataset, model_config) if limit > 0: runs = runs[:limit] gather_results.save_run_uuids(dataset, runs) for run in runs: if estimator == 'tf.estimator.DNNClassifier': gather_results.get_subunit_results_for_run(run, '1s', db_uri, use_cache=True) print('Acquired run %s' % run.uuid) else: result = gather_results.get_subunit_results_for_run( run, '1s', db_uri)[0] print('Acquired run %s' % run.uuid) try: features, labels = nn_trainer.normalize_data(result) except TypeError: print('Unable to normalize data in run %s, ' 'skipping' % run.uuid) continue if not evaluate: nn_trainer.train_model(features, labels, dataset_name=dataset) else: nn_trainer.evaluate_model(features, labels, dataset_name=dataset)
def build_dataset(dataset, build_name, slicer, sample_interval, features_regex, class_label, tdt_split, force, visualize, data_path, target_data_path, s3_profile, s3_url, data_plots_folder, aggregation_functions): # s3 support s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile) # Prevent overwrite by mistake if gather_results.load_model_config( dataset, data_path=target_data_path, s3=s3) and not force: print("Dataset %s already configured" % dataset) sys.exit(1) # Validate tdt-split training, dev, test = map(lambda x: x / 10, tdt_split) if not sum(tdt_split) == 10: print("Training (%d) + dev (%d) + test (%d) != 10" % tdt_split) sys.exit(1) # Load available run ids for the build name (from s3) runs = gather_results.load_run_uuids('.raw', name=build_name, data_path=data_path, s3=s3) # Apply the slice def slice_fn(x): return int(x.strip()) if x.strip() else None slice_object = slice(*map(slice_fn, slicer.split(":"))) runs = np.array(runs[slice_object]) print("Obtained %d runs for build %s" % (len(runs), build_name)) # Split the runs in training, dev and test training_idx, dev_idx, test_idx = dataset_split_filters( len(runs), training, dev, data_path=target_data_path, s3=s3) np_runs = np.array(runs) # Saving dataset metadata gather_results.save_run_uuids(dataset, np_runs[training_idx], name='training', data_path=target_data_path, s3=s3) gather_results.save_run_uuids(dataset, np_runs[dev_idx], name='dev', data_path=target_data_path, s3=s3) gather_results.save_run_uuids(dataset, np_runs[test_idx], name='test', data_path=target_data_path, s3=s3) # Calculate normalized and filtered dimensions and labels normalized_length, num_dstat_features, labels = \ data_sizes_and_labels(runs[0], features_regex, sample_interval, aggregation_functions=aggregation_functions, data_path=data_path, s3=s3) model_config = { 'build_name': build_name, 'sample_interval': sample_interval, 'features_regex': features_regex, 'class_label': class_label, 'aggregation_functions': aggregation_functions, 'training_set': training, 'dev_set': dev, 'test_set': test, 'normalized_length': normalized_length, 'labels': labels, 'num_columns': num_dstat_features, 'num_features': len(labels) } # Save the config and complete list of run uuids gather_results.save_run_uuids(dataset, runs, data_path=target_data_path, s3=s3) gather_results.save_model_config(dataset, model_config, data_path=target_data_path, s3=s3) print("Stored %d run IDs in the model %s config" % (len(runs), dataset)) # Resolve the aggregation function names to functions resolved_agg_fn = [ resolve_aggregation_function(x) for x in aggregation_functions ] datasets = {} # Training must come first so we calculate normalization params for data_type in ['training', 'dev', 'test']: data, _figure_sizes = prepare_dataset( dataset, normalized_length, num_dstat_features, data_type, features_regex=features_regex, sample_interval=sample_interval, class_label=class_label, aggregation_functions=resolved_agg_fn, visualize=visualize, data_path=data_path, target_data_path=target_data_path, s3=s3) datasets[data_type] = data examples = data['examples'] if len(examples) == 0: continue # Perform dataset-wise normalization if data_type == 'training': n_examples, normalization_params = normalize_dataset( examples, labels) # We cache normalization parameters from the training data set # to normalize the dev and test set, as well as other input data model_config['normalization_params'] = normalization_params gather_results.save_model_config(dataset, model_config, data_path=target_data_path, s3=s3) # Save figure sizes as well for training only figure_sizes = _figure_sizes else: # Perform dataset-wise normalization n_examples, normalization_params = normalize_dataset( examples, labels, model_config['normalization_params']) # Replace examples with normalized ones datasets[data_type]['examples'] = n_examples # Store the normalized data to disk gather_results.save_dataset(dataset, name=data_type, data_path=target_data_path, s3=s3, **datasets[data_type]) # Plot some more figures if visualize and not aggregation_functions: for n in range(n_examples.shape[0]): figure_name = sample_interval + "_%s_" + str(n) unrolled_norm_plot = pd.Series(n_examples[n]).plot() fig = unrolled_norm_plot.get_figure() axes = plt.gca() axes.set_ylim([-1, 1]) fig.savefig( os.sep.join([data_plots_folder] + [figure_name % "normalized"])) plt.close(fig) df = pd.DataFrame(figure_sizes, columns=['size', 'status']) size_plot = df.plot.scatter(x='size', y='status') fig = size_plot.get_figure() fig.savefig(os.sep.join([data_plots_folder] + ['sizes_by_result.png'])) plt.close(fig) # Store labels to disk gather_results.save_dataset(dataset, name='labels', data_path=target_data_path, s3=s3, labels=labels) print("Done creating dataset %s" % model_config)
def prepare_dataset(dataset, normalized_length, num_dstat_features, data_type, features_regex, sample_interval='1s', class_label='status', aggregation_functions=None, visualize=False, data_path=None, target_data_path=None, s3=None): """Takes a dataset and filters and does the magic Loads the run ids from the dataset configuration. Loads the data (dsv + meta) for every run from cache. Builds the unrolled examples as a numpy ndarray. Builds the classes as a numpy array. Saves the data setup to the dataset config. Does some visualization (if enabled). """ if visualize: data_plots_folder = [ os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data', dataset, 'plots' ] os.makedirs(os.sep.join(data_plots_folder), exist_ok=True) # Load the list of runs and base labels runs = gather_results.load_run_uuids(dataset, name=data_type, data_path=target_data_path, s3=s3) # run_uuids are the example_ids sizes = [] # The data for each example. examples = examples_ndarray(len(runs), num_dstat_features, normalized_length) # The test result for each example classes = [] skips = [] print("Loading %s data:" % data_type, end='\r', flush=True) for count, run in enumerate(runs): print("Loading %s data: %d of %d" % (data_type, count + 1, len(runs)), end='\r', flush=True) result = gather_results.get_subunit_results_for_run( run, sample_interval, data_path=data_path, s3=s3) # For one run_uuid we must only get on example (result) # Filtering by columns if not result: skips.append(run.uuid) continue # Apply column filtering result = filter_example(result, features_regex) # Normalize data example = fixed_lenght_example(result, normalized_length, aggregation_functions) vector = unroll_example(example, normalized_length) # Normalize status status = get_class(result, class_label) # Examples is an np ndarrays examples[count] = vector.values classes.append(status) # Plot from figures if visualize and not aggregation_functions: # Prepare some more data if we are going to visualize sizes.append((result['dstat'].shape[0], status)) figure_name = sample_interval + "_%s_" + str(count) # Plot un-normalized data data_plot = result['dstat'].plot() fig = data_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "downsampled"])) plt.close(fig) # Plot fixed size data fixed_plot = example.plot() fig = fixed_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "fixedsize"])) plt.close(fig) # Plot unrolled data unrolled_plot = pd.Series(vector).plot() fig = unrolled_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "unrolled"])) plt.close(fig) print("Loading %s data: %d done!" % (data_type, len(runs))) # Check that everything went well if len(skips) > 0: print('Unable to train model because of missing runs %s' % skips) safe_runs = [run.uuid for run in runs if run.uuid not in skips] gather_results.save_run_uuids(dataset, safe_runs) print('The model has been updated to exclude those runs.') print('Please re-run the training step.') sys.exit(1) classes = np.array(classes) figure_sizes = np.array(sizes) example_ids = np.array(runs) print("%s set: examples: %s, classes: %s, example IDs: %s" % (data_type, str(examples.shape), str( classes.shape), str(example_ids.shape))) data = { 'examples': examples, 'example_ids': example_ids, 'classes': classes } if visualize and aggregation_functions and len(examples) > 0: if len(aggregation_functions) > 3: print('Visualization skipped, cannot represent more than 3D') sys.exit(1) else: fig = plt.figure() if len(aggregation_functions) == 3: ax = fig.add_subplot(111, projection='3d') else: ax = fig.add_subplot(111) # Build a dict [class] -> [int ID] unique_classes = list(set(classes)) dict_classes = dict( zip(unique_classes, list(range(len(unique_classes))))) # Setup colours cm = plt.get_cmap('jet') cNorm = pltcolors.Normalize(vmin=0, vmax=len(unique_classes)) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm) # Scatter the data for ii in range(len(examples)): ax.scatter(*examples[ii], marker='o', c=scalarMap.to_rgba(dict_classes[classes[ii]])) # Set axis labels ax.set_xlabel(aggregation_functions[0].__name__) if len(aggregation_functions) > 1: ax.set_ylabel(aggregation_functions[1].__name__) if len(aggregation_functions) > 2: ax.set_zlabel(aggregation_functions[2].__name__) # scalarMap.set_array(classes) # fig.colorbar(scalarMap) # Save the plot fig.savefig( os.sep.join(data_plots_folder + [data_type + "_3d_plot"])) plt.close(fig) return data, figure_sizes
def local_trainer(train, estimator, dataset, sample_interval, features_regex, class_label, visualize, steps, gpu, debug): # Normalized lenght before resampling normalized_length = 5500 if sample_interval: # Calculate the desired normalized lenght after resample normalized_length = get_downsampled_example_lenght( sample_interval, normalized_length) data_plots_folder = [ os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data', dataset, 'plots' ] os.makedirs(os.sep.join(data_plots_folder), exist_ok=True) runs = gather_results.load_run_uuids(dataset) # run_uuids are the example_ids sizes = [] # The data for each example. We don't know yet the pre-set shape, so # wait until the first result comes in examples = [] # Model configuration. We need to cache sample_interval, features-regex and # the normalization parameters for each feature so we can re-use them # during prediction. model_config = { 'sample_interval': sample_interval, 'features_regex': features_regex, 'normalized_length': normalized_length } # The test result for each example classes = [] labels = [] idx = 0 skips = [] for run in runs: results = gather_results.get_subunit_results_for_run( run, sample_interval) # For one run_uuid we must only get on example (result) result = results[0] # Filtering by columns if not result: skips.append(run.uuid) continue df = result['dstat'] if features_regex: col_regex = re.compile(features_regex) result['dstat'] = df[list(filter(col_regex.search, df.columns))] # Setup the numpy matrix and sizes if len(examples) == 0: # Adjust normalized_length to the actual re-sample one examples = np.ndarray(shape=(len(runs), len(result['dstat'].columns) * normalized_length)) model_config['num_columns'] = len(result['dstat'].columns) model_config['num_features'] = (len(result['dstat'].columns) * normalized_length) # Normalize data example = fixed_lenght_example(result, normalized_length) # Normalize status status = get_class(result, class_label) vector, new_labels = unroll_example(example, normalized_length, labels) # Only calculate labels for the first example if len(labels) == 0: labels = new_labels model_config['labels'] = labels print("Normalized example %d of %d" % (runs.index(run) + 1, len(runs)), end='\r', flush=True) # Examples is an np ndarrays examples[idx] = vector.values classes.append(status) if visualize: # Prepare some more data if we are going to visualize sizes.append((result['dstat'].shape[0], status)) figure_name = sample_interval + "_%s_" + str(idx) # Plot un-normalized data data_plot = result['dstat'].plot() fig = data_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "downsampled"])) plt.close(fig) # Plot fixed size data fixed_plot = example.plot() fig = fixed_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "fixedsize"])) plt.close(fig) # Plot unrolled data unrolled_plot = pd.Series(vector).plot() fig = unrolled_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "unrolled"])) plt.close(fig) idx += 1 if len(skips) > 0: print('Unable to train model because of missing runs %s' % skips) safe_runs = [run for run in runs if run.uuid not in skips] gather_results.save_run_uuids(dataset, safe_runs) print('The model has been updated to exclude those runs.') print('Please re-run the training step.') sys.exit(1) # Perform dataset-wise normalization # NOTE(andreaf) When we train the model we ignore any saved normalization # parameter, since the sample interval and features may be different. n_examples, normalization_params = normalize_dataset(examples, labels) # We do cache the result to normalize the prediction set. model_config['normalization_params'] = normalization_params gather_results.save_model_config(dataset, model_config) if visualize: for n in range(len(runs)): figure_name = sample_interval + "_%s_" + str(n) unrolled_norm_plot = pd.Series(n_examples[n]).plot() fig = unrolled_norm_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "normalized"])) plt.close(fig) np_sizes = np.array(sizes) df = pd.DataFrame(np_sizes, columns=['size', 'status']) size_plot = df.plot.scatter(x='size', y='status') fig = size_plot.get_figure() fig.savefig(os.sep.join(data_plots_folder + ['sizes_by_result.png'])) plt.close(fig) # Now do the training exmple_ids = [run.uuid for run in runs] classes = np.array(classes) print("\nTraining data shape: (%d, %d)" % n_examples.shape) if train: if debug: tf.logging.set_verbosity(tf.logging.DEBUG) config = tf.ConfigProto(log_device_placement=True, ) config.gpu_options.allow_growth = True config.allow_soft_placement = True model = svm_trainer.SVMTrainer(n_examples, exmple_ids, labels, classes, dataset_name=dataset, force_gpu=gpu) model.train(steps=steps)
def train_model(build_name): global estimator dataset = estimator global model_dir with session_scope() as session: if not os.path.isfile(os.sep.join([model_dir, 'data', dataset, 'runs.json.gz'])): runs = gather_results.get_runs_by_name(None, build_name=build_name, session=session) model_config = {'build_name': build_name} gather_results.save_model_config(dataset, model_config, data_path=model_dir) gather_results.save_run_uuids(dataset, runs, data_path=model_dir) else: runs = gather_results.load_run_uuids(dataset, data_path=model_dir) normalized_length = 5500 if estimator == 'svm': skips = [] classes = [] labels = [] examples = [] class_label = 'status' features_regex = None sample_interval = None idx = 0 # Model configuration. We need to cache sample_interval, # features-regex and the normalization parameters for each # feature so we can re-use them during prediction. model_config = { 'sample_interval': sample_interval, 'features_regex': features_regex, 'normalized_length': normalized_length } for run in runs: results = gather_results.get_subunit_results_for_run( run, '1s', session=None, data_path=model_dir, use_cache=True) print('Acquired run %s' % run.uuid) # For one run_uuid we must only get on example (result) result = results[0] if not result: skips.append(run.uuid) continue # Setup the numpy matrix and sizes if len(examples) == 0: # Adjust normalized_length to the actual re-sample one examples = np.ndarray( shape=( len(runs), (len(result['dstat'].columns) * normalized_length))) model_config['num_columns'] = len( result['dstat'].columns) model_config['num_features'] = (len( result['dstat'].columns) * normalized_length) # Normalize data example = fixed_lenght_example(result, normalized_length) # Normalize status status = get_class(result, class_label) vector, new_labels = unroll_example( example, normalized_length, labels) # Only calculate labels for the first example if len(labels) == 0: labels = new_labels model_config['labels'] = labels # Examples is an np ndarrays examples[idx] = vector.values classes.append(status) if len(skips) > 0: print('Unable to train model because of missing ' 'runs %s' % skips) safe_runs = [ run for run in runs if run.uuid not in skips] gather_results.save_run_uuids(dataset, safe_runs, data_path=model_dir) message = ('The model has been updated to exclude ' 'those runs. Please re-run the training' ' step.') abort(make_response(message, 400)) def run_training(): # Perform dataset-wise normalization # NOTE(andreaf) When we train the model we ignore any saved # normalization # parameter, since the sample interval and features may be # different. n_examples, normalization_params = normalize_dataset( examples, labels) # We do cache the result to normalize the prediction set. model_config['normalization_params'] = normalization_params gather_results.save_model_config(dataset, model_config, data_path=model_dir) # Now do the training example_ids = [run.uuid for run in runs] outclasses = np.array(classes) svm_trainer.SVMTrainer(n_examples, example_ids, labels, outclasses, dataset_name=dataset, model_path=model_dir) thread = threading.Thread(target=run_training) thread.start() return "training started", 202 else: def run_nn_training(): for run in runs: uuid = run.uuid result = gather_results.get_subunit_results_for_run( run, '1s', session=session, use_cache=False, data_path=model_dir)[0] try: features, labels = nn_trainer.normalize_data(result) except TypeError: print('Unable to normalize data in run %s, ' 'skipping' % uuid) continue nn_trainer.train_model(features, labels, dataset_name=dataset, model_path=model_dir) print('done') thread = threading.Thread(target=run_nn_training) thread.start() return "training started", 202
def prepare_dataset(dataset, normalized_length, num_dstat_features, data_type, features_regex, sample_interval='1s', class_label='status', visualize=False, data_path=None, target_data_path=None, s3=None): """Takes a dataset and filters and does the magic Loads the run ids from the dataset configuration. Loads the data (dsv + meta) for every run from cache. Builds the unrolled exaples as a numpy ndarray. Builds the classes as a numpy array. Saves the data setup to the dataset config. Does some visualization (if enabled). """ if visualize: data_plots_folder = [ os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data', dataset, 'plots' ] os.makedirs(os.sep.join(data_plots_folder), exist_ok=True) # Load the list of runs and base labels runs = gather_results.load_run_uuids(dataset, name=data_type, data_path=target_data_path, s3=s3) # run_uuids are the example_ids sizes = [] # The data for each example. examples = examples_ndarray(len(runs), num_dstat_features, normalized_length) # The test result for each example classes = [] skips = [] print("Loading %s data:" % data_type, end='\r', flush=True) for count, run in enumerate(runs): print("Loading %s data: %d of %d" % (data_type, count + 1, len(runs)), end='\r', flush=True) result = gather_results.get_subunit_results_for_run( run, sample_interval, data_path=data_path, s3=s3) # For one run_uuid we must only get on example (result) # Filtering by columns if not result: skips.append(run.uuid) continue # Apply column filtering result = filter_example(result, features_regex) # Normalize data example = fixed_lenght_example(result, normalized_length) vector = unroll_example(example, normalized_length) # Normalize status status = get_class(result, class_label) # Examples is an np ndarrays examples[count] = vector.values classes.append(status) # Plot from figures if visualize: # Prepare some more data if we are going to visualize sizes.append((result['dstat'].shape[0], status)) figure_name = sample_interval + "_%s_" + str(count) # Plot un-normalized data data_plot = result['dstat'].plot() fig = data_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "downsampled"])) plt.close(fig) # Plot fixed size data fixed_plot = example.plot() fig = fixed_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "fixedsize"])) plt.close(fig) # Plot unrolled data unrolled_plot = pd.Series(vector).plot() fig = unrolled_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "unrolled"])) plt.close(fig) print("Loading %s data: %d done!" % (data_type, len(runs))) # Check that everything went well if len(skips) > 0: print('Unable to train model because of missing runs %s' % skips) safe_runs = [run.uuid for run in runs if run.uuid not in skips] gather_results.save_run_uuids(dataset, safe_runs) print('The model has been updated to exclude those runs.') print('Please re-run the training step.') sys.exit(1) classes = np.array(classes) figure_sizes = np.array(sizes) example_ids = np.array(runs) print("%s set: examples: %s, classes: %s, example IDs: %s" % (data_type, str(examples.shape), str( classes.shape), str(example_ids.shape))) data = { 'examples': examples, 'example_ids': example_ids, 'classes': classes } return data, figure_sizes