Esempio n. 1
0
def db_trainer(estimator, dataset, build_name, limit, db_uri, evaluate):
    runs = gather_results.get_runs_by_name(db_uri, build_name=build_name)
    model_config = {'build_name': build_name}
    gather_results.save_model_config(dataset, model_config)
    if limit > 0:
        runs = runs[:limit]
    gather_results.save_run_uuids(dataset, runs)
    for run in runs:
        if estimator == 'tf.estimator.DNNClassifier':
            gather_results.get_subunit_results_for_run(run,
                                                       '1s',
                                                       db_uri,
                                                       use_cache=True)
            print('Acquired run %s' % run.uuid)
        else:
            result = gather_results.get_subunit_results_for_run(
                run, '1s', db_uri)[0]
            print('Acquired run %s' % run.uuid)
            try:
                features, labels = nn_trainer.normalize_data(result)
            except TypeError:
                print('Unable to normalize data in run %s, '
                      'skipping' % run.uuid)
                continue
            if not evaluate:
                nn_trainer.train_model(features, labels, dataset_name=dataset)
            else:
                nn_trainer.evaluate_model(features,
                                          labels,
                                          dataset_name=dataset)
Esempio n. 2
0
 def run_training():
     # Perform dataset-wise normalization
     # NOTE(andreaf) When we train the model we ignore any saved
     # normalization
     # parameter, since the sample interval and features may be
     # different.
     n_examples, normalization_params = normalize_dataset(
         examples, labels)
     # We do cache the result to normalize the prediction set.
     model_config['normalization_params'] = normalization_params
     gather_results.save_model_config(dataset, model_config,
                                      data_path=model_dir)
     # Now do the training
     example_ids = [run.uuid for run in runs]
     outclasses = np.array(classes)
     svm_trainer.SVMTrainer(n_examples, example_ids, labels,
                            outclasses, dataset_name=dataset,
                            model_path=model_dir)
Esempio n. 3
0
def local_trainer(train, estimator, dataset, sample_interval, features_regex,
                  class_label, visualize, steps, gpu, debug):
    # Normalized lenght before resampling
    normalized_length = 5500
    if sample_interval:
        # Calculate the desired normalized lenght after resample
        normalized_length = get_downsampled_example_lenght(
            sample_interval, normalized_length)

    data_plots_folder = [
        os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data',
        dataset, 'plots'
    ]
    os.makedirs(os.sep.join(data_plots_folder), exist_ok=True)
    runs = gather_results.load_run_uuids(dataset)

    # run_uuids are the example_ids
    sizes = []
    # The data for each example. We don't know yet the pre-set shape, so
    # wait until the first result comes in
    examples = []

    # Model configuration. We need to cache sample_interval, features-regex and
    # the normalization parameters for each feature so we can re-use them
    # during prediction.
    model_config = {
        'sample_interval': sample_interval,
        'features_regex': features_regex,
        'normalized_length': normalized_length
    }

    # The test result for each example
    classes = []
    labels = []
    idx = 0
    skips = []
    for run in runs:
        results = gather_results.get_subunit_results_for_run(
            run, sample_interval)
        # For one run_uuid we must only get on example (result)
        result = results[0]
        # Filtering by columns
        if not result:
            skips.append(run.uuid)
            continue
        df = result['dstat']
        if features_regex:
            col_regex = re.compile(features_regex)
            result['dstat'] = df[list(filter(col_regex.search, df.columns))]
        # Setup the numpy matrix and sizes
        if len(examples) == 0:
            # Adjust normalized_length to the actual re-sample one
            examples = np.ndarray(shape=(len(runs),
                                         len(result['dstat'].columns) *
                                         normalized_length))
            model_config['num_columns'] = len(result['dstat'].columns)
            model_config['num_features'] = (len(result['dstat'].columns) *
                                            normalized_length)
        # Normalize data
        example = fixed_lenght_example(result, normalized_length)
        # Normalize status
        status = get_class(result, class_label)
        vector, new_labels = unroll_example(example, normalized_length, labels)
        # Only calculate labels for the first example
        if len(labels) == 0:
            labels = new_labels
            model_config['labels'] = labels
        print("Normalized example %d of %d" % (runs.index(run) + 1, len(runs)),
              end='\r',
              flush=True)
        # Examples is an np ndarrays
        examples[idx] = vector.values
        classes.append(status)
        if visualize:
            # Prepare some more data if we are going to visualize
            sizes.append((result['dstat'].shape[0], status))
            figure_name = sample_interval + "_%s_" + str(idx)
            # Plot un-normalized data
            data_plot = result['dstat'].plot()
            fig = data_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "downsampled"]))
            plt.close(fig)
            # Plot fixed size data
            fixed_plot = example.plot()
            fig = fixed_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "fixedsize"]))
            plt.close(fig)
            # Plot unrolled data
            unrolled_plot = pd.Series(vector).plot()
            fig = unrolled_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "unrolled"]))
            plt.close(fig)
        idx += 1
    if len(skips) > 0:
        print('Unable to train model because of missing runs %s' % skips)
        safe_runs = [run for run in runs if run.uuid not in skips]
        gather_results.save_run_uuids(dataset, safe_runs)
        print('The model has been updated to exclude those runs.')
        print('Please re-run the training step.')
        sys.exit(1)
    # Perform dataset-wise normalization
    # NOTE(andreaf) When we train the model we ignore any saved normalization
    # parameter, since the sample interval and features may be different.
    n_examples, normalization_params = normalize_dataset(examples, labels)
    # We do cache the result to normalize the prediction set.
    model_config['normalization_params'] = normalization_params
    gather_results.save_model_config(dataset, model_config)
    if visualize:
        for n in range(len(runs)):
            figure_name = sample_interval + "_%s_" + str(n)
            unrolled_norm_plot = pd.Series(n_examples[n]).plot()
            fig = unrolled_norm_plot.get_figure()
            fig.savefig(
                os.sep.join(data_plots_folder + [figure_name % "normalized"]))
            plt.close(fig)

        np_sizes = np.array(sizes)
        df = pd.DataFrame(np_sizes, columns=['size', 'status'])
        size_plot = df.plot.scatter(x='size', y='status')
        fig = size_plot.get_figure()
        fig.savefig(os.sep.join(data_plots_folder + ['sizes_by_result.png']))
        plt.close(fig)

    # Now do the training
    exmple_ids = [run.uuid for run in runs]
    classes = np.array(classes)
    print("\nTraining data shape: (%d, %d)" % n_examples.shape)
    if train:
        if debug:
            tf.logging.set_verbosity(tf.logging.DEBUG)
        config = tf.ConfigProto(log_device_placement=True, )
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        model = svm_trainer.SVMTrainer(n_examples,
                                       exmple_ids,
                                       labels,
                                       classes,
                                       dataset_name=dataset,
                                       force_gpu=gpu)
        model.train(steps=steps)
Esempio n. 4
0
def train_model(build_name):

    global estimator
    dataset = estimator
    global model_dir
    with session_scope() as session:
        if not os.path.isfile(os.sep.join([model_dir, 'data', dataset,
                                           'runs.json.gz'])):
            runs = gather_results.get_runs_by_name(None,
                                                   build_name=build_name,
                                                   session=session)
            model_config = {'build_name': build_name}
            gather_results.save_model_config(dataset, model_config,
                                             data_path=model_dir)
            gather_results.save_run_uuids(dataset, runs,
                                          data_path=model_dir)
        else:
            runs = gather_results.load_run_uuids(dataset,
                                                 data_path=model_dir)
        normalized_length = 5500
        if estimator == 'svm':
            skips = []
            classes = []
            labels = []
            examples = []
            class_label = 'status'
            features_regex = None
            sample_interval = None
            idx = 0
            # Model configuration. We need to cache sample_interval,
            # features-regex and the normalization parameters for each
            # feature so we can re-use them during prediction.
            model_config = {
                'sample_interval': sample_interval,
                'features_regex': features_regex,
                'normalized_length': normalized_length
            }
            for run in runs:
                results = gather_results.get_subunit_results_for_run(
                    run, '1s', session=None, data_path=model_dir,
                    use_cache=True)
                print('Acquired run %s' % run.uuid)
                # For one run_uuid we must only get on example (result)
                result = results[0]
                if not result:
                    skips.append(run.uuid)
                    continue
                # Setup the numpy matrix and sizes
                if len(examples) == 0:
                    # Adjust normalized_length to the actual re-sample one
                    examples = np.ndarray(
                        shape=(
                            len(runs),
                            (len(result['dstat'].columns)
                             * normalized_length)))
                    model_config['num_columns'] = len(
                        result['dstat'].columns)
                    model_config['num_features'] = (len(
                        result['dstat'].columns) * normalized_length)
                    # Normalize data
                    example = fixed_lenght_example(result,
                                                   normalized_length)
                    # Normalize status
                    status = get_class(result, class_label)
                    vector, new_labels = unroll_example(
                        example, normalized_length, labels)
                    # Only calculate labels for the first example
                    if len(labels) == 0:
                        labels = new_labels
                        model_config['labels'] = labels
                    # Examples is an np ndarrays
                    examples[idx] = vector.values
                    classes.append(status)
            if len(skips) > 0:
                    print('Unable to train model because of missing '
                          'runs %s' % skips)
                    safe_runs = [
                        run for run in runs if run.uuid not in skips]
                    gather_results.save_run_uuids(dataset, safe_runs,
                                                  data_path=model_dir)
                    message = ('The model has been updated to exclude '
                               'those runs. Please re-run the training'
                               ' step.')
                    abort(make_response(message, 400))

            def run_training():
                # Perform dataset-wise normalization
                # NOTE(andreaf) When we train the model we ignore any saved
                # normalization
                # parameter, since the sample interval and features may be
                # different.
                n_examples, normalization_params = normalize_dataset(
                    examples, labels)
                # We do cache the result to normalize the prediction set.
                model_config['normalization_params'] = normalization_params
                gather_results.save_model_config(dataset, model_config,
                                                 data_path=model_dir)
                # Now do the training
                example_ids = [run.uuid for run in runs]
                outclasses = np.array(classes)
                svm_trainer.SVMTrainer(n_examples, example_ids, labels,
                                       outclasses, dataset_name=dataset,
                                       model_path=model_dir)
            thread = threading.Thread(target=run_training)
            thread.start()
            return "training started", 202
        else:
            def run_nn_training():
                for run in runs:
                    uuid = run.uuid
                    result = gather_results.get_subunit_results_for_run(
                        run, '1s', session=session, use_cache=False,
                        data_path=model_dir)[0]
                    try:
                        features, labels = nn_trainer.normalize_data(result)
                    except TypeError:
                        print('Unable to normalize data in run %s, '
                              'skipping' % uuid)
                        continue
                    nn_trainer.train_model(features, labels,
                                           dataset_name=dataset,
                                           model_path=model_dir)
                print('done')
            thread = threading.Thread(target=run_nn_training)
            thread.start()
            return "training started", 202