Beispiel #1
0
def report_auc_vs_baseline(db, rid, graph=False):
    with db_session(db):
        run = db.get_datarun(rid)
        ds = run.dataset
        test = [float(y) for y in get_best_so_far(db, rid)]

    ds_file = os.path.basename(ds.train_path)
    bl_path = download_file_http(BASELINE_URL + ds_file,
                                 local_folder=BASELINE_PATH)

    with open(bl_path) as f:
        baseline = [float(l.strip()) for l in f]

    min_len = min(len(baseline), len(test))
    x = range(min_len)

    test_auc = auc(x, test[:min_len])
    bl_auc = auc(x, baseline[:min_len])
    diff = test_auc - bl_auc

    print('Dataset %s (datarun %d)' % (ds_file, rid))
    print('AUC: test = %.3f, baseline = %.3f (%.3f)' % (test_auc, bl_auc, diff))

    if graph:
        graph_series(100, ds_file, baseline=baseline, test=test)

    return test_auc, bl_auc
Beispiel #2
0
def test_run_per_partition(dataset):
    sql_conf = SQLConfig({'sql_database': DB_PATH})
    db = Database(**sql_conf.to_dict())

    run_conf = RunConfig(
        {
            'dataset_id': dataset.id,
            'methods': ['logreg'],
            'run_per_partition': True
        }
    )

    atm = ATM(sql_conf, None, None)

    run_ids = atm.enter_data(None, run_conf)

    with db_session(db):
        runs = []
        for run_id in run_ids:
            run = db.get_datarun(run_id.id)
            if run is not None:
                runs.append(run)

        assert len(runs) == METHOD_HYPERPARTS['logreg']
        assert all([len(r.hyperpartitions) == 1 for r in runs])
def test_enter_data_all(dataset):
    sql_conf = SQLConfig(database=DB_PATH)
    db = Database(**vars(sql_conf))
    run_conf = RunConfig(dataset_id=dataset.id,
                         methods=METHOD_HYPERPARTS.keys())

    run_id = enter_data(sql_conf, run_conf)

    with db_session(db):
        run = db.get_datarun(run_id)
        assert run.dataset.id == dataset.id
        assert len(run.hyperpartitions) == sum(METHOD_HYPERPARTS.values())
Beispiel #4
0
def test_enter_data_all(dataset):
    sql_conf = SQLConfig({'sql_database': DB_PATH})
    db = Database(**sql_conf.to_dict())
    run_conf = RunConfig({'dataset_id': dataset.id, 'methods': METHOD_HYPERPARTS.keys()})

    atm = ATM(sql_conf, None, None)

    run_id = atm.enter_data(None, run_conf)

    with db_session(db):
        run = db.get_datarun(run_id.id)
        assert run.dataset.id == dataset.id
        assert len(run.hyperpartitions) == sum(METHOD_HYPERPARTS.values())
def test_enter_data_by_methods(dataset):
    sql_conf = SQLConfig(database=DB_PATH)
    db = Database(**vars(sql_conf))
    run_conf = RunConfig(dataset_id=dataset.id)

    for method, n_parts in METHOD_HYPERPARTS.items():
        run_conf.methods = [method]
        run_id = enter_data(sql_conf, run_conf)

        assert db.get_datarun(run_id)
        with db_session(db):
            run = db.get_datarun(run_id)
            assert run.dataset.id == dataset.id
            assert len(run.hyperpartitions) == n_parts
Beispiel #6
0
def test_enter_data_by_methods(dataset):
    sql_conf = SQLConfig({'sql_database': DB_PATH})
    db = Database(**sql_conf.to_dict())
    run_conf = RunConfig({'dataset_id': dataset.id})

    atm = ATM(sql_conf, None, None)

    for method, n_parts in METHOD_HYPERPARTS.items():
        run_conf.methods = [method]
        run_id = atm.enter_data(None, run_conf)

        with db_session(db):
            run = db.get_datarun(run_id.id)
            assert run.dataset.id == dataset.id
            assert len(run.hyperpartitions) == n_parts
Beispiel #7
0
def report_auc_vs_baseline(db, rids, graph=False):

    if len(rids) == 0:
        return
    rid = rids[0]
    with db_session(db):
        run = db.get_datarun(rid)
        ds = run.dataset
        test = np.array([[float(y) for y in get_best_so_far(db, rid)]
                         for rid in rids])
        test = test.T
        mean_test = np.mean(test, axis=1).tolist()

    ds_file = os.path.basename(ds.train_path)
    bl_path = download_file_http(BASELINE_URL + ds_file,
                                 local_folder=BASELINE_PATH)
    with open(bl_path) as f:
        baseline = np.array([[float(each) for each in l.strip().split('\t')]
                             for l in f])
        mean_baseline = np.mean(baseline, axis=1).tolist()

    min_len = min(baseline.shape[0], test.shape[0])
    x = range(min_len)

    test_aucs = np.array(
        [auc(x, test[:min_len, row]) for row in range(test.shape[1])])
    bl_aucs = np.array(
        [auc(x, baseline[:min_len, row]) for row in range(baseline.shape[1])])
    # get avg, std, min of AUC over trials
    mean_auc_test = np.mean(test_aucs)
    mean_auc_bl = np.mean(bl_aucs)
    std_auc_test = np.std(test_aucs)
    std_auc_bl = np.std(bl_aucs)
    min_auc_test = np.min(test_aucs)
    min_auc_bl = np.min(bl_aucs)
    mean_auc_diff = mean_auc_test - mean_auc_bl
    print('Dataset %s (dataruns %s)' % (ds_file, rids))
    print('Comparing %d trials to baseline generated by %d trials' %
          (len(rids), baseline.shape[1]))
    print('MEAN AUC: test = %.3f, baseline = %.3f (%.3f)' %
          (mean_auc_test, mean_auc_bl, mean_auc_diff))
    print('STD AUC: test = %.3f, baseline = %.3f' % (std_auc_test, std_auc_bl))
    print('MIN AUC: test = %.3f, baseline = %.3f' % (min_auc_test, min_auc_bl))

    if graph:
        graph_series(100, ds_file, baseline=mean_baseline, test=mean_test)

    return mean_auc_test, mean_auc_bl
def test_run_per_partition(dataset):
    sql_conf = SQLConfig(database=DB_PATH)
    db = Database(**vars(sql_conf))
    run_conf = RunConfig(dataset_id=dataset.id, methods=['logreg'])

    run_ids = enter_data(sql_conf, run_conf, run_per_partition=True)

    with db_session(db):
        runs = []
        for run_id in run_ids:
            run = db.get_datarun(run_id)
            if run is not None:
                runs.append(run)

        assert len(runs) == METHOD_HYPERPARTS['logreg']
        assert all([len(run.hyperpartitions) == 1 for run in runs])
Beispiel #9
0
    def save_classifier(self, classifier_id, model, metrics):
        """
        Update a classifier with metrics and model information and mark it as
        "complete"

        classifier_id: ID of the classifier to save
        model: Model object containing a serializable representation of the
            final model generated by this classifier.
        metrics: Dictionary containing cross-validation and test metrics data
            for the model.
        """
        # whether to save model and metrics data to the filesystem
        if self.save_files:
            # keep a database session open so that the utility functions can
            # access the linked hyperpartitions and dataruns
            with db_session(self.db):
                classifier = self.db.get_classifier(classifier_id)
                model_path = save_model(classifier, self.model_dir, model)
                metric_path = save_metrics(classifier, self.metric_dir,
                                           metrics)

            # if necessary, save model and metrics to Amazon S3 bucket
            if self.cloud_mode:
                try:
                    self.save_classifier_cloud(model_path, metric_path)
                except Exception:
                    msg = traceback.format_exc()
                    _log('Error in save_classifier_cloud()')
                    self.db.mark_classifier_errored(classifier_id,
                                                    error_msg=msg)
        else:
            model_path = None
            metric_path = None

        # update the classifier in the database
        self.db.complete_classifier(classifier_id=classifier_id,
                                    trainable_params=model.trainable_params,
                                    dimensions=model.dimensions,
                                    model_path=model_path,
                                    metric_path=metric_path,
                                    cv_score=model.cv_judgment_metric,
                                    cv_stdev=model.cv_judgment_metric_stdev,
                                    test_score=model.test_judgment_metric)

        # update this session's hyperpartition entry
        _log('Saved classifier %d.' % classifier_id)
Beispiel #10
0
def test_save_classifier(db, datarun, model, metrics):
    worker = Worker(db, datarun, models_dir=MODEL_DIR, metrics_dir=METRIC_DIR)
    hp = db.get_hyperpartitions(datarun_id=worker.datarun.id)[0]
    classifier = worker.db.start_classifier(hyperpartition_id=hp.id,
                                            datarun_id=worker.datarun.id,
                                            host='localhost',
                                            hyperparameter_values=DT_PARAMS)

    worker.db.complete_classifier = Mock()
    worker.save_classifier(classifier.id, model, metrics)
    worker.db.complete_classifier.assert_called()

    with db_session(worker.db):
        clf = db.get_classifier(classifier.id)

        loaded = load_model(clf, MODEL_DIR)
        assert isinstance(loaded, Model)
        assert loaded.method == model.method
        assert loaded.random_state == model.random_state

        assert load_metrics(clf, METRIC_DIR) == metrics
Beispiel #11
0
def test_save_classifier(db, datarun, model, metrics):
    log_conf = LogConfig(model_dir=MODEL_DIR, metric_dir=METRIC_DIR)
    worker = Worker(db, datarun, log_config=log_conf)
    hp = db.get_hyperpartitions(datarun_id=worker.datarun.id)[0]
    classifier = worker.db.start_classifier(hyperpartition_id=hp.id,
                                            datarun_id=worker.datarun.id,
                                            host='localhost',
                                            hyperparameter_values=DT_PARAMS)

    worker.db.complete_classifier = Mock()
    worker.save_classifier(classifier.id, model, metrics)
    worker.db.complete_classifier.assert_called()

    with db_session(worker.db):
        clf = db.get_classifier(classifier.id)

        loaded = load_model(clf, MODEL_DIR)
        assert type(loaded) == Model
        assert loaded.method == model.method
        assert loaded.random_state == model.random_state

        assert load_metrics(clf, METRIC_DIR) == metrics
Beispiel #12
0
                    help='Only train on dataruns with these ids',
                    nargs='+')
parser.add_argument('--time', help='Number of seconds to run worker', type=int)
parser.add_argument(
    '--choose-randomly',
    action='store_true',
    help='Choose dataruns to work on randomly (default = sequential order)')
parser.add_argument('--no-save',
                    dest='save_files',
                    default=True,
                    action='store_const',
                    const=False,
                    help="don't save models and metrics at all")

# parse arguments and load configuration
args = parser.parse_args()

sql_config, _, aws_config, log_config = load_config(**vars(args))

db = Database(**vars(sql_config))

with db_session(db):  # keep a database session open to access the dataruns
    ## get all the classifier in the dataset
    classifiers = db.get_classifiers()
    ## or
    ## get one classifier by the classifier ID
    # classifier = db.get_classifier(classifier_id)
    print("total {} classifiers".format(len(classifiers)))
    for classifier in classifiers:
        metrics = load_metrics(classifier, metric_dir="./metrics")
        print(metrics)