Exemple #1
0
def create_dataset(db, run_config, aws_config=None):
    """
    Create a dataset and add it to the ModelHub database.

    db: initialized Database object
    run_config: RunConfig object describing the dataset to create
    aws_config: optional. AWS credentials for downloading data from S3.
    """
    # download data to the local filesystem to extract metadata
    train_local, test_local = download_data(run_config.train_path,
                                            run_config.test_path, aws_config)

    # create the name of the dataset from the path to the data
    name = os.path.basename(train_local)
    name = name.replace("_train.csv", "").replace(".csv", "")

    # process the data into the form ATM needs and save it to disk
    meta = MetaData(run_config.class_column, train_local, test_local)

    # enter dataset into database
    dataset = db.create_dataset(name=name,
                                description=run_config.data_description,
                                train_path=run_config.train_path,
                                test_path=run_config.test_path,
                                class_column=run_config.class_column,
                                n_examples=meta.n_examples,
                                k_classes=meta.k_classes,
                                d_features=meta.d_features,
                                majority=meta.majority,
                                size_kb=old_div(meta.size, 1000))
    return dataset
Exemple #2
0
    def create_dataset(self):
        """
        Create a dataset and add it to the ModelHub database.
        """
        # download data to the local filesystem to extract metadata
        train_local, test_local = download_data(self.run_conf.train_path,
                                                self.run_conf.test_path,
                                                self.aws_conf)

        # create the name of the dataset from the path to the data
        name = os.path.basename(train_local)
        name = name.replace("_train.csv", "").replace(".csv", "")

        # process the data into the form ATM needs and save it to disk
        meta = MetaData(self.run_conf.class_column, train_local, test_local)

        # enter dataset into database
        dataset = self.db.create_dataset(
            name=name,
            description=self.run_conf.data_description,
            train_path=self.run_conf.train_path,
            test_path=self.run_conf.test_path,
            class_column=self.run_conf.class_column,
            n_examples=meta.n_examples,
            k_classes=meta.k_classes,
            d_features=meta.d_features,
            majority=meta.majority,
            size_kb=old_div(meta.size, 1000))
        return dataset
Exemple #3
0
def model(dataset):
    train_path, _ = download_data(dataset.train_path)
    model = Model(method='dt', params=DT_PARAMS,
                  judgment_metric='roc_auc',
                  class_column=dataset.class_column)
    model.train_test(train_path=train_path)
    return model
Exemple #4
0
def model(dataset):
    train_path, _ = download_data(dataset.train_path)
    model = Model(method='dt', params=DT_PARAMS,
                  judgment_metric='roc_auc',
                  class_column=dataset.class_column)
    model.train_test(train_path=train_path)
    return model
Exemple #5
0
    def test_classifier(self, method, params):
        """
        Given a set of fully-qualified hyperparameters, create and test a
        classifier model.
        Returns: Model object and metrics dictionary
        """
        model = Model(method=method,
                      params=params,
                      judgment_metric=self.datarun.metric,
                      class_column=self.dataset.class_column,
                      verbose_metrics=self.verbose_metrics)

        train_path, test_path = download_data(self.dataset.train_path,
                                              self.dataset.test_path,
                                              self.aws_config)

        metrics = model.train_test(train_path=train_path, test_path=test_path)

        target = self.datarun.score_target

        def metric_string(model):
            if 'cv' in target or 'mu_sigma' in target:
                return '%.3f +- %.3f' % (model.cv_judgment_metric,
                                         2 * model.cv_judgment_metric_stdev)
            else:
                return '%.3f' % model.test_judgment_metric

        logger.info('Judgment metric (%s, %s): %s' %
                    (self.datarun.metric, target[:-len('_judgment_metric')],
                     metric_string(model)))

        old_best = self.db.get_best_classifier(datarun_id=self.datarun.id,
                                               score_target=target)
        if old_best is not None:
            if getattr(model, target) > getattr(old_best, target):
                logger.info(
                    'New best score! Previous best (classifier %s): %s',
                    old_best.id, metric_string(old_best))
            else:
                logger.info('Best so far (classifier %s): %s', old_best.id,
                            metric_string(old_best))

        return model, metrics
Exemple #6
0
def enter_dataset(db, run_config, aws_config=None):
    """
    Generate a dataset, and update run_config with the dataset ID.

    db: Database object with active connection to ModelHub
    run_config: all attributes necessary to initialize a Datarun, including
        Dataset info
    aws_config: all attributes necessary to connect to an S3 bucket.

    Returns: the generated dataset object
    """
    print 'downloading data...'
    train_path, test_path = download_data(run_config.train_path,
                                          run_config.test_path, aws_config)
    print 'creating dataset...'
    dataset = create_dataset(db, run_config.label_column, train_path, test_path,
                             run_config.data_description)
    run_config.dataset_id = dataset.id

    return dataset