def create_dataset(db, run_config, aws_config=None): """ Create a dataset and add it to the ModelHub database. db: initialized Database object run_config: RunConfig object describing the dataset to create aws_config: optional. AWS credentials for downloading data from S3. """ # download data to the local filesystem to extract metadata train_local, test_local = download_data(run_config.train_path, run_config.test_path, aws_config) # create the name of the dataset from the path to the data name = os.path.basename(train_local) name = name.replace("_train.csv", "").replace(".csv", "") # process the data into the form ATM needs and save it to disk meta = MetaData(run_config.class_column, train_local, test_local) # enter dataset into database dataset = db.create_dataset(name=name, description=run_config.data_description, train_path=run_config.train_path, test_path=run_config.test_path, class_column=run_config.class_column, n_examples=meta.n_examples, k_classes=meta.k_classes, d_features=meta.d_features, majority=meta.majority, size_kb=old_div(meta.size, 1000)) return dataset
def create_dataset(self): """ Create a dataset and add it to the ModelHub database. """ # download data to the local filesystem to extract metadata train_local, test_local = download_data(self.run_conf.train_path, self.run_conf.test_path, self.aws_conf) # create the name of the dataset from the path to the data name = os.path.basename(train_local) name = name.replace("_train.csv", "").replace(".csv", "") # process the data into the form ATM needs and save it to disk meta = MetaData(self.run_conf.class_column, train_local, test_local) # enter dataset into database dataset = self.db.create_dataset( name=name, description=self.run_conf.data_description, train_path=self.run_conf.train_path, test_path=self.run_conf.test_path, class_column=self.run_conf.class_column, n_examples=meta.n_examples, k_classes=meta.k_classes, d_features=meta.d_features, majority=meta.majority, size_kb=old_div(meta.size, 1000)) return dataset
def model(dataset): train_path, _ = download_data(dataset.train_path) model = Model(method='dt', params=DT_PARAMS, judgment_metric='roc_auc', class_column=dataset.class_column) model.train_test(train_path=train_path) return model
def test_classifier(self, method, params): """ Given a set of fully-qualified hyperparameters, create and test a classifier model. Returns: Model object and metrics dictionary """ model = Model(method=method, params=params, judgment_metric=self.datarun.metric, class_column=self.dataset.class_column, verbose_metrics=self.verbose_metrics) train_path, test_path = download_data(self.dataset.train_path, self.dataset.test_path, self.aws_config) metrics = model.train_test(train_path=train_path, test_path=test_path) target = self.datarun.score_target def metric_string(model): if 'cv' in target or 'mu_sigma' in target: return '%.3f +- %.3f' % (model.cv_judgment_metric, 2 * model.cv_judgment_metric_stdev) else: return '%.3f' % model.test_judgment_metric logger.info('Judgment metric (%s, %s): %s' % (self.datarun.metric, target[:-len('_judgment_metric')], metric_string(model))) old_best = self.db.get_best_classifier(datarun_id=self.datarun.id, score_target=target) if old_best is not None: if getattr(model, target) > getattr(old_best, target): logger.info( 'New best score! Previous best (classifier %s): %s', old_best.id, metric_string(old_best)) else: logger.info('Best so far (classifier %s): %s', old_best.id, metric_string(old_best)) return model, metrics
def enter_dataset(db, run_config, aws_config=None): """ Generate a dataset, and update run_config with the dataset ID. db: Database object with active connection to ModelHub run_config: all attributes necessary to initialize a Datarun, including Dataset info aws_config: all attributes necessary to connect to an S3 bucket. Returns: the generated dataset object """ print 'downloading data...' train_path, test_path = download_data(run_config.train_path, run_config.test_path, aws_config) print 'creating dataset...' dataset = create_dataset(db, run_config.label_column, train_path, test_path, run_config.data_description) run_config.dataset_id = dataset.id return dataset