def create_dataset(db, run_config, aws_config=None): """ Create a dataset and add it to the ModelHub database. db: initialized Database object run_config: RunConfig object describing the dataset to create aws_config: optional. AWS credentials for downloading data from S3. """ # download data to the local filesystem to extract metadata train_local, test_local = download_data(run_config.train_path, run_config.test_path, aws_config) # create the name of the dataset from the path to the data name = os.path.basename(train_local) name = name.replace("_train.csv", "").replace(".csv", "") # process the data into the form ATM needs and save it to disk meta = MetaData(run_config.class_column, train_local, test_local) # enter dataset into database dataset = db.create_dataset(name=name, description=run_config.data_description, train_path=run_config.train_path, test_path=run_config.test_path, class_column=run_config.class_column, n_examples=meta.n_examples, k_classes=meta.k_classes, d_features=meta.d_features, majority=meta.majority, size_kb=old_div(meta.size, 1000)) return dataset
def create_dataset(db, label_column, train_path, test_path=None, data_description=None): """ Create a dataset and add it to the ModelHub database. db: initialized Database object label_column: name of csv column representing the label train_path: path to raw training data test_path: path to raw test data data_description: description of the dataset (max 1000 chars) """ # create the name of the dataset from the path to the data name = os.path.basename(train_path) name = name.replace("_train.csv", "").replace(".csv", "") # process the data into the form ATM needs and save it to disk meta = MetaData(label_column, train_path, test_path) # enter dataset into database dataset = db.create_dataset(name=name, description=data_description, train_path=train_path, test_path=test_path, label_column=label_column, n_examples=meta.n_examples, k_classes=meta.k_classes, d_features=meta.d_features, majority=meta.majority, size_kb=meta.size / 1000) return dataset
def create_dataset(self): """ Create a dataset and add it to the ModelHub database. """ # download data to the local filesystem to extract metadata train_local, test_local = download_data(self.run_conf.train_path, self.run_conf.test_path, self.aws_conf) # create the name of the dataset from the path to the data name = os.path.basename(train_local) name = name.replace("_train.csv", "").replace(".csv", "") # process the data into the form ATM needs and save it to disk meta = MetaData(self.run_conf.class_column, train_local, test_local) # enter dataset into database dataset = self.db.create_dataset( name=name, description=self.run_conf.data_description, train_path=self.run_conf.train_path, test_path=self.run_conf.test_path, class_column=self.run_conf.class_column, n_examples=meta.n_examples, k_classes=meta.k_classes, d_features=meta.d_features, majority=meta.majority, size_kb=old_div(meta.size, 1000)) return dataset
def train_test(self, train_path, test_path=None): # load train and (maybe) test data metadata = MetaData(class_column=self.class_column, train_path=train_path, test_path=test_path) self.num_classes = metadata.k_classes self.num_features = metadata.d_features # if necessary, cast judgment metric into its binary/multiary equivalent if self.num_classes == 2: if self.judgment_metric in [Metrics.F1_MICRO, Metrics.F1_MACRO]: self.judgment_metric = Metrics.F1 elif self.judgment_metric in [ Metrics.ROC_AUC_MICRO, Metrics.ROC_AUC_MACRO ]: self.judgment_metric = Metrics.ROC_AUC else: if self.judgment_metric == Metrics.F1: self.judgment_metric = Metrics.F1_MACRO elif self.judgment_metric == Metrics.ROC_AUC: self.judgment_metric = Metrics.ROC_AUC_MACRO # load training data train_data = self.load_data(train_path) # if necessary, generate permanent train/test split if test_path is not None: test_data = self.load_data(test_path) all_data = pd.concat([train_data, test_data]) else: all_data = train_data train_data, test_data = train_test_split( train_data, test_size=self.testing_ratio, random_state=self.random_state) # extract feature matrix and labels from raw data self.encoder = DataEncoder(class_column=self.class_column) self.encoder.fit(all_data) X_train, y_train = self.encoder.transform(train_data) X_test, y_test = self.encoder.transform(test_data) # create and cross-validate pipeline self.make_pipeline() cv_scores = self.cross_validate(X_train, y_train) # train and test the final model self.pipeline.fit(X_train, y_train) test_scores = self.test_final_model(X_test, y_test) return {'cv': cv_scores, 'test': test_scores}