Beispiel #1
0
def create_dataset(db, run_config, aws_config=None):
    """
    Create a dataset and add it to the ModelHub database.

    db: initialized Database object
    run_config: RunConfig object describing the dataset to create
    aws_config: optional. AWS credentials for downloading data from S3.
    """
    # download data to the local filesystem to extract metadata
    train_local, test_local = download_data(run_config.train_path,
                                            run_config.test_path, aws_config)

    # create the name of the dataset from the path to the data
    name = os.path.basename(train_local)
    name = name.replace("_train.csv", "").replace(".csv", "")

    # process the data into the form ATM needs and save it to disk
    meta = MetaData(run_config.class_column, train_local, test_local)

    # enter dataset into database
    dataset = db.create_dataset(name=name,
                                description=run_config.data_description,
                                train_path=run_config.train_path,
                                test_path=run_config.test_path,
                                class_column=run_config.class_column,
                                n_examples=meta.n_examples,
                                k_classes=meta.k_classes,
                                d_features=meta.d_features,
                                majority=meta.majority,
                                size_kb=old_div(meta.size, 1000))
    return dataset
Beispiel #2
0
def create_dataset(db, label_column, train_path, test_path=None,
                   data_description=None):
    """
    Create a dataset and add it to the ModelHub database.

    db: initialized Database object
    label_column: name of csv column representing the label
    train_path: path to raw training data
    test_path: path to raw test data
    data_description: description of the dataset (max 1000 chars)
    """
    # create the name of the dataset from the path to the data
    name = os.path.basename(train_path)
    name = name.replace("_train.csv", "").replace(".csv", "")

    # process the data into the form ATM needs and save it to disk
    meta = MetaData(label_column, train_path, test_path)

    # enter dataset into database
    dataset = db.create_dataset(name=name,
                                description=data_description,
                                train_path=train_path,
                                test_path=test_path,
                                label_column=label_column,
                                n_examples=meta.n_examples,
                                k_classes=meta.k_classes,
                                d_features=meta.d_features,
                                majority=meta.majority,
                                size_kb=meta.size / 1000)
    return dataset
Beispiel #3
0
    def create_dataset(self):
        """
        Create a dataset and add it to the ModelHub database.
        """
        # download data to the local filesystem to extract metadata
        train_local, test_local = download_data(self.run_conf.train_path,
                                                self.run_conf.test_path,
                                                self.aws_conf)

        # create the name of the dataset from the path to the data
        name = os.path.basename(train_local)
        name = name.replace("_train.csv", "").replace(".csv", "")

        # process the data into the form ATM needs and save it to disk
        meta = MetaData(self.run_conf.class_column, train_local, test_local)

        # enter dataset into database
        dataset = self.db.create_dataset(
            name=name,
            description=self.run_conf.data_description,
            train_path=self.run_conf.train_path,
            test_path=self.run_conf.test_path,
            class_column=self.run_conf.class_column,
            n_examples=meta.n_examples,
            k_classes=meta.k_classes,
            d_features=meta.d_features,
            majority=meta.majority,
            size_kb=old_div(meta.size, 1000))
        return dataset
Beispiel #4
0
    def train_test(self, train_path, test_path=None):
        # load train and (maybe) test data
        metadata = MetaData(class_column=self.class_column,
                            train_path=train_path,
                            test_path=test_path)
        self.num_classes = metadata.k_classes
        self.num_features = metadata.d_features

        # if necessary, cast judgment metric into its binary/multiary equivalent
        if self.num_classes == 2:
            if self.judgment_metric in [Metrics.F1_MICRO, Metrics.F1_MACRO]:
                self.judgment_metric = Metrics.F1
            elif self.judgment_metric in [
                    Metrics.ROC_AUC_MICRO, Metrics.ROC_AUC_MACRO
            ]:
                self.judgment_metric = Metrics.ROC_AUC
        else:
            if self.judgment_metric == Metrics.F1:
                self.judgment_metric = Metrics.F1_MACRO
            elif self.judgment_metric == Metrics.ROC_AUC:
                self.judgment_metric = Metrics.ROC_AUC_MACRO

        # load training data
        train_data = self.load_data(train_path)

        # if necessary, generate permanent train/test split
        if test_path is not None:
            test_data = self.load_data(test_path)
            all_data = pd.concat([train_data, test_data])
        else:
            all_data = train_data
            train_data, test_data = train_test_split(
                train_data,
                test_size=self.testing_ratio,
                random_state=self.random_state)

        # extract feature matrix and labels from raw data
        self.encoder = DataEncoder(class_column=self.class_column)
        self.encoder.fit(all_data)
        X_train, y_train = self.encoder.transform(train_data)
        X_test, y_test = self.encoder.transform(test_data)

        # create and cross-validate pipeline
        self.make_pipeline()
        cv_scores = self.cross_validate(X_train, y_train)

        # train and test the final model
        self.pipeline.fit(X_train, y_train)
        test_scores = self.test_final_model(X_test, y_test)
        return {'cv': cv_scores, 'test': test_scores}