def test_extract_sklearn_features_categorical():
    dataset = pd.DataFrame({
        'Nationality': ['USA', 'USA', 'France', 'Germany', 'Bengal'],
        'Gender': ['M', 'F', 'M', 'M', 'T'],
        'C': ['1', '2', '3', '4', '5'],
        'D': [1, None, 3, 4, 5],
    })
    categories = ['Nationality', 'Gender']
    categories_to_val_map = {
        'Nationality': {
            'USA': 0,
            'France': 1,
            'Germany': 2,
            'Bengal': 3
        },
        'Gender': {
            'M': 0,
            'F': 1,
            'T': 2
        }
    }
    matrix = sku.extract_sklearn_features_categorical(categories,
                                                      categories_to_val_map,
                                                      dataset)
    expected = np.asarray([[1, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 1, 0],
                           [0, 1, 0, 0, 1, 0, 0], [0, 0, 1, 0, 1, 0, 0],
                           [0, 0, 0, 1, 0, 0, 1]])
    assert np.array_equal(matrix, expected)
    def train(self, df, targets, conditions):
        # Obtain the targets column.
        if len(targets) != 1:
            raise BLE(
                ValueError('MultipleRegression requires at least one column '
                           'in targets. Received {}'.format(targets)))
        if targets[0][1].lower() != 'numerical':
            raise BLE(
                ValueError('MultipleRegression can only regress NUMERICAL '
                           'columns. Received {}'.format(targets)))
        self.targets = [targets[0][0]]

        # Obtain the condition columns.
        if len(conditions) < 1:
            raise BLE(
                ValueError(
                    'MultipleRegression requires at least one '
                    'column in conditions. Received {}'.format(conditions)))
        self.conditions_categorical = []
        self.conditions_numerical = []
        for c in conditions:
            if c[1].lower() == 'categorical':
                self.conditions_categorical.append(c[0])
            else:
                self.conditions_numerical.append(c[0])
        self.conditions = self.conditions_numerical + \
            self.conditions_categorical

        # The dataset.
        self.dataset = pd.DataFrame()
        # Lookup for categoricals to code.
        self.categories_to_val_map = dict()
        # Training set (regressors and labels)
        self.X_numerical = np.ndarray(0)
        self.X_categorical = np.ndarray(0)
        self.Y = np.ndarray(0)
        # Linear regressors.
        self.mr_partial = LinearRegression()
        self.mr_full = LinearRegression()

        # Preprocess the data.
        self.dataset = utils.extract_sklearn_dataset(self.conditions,
                                                     self.targets, df)
        self.categories_to_val_map = utils.build_categorical_to_value_map(
            self.conditions_categorical, self.dataset)
        self.X_categorical = utils.extract_sklearn_features_categorical(
            self.conditions_categorical, self.categories_to_val_map,
            self.dataset)
        self.X_numerical = utils.extract_sklearn_features_numerical(
            self.conditions_numerical, self.dataset)
        self.Y = utils.extract_sklearn_univariate_target(
            self.targets, self.dataset)
        # Train the multiple regression.
        self._train_mr()
    def train(self, df, targets, conditions):
        # Obtain the targets column.
        if len(targets) != 1:
            raise BLE(ValueError(
                'MultipleRegression requires at least one column '
                'in targets. Received {}'.format(targets)))
        if targets[0][1].lower() != 'numerical':
            raise BLE(ValueError(
                'MultipleRegression can only regress NUMERICAL '
                'columns. Received {}'.format(targets)))
        self.targets = [targets[0][0]]

        # Obtain the condition columns.
        if len(conditions) < 1:
            raise BLE(ValueError('MultipleRegression requires at least one '
                'column in conditions. Received {}'.format(conditions)))
        self.conditions_categorical = []
        self.conditions_numerical = []
        for c in conditions:
            if c[1].lower() == 'categorical':
                self.conditions_categorical.append(c[0])
            else:
                self.conditions_numerical.append(c[0])
        self.conditions = self.conditions_numerical + \
            self.conditions_categorical

        # The dataset.
        self.dataset = pd.DataFrame()
        # Lookup for categoricals to code.
        self.categories_to_val_map = dict()
        # Training set (regressors and labels)
        self.X_numerical = np.ndarray(0)
        self.X_categorical = np.ndarray(0)
        self.Y = np.ndarray(0)
        # Linear regressors.
        self.mr_partial = LinearRegression()
        self.mr_full = LinearRegression()

        # Preprocess the data.
        self.dataset = utils.extract_sklearn_dataset(self.conditions,
            self.targets, df)
        self.categories_to_val_map = utils.build_categorical_to_value_map(
            self.conditions_categorical, self.dataset)
        self.X_categorical = utils.extract_sklearn_features_categorical(
            self.conditions_categorical, self.categories_to_val_map,
            self.dataset)
        self.X_numerical = utils.extract_sklearn_features_numerical(
            self.conditions_numerical, self.dataset)
        self.Y = utils.extract_sklearn_univariate_target(self.targets,
            self.dataset)
        # Train the multiple regression.
        self._train_mr()
Exemple #4
0
 def train(self, df, targets, conditions):
     # Obtain the targets column.
     if len(targets) != 1:
         raise BLE(
             ValueError('RandomForest requires exactly one column in '
                        'targets. Received {}'.format(targets)))
     if targets[0][1].lower() != 'categorical':
         raise BLE(
             ValueError('RandomForest can only classify CATEGORICAL '
                        'columns. Received {}'.format(targets)))
     self.targets = [targets[0][0]]
     # Obtain the condition columns.
     if len(conditions) < 1:
         raise BLE(
             ValueError('RandomForest requires at least one column in '
                        'conditions. Received {}'.format(conditions)))
     self.conditions_categorical = []
     self.conditions_numerical = []
     for c in conditions:
         if c[1].lower() == 'categorical':
             self.conditions_categorical.append(c[0])
         else:
             self.conditions_numerical.append(c[0])
     self.conditions = self.conditions_numerical + \
         self.conditions_categorical
     # The dataset.
     self.dataset = pd.DataFrame()
     # Lookup for categoricals to code.
     self.categories_to_val_map = dict()
     # Training set (regressors and labels)
     self.X_numerical = np.ndarray(0)
     self.X_categorical = np.ndarray(0)
     self.Y = np.ndarray(0)
     # Random Forests.
     self.rf_partial = RandomForestClassifier(n_estimators=100)
     self.rf_full = RandomForestClassifier(n_estimators=100)
     # Preprocess the data.
     self.dataset = utils.extract_sklearn_dataset(self.conditions,
                                                  self.targets, df)
     self.categories_to_val_map = utils.build_categorical_to_value_map(
         self.conditions_categorical, self.dataset)
     self.X_categorical = utils.extract_sklearn_features_categorical(
         self.conditions_categorical, self.categories_to_val_map,
         self.dataset)
     self.X_numerical = utils.extract_sklearn_features_numerical(
         self.conditions_numerical, self.dataset)
     self.Y = utils.extract_sklearn_univariate_target(
         self.targets, self.dataset)
     # Train the random forest.
     self._train_rf()
Exemple #5
0
 def train(self, df, targets, conditions):
     # Obtain the targets column.
     if len(targets) != 1:
         raise BLE(ValueError('RandomForest requires exactly one column in '
             'targets. Received {}'.format(targets)))
     if targets[0][1].lower() != 'categorical':
         raise BLE(ValueError('RandomForest can only classify CATEGORICAL '
             'columns. Received {}'.format(targets)))
     self.targets = [targets[0][0]]
     # Obtain the condition columns.
     if len(conditions) < 1:
         raise BLE(ValueError('RandomForest requires at least one column in '
             'conditions. Received {}'.format(conditions)))
     self.conditions_categorical = []
     self.conditions_numerical = []
     for c in conditions:
         if c[1].lower() == 'categorical':
             self.conditions_categorical.append(c[0])
         else:
             self.conditions_numerical.append(c[0])
     self.conditions = self.conditions_numerical + \
         self.conditions_categorical
     # The dataset.
     self.dataset = pd.DataFrame()
     # Lookup for categoricals to code.
     self.categories_to_val_map = dict()
     # Training set (regressors and labels)
     self.X_numerical = np.ndarray(0)
     self.X_categorical = np.ndarray(0)
     self.Y = np.ndarray(0)
     # Random Forests.
     self.rf_partial = RandomForestClassifier(n_estimators=100)
     self.rf_full = RandomForestClassifier(n_estimators=100)
     # Preprocess the data.
     self.dataset = utils.extract_sklearn_dataset(self.conditions,
         self.targets, df)
     self.categories_to_val_map = utils.build_categorical_to_value_map(
         self.conditions_categorical, self.dataset)
     self.X_categorical = utils.extract_sklearn_features_categorical(
         self.conditions_categorical, self.categories_to_val_map,
         self.dataset)
     self.X_numerical = utils.extract_sklearn_features_numerical(
         self.conditions_numerical, self.dataset)
     self.Y = utils.extract_sklearn_univariate_target(self.targets,
         self.dataset)
     # Train the random forest.
     self._train_rf()
def test_extract_sklearn_features_categorical():
    dataset = pd.DataFrame({
        'Nationality':['USA', 'USA', 'France', 'Germany', 'Bengal'],
        'Gender':['M', 'F', 'M', 'M', 'T'],
        'C':['1', '2', '3', '4', '5'],
        'D':[1, None, 3, 4, 5],
        })
    categories = ['Nationality', 'Gender']
    categories_to_val_map = {
        'Nationality' : {'USA':0, 'France':1, 'Germany': 2, 'Bengal':3},
        'Gender' : {'M':0, 'F':1, 'T': 2}
    }
    matrix = sku.extract_sklearn_features_categorical(categories,
        categories_to_val_map, dataset)
    expected = np.asarray([
        [1, 0, 0, 0, 1, 0, 0],
        [1, 0, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 1, 0, 0],
        [0, 0, 1, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 0, 1]])
    assert np.array_equal(matrix, expected)