def train(self, df, targets, conditions):
        # Obtain the targets column.
        if len(targets) != 1:
            raise BLE(
                ValueError('MultipleRegression requires at least one column '
                           'in targets. Received {}'.format(targets)))
        if targets[0][1].lower() != 'numerical':
            raise BLE(
                ValueError('MultipleRegression can only regress NUMERICAL '
                           'columns. Received {}'.format(targets)))
        self.targets = [targets[0][0]]

        # Obtain the condition columns.
        if len(conditions) < 1:
            raise BLE(
                ValueError(
                    'MultipleRegression requires at least one '
                    'column in conditions. Received {}'.format(conditions)))
        self.conditions_categorical = []
        self.conditions_numerical = []
        for c in conditions:
            if c[1].lower() == 'categorical':
                self.conditions_categorical.append(c[0])
            else:
                self.conditions_numerical.append(c[0])
        self.conditions = self.conditions_numerical + \
            self.conditions_categorical

        # The dataset.
        self.dataset = pd.DataFrame()
        # Lookup for categoricals to code.
        self.categories_to_val_map = dict()
        # Training set (regressors and labels)
        self.X_numerical = np.ndarray(0)
        self.X_categorical = np.ndarray(0)
        self.Y = np.ndarray(0)
        # Linear regressors.
        self.mr_partial = LinearRegression()
        self.mr_full = LinearRegression()

        # Preprocess the data.
        self.dataset = utils.extract_sklearn_dataset(self.conditions,
                                                     self.targets, df)
        self.categories_to_val_map = utils.build_categorical_to_value_map(
            self.conditions_categorical, self.dataset)
        self.X_categorical = utils.extract_sklearn_features_categorical(
            self.conditions_categorical, self.categories_to_val_map,
            self.dataset)
        self.X_numerical = utils.extract_sklearn_features_numerical(
            self.conditions_numerical, self.dataset)
        self.Y = utils.extract_sklearn_univariate_target(
            self.targets, self.dataset)
        # Train the multiple regression.
        self._train_mr()
예제 #2
0
    def train(self, df, targets, conditions):
        # Obtain the targets column.
        if len(targets) != 1:
            raise BLE(ValueError(
                'MultipleRegression requires at least one column '
                'in targets. Received {}'.format(targets)))
        if targets[0][1].lower() != 'numerical':
            raise BLE(ValueError(
                'MultipleRegression can only regress NUMERICAL '
                'columns. Received {}'.format(targets)))
        self.targets = [targets[0][0]]

        # Obtain the condition columns.
        if len(conditions) < 1:
            raise BLE(ValueError('MultipleRegression requires at least one '
                'column in conditions. Received {}'.format(conditions)))
        self.conditions_categorical = []
        self.conditions_numerical = []
        for c in conditions:
            if c[1].lower() == 'categorical':
                self.conditions_categorical.append(c[0])
            else:
                self.conditions_numerical.append(c[0])
        self.conditions = self.conditions_numerical + \
            self.conditions_categorical

        # The dataset.
        self.dataset = pd.DataFrame()
        # Lookup for categoricals to code.
        self.categories_to_val_map = dict()
        # Training set (regressors and labels)
        self.X_numerical = np.ndarray(0)
        self.X_categorical = np.ndarray(0)
        self.Y = np.ndarray(0)
        # Linear regressors.
        self.mr_partial = LinearRegression()
        self.mr_full = LinearRegression()

        # Preprocess the data.
        self.dataset = utils.extract_sklearn_dataset(self.conditions,
            self.targets, df)
        self.categories_to_val_map = utils.build_categorical_to_value_map(
            self.conditions_categorical, self.dataset)
        self.X_categorical = utils.extract_sklearn_features_categorical(
            self.conditions_categorical, self.categories_to_val_map,
            self.dataset)
        self.X_numerical = utils.extract_sklearn_features_numerical(
            self.conditions_numerical, self.dataset)
        self.Y = utils.extract_sklearn_univariate_target(self.targets,
            self.dataset)
        # Train the multiple regression.
        self._train_mr()
예제 #3
0
 def train(self, df, targets, conditions):
     # Obtain the targets column.
     if len(targets) != 1:
         raise BLE(
             ValueError('RandomForest requires exactly one column in '
                        'targets. Received {}'.format(targets)))
     if targets[0][1].lower() != 'categorical':
         raise BLE(
             ValueError('RandomForest can only classify CATEGORICAL '
                        'columns. Received {}'.format(targets)))
     self.targets = [targets[0][0]]
     # Obtain the condition columns.
     if len(conditions) < 1:
         raise BLE(
             ValueError('RandomForest requires at least one column in '
                        'conditions. Received {}'.format(conditions)))
     self.conditions_categorical = []
     self.conditions_numerical = []
     for c in conditions:
         if c[1].lower() == 'categorical':
             self.conditions_categorical.append(c[0])
         else:
             self.conditions_numerical.append(c[0])
     self.conditions = self.conditions_numerical + \
         self.conditions_categorical
     # The dataset.
     self.dataset = pd.DataFrame()
     # Lookup for categoricals to code.
     self.categories_to_val_map = dict()
     # Training set (regressors and labels)
     self.X_numerical = np.ndarray(0)
     self.X_categorical = np.ndarray(0)
     self.Y = np.ndarray(0)
     # Random Forests.
     self.rf_partial = RandomForestClassifier(n_estimators=100)
     self.rf_full = RandomForestClassifier(n_estimators=100)
     # Preprocess the data.
     self.dataset = utils.extract_sklearn_dataset(self.conditions,
                                                  self.targets, df)
     self.categories_to_val_map = utils.build_categorical_to_value_map(
         self.conditions_categorical, self.dataset)
     self.X_categorical = utils.extract_sklearn_features_categorical(
         self.conditions_categorical, self.categories_to_val_map,
         self.dataset)
     self.X_numerical = utils.extract_sklearn_features_numerical(
         self.conditions_numerical, self.dataset)
     self.Y = utils.extract_sklearn_univariate_target(
         self.targets, self.dataset)
     # Train the random forest.
     self._train_rf()
def test_extract_sklearn_features_numerical():
    dataset = pd.DataFrame({
        'A': [1, 2, None, 4, 10],
        'B': [0, -2, 3, None, 1],
        'C': ['1', '2', '3', '4', '5'],
        'D': [1, None, 3, 4, 5],
    })
    columns = ['A', 'B']
    matrix = sku.extract_sklearn_features_numerical(columns, dataset)
    # Test correct imputation.
    mean_A = (1 + 2 + 4 + 10) / 4.
    mean_B = (0 - 2 + 3 + 1) / 4.
    expected = np.asarray([[1, 2, mean_A, 4, 10], [0, -2, 3, mean_B, 1]]).T
    assert np.array_equal(matrix, expected)
예제 #5
0
def test_extract_sklearn_features_numerical():
    dataset = pd.DataFrame({
        'A':[1, 2, None, 4, 10],
        'B':[0, -2, 3, None, 1],
        'C':['1', '2', '3', '4', '5'],
        'D':[1, None, 3, 4, 5],
        })
    columns = ['A', 'B']
    matrix = sku.extract_sklearn_features_numerical(columns, dataset)
    # Test correct imputation.
    mean_A = (1 + 2 + 4 + 10) / 4.
    mean_B = (0 -2 + 3 + 1) / 4.
    expected = np.asarray([
        [1, 2, mean_A, 4, 10],
        [0, -2, 3, mean_B, 1]]).T
    assert np.array_equal(matrix, expected)
예제 #6
0
 def train(self, df, targets, conditions):
     # Obtain the targets column.
     if len(targets) != 1:
         raise BLE(ValueError('RandomForest requires exactly one column in '
             'targets. Received {}'.format(targets)))
     if targets[0][1].lower() != 'categorical':
         raise BLE(ValueError('RandomForest can only classify CATEGORICAL '
             'columns. Received {}'.format(targets)))
     self.targets = [targets[0][0]]
     # Obtain the condition columns.
     if len(conditions) < 1:
         raise BLE(ValueError('RandomForest requires at least one column in '
             'conditions. Received {}'.format(conditions)))
     self.conditions_categorical = []
     self.conditions_numerical = []
     for c in conditions:
         if c[1].lower() == 'categorical':
             self.conditions_categorical.append(c[0])
         else:
             self.conditions_numerical.append(c[0])
     self.conditions = self.conditions_numerical + \
         self.conditions_categorical
     # The dataset.
     self.dataset = pd.DataFrame()
     # Lookup for categoricals to code.
     self.categories_to_val_map = dict()
     # Training set (regressors and labels)
     self.X_numerical = np.ndarray(0)
     self.X_categorical = np.ndarray(0)
     self.Y = np.ndarray(0)
     # Random Forests.
     self.rf_partial = RandomForestClassifier(n_estimators=100)
     self.rf_full = RandomForestClassifier(n_estimators=100)
     # Preprocess the data.
     self.dataset = utils.extract_sklearn_dataset(self.conditions,
         self.targets, df)
     self.categories_to_val_map = utils.build_categorical_to_value_map(
         self.conditions_categorical, self.dataset)
     self.X_categorical = utils.extract_sklearn_features_categorical(
         self.conditions_categorical, self.categories_to_val_map,
         self.dataset)
     self.X_numerical = utils.extract_sklearn_features_numerical(
         self.conditions_numerical, self.dataset)
     self.Y = utils.extract_sklearn_univariate_target(self.targets,
         self.dataset)
     # Train the random forest.
     self._train_rf()