def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('MultipleRegression requires at least one column ' 'in targets. Received {}'.format(targets))) if targets[0][1].lower() != 'numerical': raise BLE( ValueError('MultipleRegression can only regress NUMERICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError( 'MultipleRegression requires at least one ' 'column in conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Linear regressors. self.mr_partial = LinearRegression() self.mr_full = LinearRegression() # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the multiple regression. self._train_mr()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE(ValueError( 'MultipleRegression requires at least one column ' 'in targets. Received {}'.format(targets))) if targets[0][1].lower() != 'numerical': raise BLE(ValueError( 'MultipleRegression can only regress NUMERICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE(ValueError('MultipleRegression requires at least one ' 'column in conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Linear regressors. self.mr_partial = LinearRegression() self.mr_full = LinearRegression() # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target(self.targets, self.dataset) # Train the multiple regression. self._train_mr()
def test_extract_sklearn_dataset(): dataset = pd.DataFrame({ 'A':[1.1, 2.1, 3.9, 4.5, 5.1], 'B':[5.1, 4.1, 3.9, 2.5, 1.1], 'C':['1', '2', '3', '4', '5'], 'D':[1, None, 3, 4, 5], }) conditions, targets = ['A', 'B'], ['D'] df = sku.extract_sklearn_dataset(conditions, targets, dataset) # Test that column 'C' is not included. assert set(df.columns) == set(conditions + targets) # Test that the second row is dropped (it has a None target). assert len(df) == 4
def test_extract_sklearn_dataset(): dataset = pd.DataFrame({ 'A': [1.1, 2.1, 3.9, 4.5, 5.1], 'B': [5.1, 4.1, 3.9, 2.5, 1.1], 'C': ['1', '2', '3', '4', '5'], 'D': [1, None, 3, 4, 5], }) conditions, targets = ['A', 'B'], ['D'] df = sku.extract_sklearn_dataset(conditions, targets, dataset) # Test that column 'C' is not included. assert set(df.columns) == set(conditions + targets) # Test that the second row is dropped (it has a None target). assert len(df) == 4
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE( ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the random forest. self._train_rf()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE(ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE(ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE(ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target(self.targets, self.dataset) # Train the random forest. self._train_rf()