def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('MultipleRegression requires at least one column ' 'in targets. Received {}'.format(targets))) if targets[0][1].lower() != 'numerical': raise BLE( ValueError('MultipleRegression can only regress NUMERICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError( 'MultipleRegression requires at least one ' 'column in conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Linear regressors. self.mr_partial = LinearRegression() self.mr_full = LinearRegression() # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the multiple regression. self._train_mr()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE(ValueError( 'MultipleRegression requires at least one column ' 'in targets. Received {}'.format(targets))) if targets[0][1].lower() != 'numerical': raise BLE(ValueError( 'MultipleRegression can only regress NUMERICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE(ValueError('MultipleRegression requires at least one ' 'column in conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Linear regressors. self.mr_partial = LinearRegression() self.mr_full = LinearRegression() # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target(self.targets, self.dataset) # Train the multiple regression. self._train_mr()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE( ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the random forest. self._train_rf()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE(ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE(ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE(ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target(self.targets, self.dataset) # Train the random forest. self._train_rf()
def test_build_categorical_to_value_map(): dataset = pd.DataFrame({ 'Nationality':['USA', 'USA', 'France', 'Germany', 'Bengal'], 'Gender':['M', 'F', 'M', 'M', 'T'], 'C':['1', '2', '3', '4', '5'], 'D':[1, None, 3, 4, 5], }) columns = ['Nationality', 'Gender'] categories_to_val_map = sku.build_categorical_to_value_map(columns, dataset) # Assert all the 'columns' have a codemap. assert set(categories_to_val_map.keys()) == set(columns) for col, valmap in categories_to_val_map.iteritems(): # Assert each unique val in the column has codes for all its values. unique_vals = set(dataset[col].unique()) assert unique_vals == set(valmap.keys()) # Assert that all codes are unique. assert len(set(code for _, code in valmap.iteritems())) == \ len(unique_vals)
def test_build_categorical_to_value_map(): dataset = pd.DataFrame({ 'Nationality': ['USA', 'USA', 'France', 'Germany', 'Bengal'], 'Gender': ['M', 'F', 'M', 'M', 'T'], 'C': ['1', '2', '3', '4', '5'], 'D': [1, None, 3, 4, 5], }) columns = ['Nationality', 'Gender'] categories_to_val_map = sku.build_categorical_to_value_map( columns, dataset) # Assert all the 'columns' have a codemap. assert set(categories_to_val_map.keys()) == set(columns) for col, valmap in categories_to_val_map.iteritems(): # Assert each unique val in the column has codes for all its values. unique_vals = set(dataset[col].unique()) assert unique_vals == set(valmap.keys()) # Assert that all codes are unique. assert len(set(code for _, code in valmap.iteritems())) == \ len(unique_vals)