def test_extract_sklearn_features_categorical(): dataset = pd.DataFrame({ 'Nationality': ['USA', 'USA', 'France', 'Germany', 'Bengal'], 'Gender': ['M', 'F', 'M', 'M', 'T'], 'C': ['1', '2', '3', '4', '5'], 'D': [1, None, 3, 4, 5], }) categories = ['Nationality', 'Gender'] categories_to_val_map = { 'Nationality': { 'USA': 0, 'France': 1, 'Germany': 2, 'Bengal': 3 }, 'Gender': { 'M': 0, 'F': 1, 'T': 2 } } matrix = sku.extract_sklearn_features_categorical(categories, categories_to_val_map, dataset) expected = np.asarray([[1, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 1, 0, 0], [0, 0, 1, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0, 1]]) assert np.array_equal(matrix, expected)
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('MultipleRegression requires at least one column ' 'in targets. Received {}'.format(targets))) if targets[0][1].lower() != 'numerical': raise BLE( ValueError('MultipleRegression can only regress NUMERICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError( 'MultipleRegression requires at least one ' 'column in conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Linear regressors. self.mr_partial = LinearRegression() self.mr_full = LinearRegression() # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the multiple regression. self._train_mr()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE(ValueError( 'MultipleRegression requires at least one column ' 'in targets. Received {}'.format(targets))) if targets[0][1].lower() != 'numerical': raise BLE(ValueError( 'MultipleRegression can only regress NUMERICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE(ValueError('MultipleRegression requires at least one ' 'column in conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Linear regressors. self.mr_partial = LinearRegression() self.mr_full = LinearRegression() # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target(self.targets, self.dataset) # Train the multiple regression. self._train_mr()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE( ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the random forest. self._train_rf()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE(ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE(ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE(ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target(self.targets, self.dataset) # Train the random forest. self._train_rf()
def test_extract_sklearn_features_categorical(): dataset = pd.DataFrame({ 'Nationality':['USA', 'USA', 'France', 'Germany', 'Bengal'], 'Gender':['M', 'F', 'M', 'M', 'T'], 'C':['1', '2', '3', '4', '5'], 'D':[1, None, 3, 4, 5], }) categories = ['Nationality', 'Gender'] categories_to_val_map = { 'Nationality' : {'USA':0, 'France':1, 'Germany': 2, 'Bengal':3}, 'Gender' : {'M':0, 'F':1, 'T': 2} } matrix = sku.extract_sklearn_features_categorical(categories, categories_to_val_map, dataset) expected = np.asarray([ [1, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 1, 0, 0], [0, 0, 1, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0, 1]]) assert np.array_equal(matrix, expected)