def _oversample(X, y, method='SMOTE', strat='not majority'): # compute minimum number of samples per class min_samples = len(y) for l in set(y): if y.tolist().count(l) < min_samples: min_samples = y.tolist().count(l) if min_samples <= 5: method = 'RNDM' if method == 'ADASYN': ios = imbover.ADASYN(sampling_strategy=strat, random_state=42) elif method == 'SMOTE': ios = imbover.SMOTE(sampling_strategy=strat, random_state=42) elif method == 'SMOTENC': ios = imbover.SMOTENC(sampling_strategy=strat, random_state=42) elif method == 'BORDERSMOTE': ios = imbover.BorderlineSMOTE(sampling_strategy=strat, random_state=42) elif method == 'SVMSMOTE': ios = imbover.SVMSMOTE(sampling_strategy=strat, random_state=42) elif method == 'KMEANSSMOTE': ios = imbover.KMeansSMOTE(sampling_strategy=strat, random_state=42) elif method == 'RNDM': ios = imbover.RandomOverSampler(sampling_strategy=strat, random_state=42) X_resampled, y_resampled = ios.fit_resample(X, y) return X_resampled, y_resampled
def oversample_smote(training_features, training_labels, is_dataframe=True): ''' Convenience function for oversampling with SMOTE. This generates synthetic samples via interpolation. Automatically encodes categorical columns if a dataframe is provided with categorical columns properly marked. Input: The training features and labels. is_dataframe is for checking for categorical columns. Output: The oversampled training features and labels ''' from imblearn import over_sampling if is_dataframe == True: # Testing if there are any categorical columns # Note: These must have the "category" datatype categorical_variable_list = training_features.select_dtypes( exclude=['number', 'bool_', 'object_']).columns if categorical_variable_list.shape[0] > 0: categorical_variable_list = list(categorical_variable_list) categorical_variable_indexes = training_features.columns.get_indexer( categorical_variable_list) smote = over_sampling.SMOTENC( categorical_features=categorical_variable_indexes, random_state=46, n_jobs=-1) else: smote = over_sampling.SMOTE(random_state=46, n_jobs=-1) else: smote = over_sampling.SMOTE(random_state=46, n_jobs=-1) # Performing oversampling training_features_oversampled, training_labels_oversampled = smote.fit_sample( training_features, training_labels) # Rounding discrete variables for appropriate cutoffs # This is becuase SMOTE NC only deals with binary categorical variables, not discrete variables if is_dataframe == True: discrete_variable_list = training_features.select_dtypes( include=['int', 'int32', 'int64']).columns if discrete_variable_list.shape[0] > 0: discrete_variable_indexes = training_features.columns.get_indexer( discrete_variable_list) for discrete_variable_index in discrete_variable_indexes: training_features_oversampled[:, discrete_variable_index] = np.round( training_features_oversampled[:, discrete_variable_index]. astype(float)).astype(int) print('Previous training size:', len(training_labels)) print('Oversampled training size', len(training_labels_oversampled), '\n') print('Previous label mean:', training_labels.astype(int).mean()) print('Oversampled label mean:', training_labels_oversampled.mean()) return training_features_oversampled, training_labels_oversampled
def feature_extraction(dataset, onehot_option=False, smote_option=True, y_stratify=False, seed=0, as_category=True): ''' Loads the American Housing Survey 2017 dataset of interest and exports Pandas training, validation, and test data Inputs: onehot_option: False = label encode features, True = one-hot encode features smote_option: False = don't use SMOTE, True = use SMOTE dataset: 0 = SF data only, 1 = SF + LA data, 2 = SF + SJ data, 3 = All of CA Outputs: X: Dataframe of input features y: Dataframe of target variable X_encode: Input variable encoding X_train, X_val, X_test: Train, validation and test input dataframes y_train, y_val, y_test: Train validation and test target variable dataframe ''' # Read dataset based on input if dataset == 0: df = pd.read_csv('SF_41860_Flat.csv', index_col=0) elif dataset == 1: df = pd.read_csv('CA_41860_31080_Flat.csv', index_col=0) elif dataset == 2: df1 = pd.read_csv('SF_41860_Flat.csv', index_col=0) df2 = pd.read_csv('SJ_41940_Flat.csv', index_col=0) df = pd.concat([df1, df2], ignore_index=True) elif dataset == 3: df2 = pd.read_csv('SJ_41940_Flat.csv', index_col=0) df3 = pd.read_csv('CA_41860_31080_Flat.csv', index_col=0) df = pd.concat([df2, df3], ignore_index=True) #%% Variable Lists # Data/Variable Types # Categorial == 1 # Continuous == 0 # Topic: Admin -- a OMB13CBSA = '41860' vars_admin = ['INTSTATUS', 'SPLITSAMP'] type_admin = [1, 1] # Topic: Occupancy and Tenure vars_occ = [ 'TENURE', 'CONDO', 'HOA', 'OWNLOT', 'MGRONSITE', 'VACRESDAYS', 'VACRNTDAYS' ] type_occ = [1, 1, 1, 1, 1, 1, 1] # Topic: Structural vars_struct = [ 'BLD', 'YRBUILT', 'GUTREHB', 'GARAGE', 'WINBARS', 'MHWIDE', 'UNITSIZE', 'TOTROOMS', 'KITEXCLU', 'BATHEXCLU' ] type_struct = [1, 0, 1, 1, 1, 1, 1, 0, 1, 1] # Topic: Equipment and Appliances vars_equip = [] # Topic: Housing Problems vars_probs = [] # Topic: Demographics vars_demo = [ 'HSHLDTYPE', 'SAMEHHLD', 'NUMPEOPLE', 'NUMADULTS', 'NUMELDERS', 'NUMYNGKIDS', 'NUMOLDKIDS', 'NUMVETS', 'MILHH', 'NUMNONREL', 'PARTNER', 'MULTIGEN', 'GRANDHH', 'NUMSUBFAM', 'NUMSECFAM', 'DISHH', 'HHSEX', 'HHAGE', 'HHMAR', 'HHRACE', 'HHRACEAS', 'HHRACEPI', 'HHSPAN', 'HHCITSHP', 'HHNATVTY', 'HHINUSYR', 'HHMOVE', 'HHGRAD', 'HHENROLL', 'HHYNGKIDS', 'HHOLDKIDS', 'HHADLTKIDS', 'HHHEAR', 'HHSEE', 'HHMEMRY', 'HHWALK', 'HHCARE', 'HHERRND' ] type_demo = [ 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1 ] # Topic: Income vars_income = ['HINCP', 'FINCP', 'FS'] type_income = [0, 0, 1] # Topic: Housing Costs vars_costs = [ 'MORTAMT', 'RENT', 'UTILAMT', 'PROTAXAMT', 'INSURAMT', 'HOAAMT', 'LOTAMT', 'TOTHCAMT', 'HUDSUB', 'RENTCNTRL', 'FIRSTHOME', 'MARKETVAL', 'TOTBALAMT' ] type_costs = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0] # Topic: Mortgage Details vars_mort = [] # Topic: Home Improvement vars_improv = [] # Topic: Neighborhood Features vars_neigh = [ 'SUBDIV', 'NEARBARCL', 'NEARABAND', 'NEARTRASH', 'RATINGHS', 'RATINGNH', 'NHQSCHOOL', 'NHQPCRIME', 'NHQSCRIME', 'NHQPUBTRN', 'NHQRISK' ] type_neigh = [1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1] # Topic: Recent Movers vars_move = [ 'MOVFORCE', 'MOVWHY', 'RMJOB', 'RMOWNHH', 'RMFAMILY', 'RMCHANGE', 'RMCOMMUTE', 'RMHOME', 'RMCOSTS', 'RMHOOD', 'RMOTHER' ] type_move = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # Topic: Delinquency vars_del = [] # Topic: Disaster Planning vars_dis = [ 'DPGENERT', 'DPSHELTR', 'DPDRFOOD', 'DPEMWATER', 'DPEVSEP', 'DPEVLOC', 'DPALTCOM', 'DPGETINFO', 'DPEVVEHIC', 'DPEVKIT', 'DPEVINFO', 'DPEVFIN', 'DPEVACPETS', 'DPFLDINS', 'DPMAJDIS' ] type_dis = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # Topic: Commuting vars_comm = [] # Topic: Eviction vars_evict = [] # Combine variables in a list x_vars = np.asarray([ var for var_list in [ vars_admin, vars_occ, vars_struct, vars_equip, vars_probs, vars_demo, vars_income, vars_costs, vars_mort, vars_improv, vars_neigh, vars_move, vars_del, vars_dis, vars_comm, vars_evict ] for var in var_list ]) x_vars_encode = np.asarray([ code for code_list in [ type_admin, type_occ, type_struct, type_demo, type_income, type_costs, type_neigh, type_move, type_dis ] for code in code_list ]) #%% Data Cleaning and Filtering # Count the number of responses of each kind to DPEVLOC (1-5, -6 or -9) # M or -9: Not reported # N or -6: Not applicable # Number of valid features n = Counter(df['DPEVLOC']) n = np.asarray([n[f"'{i}'"] for i in range(1, 4)]) # Filter by valid output only (rows) -- keep just responses 1, 2, and 3 df = df.loc[df['DPEVLOC'].isin(["'{}'".format(i) for i in range(1, 4)])] # Filter by proportion of NA values (cols) props_NA = [ sum(list(df[var] == "'-6'") or list(df[var] == "'-9'")) / len(df[var]) for var in x_vars ] vars_remove = [x_vars[i] for i, var in enumerate(props_NA) if var > 0.25] # Exclude certain variables by choice vars_remove.extend( ['MORTAMT', 'RENT', 'PROTAXAMT', 'HOAAMT', 'LOTAMT', 'TOTBALAMT']) # Remove disaster preparedeness variables vars_remove.extend(vars_dis) # Remove variables that are constant for all observations (i.e. only 1 unique value) vars_remove.extend( [var for i, var in enumerate(x_vars) if len(np.unique(df[var])) == 1]) # Create a binary list of valid id's and a list of valid variables idx = [var not in vars_remove for var in x_vars] valid_vars = x_vars[idx] # Filter inputs (X), outputs (y), and input variable encoding (X_encode) X = df[valid_vars] X_encode = x_vars_encode[idx] y = df['DPEVLOC'] #%% Split into train/dev/test set # Train-val-test ratio = 0.6-0.2-0.2 if y_stratify: X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, stratify=y, random_state=seed) X_train, X_val, y_train, y_val = model_selection.train_test_split( X_train, y_train, test_size=0.25, stratify=y_train, random_state=seed) else: X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=0) X_train, X_val, y_train, y_val = model_selection.train_test_split( X_train, y_train, test_size=0.25, random_state=0) if smote_option: # Use SMOTE for continuous features to oversample the non-majority classes smote = os.SMOTENC(categorical_features=X_encode.astype('bool'), sampling_strategy='not majority', random_state=seed) # smote = os.SMOTE(sampling_strategy='not majority') X_train, y_train = smote.fit_sample(X_train, y_train) # Concatenate all three sets into master X and y dataframes X = pd.concat([X_train, X_val, X_test], ignore_index=True) y = pd.concat([y_train, y_val, y_test], ignore_index=True) # Indices to separate out the three sets train_sep = X_train.shape[0] val_sep = train_sep + X_val.shape[0] test_sep = val_sep + X_test.shape[0] #%% Encode input and output variables as categorical X = X.copy() # Transform output variable with the LabelEncoder le = preprocessing.LabelEncoder() le.fit(np.unique(y)) y = le.transform(y) X_remove = [] # Loop through each input variable and encode categorical ones (i.e. X_incode == 1) for i, val in enumerate(X_encode): col = valid_vars[i] Xi = X.loc[:, col] if val == 1: if onehot_option: # Encode categorical variables as One Hot encoder OneHot = pd.get_dummies(Xi, prefix=col) if 2 < OneHot.shape[1] <= 20: X = pd.concat([X, OneHot], axis=1) X_encode = np.append(X_encode, np.repeat(0, OneHot.shape[1])) X = X.drop(col, axis=1) X_remove.append(i) else: Xi = X.loc[:, col] le.fit(np.unique(Xi)) X.loc[:, col] = le.transform(Xi) if as_category: X[col] = X[col].astype('category') else: # Encode categorical variables as Label encoder Xi = X.loc[:, col] le.fit(np.unique(Xi)) X.loc[:, col] = le.transform(Xi) if as_category: X[col] = X[col].astype('category') # X = X.drop(col, axis=1) # **Optional** # Encoding of missing values in non-categorical variables # If the a missing value is present in the variable (i.e. -6 or -9), # a separate index variable is created to represent missing values, while # -6 and -9 are replaced with 0 in the continuous variable. elif val == 0: if any(Xi < 0): le.fit([0, 1]) X[col + '_MISSING'] = le.transform([i < 0 for i in X[col]]) X[col] = X[col].clip(lower=0) valid_vars = np.append(valid_vars, [col + '_MISSING']) X_encode = np.append(X_encode, val) X_encode = np.asarray( [i for j, i in enumerate(X_encode) if j not in X_remove]) # After encoding, split back into train, val, and test sets X_train, y_train = X[0:train_sep], y[0:train_sep] X_val, y_val = X[train_sep:val_sep], y[train_sep:val_sep] X_test, y_test = X[val_sep:test_sep], y[val_sep:test_sep] return X, y, X_encode, X_train, y_train, X_val, y_val, X_test, y_test, n
valid_vars = x_vars[idx] # Filter inputs (X), outputs (y), and input variable encoding (X_encode) X = df[valid_vars] X_encode = x_vars_encode[idx] y = df['DPEVLOC'] #%% Split into train/dev/test set # Train-val-test ratio = 0.6-0.2-0.2 X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=0) X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.25, random_state=0) # Use SMOTE for continuous features to oversample the non-majority classes smote = os.SMOTENC(categorical_features = X_encode.astype('bool'), sampling_strategy='not majority', random_state=0) # smote = os.SMOTE(sampling_strategy='not majority') X_train, y_train = smote.fit_sample(X_train, y_train) # Concatenate all three sets into master X and y dataframes X = pd.concat([X_train, X_val, X_test], ignore_index= True) y = pd.concat([y_train, y_val, y_test], ignore_index= True) # Indices to separate out the three sets train_sep = X_train.shape[0] val_sep = train_sep + X_val.shape[0] test_sep = val_sep + X_test.shape[0] #%% Encode input and output variables as categorical