def split_dfo(dfo, train_pct=.7, randomer=None, stratify=None, drop_cols=None, splain=local_settings.splain, **kwargs): ''' scale_dfo(dfo, scaler_fn=standard_scaler, **kwargs) RETURNS: dfo object with heaping piles of context enclosed scaler_fn must be a function dummy val added to train and test to allow for later feature selection testing ''' dfo.randomer = randomer dfo.stratify = stratify if stratify is not None else dfo.y_column dfo.train_pct = train_pct dfo.drop_cols = drop_cols df2 = pd.DataFrame(dfo.df) df2 = remove_cols(df=df2, cols=drop_cols) dfo.train, dfo.test = split_my_data_whole(df=df2, target_column=dfo.y_column, stratify=dfo.stratify, random_state=dfo.randomer) dfo.train_index = dfo.train.index frame_splain(dfo.train, 'DFO Train', splain=splain) dfo.test_index = dfo.test.index frame_splain(dfo.test, 'DFO Test', splain=splain) return dfo
def df_join_xy(X, y): ''' df_join_xy(X, y) RETURNS dataframe X.join(y) Allows reconfigurations of X and y based on train or test and scaled or unscaled ''' join_df = X.join(y) frame_splain(join_df, 'join df') return join_df
def rename_fields(dataframe): ''' rename_fields(dataframe) ''' columns = dataframe.columns.tolist() renames = {k: v for k, v in _global_renames if k in columns} renamed_df = dataframe.rename(columns=renames) frame_splain(renamed_df, title='renamed df') return renamed_df
def check_df(dataframe, *args, splain=local_settings.splain, **kwargs): ''' check_df(dataframe, splain=local_settings.splain, **kwargs) RETURNS: dataframe This function receives any dataframe, replaces null values with np.nan and passes it through frame_splain(). If splain is true, frame_splain() will produce a report on the dataframe. ''' dataframe.fillna(value=np.nan, inplace=True) frame_splain(dataframe, splain=splain, **kwargs) return dataframe
def xy_df(dataframe, y_column): ''' xy_df(dataframe, y_column) RETURNS X_df, y_df Pass in one dataframe of observed data and the name of the target column. Returns dataframe of all columns except the target column and dataframe of just the target column. If y_column is a list, more than one column can be separated. ''' X_df = dataframe.drop([y_column], axis=1) frame_splain(X_df, title='X') y_df = pd.DataFrame(dataframe[y_column]) frame_splain(y_df, title='y') return X_df, y_df
def wrangle_zillow(db='zillow', sql='zillow_sql', sql_string=False): ''' wrangle_zillow(db='zillow', sql='zillow_sql', sql_string=False) RETURNS result_df Pass database name ('zillow' by default) and either preset sql with sql_string=False (default) or sql statement with sql_string=True. Produces results of SQL statement in a dataframe object. *** Requires user, password, and host from env.py *** ''' get_database = db zillow_url = get_db_url(user=user, password=password, host=host, database=get_database) use_sql = sql if sql_string else get_sql(sql='zillow_sql') result_df = pd.read_sql(use_sql, zillow_url) frame_splain(result_df, topx=5, maxcols=10) return result_df
def edit_prep_df(dataframe): ''' set_base_df(dataframe) RETURN prepped_df Gets basic dataframe for MVP objective. Features include bathrooms, bedrooms, and square footage. Target variable is 'taxable_value' ''' keep_fields = [ 'nbr_bthrms', 'nbr_bedrms', 'finished_sqft', 'taxable_value' ] prepped_df = dataframe[keep_fields] frame_splain(prepped_df, title='prepped df') return prepped_df # print('Got Prep')
def scale_dfo(dfo, scaler_fn=standard_scaler, splain=local_settings.splain, **kwargs): ''' scale_dfo(dfo, scaler_fn=standard_scaler, **kwargs) RETURNS: dfo object with heaping piles of context enclosed scaler_fn must be a function dummy val added to train and test to allow for later feature selection testing ''' dfo.scaler_fn = scaler_fn if scaler_fn is None: dfo.scaler = None else: dfo.scaler, dfo.train_scaled, dfo.test_scaled = scaler_fn(train=dfo.train, test=dfo.test) dfo.train_scaled['dummy_val']=1 dfo.test_scaled['dummy_val']=1 dfo.train['dummy_val']=1 dfo.test['dummy_val']=1 dfo.X_train, dfo.y_train = xy_df(dataframe=dfo.train, y_column=dfo.y_column) dfo.X_test, dfo.y_test = xy_df(dataframe=dfo.test, y_column=dfo.y_column) frame_splain(dfo.X_train, 'X_Train', splain=splain) frame_splain(dfo.y_train, 'y_Train', splain=splain) frame_splain(dfo.X_test, 'X_Test', splain=splain) frame_splain(dfo.y_test, 'Y_Test', splain=splain) if scaler_fn is not None: dfo.X_train_scaled, dfo.y_train_scaled = xy_df(dataframe=dfo.train_scaled, y_column=dfo.y_column) dfo.X_test_scaled, dfo.y_test_scaled = xy_df(dataframe=dfo.test_scaled, y_column=dfo.y_column) frame_splain(dfo.X_train_scaled, 'X_Train_scaled', splain=splain) frame_splain(dfo.y_train_scaled, 'y_Train_scaled', splain=splain) frame_splain(dfo.X_test_scaled, 'X_Test_scaled', splain=splain) frame_splain(dfo.y_test_scaled, 'Y_Test_scaled', splain=splain) return dfo