def clean_zillow(cached=True): '''This function acquires and prepares the zillow data from a local csv, default. Passing cached=False acquires fresh data from sql and writes to csv.''' # use my aquire function to read data into a df from a csv file df = get_zillow_data() # drop duplicates df.drop_duplicates(inplace=True) # drop duplicate columns and remove columns with more than 50% nulls df = df.drop(columns=[ 'unitcnt', 'propertylandusedesc', 'heatingorsystemdesc', 'propertyzoningdesc', 'heatingorsystemtypeid', 'taxdelinquencyflag', 'taxdelinquencyyear', 'yardbuildingsqft17', 'finishedsquarefeet50', 'finishedfloor1squarefeet', 'fireplacecnt', 'threequarterbathnbr', 'pooltypeid7', 'poolcnt', 'numberofstories', 'airconditioningdesc', 'garagetotalsqft', 'garagecarcnt', 'regionidneighborhood', 'hashottuborspa', 'pooltypeid2', 'poolsizesum', 'pooltypeid10', 'typeconstructiontypeid', 'typeconstructiondesc', 'architecturalstyledesc', 'finishedsquarefeet6', 'fireplaceflag', 'yardbuildingsqft26', 'finishedsquarefeet13', 'storytypeid', 'storydesc', 'basementsqft', 'finishedsquarefeet15', 'buildingclassdesc', 'architecturalstyletypeid', 'airconditioningtypeid', 'buildingclasstypeid', 'buildingqualitytypeid', 'decktypeid', 'architecturalstyletypeid.1', 'airconditioningtypeid.1', 'heatingorsystemtypeid.1', 'propertylandusetypeid.1', 'buildingclasstypeid.1', 'storytypeid.1', 'typeconstructiontypeid.1', 'id.1', 'Unnamed: 0', 'calculatedbathnbr', 'fips', 'latitude', 'longitude', 'regionidcounty', 'roomcnt', 'yearbuilt', 'assessmentyear', 'propertycountylandusecode', 'propertylandusetypeid', 'parcelid.2', 'parcelid.1' ]) #removing columns df.replace(',', '', regex=True, inplace=True) #handling nan's df = df.fillna(df.mean()) return df
def zillow_clean(): df = a.get_zillow_data() df = handle_missing_values(df, prop_required_column=.5, prop_required_row=.75) cols_to_remove = [ 'buildingqualitytypeid', 'heatingorsystemtypeid', 'propertyzoningdesc', 'censustractandblock' ] df = remove_columns(df, cols_to_remove) columns = [ 'calculatedbathnbr', 'fullbathcnt', 'regionidcity', 'regionidzip', 'unitcnt', 'yearbuilt' ] strategy = 'most_frequent' df = imputer_9000(df, columns, strategy) columns = [ 'calculatedbathnbr', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'taxamount', 'landtaxvaluedollarcnt' ] strategy = 'most_frequent' df = imputer_9000(df, columns, strategy) return df
def prep_zillow_data(): df = acquire.get_zillow_data() df = df.drop(columns = {'airconditioningtypeid', 'architecturalstyletypeid', 'basementsqft', 'taxdelinquencyyear', 'buildingclasstypeid', 'decktypeid', 'finishedfloor1squarefeet', 'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet50', 'finishedsquarefeet6', 'fireplacecnt', 'hashottuborspa', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid', 'yardbuildingsqft17', 'yardbuildingsqft26', 'numberofstories', 'fireplaceflag', 'taxdelinquencyflag', 'buildingqualitytypeid', 'garagecarcnt', 'garagetotalsqft', 'poolcnt', 'regionidneighborhood', 'propertyzoningdesc', 'propertycountylandusecode', 'id', 'parcelid', 'transactiondate', 'finishedsquarefeet12', 'censustractandblock', 'logerror'}) df['propertylandusetypeid'] = df['propertylandusetypeid'].astype(float) df['calculatedfinishedsquarefeet'] = df['calculatedfinishedsquarefeet'].fillna((df['calculatedfinishedsquarefeet'].mean())) df['calculatedbathnbr'] = df['calculatedbathnbr'].fillna((df['calculatedbathnbr'].mean())) df['fullbathcnt'] = df['fullbathcnt'].fillna((df['fullbathcnt'].mean())) df['heatingorsystemtypeid'] = df['heatingorsystemtypeid'].fillna((df['heatingorsystemtypeid'].mean())) df['lotsizesquarefeet'] = df['lotsizesquarefeet'].fillna((df['lotsizesquarefeet'].mean())) df['regionidcity'] = df['regionidcity'].fillna((df['regionidcity'].mean())) df['regionidzip'] = df['regionidzip'].fillna((df['regionidzip'].mean())) df['unitcnt'] = df['unitcnt'].fillna((df['unitcnt'].mean())) df['yearbuilt'] = df['yearbuilt'].fillna((df['yearbuilt'].mean())) df['calculatedfinishedsquarefeet'] = df.calculatedfinishedsquarefeet[(np.abs(stats.zscore(df.calculatedfinishedsquarefeet)) < 3)] df = df.dropna() return df
def wrangle_zillow_data(): df = acquire.get_zillow_data() df = prepare.zillow_single_unit(df) df = prepare.remove_columns(df,['calculatedbathnbr','finishedsquarefeet12',\ 'fullbathcnt','propertycountylandusecode','unitcnt','structuretaxvaluedollarcnt',\ 'landtaxvaluedollarcnt','assessmentyear','propertyzoningdesc']) df = prepare.handle_missing_values(df) df.dropna(inplace=True) return df
def wrangle_zillow_data(): df = acquire.get_zillow_data() df = prepare.zillow_single_unit(df) df = prepare.remove_columns(df, [ 'finishedsquarefeet12', 'fullbathcnt', 'unitcnt', 'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt', 'assessmentyear', 'propertyzoningdesc' ]) df = prepare.handle_missing_values(df) return df
def final_prep(): """ Accepts DF. Performs all changes outlined in prep phase and returns scaled and unscaled versions of train, validate, and test samples (6 in total). """ # Acquiring data df = get_zillow_data() # Preparing data with changes outlined in prepare section of notebook drop_missing_columns(df) drop_selected_columns(df) df.dropna(inplace=True) df = d_type_convert(df) df = zillow_dummy(df) df = column_sort_rename(df) train, validate, test = split_data(df) train_scaled, validate_scaled, test_scaled = data_scaler( train, validate, test) # returning DFs return train, validate, test, train_scaled, validate_scaled, test_scaled
def clean_zillow(): #get the zillow data df = get_zillow_data() #handle the nan's df['bedroomcnt'].fillna(df['bedroomcnt'].mode()[0], inplace=True) df['bathroomcnt'].fillna(df['bathroomcnt'].mode()[0], inplace=True) df['calculatedfinishedsquarefeet'].fillna( df['calculatedfinishedsquarefeet'].mode()[0], inplace=True) df['fips'].fillna(df['fips'].mode()[0], inplace=True) df['taxamount'].fillna(df['taxamount'].mode()[0], inplace=True) df['taxvaluedollarcnt'].fillna(df['taxvaluedollarcnt'].mode()[0], inplace=True) df['lotsizesquarefeet'].fillna(df['lotsizesquarefeet'].mode()[0], inplace=True) # make new column for county names rating = [] for row in df['fips']: if row == 6037.0: rating.append('los_angeles_county') elif row == 6059.0: rating.append('orange_county') elif row == 6111.0: rating.append('ventura_county') else: rating.append('no_county') df['county'] = rating #rename columns to make it easier to call later df = df.rename( columns={ "bedroomcnt": "bedroom_count", "bathroomcnt": "bathroom_count", "lotsizesquarefeet": "lot_size", "calculatedfinishedsquarefeet": "square_feet", "taxamount": "tax_amount", "taxvaluedollarcnt": "tax_value" }) # drop unneeded columns df = df.drop(columns=["Unnamed: 0", "parcelid", 'latitude', 'longitude']) # return the clean dataframe return df
def prep_zillow_data(): df = acquire.get_zillow_data() df = drop_nulls(df, prop_required_column=.70, prop_required_row=.70) df['finishedsquarefeet12'] = df['finishedsquarefeet12'].fillna( (df['finishedsquarefeet12'].mean())) df['lotsizesquarefeet'] = df['lotsizesquarefeet'].fillna( (df['lotsizesquarefeet'].mean())) df['regionidcity'] = df['regionidcity'].fillna((df['regionidcity'].mean())) df['fullbathcnt'] = df['fullbathcnt'].fillna((df['fullbathcnt'].mean())) df['calculatedbathnbr'] = df['calculatedbathnbr'].fillna( (df['calculatedbathnbr'].mean())) df['yearbuilt'] = df['yearbuilt'].fillna((df['yearbuilt'].mean())) df['censustractandblock'] = df['censustractandblock'].fillna( (df['censustractandblock'].mean())) df['calculatedfinishedsquarefeet'] = df[ 'calculatedfinishedsquarefeet'].fillna( (df['calculatedfinishedsquarefeet'].mean())) df['structuretaxvaluedollarcnt'] = df['structuretaxvaluedollarcnt'].fillna( (df['structuretaxvaluedollarcnt'].mean())) df['regionidzip'] = df['regionidzip'].fillna((df['regionidzip'].mean())) df = df.dropna() df = df.drop(columns=['pid', 'id']) county_dummies = pd.get_dummies(df['fips']) df = pd.concat((df, county_dummies), axis=1) df = df.rename( columns={ 6059.0: 'is_orange_county', 6037.0: 'is_la_county', 6111.0: 'is_ventura_county' }) df = df[df.bedroomcnt < 8] df = df[df.bedroomcnt > 0] df = df[df.bathroomcnt > 0] df = df[df.bathroomcnt < 6] df.to_csv('prep_zillow.csv') return df
def prep_zillow_data(target): """ function takes in name of targeted variable column (taxvaluedollarcnt), acquires zillow house data, prepares it via removing null values, renaming columns, scaling data, and returning data split into all of the data frames needed to progress through project notebook. """ # acquire zillow data df = get_zillow_data() # filter columns down to the target variable (taxvaluedollarcnt) and the final four columns from prep df = df[[ 'calculatedfinishedsquarefeet', 'bedroomcnt', 'bathroomcnt', 'lotsizesquarefeet', 'taxvaluedollarcnt' ]] # drop na values df.dropna(inplace=True) # dropping extreme outliers df.drop(df[df['lotsizesquarefeet'] > 3000000].index, inplace=True) df.drop(df[df['calculatedfinishedsquarefeet'] > 35000].index, inplace=True) df.drop(df[df['taxvaluedollarcnt'] > 20000000].index, inplace=True) # splitting data train_validate, test = train_test_split(df, test_size=.2, random_state=123) train, validate = train_test_split(train_validate, test_size=.3, random_state=123) # specifying which columns to keep in outputted dataframe # x = features | y = target variable X_train = train.drop(columns=[target]) y_train = train[[target]] X_validate = validate.drop(columns=[target]) y_validate = validate[[target]] X_test = test.drop(columns=[target]) y_test = test[[target]] # importing scaler scaler = sklearn.preprocessing.MinMaxScaler() # fitting scaler scaler.fit(X_train) # scaling data in dataframes X_train_scaled = pd.DataFrame(scaler.transform(X_train)) X_validate_scaled = pd.DataFrame(scaler.transform(X_validate)) X_test_scaled = pd.DataFrame(scaler.transform(X_test)) # renaming data frame columns train.rename(columns={ 'taxvaluedollarcnt': 'tax_dollar_value', 'calculatedfinishedsquarefeet': 'unit_sq_feet', 'bedroomcnt': 'bedroom_count', 'bathroomcnt': 'bathroom_count', 'lotsizesquarefeet': 'lot_size_sq_feet' }, inplace=True) X_train_scaled.rename(columns={ 0: 'unit_sq_feet', 1: 'bedroom_count', 2: 'bathroom_count', 3: 'lot_size_sq_feet' }, inplace=True) X_validate_scaled.rename(columns={ 0: 'unit_sq_feet', 1: 'bedroom_count', 2: 'bathroom_count', 3: 'lot_size_sq_feet' }, inplace=True) X_test_scaled.rename(columns={ 0: 'unit_sq_feet', 1: 'bedroom_count', 2: 'bathroom_count', 3: 'lot_size_sq_feet' }, inplace=True) X_train.rename(columns={ 'calculatedfinishedsquarefeet': 'unit_sq_feet', 'bedroomcnt': 'bedroom_count', 'bathroomcnt': 'bathroom_count', 'lotsizesquarefeet': 'lot_size_sq_feet' }, inplace=True) X_validate.rename(columns={ 'calculatedfinishedsquarefeet': 'unit_sq_feet', 'bedroomcnt': 'bedroom_count', 'bathroomcnt': 'bathroom_count', 'lotsizesquarefeet': 'lot_size_sq_feet' }, inplace=True) X_test.rename(columns={ 'calculatedfinishedsquarefeet': 'unit_sq_feet', 'bedroomcnt': 'bedroom_count', 'bathroomcnt': 'bathroom_count', 'lotsizesquarefeet': 'lot_size_sq_feet' }, inplace=True) y_train.rename(columns={'taxvaluedollarcnt': 'tax_dollar_value'}, inplace=True) y_validate.rename(columns={'taxvaluedollarcnt': 'tax_dollar_value'}, inplace=True) y_test.rename(columns={'taxvaluedollarcnt': 'tax_dollar_value'}, inplace=True) # returning data frames return train, y_train, y_validate, y_test, X_train_scaled, X_train, X_validate, X_test, X_validate_scaled, X_test_scaled
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np import env from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler from acquire import get_zillow_data # In[109]: df = get_zillow_data() df.head() # In[110]: # Let's figure out how much data is missing where def nulls_by_col(df): num_missing = df.isnull().sum() rows = df.shape[0] pct_missing = num_missing / rows cols_missing = pd.DataFrame({ 'number_missing_rows': num_missing, 'percent_rows_missing': pct_missing }) return cols_missing
def prep_zillow_data(): df = acquire.get_zillow_data() df = missing_rows(df) df = missing_cols(df) df = drop_nulls(df, prop_required_column=.70, prop_required_row=.70) return df
def prepare_zillow(): ''' Signature: prepare_zillow() -> pandas.core.frame.DataFrame Docstring: Prepare the zillow dataset for data EDA Return DataFrame of zillow dataset ''' df = get_zillow_data() df.storydesc.fillna(0, inplace=True) df.storydesc.replace('Basement', 1, inplace=True) df['has_basement'] = df.storydesc df.hashottuborspa.fillna(0, inplace=True) df['has_hottub_or_spa'] = df.hashottuborspa df.poolcnt.fillna(0, inplace=True) df['has_pool'] = df.poolcnt df.poolsizesum.fillna(0, inplace=True) df['pool_area_sqft'] = df.poolsizesum df['has_patio'] = df.yardbuildingsqft17.notnull().astype(np.int) df['patio_area_sqft'] = df.yardbuildingsqft17.fillna(0) df['has_shed'] = df.yardbuildingsqft26.notnull().astype(np.int) df['basement_area_sqft'] = df.yardbuildingsqft26.fillna(0) features_to_drop = [ 'decktypeid', 'buildingclasstypeid', 'buildingqualitytypeid', 'finishedsquarefeet6', 'finishedsquarefeet13', 'finishedsquarefeet15', 'buildingclassdesc', 'pooltypeid2', 'pooltypeid7', 'pooltypeid10', 'regionidcity', 'regionidcounty', 'regionidzip', 'typeconstructiontypeid', 'typeconstructiondesc', 'architecturalstyletypeid', 'architecturalstyledesc', 'storytypeid', 'storydesc', 'hashottuborspa', 'poolcnt', 'poolsizesum', 'yardbuildingsqft17', 'yardbuildingsqft26', 'taxdelinquencyyear', 'taxdelinquencyflag', 'finishedsquarefeet50', 'finishedfloor1squarefeet', 'censustractandblock', 'rawcensustractandblock', 'propertylandusetypeid', 'id', 'assessmentyear', 'finishedsquarefeet12', 'bathroomcnt', 'fullbathcnt', 'basementsqft', 'threequarterbathnbr', 'propertylandusedesc', 'propertycountylandusecode', 'regionidcounty', ] df.drop(columns=features_to_drop, inplace=True) df.rename(columns={ 'lotsizesquarefeet': 'lot_size_sqft', 'calculatedfinishedsquarefeet': 'livable_area_sqft', 'structuretaxvaluedollarcnt': 'structure_tax', 'taxvaluedollarcnt': 'taxable_value', 'landtaxvaluedollarcnt': 'land_tax', 'taxamount': 'property_tax', 'lasttransactiondate': 'date_sold', 'roomcnt': 'num_of_rooms', 'yearbuilt': 'year_built', 'bedroomcnt': 'num_of_bedrooms', 'calculatedbathnbr': 'num_of_restrooms' }, inplace=True) df = df[[ 'parcelid', 'num_of_bedrooms', 'num_of_restrooms', 'livable_area_sqft', 'lot_size_sqft', 'year_built', 'has_basement', 'has_hottub_or_spa', 'has_pool', 'pool_area_sqft', 'has_patio', 'patio_area_sqft', 'has_shed', 'basement_area_sqft', 'property_tax', 'structure_tax', 'land_tax', 'taxable_value', 'date_sold', 'fips', 'latitude', 'longitude', 'logerror' ]] df.fips = df.fips.astype(np.int) df.has_hottub_or_spa = df.has_hottub_or_spa.astype(np.int) df.has_pool = df.has_pool.astype(np.int) df.num_of_bedrooms = df.num_of_bedrooms.replace(0, np.nan) df.latitude = df.latitude / 1_000_000 df.longitude = df.longitude / 1_000_000 return df
# Wrangling import pandas as pd import numpy as np # Exploring import scipy.stats as stats import matplotlib.pyplot as plt import seaborn as sns # default pandas decimal number display format. pd.options.display.float_format = '{:20,.2f}'.format import acquire df = acquire.get_zillow_data() def nulls_by_col(df): # Look at the number missing. num_missing = df.isnull().sum() # number of rows rows = df.shape[0] # percent_missing pct_missing = num_missing / rows # Create dataframe of column name, num_rows missing, and pct of rows missing. cols_missing = pd.DataFrame({ 'num_rows': num_missing,