def wrangle(): # get zillow data df = acquire.get_data(query, db) # keep only the most recent transaction date df = df.sort_values("transactiondate", ascending=False).drop_duplicates("parcelid") # keep only 2017 values df = df [df.transactiondate.str.startswith("2017")] # remove all the duplicate id columns df.drop(columns = ["typeconstructiontypeid","storytypeid", "propertylandusetypeid", "heatingorsystemtypeid", "buildingclasstypeid","architecturalstyletypeid","airconditioningtypeid","id"], inplace=True) # keep single family homes and remove unit counts greater than 1 df = df [df.propertylandusedesc == "Single Family Residential"] df = df [(df.unitcnt != 2) & (df.unitcnt != 3)] # remove rows or columns that have 99% null values prep.handle_missing_values(df, .5, .5) # remove the following columns with not enough info or duplicate info df.drop(columns = ["finishedsquarefeet12","buildingqualitytypeid", "fullbathcnt", "propertyzoningdesc", "unitcnt", "heatingorsystemdesc","assessmentyear","regionidcounty", "rawcensustractandblock", "calculatedbathnbr", "propertycountylandusecode"], inplace=True) # remove remaining rows with blanks df.dropna(inplace=True) # set index as parcelid df.set_index("parcelid", inplace=True) return df
def main(): """ Main entry point for the script. """ df, dfc = prep_data(ac.get_data()) print('Done and doner.') print(df.head(10))
def wrangle_data(): '''Takes no arguments and returns a prepared zillow DataFrame''' zillow = get_data() zillow = drop_columns(zillow) zillow = impute_median(zillow) zillow = create_new_features(zillow) zillow = zillow[(zillow.bathroomcnt > 0) & (zillow.bedroomcnt > 0)] zillow = handle_outliers(zillow) return zillow
def acquire_and_prep_data(): zillow = acquire.get_data() zillow = zillow.drop_duplicates() zillow = zillow.dropna() zillow = zillow.drop(columns=['fips', 'roomcnt']) zillow.bedroomcnt = zillow.bedroomcnt.astype('int') zillow.calculatedfinishedsquarefeet = zillow.calculatedfinishedsquarefeet.astype('int') zillow.fullbathcnt = zillow.fullbathcnt.astype('int') zillow.yearbuilt = zillow.yearbuilt.astype('int') zillow.taxvaluedollarcnt = zillow.taxvaluedollarcnt.astype('int') zillow = zillow.rename(columns={'calculatedfinishedsquarefeet': 'squarefeet', 'Name': 'County'}) zillow.latitude = zillow.latitude / 1000000 zillow.longitude = zillow.longitude / 1000000 zillow = zillow[zillow.bedroomcnt > 0] zillow = zillow[zillow.bedroomcnt > 0] return zillow
from acquire import get_data from prep import prep_data # Get the raw data from .csv or MySQL query raw = get_data() # Remove nulls df = prep_data(raw) # Milestones before Friday: # 2. Scale # 3. Super basic Model df.info() df.describe() # First pass for outlier detection: # Do the value counts and distribution make sense? # Is there anything way out of line here? df.bedrooms.value_counts() # encode as discrete df.bathrooms.value_counts() # encode as discrete df.sqft.value_counts() # can bin or scale df.taxvalue.value_counts() # scale this (also our target variable)
def get_mall(): return acquire.get_data("SELECT * FROM customers", "mall_customers")
import acquire import pandas as pd from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings("ignore") df = acquire.get_data() def get_single_unit(df): '''Takes in a dataframe, removes the duplicate column names and filters it based on the property land use description and returns a new dataframe of just single family residential property''' df = df.loc[:, ~df.columns.duplicated()] df = df[df.propertylandusetypeid.isin([260, 261, 262, 279])] return df def handle_missing_values(df, column_prop, row_prop): '''Takes in a dataframe, the proportion of the column with non NA, the proportion with the rows with Non NA and returns dataframe with the na removed at given proportion''' threshold = int(round(column_prop * len(df), 0)) df.dropna(axis=1, thresh=threshold, inplace=True) threshold = int(round(row_prop * len(df.columns), 0)) df.dropna(axis=0, thresh=threshold, inplace=True) return df