def get_iris_data(): url = env.get_db_url('iris_db') query = ''' SELECT * FROM measurements JOIN species USING (species_id) ''' return pd.read_sql(query, url)
def get_zillow_data(): query = ''' select * from `predictions_2017` left join properties_2017 using(`parcelid`) left join `airconditioningtype` using (`airconditioningtypeid`) left join `architecturalstyletype` as arch using(`architecturalstyletypeid`) left join `buildingclasstype` using(`buildingclasstypeid`) left join `heatingorsystemtype` using(`heatingorsystemtypeid`) left join `propertylandusetype` using(`propertylandusetypeid`) left join `storytype` using(`storytypeid`) left join `typeconstructiontype` using(`typeconstructiontypeid`) where (`latitude` is not null) and (`longitude` is not NULL) ''' df = pd.read_sql(query, get_db_url('zillow')) df = df.sort_values(by=['transactiondate'], axis=0).drop_duplicates(keep='last', subset='parcelid') df.drop('id', axis=1, inplace=True) return df
def get_titanic_data(): sql = """ SELECT * FROM passengers """ url = get_db_url('titanic_db') return pd.read_sql(sql, url)
def get_data_from_mysql(): query = """ SELECT customer_id, monthly_charges, tenure, total_charges FROM customers JOIN contract_types USING (contract_type_id) WHERE contract_type_id = 3; """ df = pd.read_sql(query, get_db_url("telco_churn")) return df
def get_iris_data(): sql = """ SELECT * FROM measurements JOIN species USING (species_id) """ url = get_db_url('iris_db') return pd.read_sql(sql, url)
def get_data_from_sql(): sql = """ SELECT customer_id, monthly_charges, tenure, total_charges FROM customers WHERE contract_type_id = 3 """ url = get_db_url('telco_churn') tc_df = pd.read_sql(sql, url) return tc_df
def get_iris_data(): query = ''' select * from measurements join species using(`species_id`) ''' df = pd.read_sql(query, get_db_url('iris_db')) return df
def wrangle_telco(): url = get_db_url('telco_churn') query = (''' SELECT customer_id, monthly_charges, tenure, total_charges FROM customers WHERE contract_type_id = 3 ''') df = pd.read_sql(query, url) df.total_charges = df.total_charges.str.strip() df.total_charges = df.total_charges.replace('', 0).astype(float) return df
def wrangle_telco(): query = """SELECT customer_id, tenure, monthly_charges, total_charges FROM customers WHERE contract_type_id = 3;""" url = get_db_url("telco_churn") telco = pd.read_sql(query, url) telco.total_charges = telco.total_charges.str.strip() telco = telco.replace("", np.nan) telco = telco.dropna() telco.total_charges = telco.total_charges.astype("float") return telco
def get_zillow_data(): url = env.get_db_url('zillow') query = ''' SELECT * FROM predictions_2017 LEFT JOIN properties_2017 USING (parcelid) LEFT JOIN airconditioningtype USING (airconditioningtypeid) LEFT JOIN architecturalstyletype USING (architecturalstyletypeid) LEFT JOIN buildingclasstype USING (buildingclasstypeid) LEFT JOIN heatingorsystemtype USING (heatingorsystemtypeid) LEFT JOIN propertylandusetype USING (propertylandusetypeid) LEFT JOIN storytype USING (storytypeid) LEFT JOIN typeconstructiontype USING (typeconstructiontypeid) WHERE (latitude IS NOT NULL AND longitude IS NOT NULL)''' df = pd.read_sql(query, url) new_dates = df.groupby(by='parcelid').transactiondate.max().reset_index() df.drop(columns=['parcelid', 'transactiondate'], inplace=True) df = new_dates.join(df, how='left') df.drop(columns='id', inplace=True) return df
def get_titanic_data(cached=False): ''' This function returns the titanic database as a pandas dataframe. If the data is cached or the file exists in the directory, the function will read the data into a df and return it. Otherwise, the function will read the database into a dataframe, cache it as a csv file and return the dataframe. ''' #If the cached parameter is false, or the csv file is not on disk, read from the database into a dataframe if cached == False or os.path.isfile('titanic_df.csv') == False: query = ''' SELECT * FROM passengers; ''' titanic_df = pd.read_sql(query, get_db_url('titanic_db')) #also cache the data we read from the db, to a file on disk titanic_df.to_csv('titanic_df.csv') else: #either the cached parameter was true, or a file exists on disk. Read that into a df instead of going to the database titanic_df = pd.read_csv('titanic_df.csv', index_col=0) #return our dataframe regardless of its origin return titanic_df
def get_iris_data(cached=False): ''' This function will return the iris db as a pandas df. If the data is cached or the file exists in the directory, the function will read that file into a pandas df and return it. Otherwise, the function will read data from the codeup db into a df, and return it to the caller. ''' # read the db from codeup db into a df if the cached parameter is false or the file is not on disk if cached == False or os.path.isfile('iris_df.csv') == False: query = ''' SELECT * FROM measurements JOIN species USING (species_id);; ''' iris_df = pd.read_sql(query, get_db_url('iris_db')) # cache it as a csv file iris_df.to_csv('iris_df.csv') else: # if cached parameter is True or file exists on disk, read the file into a pandas df iris_df = pd.read_csv('iris_df', index_col=0) # return the iris df regardless of origin return iris_df
import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np import split_scale from sklearn.preprocessing import StandardScaler, MinMaxScaler import wrangle import env import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler url = env.get_db_url('zillow') def prep_predictions(): df = pd.read_sql(""" SELECT * FROM predictions_2017 """, url) df['transactiondate'] = pd.to_datetime(df['transactiondate']) df = df[df.groupby('parcelid')['transactiondate'].transform('max') == df['transactiondate']] return df
# In[84]: #wrangle data import warnings warnings.filterwarnings('ignore') from env import get_db_url import pandas as pd import numpy as np # ### 1) Acquire customer_id, monthly_charges, tenure, and total_charges from telco_churn database for all customers with a 2 year contract. # In[176]: url = get_db_url('telco_churn') df = pd.read_sql( ''' SELECT customer_id, monthly_charges, total_charges FROM customers where contract_type_id = 3 ''', url) # In[177]: df.head(5) # In[178]: df.shape
def wrangle_grades(): grades = pd.read_csv("student_grades.csv") grades.replace(r'^\s*$', np.nan, regex=True, inplace=True) df = grades.dropna().astype('int') df.drop(columns="student_id", inplace=True) return df telco_query = """ SELECT c.customer_id, c.monthly_charges, c.tenure, c.total_charges FROM customers AS c JOIN contract_types AS ct USING(contract_type_id) WHERE ct.contract_type = 'Two year'; """ telco_url = get_db_url("telco_churn") def wrangle_telco(): """ This function does the following: 1. Queries data from the telco_churn database into a pandas DataFrame 2. Cleans the total_charges feature 3. Replaces any empty strings with np.nan 4. Removes any rows with missing values 5. Reassigns the total_charges feature as a float 6. Returns a new pandas DataFrame """ customers = pd.read_sql(telco_query, telco_url) customers.total_charges = customers.total_charges.str.strip() customers = customers.replace("", np.nan)
import pandas as pd import numpy as np import split_scale from sklearn.preprocessing import StandardScaler, MinMaxScaler import wrangle import env import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler url2 = env.get_db_url('titanic_db') def prep_titanic(): df = pd.read_sql(""" SELECT * FROM passengers """ ,url2) df.drop(columns=['deck'],inplace=True) df.fillna(np.nan,inplace=True) imp_mode = SimpleImputer(missing_values=np.nan,strategy='most_frequent') imp_mode.fit(df[['embarked']])
def get_titanic_data(): url = env.get_db_url('titanic_db') query = ''' SELECT * FROM passengers ''' return pd.read_sql(query, url)
def get_titanic_data(): query = 'select * from passengers' df = pd.read_sql(query, get_db_url('titanic_db')) return df
import pandas as pd import numpy as np import split_scale from sklearn.preprocessing import StandardScaler, MinMaxScaler import wrangle import env import seaborn as sns import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler url = env.get_db_url('telco_churn') def clean_telco(df): df.total_charges = df.total_charges.replace(r'^\s*$', np.nan, regex=True) df = df[df.total_charges.isna() == False] df['total_charges'] = df['total_charges'].astype(float) df['churn'] = df.churn == 'Yes' df['senior_citizen'] = df.senior_citizen == 1 df['is_male'] = df.gender == 'Male' df['paperless_billing'] = df.paperless_billing == 'Yes' df['family'] = (df.partner == 'Yes') | (df.dependents == 'Yes') df['phone'] = (df.phone_service == 'Yes') | (df.multiple_lines == 'Yes') df['streaming'] = (df.streaming_tv == 'Yes') | (df.streaming_movies
def get_zillow_data(): url = get_db_url("zillow") sql = """ SELECT Z.parcelid, Z.basementsqft, Z.bathroomcnt, Z.bedroomcnt, Z.calculatedbathnbr, Z.finishedfloor1squarefeet, Z.calculatedfinishedsquarefeet, Z.finishedsquarefeet12, Z.finishedsquarefeet13, Z.finishedsquarefeet15, Z.finishedsquarefeet50, Z.finishedsquarefeet6, Z.fips, Z.fireplacecnt, Z.fullbathcnt, Z.garagecarcnt, Z.garagetotalsqft, Z.hashottuborspa, Z.latitude, Z.longitude, Z.lotsizesquarefeet, Z.poolcnt, Z.poolsizesum, Z.propertycountylandusecode, Z.propertyzoningdesc, Z.regionidcity, Z.regionidcounty, Z.regionidneighborhood, Z.regionidzip, Z.roomcnt, Z.threequarterbathnbr, Z.unitcnt, Z.yardbuildingsqft17, Z.yardbuildingsqft26, Z.yearbuilt, Z.numberofstories, Z.fireplaceflag, Z.structuretaxvaluedollarcnt, Z.taxvaluedollarcnt, Z.assessmentyear, Z.landtaxvaluedollarcnt, Z.taxamount, Z.taxdelinquencyflag, Z.taxdelinquencyyear, Z.censustractandblock, unique_properties.logerror, unique_properties.transactiondate, plt.propertylandusedesc, st.storydesc, ct.typeconstructiondesc, act.airconditioningdesc, bct.buildingclassdesc, hst.heatingorsystemdesc FROM (SELECT p17.parcelid, logerror, transactiondate FROM predictions_2017 AS p17 JOIN (SELECT predictions_2017.parcelid, MAX(transactiondate) AS max_trans_date FROM predictions_2017 GROUP BY predictions_2017.parcelid) AS pred_agg ON (p17.parcelid=pred_agg.parcelid) AND (pred_agg.max_trans_date=p17.transactiondate)) AS unique_properties LEFT JOIN properties_2017 AS Z ON (Z.parcelid=unique_properties.parcelid) LEFT JOIN propertylandusetype AS plt ON (Z.propertylandusetypeid=plt.propertylandusetypeid) LEFT JOIN storytype AS st ON (Z.storytypeid=st.storytypeid) LEFT JOIN typeconstructiontype AS ct ON (Z.typeconstructiontypeid=ct.typeconstructiontypeid) LEFT JOIN airconditioningtype AS act ON (Z.airconditioningtypeid=act.airconditioningtypeid) LEFT JOIN architecturalstyletype AS ast ON (Z.architecturalstyletypeid=ast.architecturalstyletypeid) LEFT JOIN buildingclasstype AS bct ON (Z.buildingclasstypeid=bct.buildingclasstypeid) LEFT JOIN heatingorsystemtype AS hst ON (Z.heatingorsystemtypeid=hst.heatingorsystemtypeid) WHERE Z.latitude IS NOT NULL AND Z.longitude IS NOT NULL """ df = pd.read_sql(sql, url) return df
def clean_telco_data(): #pull data query = ''' select * from customers as cust join `internet_service_types` as net on cust.`internet_service_type_id` = net.internet_service_type_id join `contract_types` as cont on cust.`contract_type_id` = cont.`contract_type_id` join payment_types as pmt using(`payment_type_id`); ''' churn_df = pd.read_sql(query, get_db_url('telco_churn')) #for duplicate columns churn_df = churn_df.loc[:,~churn_df.columns.duplicated()] #for duplicat rows churn_df = churn_df.drop_duplicates() #drop redundant collumns churn_df = (churn_df.drop('contract_type_id', axis = 1) .drop('internet_service_type_id', axis = 1) .drop('payment_type_id', axis = 1)) #change 'no internets' and no phones to just no churn_df.replace('No internet service', 'No', inplace=True) churn_df.replace('No phone service', 'No', inplace=True) # change to float churn_df.replace(r'^\s*$', np.nan, regex=True, inplace=True) churn_df = churn_df.dropna(axis=0) churn_df.total_charges = churn_df.total_charges.astype(float) #get features and target target = 'churn' features = churn_df.columns.tolist() features.remove(target) features.remove('customer_id') #change churn column to boolean churn_df['churn'] = LabelEncoder().fit_transform(churn_df['churn']).astype(bool) churn_df.senior_citizen = churn_df.senior_citizen.astype(bool) #create new e-check collumn churn_df['e_check'] = churn_df.payment_type == 'Electronic check' #remove total_charges and senior citizens features.remove('total_charges') #remove collumns with little effect on tenure features.remove('gender') features.remove('phone_service') features.remove('payment_type') features.remove('contract_type') features.remove('internet_service_type') features.remove('multiple_lines') #encode yes no collumns for i in features: if churn_df[i].unique().tolist() == ['No', 'Yes'] or churn_df[i].unique().tolist() == ['Yes', 'No']: churn_df[i] = churn_df[i] == 'Yes' #one hot encode collumns churn_df = (churn_df.join(pd.get_dummies(churn_df.contract_type), on= churn_df.index) .join(pd.get_dummies(churn_df.internet_service_type), on = churn_df.index)) #add to features new_features = pd.get_dummies(churn_df.contract_type).columns.tolist() new_features += pd.get_dummies(churn_df.internet_service_type).columns.tolist() features += new_features #split data train, test = split_scale.split_my_data(churn_df, stratify=churn_df.churn) return train, test, features, target
def get_zillow(sql): url = get_db_url('zillow') zillow_df = pd.read_sql(sql, url, index_col='id') return zillow_df