def prep_titanic():
    '''
    This function reads titanic data into a df from a csv file.
    Returns prepped train, validate, and test dfs
    '''
    # use my acquire function to read data into a df from a csv file
    df = get_titanic_data()

    # drop rows where embarked/embark town are null values
    df = df[~df.embarked.isnull()]

    # encode embarked & sex using dummy columns
    titanic_dummies = pd.get_dummies(df[['sex', 'embarked']], drop_first=True)

    # join dummy columns back to df
    df = pd.concat([df, titanic_dummies], axis=1)

    # drop the deck column
    df = df.drop(columns=[
        'passenger_id', 'deck', 'sex', 'embarked', 'class', 'embark_town'
    ])

    # impute missing age values
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer = imp_mean.fit(df[['age']])
    df[['age']] = imputer.transform(df[['age']])

    return df
def prep_titanic(cached=True):
    '''
    This function reads titanic data into a df from a csv file.
    Returns prepped train, validate, and test dfs
    '''
    # use my acquire function to read data into a df from a csv file
    df = get_titanic_data(cached)

    # drop rows where embarked/embark town are null values
    df = df[~df.embarked.isnull()]

    # encode embarked using dummy columns
    titanic_dummies = pd.get_dummies(df.embarked, drop_first=True)

    # join dummy columns back to df
    df = pd.concat([df, titanic_dummies], axis=1)

    # drop the deck column
    df = df.drop(columns='deck')

    # split data into train, validate, test dfs
    train, validate, test = titanic_split(df)

    # impute mean of age into null values in age column
    train, validate, test = impute_mean_age(train, validate, test)

    return train, validate, test
def prep_titanic_data(cached=True):

    # use my acquire function to read data into a df from a csv file
    df = get_titanic_data(cached)

    # drop rows where embarked/embark town are null values
    df = df[~df.embarked.isnull()]

    # encode embarked and sex using dummy columns
    titanic_dummies = pd.get_dummies(df[['sex', 'embarked']], drop_first=True)

    # join dummy columns back to df
    df = pd.concat([df, titanic_dummies], axis=1)

    # drop the deck column
    df = df.drop(columns=[
        'deck', 'sex', 'embarked', 'class', 'embark_town', 'passenger_id'
    ])

    # # split data into train, validate, test dfs
    train, validate, test = titanic_split(df)

    # # impute mean of age into null values in age column
    train, validate, test = impute_mean_age(train, validate, test)

    return train, validate, test
def prep_titanic(df=get_titanic_data()):
    """
    prep_titanic accepts the titanic dataset and returns a transformed titanic dataset
    for exploratory analysis.
    type(df) >>> pandas.core.frame.DataFrame
    """
    # Drop missing values in the embarked column.
    # This removes missing values in embark_town as well.
    # df.dropna(how='any', subset=['embarked'], inplace=True)

    # Throw the deck overboard because there are too many missing values.
    df.drop(columns=['deck'], inplace=True)

    # Create dummy variables for our targets.
    encoded_embarked = pd.get_dummies(df.embark_town, drop_first=True)

    encoded_class = pd.get_dummies(df['class'], drop_first=True)

    encoded_sex = pd.get_dummies(df.sex, drop_first=True)

    df = df.select_dtypes(exclude='O')
    # Scale numerical columns using MinMaxScalar()
    # scalar = MinMaxScaler()

    # Use `.transform_fit` on the scalar object to fit and transform the data.
    # Assign directly to 'age' and 'fare' columns.
    # df[['age', 'fare']] = scalar.fit_transform(df[['age', 'fare']])

    # Add the encoded target names as columns to the dataframe.
    df = pd.concat([df, encoded_embarked, encoded_class, encoded_sex], axis=1)

    return df
Example #5
0
def prep_titanic_data(splain=local_settings.splain, **kwargs):
    '''
    prep_titanic(splain=local_settings.splain, **kwargs)
    RETURNS: df, encoder, scaler
    
    
    # Titanic Data

    # 1. Use the function you defined in acquire.py to load the titanic data set.
    # 2. Handle the missing values in the embark_town and embarked columns.
    # 3. Remove the deck column.
    # 4. Use a label encoder to transform the embarked column.
    # 5. Scale the age and fare columns using a min max scaler. Why might this be 
    # beneficial? When might you not want to do this?
    # 6. Create a function named prep_titanic that accepts the untransformed 
    # titanic data, and returns the data with the transformations above applied.

    # Note: drop columns updated to deck, embarked, passenger_id in explore
    # Note: encoding changed to embark_town
    '''
    df = get_titanic_data(splain=splain)
    df.drop(columns=['deck', 'embarked','passenger_id'], inplace=True)
    df = simpute(df=df, column='embark_town', splain=splain)
    df, encoder = encode_col(df=df, col='embark_town')
    scaler = MinMaxScaler()
    scaler.fit(df[['age','fare']])
    df[['age','fare']] = scaler.transform(df[['age','fare']])
    return df, encoder, scaler
def prep_titanic_data(cached=True):
    '''
    Takes the titanic data, does data prep, and returns
    train, test, and validate data splits
    '''

    # use my acquire function to read data into a df from a csv file
    df = get_titanic_data(cached)

    # drop rows where embarked/embark town are null values
    df = df[~df.embarked.isnull()]

    # encode embarked and sex using dummy columns
    titanic_dummies = pd.get_dummies(df[['sex', 'embarked']], drop_first=True)

    # join dummy columns back to df
    df = pd.concat([df, titanic_dummies], axis=1)

    # drop the deck column
    df = df.drop(columns=[
        'deck', 'sex', 'embarked', 'class', 'embark_town', 'passenger_id'
    ])

    # # split data into train, validate, test dfs
    train, validate, test = titanic_split(df)

    # # impute mean of age into null values in age column
    train, validate, test = impute_mean_age(train, validate, test)

    return train, validate, test

    #################### Scale Any Data Set ##################
    def add_scaled_columns(train, validate, test, scaler, columns_to_scale):
        new_column_names = [c + '_scaled' for c in columns_to_scale]
        scaler.fit(train[columns_to_scale])

        train = pd.concat([
            train,
            pd.DataFrame(scaler.transform(train[columns_to_scale]),
                         columns=new_column_names,
                         index=train.index),
        ],
                          axis=1)
        validate = pd.concat([
            validate,
            pd.DataFrame(scaler.transform(validate[columns_to_scale]),
                         columns=new_column_names,
                         index=validate.index),
        ],
                             axis=1)
        test = pd.concat([
            test,
            pd.DataFrame(scaler.transform(test[columns_to_scale]),
                         columns=new_column_names,
                         index=test.index),
        ],
                         axis=1)

        return train, validate, test
def prep_titanic():
    titanic = acquire.get_titanic_data()
    titanic = titanic[~titanic.embarked.isnull()]
    titanic = titanic[~titanic.embark_town.isnull()]
    cols_to_drop = ['passenger_id', 'pclass', 'embark_town', 'deck']
    titanic = titanic.drop(columns=cols_to_drop)
    train, test, validate = split_data(titanic)
    return train, test, validate
Example #8
0
def titanic_prep(cached=True):
    df = acquire.get_titanic_data()
    df = df[~df.embarked.isnull()]
    titanic_dummies = pd.get_dummies(df.embarked, drop_first=True)
    df = pd.concat([df, titanic_dummies], axis=1)
    df = df.drop(columns='deck')
    train, validate, test = titanic_split(df)
    train, validate, test = impute_mean_age(train, validate, test)

    return train, validate, test
Example #9
0
def prep_titanic_exercise():
    titanic = acquire.get_titanic_data()
    titanic = titanic[~titanic.embark_town.isnull()]
    titanic.drop(columns=['deck'])
    titanic_dummies = pd.get_dummies(titanic['embarked'])
    titanic = pd.concat([titanic, titanic_dummies], axis=1)
    imputer = SimpleImputer(strategy='mean')
    imputer = imputer.fit(titanic[['age']])
    titanic['impute_age'] = imputer.transform(titanic[['age']])
    return titanic
Example #10
0
def prep_titanic():
    
    df = acquire.get_titanic_data()
    df = df[~df.embarked.isnull()]
    titanic_dummies = pd.get_dummies(df[['embarked', 'sex']], drop_first=True)
    df = pd.concat([df, titanic_dummies], axis=1)
    df = df.drop(columns=['deck', 'passenger_id', 'sex', 'embarked', 'embark_town', 'class'])
    train, validate, test = titanic_split(df)
    train, validate, test = impute_mean_age(train, validate, test)
    
    return train, validate, test
Example #11
0
def prep_titanic():
    df = acquire.get_titanic_data()
    df = df.drop(columns=['deck', 'class','embark_town'])
    df.embarked = df.embarked.fillna('S')
    df.embarked = df.embarked.astype("|S")
    train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size=.8)
    train, test = encode_embarked(train,test)
    train, test = scale_age_and_fare(train,test)
    train, test = fillna_age(train,test)
    
    return train, test 
def prep_titanic():

    # Acquire titanic dataset
    df_titanic = acquire.get_titanic_data()

    # Make the passenger_id the index of the dataset
    df_titanic.set_index('passenger_id', inplace=True)

    # df_titanic.head()
    # Look at how many null values are in each column
    # df_titanic.isnull().sum()
    # df_titanic.shape

    # Fill null values with np.nan
    df_titanic.embark_town.fillna('Other', inplace=True)
    df_titanic.embarked.fillna('Other', inplace=True)

    # Deck column had 688 null values out of 891 rows.
    # Because the majority of values are empty we do not not have enough information to go off of.
    # We will drop 'deck' column because we cannot use the data in this analysis
    df_titanic.drop(columns=['deck'], inplace=True)

    # Split dataframe into train, test
    train, test = train_test_split(df_titanic,
                                   test_size=.3,
                                   random_state=123,
                                   stratify=df_titanic.survived)

    # Train DataFrame: Fill values with 'most_frequent' that are np.NAN in embarked, embark_town
    imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imp_mode.fit_transform(train[['embarked', 'embark_town']])
    test[['embarked', 'embark_town'
          ]] = imp_mode.transform(test[['embarked', 'embark_town']])

    # Change categorical variables in 'embarked' to numerical values
    int_encoder = LabelEncoder()
    int_encoder.fit(train[['embarked']])
    train['embarked_encoded'] = int_encoder.transform(train[['embarked']])
    test['embarked_encoded'] = int_encoder.transform(test[['embarked']])

    train.head()

    # Scale age and fare using MinMaxScaler
    scaler = MinMaxScaler()
    train[['age', 'fare']] = scaler.fit_transform(train[['age', 'fare']])
    test[['age', 'fare']] = scaler.transform(test[['age', 'fare']])

    return train, test, int_encoder
Example #13
0
def prep_titanic():
    df = acquire.get_titanic_data()
    df.embark_town.fillna('Other', inplace=True)
    df.embarked.fillna('Unknown', inplace=True)
    df.drop(columns=['deck'], inplace=True)

    encoder = LabelEncoder()
    df.embarked = encoder.fit_transform(df.embarked)

    scaler = MinMaxScaler()
    df.age = scaler.fit_transform(df[['age']])

    scaler = MinMaxScaler()
    df.fare = scaler.fit_transform(df[['fare']])

    return df
Example #14
0
def prep_titanic():
    titanic = acquire.get_titanic_data()
    titanic = titanic[~ titanic.embarked.isnull()]
    titanic = titanic[~ titanic.embark_town.isnull()]
    
    df_dummies = pd.get_dummies(titanic[['embark_town']], drop_first = True)
    
    df_dum = pd.get_dummies(titanic[['sex']], drop_first = True)
    
    titanic = pd.concat([titanic, df_dummies, df_dum], axis = 1)
    
    
    cols_to_drop = ['passenger_id','pclass', 'embarked', 'deck', 'sex']
    titanic = titanic.drop(columns = cols_to_drop)
    
    train, test, validate = split_data(titanic)
    return train, test, validate
def prep_titanic():
    df_titanic = acquire.get_titanic_data()
    df_titanic.embark_town.fillna('Other', inplace=True)
    df_titanic.embarked.fillna('Unknown', inplace=True)
    df_titanic.drop('deck', inplace=True, axis=1)
    lab_enc = LabelEncoder()
    lab_enc.fit(df_titanic.embarked)
    df_titanic.embarked = lab_enc.transform(df_titanic.embarked)
    scaler = MinMaxScaler()
    scaler.fit(df_titanic[['fare','age']])
    df_titanic.fare = scaler.transform(df_titanic[['fare', 'age']])
    return df_titanic


#USE df.nunique()<5 instead of this temp list
# def pick_viable_categories(df):
#     discretes = df.select_dtypes(include='object')
#     temp = []
#     for column in discretes:
#         columnSeriesObj = discretes[column]
#         if len(columnSeriesObj.unique()) < 4:
#             temp.append(columnSeriesObj.name)
#     return temp

# def plot_viable_categories(target, df):
#     x = pick_viable_categories(df)
#     _, ax = plt.subplots(nrows=1, ncols=len(x), figsize=(16,5))
#     average_rate = df.target.mean()
#     for i, feature in enumerate(x):
#         sns.barplot(feature, target, data=df_titanic, ax=ax[i], alpha=.5)
#         ax[i].set_ylabel('average_rate')
#         ax[i].axhline(average_rate, ls='--', color='grey')

# def pick_viable_regressors():
#     regressors = df_titanic.select_dtypes(include=['float64','int64'])
#     temp = []
#     for column in regressors:
#         columnSeriesObj = regressors[column]
#         temp.append(columnSeriesObj.name)
#     return temp
import pandas as pd

from acquire import get_titanic_data
from prepare import prep_titanic_data


def set_features(df, target, *features):
    X = df[['pclass', 'age', 'fare', 'sibsp', 'parch']]
    y = df[[target]]
    return X, y


# Get and prepare the data
df = prep_titanic_data(get_titanic_data())

# Set the features
features = ['pclass', 'age', 'fare', 'sibsp', 'parch']
target = 'survived'
X, y = set_features(
    df,
    target,
    *features,
)
Example #17
0
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prepare_titanic_data

df = prepare_titanic_data(get_titanic_data())

X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

# Create the logistic regression object
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

# Fit the model to the training data
logit.fit(X_train, y_train)

print('Coefficient: \n', logit.coef_)
Example #18
0
    df = acquire.get_iris_data()
    df = df.drop(columns=['species_id', 'species_id.1']).rename(
        columns={'species_name': 'species'})
    species_dummies = pd.get_dummies(df.species, drop_first=True)
    df = pd.concat([df, species_dummies], axis=1)
    return df


# In[9]:

prepped = iris_prep()
prepped.sample(3)

# In[26]:

titanic = acquire.get_titanic_data()
titanic.head()

# In[27]:

##handling nulls
titanic[titanic.embark_town.isnull()]
titanic[titanic.embarked.isnull()]

# In[28]:

titanic = titanic[~titanic.embarked.isnull()]
titanic.info()

# In[29]:
def encode_species_col(iris_df):
    from sklearn import preprocessing
    encoder = preprocessing.LabelEncoder()
    encoder.fit(iris_df.species)
    return iris_df.assign(species_encode = encoder.transform(iris_df.species))

def prep_iris(iris_df):
    return iris_df.pipe(drop_columns)\
        .pipe(rename_columns)\
        .pipe(encode_species_col)

# 2. Titanic Data
# Use the function you defined in acquire.py to load the titanic data set.
from acquire import get_titanic_data
titanic_df = get_titanic_data()
# print(titanic_df)
# Write the code to perform the operations below. (Do this yourself, don't copy from the curriculum.)

# a. Handle the missing values in the embark_town and embarked columns.
# print(titanic_df['embark_town'].unique())
titanic_df.embark_town.fillna(value='Unknown', inplace=True)
# print(titanic_df)
# print(titanic_df['embarked'].unique())
titanic_df.embarked.fillna(value='Unknown', inplace=True)


# b. Remove the deck column.
titanic_df = titanic_df.drop(['deck'], axis=1)
# print(titanic_df)
Example #20
0
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import acquire as a
import prepare as p

df = a.get_titanic_data()
df = p.prep_titanic_data(df)

def loopy_graphs(df, target):
    features = list(df.columns[(df.dtypes == object) | (df.nunique()<5)])
    
    
    pop_rate = df[target].mean()

    for i, feature in enumerate(features):
        sns.barplot(feature,target,data=df,alpha=.6)
        plt.show()

def plot_violin(features, target, df):
    for descrete in df[features].select_dtypes([object,int]).columns.tolist():
        if df[descrete].nunique() <= 5: