Ejemplo n.º 1
0
def clean_telco(cached=False):
    df = get_telco_data()
    # use my aquire function to read data into a df from a csv file
    # drop duplicates
    df.drop_duplicates(inplace=True)
    # drop and rename columns
    df = df.drop(columns=[
        'payment_type_id', 'contract_type_id.1', 'contract_type_id',
        'internet_service_type_id'
    ])
    df['years_tenure'] = df.tenure / 12
    df['has_streaming'] = df["streaming_tv" or "streaming_movies"] == 'Yes'
    df['is_family'] = df["partner" or "dependents"] == 'Yes'
    df['has_phones'] = df['phone_service' or 'multiple_lines'] == 'Yes'
    df['has_security_features'] = df['online_security'
                                     or 'online_backup'] == 'Yes'
    df['years_tenure'] = df.tenure / 12
    # create dummy columns for churn
    telco_dummies = pd.get_dummies(df.churn, drop_first=True)
    # add dummy columns to df
    df = pd.concat([df, telco_dummies], axis=1)
    # rename dummy columns
    df = df.rename(columns={'Yes': 'is_churn'})

    return df
Ejemplo n.º 2
0
def prep_telco(cached=True):

    df = get_telco_data(cached)
    df = df.drop(columns=[
        'customer_id', 'online_security', 'online_backup', 'device_protection',
        'tech_support', 'internet_service_type_id', 'contract_type_id',
        'payment_type_id', 'partner', 'dependents', 'multiple_lines',
        'streaming_tv', 'streaming_movies', 'total_charges'
    ])
    df_dummies = pd.get_dummies(df.contract_type)
    df = pd.concat([df, df_dummies], axis=1)
    df = df.drop(columns='contract_type')
    df_dummies = pd.get_dummies(df.internet_service_type)
    df = pd.concat([df, df_dummies], axis=1)
    df = df.drop(columns='internet_service_type')
    df_dummies = pd.get_dummies(df.payment_type)
    df = pd.concat([df, df_dummies], axis=1)
    df = df.drop(columns='payment_type')
    df.loc[df['phone_service'] == 'No', 'phone_service'] = 0
    df.loc[df['phone_service'] == 'Yes', 'phone_service'] = 1
    df.loc[df['paperless_billing'] == 'No', 'paperless_billing'] = 0
    df.loc[df['paperless_billing'] == 'Yes', 'paperless_billing'] = 1
    df.loc[df['churn'] == 'No', 'churn'] = 0
    df.loc[df['churn'] == 'Yes', 'churn'] = 1
    df_dummies = pd.get_dummies(df.gender)
    df = pd.concat([df, df_dummies], axis=1)
    df = df.drop(columns='gender')
    df['phone_service'] = df.phone_service.astype('int')
    df['paperless_billing'] = df.paperless_billing.astype('int')
    df['churn'] = df.churn.astype('int')

    return df
Ejemplo n.º 3
0
def clean_telco():
    '''
    clean_telco will take in the telco_churn data as df, replace "No service" string with "No", convert categorical variable into dummy/indicator variables, drop columns with duplicate  information as well as statistically invalid columns, normalize the column titles, rename some titles for  legibility, convert total_charges to dtype='float64' and fill null values in total_charges with 0 since the customers tenure was 0 and have yet to pay their first bill. Finally,this function converts 'Yes' to 1 and 'No' to 0.
    '''
    df = get_telco_data()
    df.replace('No phone service', 'No', inplace=True)
    df.replace('No internet service', 'No', inplace=True)
    dummy_df = pd.get_dummies(df[['gender', 'contract_type', 'payment_type']],
                              drop_first=True)
    dummy2_df = pd.get_dummies(df[['internet_service_type']], drop_first=False)
    dropcols = [
        'gender', 'payment_type_id', 'internet_service_type_id',
        'contract_type_id', 'internet_service_type_none'
    ]
    df = pd.concat([df, dummy_df, dummy2_df], axis=1)
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace(
        '(', '').str.replace(')', '')
    df.drop(columns=dropcols, inplace=True)
    df.columns = [
        'customer_id', 'senior_citizen', 'partner', 'dependents', 'tenure',
        'phone_service', 'multiple_lines', 'online_security', 'online_backup',
        'device_protection', 'tech_support', 'streaming_tv',
        'streaming_movies', 'paperless_billing', 'monthly_charges',
        'total_charges', 'churn', 'contract_type', 'internet_service_type',
        'payment_type', 'gender_male', 'one_year_contract',
        'two_year_contract', 'credit_card_payment', 'e_check_payment',
        'mailed_check_payment', 'dsl_internet', 'fiber_optic_internet'
    ]
    df.total_charges = pd.to_numeric(df.total_charges,
                                     errors='coerce').astype('float64')
    df.total_charges = df.total_charges.fillna(value=0)
    df = df.replace("Yes", 1).replace("No", 0)
    return df
Ejemplo n.º 4
0
def wrangle_telco():
    df = get_telco_data()
    df['total_charges'] = df.total_charges.where((df.tenure != 0), 0)
    df['total_charges'] = df.total_charges.astype(float)
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate,
                                       test_size=.3,
                                       random_state=123)
    return train, validate, test
Ejemplo n.º 5
0
def telco_data_prep(data_split=False):
    '''
    Load telco_data as a pandas DataFrame
    Clean numeric types and creates dummy variables
    '''

    # Call function to load the telco dataset
    df = get_telco_data()

    # Clean and cast total_charges column from type object to float
    df.total_charges = df.total_charges.str.strip()

    # New customers that have recently signed up have value of ''. '' == 0 tenure and 0 total charges
    df.total_charges = df.total_charges.replace(
        '', 0)  # Replace empty strings with 0, np.nan iteration 2
    df.total_charges = df.total_charges.astype(
        "float")  # Cast the entire column from type string to float

    # Drop the observations where customers have not paid their first month with telco
    df.dropna(inplace=True)

    # Replace target variable strings('Yes'/'No') with int's(1/0)
    df.churn = np.where(df.churn == 'Yes', 1, 0)

    # Clean tenure columns
    df.rename(columns={'tenure': 'tenure_in_months'}, inplace=True)
    df['tenure_in_years'] = round(df.tenure_in_months / 12, 2)

    # Collect the column name where values are categorical/strings/objects
    encoded_columns = df.nunique()[df.nunique() <= 4].index.to_list(
    )  # Columns with 4 or less unique values
    encoded_columns.remove(
        'churn'
    )  # remove churn from this list, we've already converted the values to binary outcomes
    encoded_attributes = pd.get_dummies(
        df[encoded_columns], drop_first=True
    )  # with the remaining object columns, create dummy variables.

    df = pd.concat(
        [df, encoded_attributes],
        axis=1)  # concat the original dataframe with the encoded attributes.

    # encoded_columns.append('customer_id')  # append categorical columns with customer_id
    df.drop(
        columns=encoded_columns,
        inplace=True)  # drop all columns that are represented by pd.dummies
    churn = df[['churn']]

    df.drop(columns='churn', inplace=True)
    df = pd.concat([df, churn], axis=1)

    return df
def wrangle_telco():
    '''
    wrangle_telco will read in our telco data for two year contract customers via a sql query, clean the data down to 
    monthly charges, total charges, tenure, and customer id,
    replace whitespace values in total charges with zeros where appropriate
    and then split the data
    
    return: train, validate, and test sets of telco data
    '''
    df = clean_telco(acquire.get_telco_data())
    return split_data(df)
    
    
Ejemplo n.º 7
0
def wrangle_telco():
    df = acquire.get_telco_data()
    df.tenure.replace(0, 1, inplace=True)
    df.total_charges = df.total_charges.str.strip()
    df.total_charges.replace('', df.monthly_charges, inplace=True)
    df.total_charges = df.total_charges.astype(float)
    df['automatic_payment'] = ((df['payment_type_id'] == 3) |
                               (df['payment_type_id'] == 4))
    scaler = MinMaxScaler()
    df['monthly_charges_scaled'] = scaler.fit_transform(
        df['monthly_charges'].values.reshape(-1, 1))
    df['tenure_3_or_less'] = df['tenure'] <= 3
    return df
Ejemplo n.º 8
0
def wrangle_telco():
    '''
    This function acquires/loads `telco_churn` data from a SQL database using SQL and Pandas. Cleans the acquired
    data and returns a dataframe to be used in EDA and Modeling.
    '''

    df = get_telco_data()
    df['total_charges'] = df.total_charges.str.strip()
    df.total_charges.replace("", 0, inplace=True)
    df['total_charges'] = df.total_charges.astype('float')
    df = df[['customer_id', 'tenure', 'monthly_charges', 'total_charges']]

    train, validate, test = split_data(df)
    return train, validate, test
Ejemplo n.º 9
0
def wrangle_telco():
    '''
    Transforms data brought in from SQL to handle nulls in tenure
    Turn the total_charges column to a float
    Add calculated fields
    '''
    df = acquire.get_telco_data()
    df.tenure.replace(0, 1, inplace=True)
    df.total_charges = df.total_charges.str.strip()
    df.total_charges.replace('', df.monthly_charges, inplace=True)
    df.total_charges = df.total_charges.astype(float)
    df['automatic_payment'] = ((df['payment_type_id'] == 3) | (df['payment_type_id'] == 4))
    scaler = MinMaxScaler()
    df['monthly_charges_scaled'] = scaler.fit_transform(df['monthly_charges'].values.reshape(-1,1))
    df['tenure_3_or_less'] = df['tenure']<=3
    return df
Ejemplo n.º 10
0
def prep_telco_data():
    # Use acquire file to import telco data
    df = get_telco_data()
    # Drop duplicates if there are any
    df.drop_duplicates(inplace=True)
    #
    payment_type_cols = [1, 2, 3, 4]
    df['is_automatic_payment'] = df.payment_type_id.replace(
        payment_type_cols, [0, 0, 1, 1])
    no_internet = [
        'streaming_movies', 'tech_support', 'streaming_tv', 'online_security',
        'online_backup', 'device_protection'
    ]
    df[no_internet] = df[no_internet].replace({
        'No internet service': 0,
        'No': 1,
        'Yes': 2
    })
    binary_cols = [
        'churn', 'partner', 'dependents', 'phone_service', 'multiple_lines',
        'paperless_billing'
    ]
    df[binary_cols] = df[binary_cols].replace('Yes', 1).replace('No', 0)
    df.rename(columns={'gender': 'male'}, inplace=True)
    df['male'] = df['male'].replace('Male', 1).replace('Female', 0)
    df['total_charges'] = df.total_charges.where((df.tenure != 0), 0)
    df = df.astype({'total_charges': 'float64'})
    df['phone_lines'] = df['multiple_lines'].replace({
        'No phone service': 0,
        'No': 1,
        'Yes': 2
    })
    df.drop(columns='multiple_lines')
    df.rename(columns={'tenure': 'monthly_tenure'}, inplace=True)
    df['yearly_tenure'] = round(df.monthly_tenure / 12, 2)
    df['part_depend'] = df['partner'] + df['dependents']
    df = df.drop(df[((df['monthly_tenure'].sort_values() == 0))].index)
    internet_types = [1, 2, 3]
    df['fiber'] = df['internet_service_type_id'].replace(
        internet_types, [0, 1, 0])
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate,
                                       test_size=.3,
                                       random_state=123)
    return train, validate, test
Ejemplo n.º 11
0
def run_model():
    # Get the Data
    df = acquire.get_telco_data()

    # Prepare the Data
    df = prepare.drop_columns(df)
    df = prepare.fix_dtypes(df)

    # Add Features
    df = features.create_features(df)

    # Encode DataFrame
    df = encode.encode_df(df)

    # Select features to be used in the model
    cols = ['contract_type', 
            'tenure',
            'monthly_charges',
            'payment_type',
            'has_internet']

    X = df[cols]
    y = df.churn
    
    # Create and fit the model
    forest = RandomForestClassifier(n_estimators=100, 
                                      max_depth=9,
                                      random_state=123).fit(X, y)

    # Create a DataFrame to hold predictions
    results = pd.DataFrame(
        {'Costumer_ID': df.customer_id,
         'Model_Predictions': forest.predict(X),
         'Model_Probabilities': forest.predict_proba(X)[:,1]
        })

    # Generate csv
    results.to_csv('model_results.csv')

    return results
def model_and_prediction():
    # acquire data
    df = ac.get_telco_data()
    #  prepare data
    encoder, train, validate, test = pr.prep_telco(df)
    scaler, train, validate, test = pr.min_max_scaler(train, validate, test)
    # select desired features
    X = ['tenure', 'monthly_charges', 'contract_type_id', 'senior_citizen']
    y = ['churn']
    X_train = train[X]
    y_train = train[y]
    X_test = test[X]
    y_test = test[y]
    #scale the whole dataset
    df[["tenure", "monthly_charges"
        ]] = scaler.transform(df[["tenure", "monthly_charges"]])
    # create a model trained from train dataset
    rf = RandomForestClassifier(bootstrap=True,
                                class_weight=None,
                                criterion='gini',
                                min_samples_leaf=3,
                                n_estimators=100,
                                max_depth=20,
                                random_state=123)
    rf.fit(X_train, y_train)
    # Make prediction of each customer
    df['predicted'] = rf.predict(
        df[['tenure', 'monthly_charges', 'contract_type_id',
            'senior_citizen']])
    #Show prediction of each customer
    df["probability"] = rf.predict_proba(
        df[['tenure', 'monthly_charges', 'contract_type_id',
            'senior_citizen']])[:, 1]
    # encode int to string
    df['predicted'] = df.predicted.apply(lambda x: 'Yes' if x == 1 else 'No')
    # Select columns for csv file
    df = df.rename(columns={'churn': 'actual'})
    df = df[['customer_id', 'actual', 'predicted', "probability"]]
    # write dataframe to a csv file
    df.to_csv('prediction.csv')
Ejemplo n.º 13
0
def telco_data_prep():
    df = get_telco_data()
    df.total_charges = df.total_charges.str.strip()
    df.total_charges = df.total_charges.replace('', 0) 
    df.churn.replace(['Yes', 'No'], [1,0], inplace = True)
    return df
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_telco_data


# In[93]:


telco=get_telco_data()


# In[91]:


#Getting just a look at what the data frame looks like by using .head()
telco.head()


# In[94]:


# Describing the numerical data
telco.describe()
Ejemplo n.º 15
0
                                                random_state=123)
    train, validate = train_test_split(train_and_validate,
                                       test_size=.15,
                                       random_state=123)
    return train, test, validate


# # Telco Churn Exercise Problem

# Throughout the exercises for Regression in Python lessons, you will use the following example scenario: As a customer analyst, I want to know who has spent the most money with us over their lifetime. I have monthly charges and tenure, so I think I will be able to use those two attributes as features to estimate total_charges. I need to do this within an average of $5.00 per customer.

# - The first step will be to acquire and prep the data.

# In[20]:

df = acquire.get_telco_data()

# In[21]:

df.shape

# In[22]:

df.head()

# In[23]:

df.dtypes

# In[24]:
Ejemplo n.º 16
0
def prep_telco():
    '''
    No arguments taken. Returns a df of the telco customer churn data in the Codeup SQL database.
    '''
    # acquire the data from csv (or sql if no csv exists) and assign it as a DataFrame
    df = get_telco_data()

    # set the index to be customer_id
    df = df.set_index('customer_id')

    # cut down on the complexity of data by combining the variables for add-on packages
    # into one variable, num_add_ons, that adds up the number of add-on services each customer has

    # map strings into 1s and 0s, 1 meaning that the customer has that add-on
    df.online_security = df.online_security.map({
        'Yes': 1,
        'No': 0,
        'No internet service': 0
    })
    df.online_backup = df.online_backup.map({
        'Yes': 1,
        'No': 0,
        'No internet service': 0
    })
    df.device_protection = df.device_protection.map({
        'Yes': 1,
        'No': 0,
        'No internet service': 0
    })
    df.tech_support = df.tech_support.map({
        'Yes': 1,
        'No': 0,
        'No internet service': 0
    })
    df.streaming_tv = df.streaming_tv.map({
        'Yes': 1,
        'No': 0,
        'No internet service': 0
    })
    df.streaming_movies = df.streaming_movies.map({
        'Yes': 1,
        'No': 0,
        'No internet service': 0
    })

    # add and put into new column, num_add_ons (number of add-on services)
    df['num_add_ons'] = (df.online_security + df.online_backup +
                         df.device_protection + df.tech_support +
                         df.streaming_tv + df.streaming_movies)

    # drop the add-on columns we don't need anymore.
    df = df.drop(columns=[
        'online_security', 'online_backup', 'device_protection',
        'tech_support', 'streaming_tv', 'streaming_movies'
    ])

    # encode phone_service and gender
    df.phone_service = df.phone_service.map({'Yes': 1, 'No': 0})
    # make new column is_male and encode
    df['is_male'] = df.gender.map({'Male': 1, 'Female': 0})
    # Now drop gender column
    df = df.drop(columns=['gender'])

    # encode partner, dependents, and churn
    df.partner = df.partner.map({'Yes': 1, 'No': 0})
    df.dependents = df.dependents.map({'Yes': 1, 'No': 0})
    df.churn = df.churn.map({'Yes': 1, 'No': 0})

    # Right now the contract types are map from 1 to 3, so I'm going to re-map it starting from 0.
    # 0 is Month-to-Month
    # 1 is Yearly
    # 2 is Two Year
    df.contract_type_id = df.contract_type_id.map({1: 0, 2: 1, 3: 2})

    # Going to re-map the internet_type_id column in the same way.
    # 0 is No Internet Service
    # 1 is DSL
    # 2 is Fiber Optic
    df.internet_service_type_id = df.internet_service_type_id.map({
        3: 0,
        1: 1,
        2: 2
    })

    # Now to add a column called tenure_yrs that represents tenure in years
    df['tenure_yrs'] = round((df.tenure / 12), 2)

    # Rename columns to shorter names for readability and ease of use:
    # contract_type_id --> contract_type
    # phone_service --> phone
    # internet_service_type_id --> internet_type
    # senior_citizen --> senior
    # dependents --> depend
    df = df.rename(
        columns={
            "contract_type_id": "contract_type",
            "phone_service": "phone",
            "internet_service_type_id": "internet_type",
            "senior_citizen": "senior",
            "dependents": "depend"
        })

    # convert total_charges to a float and drop the rows with null total_charges
    df['total_charges'] = pd.to_numeric(df['total_charges'], errors='coerce')
    df = df[~df.total_charges.isnull()]

    # Data is now tidy and ready to split into train, validate, and test.

    return df
Ejemplo n.º 17
0
def prep_telco_data(cached=True):
    # use my acquire function to read data into a df from a csv file
    df = pd.DataFrame(get_telco_data())
    #set index as customer_id
    df = df.set_index('customer_id')
    #rows full of Nan values
    rows_to_drop = ([
        6895, 6686, 6615, 6605, 6569, 6143, 2132, 2048, 2029, 1949, 1878
    ])
    #dropping the Nan values
    df.drop(df.index[rows_to_drop], inplace=True)
    #changing data type of total charges to float
    df.total_charges = pd.to_numeric(df.total_charges)
    # change device_protection into numeric values
    df.device_protection = df.device_protection.apply(online_checker)
    # change tech_suppport into numeric values
    df.tech_support = df.tech_support.apply(online_checker)
    # change online security into numeric values
    df.online_security = df.online_security.apply(online_checker)
    #change online backup into numeric values
    df.online_backup = df.online_backup.apply(online_checker)
    #change streaming tv into numeric values
    df.streaming_tv = df.streaming_tv.apply(online_checker)
    #change streaming movies into numeric values
    df.streaming_movies = df.streaming_movies.apply(online_checker)
    #change multiple lines into numeric values
    df.multiple_lines = df.multiple_lines.apply(online_checker)
    #create a new column with tenure by the year
    df['tenure_by_year'] = df.tenure / 12
    #change multiple_lines into a numeric value
    df.multiple_lines = df.multiple_lines.apply(online_checker)
    #create dummy values for gender, churn, and paperless_billing
    telco_dummies = pd.get_dummies(df[[
        'gender', 'partner', 'dependents', 'phone_service', 'churn',
        'paperless_billing'
    ]],
                                   drop_first=True)
    #create dummy values for internet
    internet_dummie = pd.get_dummies(df['internet_service_type_id'])
    #rename the dummy columns
    internet_dummie = internet_dummie.rename(columns={
        1: 'DSL',
        2: 'Fiber',
        3: 'No_internet'
    })
    #create dummy variable for payment
    payment_dummie = pd.get_dummies(df['payment_type_id'])
    #rename the dummy columns
    payment_dummie = payment_dummie.rename(columns={
        1: 'echeck',
        2: 'mailed_check',
        3: 'bank_transfer',
        4: 'credit_card'
    })
    #create dummy variable for contract type
    contract_dummie = pd.get_dummies(df['contract_type_id'])
    #rename dummy columns
    contract_dummie = contract_dummie.rename(columns={
        1: 'Month2month',
        2: '1year',
        3: '2year'
    })
    #add dummy values into the main dataframe
    df = pd.concat(
        [df, telco_dummies, payment_dummie, contract_dummie, internet_dummie],
        axis=1)
    #list duplicate columns
    col_to_drop = [
        'payment_type_id', 'gender', 'partner', 'dependents', 'phone_service',
        'churn', 'paperless_billing', 'contract_type', 'internet_service_type',
        'payment_type'
    ]
    #drop duplicate columns
    df = df.drop(columns=col_to_drop)
    #split data into train, validate and test subsets
    train, validate, test = telco_split(df)
    return train, validate, test
Ejemplo n.º 18
0
def prep_telco_churn_data(splain=local_settings.splain, **kwargs):
    '''
    prep_telco_churn_data(splain=local_settings.splain, **kwargs)
    RETURNS: df, encoder, scaler
    
    
    '''
    df = get_telco_data(splain=splain)
    df = df.set_index('customer_id')
    category_cols=[
        'partner_deps_id',
        'partner_deps',
        'gender',
        'contract_type_id',
        'internet_service_type_id',
        'internet_service_type',
        'contract_type',
        'payment_type_id',
        'payment_type',
        'phone_service_type',
        'phone_service_id'
    ]
    retype_cols(df=df, cols=category_cols, to_dtype='category')
    boolean_cols  = [
        'is_male',
        'is_female',
        'senior_citizen',
        'partner',
        'dependents',
        'family',
        'thru_first_month',
        'thru_first_quarter',
        'thru_first_half',
        'thru_first_year',
        'thru_first_term',
        'phone_service',
        'multiple_lines',
        'internet_service',
        'has_dsl',
        'has_fiber',
        'online_security',
        'online_backup',
        'online_security_backup',
        'device_protection',
        'tech_support',
        'streaming_tv',
        'streaming_movies',
        'streaming_services',
        'streaming_dsl',
        'streaming_fiber',
        'on_contract',
        'paperless_billing',
        'manual_mtm',
        'auto_pay'
    ]
    retype_cols(df=df, cols=boolean_cols, to_dtype='bool')
    string_cols = [
        'partner_deps',
        'gender',
        'internet_service_type',
        'contract_type',
        'payment_type',
        'phone_service_type',
    ]   
    #df_dtypes = pd.DataFrame(df.dtypes).rename(columns={0:'dtype'}).reset_index()
    dfo = set_dfo(dfo_df=df, y_column='churn', splain=True, string_cols=string_cols)
    return dfo #encoder, scaler