def final_plot():

    df = wrangle.wrangle_telco().set_index("customer_id")
    plot_pairs = plot_variable_pairs(df)
    tenure_in_year = months_to_years(df)
    plot_category = plot_categorical_and_continous_vars(tenure_in_year)

    return plot_pairs, plot_category
def scale_telco_data():
    df = wrangle_telco()

    train, validate, test = telco_split(df)

    scaler = sklearn.preprocessing.MinMaxScaler()

    columns_to_scale = ['monthly_charges', 'tenure', 'total_charges']

    train, validate, test = add_scaled_columns(train, validate, test, scaler,
                                               columns_to_scale)

    return train, validate, test
def scale_wrangle_telco(cached=True):
    '''
    This function acquires wrangle_telco data, 
    splits into train, validate, and test,
    scales the numeric columns using min-max scaling,
    and adds the scaled columns to the respective split data sets
    '''
    #acquires 'wrangle_telco' and saves it as df
    df = wrangle_telco(cached)
    #uses the function above to split the into train, validate and test
    train, validate, test = telco_split(df)
    #assigns the scaling method as min-max scaler
    scaler = sklearn.preprocessing.MinMaxScaler()
    #identifies the columns to scale
    columns_to_scale = ['monthly_charges', 'tenure', 'total_charges']
    #adds '_scaled' to the end of the newly scaled columns to identify differences
    new_column_names = [c + '_scaled' for c in columns_to_scale]
    #fts the columns to the scaler
    scaler.fit(train[columns_to_scale])
    #concatonates the newly created scaled columns to their respective data sets,
    #adds 'new_column_names' as the label to the added columns
    #uses the original index since the new columns no longer have an index
    train = pd.concat([
        train,
        pd.DataFrame(scaler.transform(train[columns_to_scale]),
                     columns=new_column_names,
                     index=train.index),
    ],
                      axis=1)
    validate = pd.concat([
        validate,
        pd.DataFrame(scaler.transform(validate[columns_to_scale]),
                     columns=new_column_names,
                     index=validate.index),
    ],
                         axis=1)
    test = pd.concat([
        test,
        pd.DataFrame(scaler.transform(test[columns_to_scale]),
                     columns=new_column_names,
                     index=test.index),
    ],
                     axis=1)
    #returns the data sets with the new respective scaled data
    return train, validate, test
Esempio n. 4
0
def scale_telco(df):
    '''
    Scale_telco wrangles the telco dataframe from the codeup database,
    splits the df into three data sets (train, validate, test), and scales
    the data using SKLEARN's Min Max Scaler. It returns three datasets:
    train_scaled, validate_scaled, test_scaled
    '''
    df = wrangle.wrangle_telco()

    train, validate, test = wrangle.train_validate_test_split(df)

    scaler = sklearn.preprocessing.MinMaxScaler()

    scaler.fit(train)

    train_scaled = scaler.transform(train)
    validate_scaled = scaler.transform(validate)
    test_scaled = scaler.transform(test)

    train_scaled = pd.DataFrame(train_scaled, columns=train.columns)
    validate_scaled = pd.DataFrame(validate_scaled, columns=train.columns)
    test_scaled = pd.DataFrame(test_scaled, columns=train.columns)

    return train_scaled, validate_scaled, test_scaled
Esempio n. 5
0
def prepare_for_split():
    df = wrangle.wrangle_telco()
    X = df[["monthly_charges", "tenure"]]
    y = df.total_charges
    return X, y
# Create a file, explore.py, that contains the following functions for exploring your variables (features & target).
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

import env
import wrangle
import split_scale 

df = wrangle.wrangle_telco()
df.head()

train, test = split_my_data_whole(df)
train.head(), test.head()
type(train)
type(test)

#1. Write a function, plot_variable_pairs(dataframe) that plots all of the pairwise relationships along with the regression line for each pair.
def plot_variable_pairs(df):

scaled_train, scaled_test = standard_scaler(train, test)
    
df_plt = sns.jointplot('monthly_charges', 'tenure', data=train, kind='reg')

Esempio n. 7
0
import warnings

warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from wrangle import wrangle_telco
import env
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

df = wrangle_telco()  # add this into split function
X = df.drop(columns=['customer_id', 'total_charges'])
y = pd.DataFrame(df['total_charges'])


def split_my_data(df, train_pct=.80, random_state=123):
    train, test = train_test_split(df,
                                   train_size=train_pct,
                                   random_state=random_state)
    return train, test


def standard_scaler(train, test):
    scaler = StandardScaler(copy=True, with_mean=True,
                            with_std=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train),
                                columns=train.columns.values).set_index(
                                    [train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test),
                               columns=test.columns.values).set_index(
                                   [test.index.values])
Esempio n. 8
0
import seaborn as sns
sns.set_style=("whitegrid")
import statsmodels.api as sm
import wrangle
import split_scale
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectKBest, f_regression
from statsmodels.formula.api import ols
import warnings
from sklearn.feature_selection import RFE
warnings.filterwarnings("ignore")

data = wrangle.wrangle_telco()
X = data.drop(columns='total_charges').set_index('customer_id')
y = pd.DataFrame(data.total_charges).set_index(data['customer_id'])
y_train, y_test = split_scale.split_my_data(y)
X_train, X_test = split_scale.split_my_data(X)



# 1.) Write a function, select_kbest_freg() that takes X_train, y_train and k as input (X_train and y_train should not be scaled!) and returns a list of the top k features.

def select_kbest_freg_unscaled(X_train, y_train, k):
    f_selector = SelectKBest(f_regression, k=k).fit(X_train, y_train)
    f_support = f_selector.get_support()
    f_feature = X_train.loc[:,f_support].columns.tolist()
    return (str(len(f_feature)), 'selected features'),(f_feature),(f_selector.scores_)
Esempio n. 9
0
#### Feature Engineering for telco_churn data

import pandas as pd

from wrangle import wrangle_telco
from split_scale import split_my_data
import features

### SelectKBest - Top Features of Unscaled Data

## Step 1. Load Data
telco_df = wrangle_telco()
telco_df.head()
telco_X = telco_df[["monthly_charges", "tenure"]]
telco_y = telco_df["total_charges"]

## Step 2. Split Data to X and y, and test and train = 4 data frames
telco_X_train, telco_X_test, telco_y_train, telco_y_test = split_my_data(
    telco_X, telco_y, 0.80)

## Step 3. Run select_kbest_freg_unscaled
f_features = features.selectkbest_optimal_features(telco_X_train,
                                                   telco_y_train, 2)
Esempio n. 10
0
#Create split_scale.py that will contain the functions that follow.
#Each scaler function should create the object, fit and transform both train and test.
#They should return the scaler, train dataframe scaled, test dataframe scaled.
# Be sure your indices represent the original indices from train/test, as those represent the indices from the original dataframe.
# Be sure to set a random state where applicable for reproducibility!

from wrangle import wrangle_telco
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

# For this project we are wrangling data from the telco-churn database.
# This function pulls the data and cleans it.
customers = wrangle_telco()

# We isolate our X and y variables for
train_pct = .8


def pull_X_y(train, test, y):
    X_train = train.drop(columns=y)
    y_train = train[[y]]
    X_test = test.drop(columns=y)
    y_test = test[[y]]
    return X_train, y_train, X_test, y_test


# Function used to split the data. Although we do produce 4 new datasets (X["train", "test"] and y["train","test"])
def split_my_data(X, y, train_pct):
    X_train, X_test, y_train, y_test = train_test_split(X,
Esempio n. 11
0
# I need to do this within an average of $5.00 per customer.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

import env
import wrangle as w
import split_scale as ss

df = w.wrangle_telco()

df

x = df[['tenure', 'monthly_charges']]

y = df[['total_charges']]

x_train, x_test, y_train, y_test = ss.split_my_data(x, y, train_pct=.8)

# 1. Write a function, select_kbest_freg_unscaled() that takes X_train, y_train and k as input
# (X_train and y_train should not be scaled!) and returns a list of the top k features.

from sklearn.feature_selection import SelectKBest, f_regression

k = 1
import pandas as pd
import numpy as np
import seaborn as sns
import split_scale as ss
from wrangle import wrangle_telco
import matplotlib.pyplot as plt

def plot_variable_pairs(df):
    graph = sns.PairGrid(df)
    graph.map_diag(plt.hist)
    graph.map_offdiag(sns.regplot)
    plt.show()

def months_to_years(tenure_months, df):
    df['tenure_years'] = tenure_months // 12
    return df

def plot_categorical_and_continuous_vars(categorical_var, continuous_var, df):
    bar plot 
    box plot
    pie chart

if __name__ == '__main__':
    telco = wrangle_telco()
    telco.set_index([telco.customer_id], inplace=True)
    train_telco, test_telco = ss.split_my_data(telco, .7, seed)
    plot_variable_pairs(telco)
    months_to_years(telco['tenure'], telco)
    plot_categorical_and_continuous_vars()
def get_X_y():
    df = wrangle.wrangle_telco()
    X = df.drop(columns=['customer_id', 'total_charges'])
    y = df.total_charges
    return X, y
Esempio n. 14
0
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import wrangle
import env

from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

df = wrangle.wrangle_telco().set_index("customer_id")
X = df.loc[:, ("tenure", "monthly_charges")]
y = pd.DataFrame(df.total_charges)

# split dataframe into train(train_percent: 80%) & test(20%)
def split_my_data(df):
	train, test = train_test_split(df, train_size = 0.8, random_state = 123)
	return train, test
# split_my_data(df)

# standard
def perform_standard_scaler(train, test):
	scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(train)
	
	train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
	test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
	return scaler, train_scaled, test_scaled

Esempio n. 15
0
def prepare_telco_for_split():
    df = wrangle.wrangle_telco()
    df.drop(columns="customer_id", inplace=True)
    return df
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from env import user, password, host
import wrangle
import split_scale
from statsmodels.formula.api import ols
from math import sqrt
from sklearn.feature_selection import SelectKBest


# Our scenario continues:
# As a customer analyst, I want to know who has spent the most money with us over their 
# lifetime. I have monthly charges and tenure, so I think I will be able to use those two 
# attributes as features to estimate total_charges. I need to do this within an average 
# of $5.00 per customer.
wrangle.wrangle_telco()
get_db_url(user, host, password, database="telco_churn")
telco = wrangle_telco()
telco

telco.head()
telco.describe()
telco.info()
telco.dtypes
telco.columns.values


train, test = train_test_split(telco, train_size=0.80, random_state=123)
train = train.drop('customer_id', axis=1)
test = test.drop('customer_id', axis=1)
train.head()