Esempio n. 1
0
def analysis_of_variance():
    """
    ANOVA

    Why? To find the correlation between different
    groups of a categorical variable

    What do we get from ANOVA?
    F-test score: variation between sample group means divided
        by variation within sample group
    P-value: confidence degree

    Notes:
    - Small F implies poor correlation between the variable
        categories and the target variable.
    - Large F implies strong correlation
    """
    df = util.create_df()

    df_anova = df[['make', 'price']]
    grouped_anova = df_anova.groupby(['make'])
    anova_results_1 = stats.f_oneway(
        grouped_anova.get_group('honda')['price'],
        grouped_anova.get_group('subaru')['price'])
    anova_results_2 = stats.f_oneway(
        grouped_anova.get_group('honda')['price'],
        grouped_anova.get_group('jaguar')['price'])
    print(anova_results_1)
    print(anova_results_2)
Esempio n. 2
0
def descriptive_statistics():
    df = util.create_df()
    # Descriptive Statistics
    # Generate various summary statistics, excluding NaN values
    df.describe()
    # Summarize categorical data
    df['drive-wheels'].value_counts()

    # Helps spot outliers in a data set
    sns.boxplot(x='drive-wheels', y='price', data=df)
    plt.show()

    # Clear the current figure so it does not interfere with our new plot
    plt.clf()

    # Scatter plot shows the relationship between two variables
    # PIV variables on x-axis
    # TDV variables on y-axis
    y = df['engine-size']
    x = df['price']
    plt.scatter(x, y)
    plt.ylim(bottom=0)
    plt.xlim(left=0)
    plt.title('Scatterplot of Engine Size vs Price')
    plt.xlabel('Engine Size')
    plt.ylabel('Price')
    plt.show()
Esempio n. 3
0
def group_by():
    """
    Group By

    Used on categorical variables (size, price, etc.). Groups data into subsets
    according to the different categories of the variable.

    Can be done on single or multiple variables.
    """
    df = util.create_df()

    df_test = df[['drive-wheels', 'body-style', 'price']]
    df_grp = df_test.groupby(['drive-wheels', 'body-style'],
                             as_index=False).mean()
    """
    Pivot table & Heatmaps
    
    One variable displayed along the columns and the other variable displayed along the 
    rows
    """
    df_pivot = df_grp.pivot(index='drive-wheels', columns='body-style')

    plt.pcolor(df_pivot, cmap='RdBu')
    plt.colorbar()
    plt.show()
Esempio n. 4
0
def correlation_simple():
    """
    Statistical metric for measuring interdependency of 2 variables

    Measures to what extent different variables are interdependent

    Examples:
        Lung cancer -> Smoking
        Rain -> Umbrella

    Correlation does not imply causation
    The umbrella didn't cause the rain, and the rain didn't cause the umbrella
    """
    df = util.create_df()

    # Positive Linear Relationship
    sns.regplot(x='engine-size', y='price', data=df)
    plt.ylim(0, )
    plt.show()

    plt.clf()

    # Negative Linear Relationship
    sns.regplot('highway-mpg', 'price', data=df)
    plt.ylim(0, )
    plt.show()

    plt.clf()

    # Weak Linear Relationship
    sns.regplot('peak-rpm', 'price', data=df)
    plt.ylim(0, )
    plt.show()
Esempio n. 5
0
def correlation_statistics():
    """
    Pearson correlation
        - Correlation Coefficient. Explanation:
            - Close to +1: Large positive relationship
            - Close to -1: Large negative relationship
            - Close to 0: No relationship

        - P Value. Strength of result certainty:
            - <0.001: Strong certainty
            - <0.05: Moderate certainty
            - <0.1: Weak certainty
            - >0.1: No certainty

    Notes:
        https://en.wikipedia.org/wiki/Correlation_and_dependence
        - We can say there's a strong correlation when:
            1. Correlation Coefficient is close to 1 or -1
            2. P value is less than 0.001
        - If the correlation coefficient is NaN?
    """
    df = util.create_df()

    df['horsepower'] = df['horsepower'].astype(float)
    pearson_coef, p_value = stats.pearsonr(df['horsepower'], df['price'])
    print('Coef: {} | P value: {}'.format(pearson_coef, p_value))
Esempio n. 6
0
def binning():
    """ Grouping values into bins
    Converts numeric into categorical variables
    Group a set of numerical values into a set of bins

    Sometimes this can improve the accuracy of the data
    """
    df = util.create_df()
    util.replace_nan_with_mean(df['horsepower'], 'float')
    df["horsepower"] = df["horsepower"].astype(int)

    # Return evenly spaced numbers over a specified interval - 4 dividers (3 bins)
    bins = np.linspace(min(df['horsepower']), max(df['horsepower']), 4)

    # Set names for our groups
    group_names = ['Low', 'Medium', 'High']

    # Bin values into discrete intervals
    # It runs through every value and applies the label depending on their ranges
    df['horsepower-binned'] = pd.cut(
        df['horsepower'],
        bins,
        labels=group_names,
        include_lowest=True
    )

    # Visualize it
    fig = plt.figure(figsize=(12, 14))
    plt.bar(group_names, df['horsepower-binned'].value_counts())
    fig.suptitle('Horsepower Bins', fontsize=18)
    plt.xlabel('Horsepower', fontsize=18)
    plt.ylabel('count', fontsize=16)
Esempio n. 7
0
def dummies():
    """ Turning categorical variables into quantitative variables
    Solution: Add dummy variables for each unique category
    Assign 0 or 1 in each category

    e.g.
    fuel | column, type: object
    --- Entries
    gas     0
    diesel  1

    1) One-hot encoding
      pandas.get_dummies()

    -- Indicator Variable
    An indicator variable (or dummy variable) is a numerical variable used to
    label categories. They are called 'dummies' because the numbers themselves
    don't have inherent meaning
    """
    df = util.create_df()
    dummy_var = pd.get_dummies(df['fuel'])

    # merge data frame "df" and "dummy_var"
    df = pd.concat([df, dummy_var], axis=1)

    # drop original column "fuel-type" from "df"
    df.drop("fuel", axis=1, inplace=True)
Esempio n. 8
0
def multiple_linear_regression():
    """
    Will use 2+ PIVs to make 1 prediction (TDV)
    """
    df = util.create_df()

    x = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]
    y = df['price']

    lmr = linear_model.LinearRegression()

    lmr.fit(x, y)

    y_hat = lmr.predict(x)

    width, height = 12, 10
    plt.figure(figsize=(width, height))

    ax1 = sns.distplot(df['price'], hist=False, color="r", label="Actual Value")
    sns.distplot(y_hat, hist=False, color="b", label="Fitted Values", ax=ax1)

    plt.title('Actual vs Fitted Values for Price')
    plt.xlabel('Price (in dollars)')
    plt.ylabel('Proportion of Cars')

    plt.show()
Esempio n. 9
0
def polynomial_regression():
    """
    Special case of the general linear regression model
    Useful for describing 'curvilinear' relationships: This is what you get by squaring or setting
        higher-order terms of the predictor variables

    The model can be:
    - Quadratic (2nd order)
    - Cubic (3rd order)
    - Higher order (4th order +)

    The degree of the regression can make a big difference if you pick the right value
    .
    """
    import numpy as np
    from sklearn.preprocessing import PolynomialFeatures
    df = util.create_df()

    x = df['horsepower']
    y = df['curb-weight']

    f = np.polyfit(x, y, 3)
    p = np.poly1d(f)
    print(p)

    pr = PolynomialFeatures(degree=2, include_bias=False)
    x_poly = pr.fit_transform(df[['horsepower', 'curb-weight']])

    print(x_poly)
Esempio n. 10
0
def model_evaluation():
    """
    Tells us how our model performs in the real world
    Difference with in-sample evaluation:
        - In-sample tells us how well our model fits the data already given to train it
        - Problem: It does not tell us how well the trained model can be used to predict new data
        - Solution: Split data in sets:
            - Training data: Train it with in-sample evaluation
            - Testing data:

    Example:
        - Train 70% of the data
        - Test 30% of the data

    There exists a generalization error that involves the percentages of data used for training
    and testing. TO overcome this issue, we use

    Cross Validation
        - Most common out-of-sample (testing) evaluation metric
        - More effective use of data (each observation is used for both training and testing)
    """
    from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
    from sklearn.linear_model import LinearRegression

    df = util.create_df()

    x_data = df[['highway-mpg']]
    y_data = df['price']

    lr = LinearRegression()
    lr.fit(x_data, y_data)
    # cv specifies how many folds to use
    scores = cross_val_score(lr, x_data, y_data, cv=3)
    print(f'Mean: {scores.mean()}. Standard deviation: {scores.std()}')
    predicted_scores = cross_val_predict(lr, x_data, y_data, cv=3)

    # Visualize the model
    width, height = 12, 10
    plt.figure(figsize=(width, height))
    sns.regplot(x='highway-mpg', y='price', data=df)
    plt.ylim(0, )
    plt.show()

    plt.clf()

    plt.figure(figsize=(width, height))
    sns.regplot(x="peak-rpm", y="price", data=df)
    plt.ylim(0, )
    plt.show()

    plt.clf()

    plt.figure(figsize=(width, height))
    sns.residplot(df['highway-mpg'], df['price'])
    plt.show()
Esempio n. 11
0
def ridge_regression():
    """
    Prevents over-fitting, which is ALSO a big problem when
    you have multiple independent variables or features

    If the estimated polynomial coefficients have a very large magnitude, we can use
    Ridge regression to control it with an 'alpha' parameter

    Alpha is a parameter we select before fitting or training a model

    As alpha increases, the other parameters get smaller
    Must be selected carefully - If alpha is too large, the parameters will reach 0, under-fitting
    the model

    if alpha is 0, over-fitting is evident!

    In order to select alpha, use cross validation
    """
    from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
    from sklearn.linear_model import Ridge, LinearRegression

    df = util.create_df()

    x_data = df.drop('price', axis=1)
    y_data = df['price']
    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size=0.15,
                                                        random_state=1)

    lr = LinearRegression()
    lr.fit(
        x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']],
        y_train)

    yhat_train = lr.predict(
        x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
    print(f'Train: {yhat_train[0:5]}')

    yhat_test = lr.predict(
        x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
    print(f'Test: {yhat_test[0:5]}')

    title = 'Distribution Plot of Predicted Value Using Training Data vs Training Data Distribution'
    DistributionPlot(y_train, yhat_train, "Actual Values (Train)",
                     "Predicted Values (Train)", title)

    rm = Ridge(alpha=0.1)
    rm.fit(x_train, y_train)
    y_hat = rm.predict(x_test)

    print('predicted:', y_hat[0:4])
    print('test set :', y_hat[0:4].values)
Esempio n. 12
0
def simple_linear_regression():
    """
    Will only use 1 PIV to make 1 prediction (TDV)
    
    FORMULA: y = b0 + (b1 * x)
    b0: intercept
    b1: slope
    """
    from sklearn.metrics import mean_squared_error, r2_score

    df = util.create_df()

    # Define PIV and TDV
    x = df[['highway-mpg']]
    y = df['price']

    x_train = x[:-20]
    x_test = x[-20:]

    y_train = y[:-20]
    y_test = y[-20:]

    lmr = linear_model.LinearRegression()

    # Train/Fit the model
    lmr.fit(x_train, y_train)

    y_predict = lmr.predict(x_test)

    # The coefficients
    print('Coefficients: \n', lmr.coef_)
    # The mean squared error
    print('Mean squared error: %.2f'
          % mean_squared_error(y_test, y_predict))
    # The coefficient of determination: 1 is perfect prediction
    print('Coefficient of determination: %.2f'
          % r2_score(y_test, y_predict))

    width, height = 12, 10
    plt.figure(figsize=(width, height))
    plt.scatter(x_test, y_test, color='black')
    plt.plot(x_test, y_predict, color='blue', linewidth=3)
    plt.ylim(0,)
    plt.title('SLR model for predicting price')
    plt.xlabel('Miles Per Gallon')
    plt.ylabel('Price')
    plt.show()
Esempio n. 13
0
def calculate_mean_squared_error():
    """
    As the MSE increases, the prediction will be less accurate.
    """
    from sklearn.metrics import mean_squared_error
    df = util.create_df()

    x = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]
    y = df['price']

    lmr = linear_model.LinearRegression()

    lmr.fit(x, y)

    y_hat = lmr.predict(x)
    mse = mean_squared_error(df['price'], y_hat)
    print(mse)
"""
DATA NORMALIZATION

- Simple Feature Scaling
- Min-Max
- Z-score
"""
from src import util

df = util.create_df()


def normalization():
    """ Overview
    1) Simple Feature Scaling
        xNew = xOld/xMax
    2) Min-Max
        xNew = (xOld-xMin)/(xMax-xMin)
    3) Z-score
        xNew = (xOld-m)/sd
        m: Average -> mean()
        sd: Standard Deviation -> std()
    """
    # Simple Feature Scaling
    df['length'] = df['length'] / df['length'].max()

    # Min-Max
    df['length'] = (df['length'] - df['length'].min()) / \
        (df['length'].max()-df['length'].min())

    # Z-score
Esempio n. 15
0
def fitting():
    """
    Over/Under-fitting for polynomial regression
    How to pick the best polynomial order

    Under-fitting: the model is too simple to fit the data
    Over-fitting: The model is too flexible to fit the data.

    The training error decreases with the order of the polynomial, BUT
    The test error is a better means of estimating the error of a polynomial.
    """
    from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
    from sklearn.linear_model import LinearRegression

    df = util.create_df()

    # x_data = df[['highway-mpg']]
    x_data = df.drop('price', axis=1)
    y_data = df['price']
    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size=0.15,
                                                        random_state=1)

    print("number of test samples :", x_test.shape[0])
    print("number of training samples:", x_train.shape[0])

    lr = LinearRegression()
    lr.fit(
        x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']],
        y_train)

    yhat_train = lr.predict(
        x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
    print(f'Train: {yhat_train[0:5]}')

    yhat_test = lr.predict(
        x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
    print(f'Test: {yhat_test[0:5]}')

    title = 'Distribution Plot of Predicted Value Using Training Data vs Training Data Distribution'
    DistributionPlot(y_train, yhat_train, "Actual Values (Train)",
                     "Predicted Values (Train)", title)

    rsqu_test = []
    order = [1, 2, 3, 4]

    lr = LinearRegression()

    # Determine which polynomial degree gives the best r^2 value
    for n in order:
        pr = PolynomialFeatures(degree=n)
        x_train_pr = pr.fit_transform(x_train[['horsepower']])
        x_test_pr = pr.fit_transform(x_test[['horsepower']])
        lr.fit(x_train_pr, y_train)
        rsqu_test.append(lr.score(x_test_pr, y_test))

    plt.plot(order, rsqu_test)
    plt.xlabel('order')
    plt.ylabel('R^2')
    plt.title('R^2 Using Test Data')
    plt.text(3, 0.75, 'Maximum R^2 ')
    plt.show()
Esempio n. 16
0
def model_evaluation_using_visualization():
    """
    Regression plot
        - Gives us a good estimate of:
            1) The relationship between 2 variables
            2) The strength of the correlation (r2)
            3) The direction of the relationship (Positive/Negative)

        - Combination of:
            1) Scatter plot: Every point represents a different y
            2) Fitted linear regression line

    Residual Plot
        - Represents the error between the actual values
        - y axis: residuals
        - x axis: TDV / Fitted values
        - Obtain difference by subtracting the predicted value - TDV
        - We expect results to have zero mean (Small variance), distributed evenly
            around the x axis with similar variance.
        - If there is NO curvature, a linear plot (function) might be more appropriate.
        - If there is a curvature, our LINEAR ASSUMPTION is incorrect, and it suggests
            a non-linear function
        - If the variance of the residuals increases with x, our MODEL is incorrect.

    Distribution Plot
        - Counts the predicted vs. actual values
        - Very useful for visualizing models with more than PIV

        Example
            - Given a data set of y values: 1, 2, 3
            - Count and plot the number of predicted values and TDVs
                that are approximately equal to 1, 2 and 3
            -
    """
    import seaborn as sns

    df = util.create_df()

    # Regression plot
    sns.regplot(x='highway-mpg', y='price', data=df)
    plt.ylim(0, )
    plt.show()

    plt.clf()

    # Residual plot
    sns.residplot(df['highway-mpg'], df['price'])
    plt.ylim(0, )
    plt.show()

    plt.clf()

    # Distribution plot
    x = df[['highway-mpg']]
    y = df['price']

    lmr = linear_model.LinearRegression()

    lmr.fit(x, y)

    y_hat = lmr.predict(x)

    ax1 = sns.distplot(y, hist=False, color='r', label='Actual value')
    sns.distplot(y_hat, hist=False, color='b', label='Fitted values', ax=ax1)
    plt.show()