Example #1
0
def calculate_binary_test_accuracy(latent_z, model, indicators, design_matrix,
                                   response):
    input_matrix = de.replace_design_latent(design_matrix=design_matrix,
                                            indicators=indicators,
                                            z=latent_z).copy()
    predictions = model.predict(X=input_matrix)
    accuracy = np.mean(np.equal(predictions > 0, response > 0))
    return accuracy
Example #2
0
def elo_test_accuracy(latent_z, indicators, design_matrix, response):
    input_matrix = de.replace_design_latent(design_matrix=design_matrix,
                                            indicators=indicators,
                                            z=latent_z).copy()
    proba_predictions = input_matrix.apply(
        lambda row: elo_calculate_home_win_prob(row["AwayRating"], row[
            "HomeRating"]),
        axis=1)
    predictions = np.array(proba_predictions >= 0.5, dtype=int).reshape(-1, 1)
    accuracy = np.mean(np.equal(predictions, response))
    return accuracy
Example #3
0
def calculate_test_accuracy(latent_z, model, indicators, design_matrix,
                            response):
    input_matrix = de.replace_design_latent(design_matrix=design_matrix,
                                            indicators=indicators,
                                            z=latent_z).copy()
    predictions = model.predict(X=input_matrix)
    squared_errors = (predictions - response)**2
    mae = np.mean(np.abs(predictions - response))
    mae_std = np.std(np.abs(predictions - response))
    accuracy = model.score(X=input_matrix, y=response)
    return accuracy, mae, mae_std, squared_errors[:,
                                                  0]  #Reshaping the squared errors to be of (x,) instead of (x,1) (for concatenation with empty list)
Example #4
0
def fit_margin_model(x,
                     y,
                     indicators,
                     p_means,
                     p_vars,
                     weights,
                     MAP=False,
                     show=False,
                     tol=1e-03,
                     max_iter=1):
    """
    Performs the Expectation-Maximization Algorithm for the Margin Model (point differential linear regression)
        Expectation = Calculate the optimal latent variables for each team given the linear model parameters
        Maximization = Calculate the optimal linear regression parameters given the team latent variables
        Result is the final team latent variables and model that can be used to predict future results
        
    :param x: data matrix (n x d) with entries for team ratings (can default to 0s here)
    :param y: vector of margin of victories for the games (n x 1)
    :param indicators: matrix with team identifiers as entries (n x 2) matrix
    :param p_means: vector of prior means of the latent team variables (z x 1)
    :param p_vars: vector of prior variances of the latent team variables (z x 1)
    :param MAP: boolean determining if MLE (False, default) should be used or the MAP estimate
    :param show: boolean determining whether additional information should be printed to the console
    :param tol: tolerance for convergence
    :param max_iter: maximum amount of iterations before terminating the algorithm
    :return: final latent variables (z x 1), final accuracy of the model (single float), and linear model object
    """

    # Initializing linear model parameters
    lm = LinearRegression()
    lm.intercept_ = np.array([0.5])
    # PARAM VECTOR SHOULD BE AWAY COEF, HOME COEF, AWAY REST COEF, HOME REST COEF
    # SET THIS IN A SMARTER WAY (VARIABLE LENGTH FOR COMPATIBILITY WITH OTHER DATASETS
    lm.coef_ = np.array([[-1.0, 1.0, -0.5, 0.5]])
    if x.shape[1] < len(lm.coef_[0]):
        lm.coef_ = np.array([lm.coef_[0][:(x.shape[1])]])

    param_vector = np.append(arr=lm.coef_, values=np.var(y)).reshape((-1, 1))

    change = tol + 1
    iterations = 0
    new_z, new_acc = np.copy(p_means), 0
    old_x = x.copy()

    if weights is None:
        weights = np.ones(len(y))

    # Continue alternating between E and M steps until the algorithm converges or reaches the maximum amount of iterations
    while change > tol and iterations < max_iter:
        start_acc = lm.score(X=old_x, y=y)
        if show:
            print(lm.intercept_)
            print(lm.coef_)
            print("EM Iteration %d start accuracy %.5f" %
                  (iterations, start_acc))
        old_z = np.copy(new_z)
        # Expectation step - solving for the optimal latent team variables given the linear model parameters (param_vector)
        new_z, z_std = gd.latent_margin_optimization(response=y,
                                                     design_matrix=old_x,
                                                     param_vector=param_vector,
                                                     indicators=indicators,
                                                     intercept=lm.intercept_,
                                                     weights=weights,
                                                     z=new_z,
                                                     prior_means=p_means,
                                                     prior_vars=p_vars,
                                                     MAP=MAP,
                                                     show=show)
        change = np.linalg.norm(new_z - old_z)
        new_x = de.replace_design_latent(design_matrix=old_x,
                                         indicators=indicators,
                                         z=new_z)

        # Internal checks to make sure gradient descent step improved model
        finish_acc = lm.score(X=new_x, y=y)
        if show:
            print("EM Iteration %d after gradient descent accuracy %.5f" %
                  (iterations, finish_acc))
        if finish_acc < start_acc:
            print(
                "ERROR: EXPECTATION OPTIMIZATION DECREASED MODEL ACCURACY (from %0.5f to %0.5f)"
                % (start_acc, finish_acc))
        elif show:
            print("Expectation Step Accuracy Improvement: %.5f" %
                  (finish_acc - start_acc))

        # Maximization step - solving for the linear model parameters given the latent team variables (z)
        lm.fit(X=new_x, y=y, sample_weight=weights)

        # OUTPUT FOR R ANALYSIS, DELETE LATER
        # merged = new_x.copy()
        # merged["MarginResponse"] = y
        # merged.to_csv("2008NBADataOutput.csv", index=False)

        if show:
            print(lm.intercept_)
            print(lm.coef_)
        param_vector = np.append(arr=lm.coef_, values=np.std(y)).reshape(
            (-1, 1))
        new_acc = lm.score(X=new_x, y=y)
        acc_change = new_acc - finish_acc
        if show:
            print("Maximization Step Accuracy Improvement: %.5f" % acc_change)
        iterations += 1
        old_x = new_x.copy()

    if iterations >= max_iter:
        print(
            "WARNING: MAXIMUM ITERATIONS (%d) OF EM ALGORITHM REACHED CHANGE:%d"
            % (iterations, change))
    elif show:
        print("Finished EM with %d iterations" % iterations)
    return new_z, new_acc, lm, z_std
Example #5
0
def latent_margin_optimization(response,
                               design_matrix,
                               param_vector,
                               intercept,
                               indicators,
                               weights,
                               z,
                               prior_means,
                               prior_vars,
                               home_points=" Home Points",
                               away_points=" Away Points",
                               a_cols=None,
                               h_cols=None,
                               joint=False,
                               MAP=False,
                               show=False,
                               newton_update=False,
                               gamma=6.0,
                               tol=1e-01,
                               max_iter=100):
    """
    Function for performing numerical optimization on latent variables of the margin model
    (Finding the latent variable vector that minimizes the log-likelihood of the margin model given fixed model parameters)
    :param response: The home margins of victory (N x 1 vector)
    :param design_matrix: The transformed pandas DataFrame for predictions (N x d matrix)
    :param param_vector: The coefficients to be multiplied by each matrix row ((d+1) x 1 vector) with last element being model variance
        NOTE THIS SHOULD BE A DICTIONARY WITH KEYS FOR "Away" AND "Home" FOR THE RESPECTIVE MODELS IN THE JOINT SETTING
    :param intercept: Same as param_vector, intercept of the linear model needed for predictions, same dictionary warning as param_vector
    :param indicators: Index numbers of away, home pairs for each example (N x 2 matrix)
    :param weights: Training example weights to give/remove emphasis to specific games
    :param z: The latent variables as a vector (K x 1 vector)
    :param prior_means: The means of the latent variable prior distributions (K x 1 vector)
    :param prior_vars: The variances of the latent variable prior distributions (K x 1 vector)
    :param MAP: Boolean whether to return the MAP estimate (=True) or the MLE (=False), default is MLE
    :param gamma: step size for gradient descent
    :param tol: minimum change allowed for termination of gradient descent
    :param max_iter: maximum amount of iterations allowed in gradient descent before termination
    :return: z: the latent variable vector that minimizes the log-likelihood of the margin model given fixed model parameters (param_vector)
    """
    z_change = tol + 1
    iterations = 0
    start = datetime.datetime.now()
    gradient_std = np.zeros(len(z))
    p_z_change = 0
    # Run until no change in latent variables or a maximum amount of iterations reached
    while z_change > tol and iterations < max_iter:
        iterations += 1
        prev_z = np.copy(z)
        # Calculate gradient of data under margin model with latent variables
        if not joint:
            z_gradient, z_second_gradient, gradient_stds = margin_model_derivative_z(
                response,
                design_matrix,
                param_vector,
                intercept,
                indicators,
                weights,
                z=prev_z,
                prior_means=prior_means,
                prior_vars=prior_vars,
                MAP=MAP)
        else:  # Is joint
            z_gradient, z_second_gradient, gradient_stds = joint_model_derivative_z(
                response,
                design_matrix,
                a_cols,
                h_cols,
                param_vector["Away"],
                param_vector["Home"],
                intercept["Away"],
                intercept["Home"],
                indicators,
                weights,
                z=prev_z,
                prior_means=prior_means,
                prior_vars=prior_vars,
                MAP=MAP,
                home_points=home_points,
                away_points=away_points)

        # Take a gradient step and calculate change in latent variable vector
        if not newton_update:
            z += gamma * np.array(z_gradient).reshape(-1, 1)
        if newton_update:
            z -= z_gradient / z_second_gradient

        z_change = np.linalg.norm(z - prev_z)
        if z_change > p_z_change or (
                p_z_change - z_change) < 1:  # Momentum adjustment calculations
            gamma = gamma / 2.
        p_z_change = z_change

        if not joint:
            design_matrix = replace_design_latent(design_matrix=design_matrix,
                                                  indicators=indicators,
                                                  z=z)
        else:  # Is joint
            design_matrix = replace_design_joint_latent(
                joint_design_matrix=design_matrix, indicators=indicators, jz=z)

        if show:
            print(
                "Expectation Optimization Iteration: %d Latent Change: %.8f" %
                (iterations, z_change))

    if iterations == max_iter:
        print(
            "Maximum iterations (%d) reached for termination of expectation optimization"
            % max_iter)

    finish = datetime.datetime.now()
    time_taken = (finish - start).total_seconds() / 60.
    if show:
        print("Time taken (minutes): %.5f" % time_taken)

    return z, gradient_stds
Example #6
0
iterations = 0
while change > tol:
    start_acc = lm.score(X=old_x, y=y)
    new_z = gd.latent_margin_gradient_descent(response=y,
                                              design_matrix=old_x,
                                              param_vector=param_vector,
                                              indicators=indicators,
                                              weights=np.ones(len(y)).reshape(
                                                  (-1, 1)),
                                              z=new_z,
                                              prior_means=p_means,
                                              prior_vars=p_vars,
                                              MAP=False,
                                              show=False)
    new_x = de.replace_design_latent(design_matrix=old_x,
                                     indicators=indicators,
                                     z=new_z)
    finish_r2 = lm.score(
        X=new_x, y=y
    )  # For internal checks to make sure gradient descent improving model
    lm.fit(X=new_x, y=y)
    param_vector = np.append(arr=lm.coef_, values=np.std(y)).reshape((-1, 1))
    new_acc = lm.score(X=new_x, y=y)
    old_x = np.copy(new_x)
    change = new_acc - start_acc
    iterations += 1
print("Finished MLE with %d iterations" % iterations)
final_acc_MLE = new_acc
final_z_MLE = np.copy(new_z)

print("-------------------------------")