Exemple #1
0
def weighted_average(models=weights, oob=True):
    dfs = []
    for m, perc in models:
        preds = common.load_preds(m, oob)
        print(m, common.rmse(preds, common.y_all))
        df = preds * perc
        dfs.append(df)
    df = pd.concat(dfs)
    df = df.groupby(df.index).sum()
    print("ensemble", common.rmse(common.y_all, df))

    common.save_preds("ensemble_weighted", df, oob=oob)
    return df
Exemple #2
0
def weighted_average(models=weights, oob=True):
    dfs = []
    for m, perc in models:
        preds = common.load_preds(m, oob)
        print(m, common.rmse(preds, common.y_all))
        df = preds * perc
        dfs.append(df)
    df = pd.concat(dfs)
    df = df.groupby(df.index).sum()
    print("ensemble", common.rmse(common.y_all, df))

    common.save_preds("ensemble_weighted", df, oob=oob)
    return df
def altering_minimize(X, us, vs, lamda):
    """
    Fill missing values in X by matrix factorization with
    altering minimizing method.

    Factorize X into the form of UV

    We first keep vs fixed, to evaluate the us. Then evaluate us and vs in such an
    approach alternatively.

    Args:
        X: Ratings with missing values
        us: User factors
        vs: Move factors
        lamda: Normalization term

    Returns:
        The us and vs that minimize out cost
    """
    # X_filled = X
    rmse_fill = np.inf

    # Start by fixing vss
    new_vs = vs
    new_us = us

    print(common.rmse(us @ vs.T, X), loss(X, us, vs, lamda))
    # Iterate until X_filled has no detective change
    count = 0
    while True:
        count += 1
        rmse_old = rmse_fill

        new_us = update_us(X, new_vs, new_us, lamda)
        rmse_fill = common.rmse(new_us @ new_vs.T, X)
        ls = loss(X, new_us, new_vs, lamda)
        print(rmse_fill, ls)

        new_vs = update_vs(X, new_vs, new_us, lamda)
        rmse_fill = common.rmse(new_us @ new_vs.T, X)
        ls = loss(X, new_us, new_vs, lamda)
        print(rmse_fill, ls)

        print()
        if rmse_old - rmse_fill < 0.001:
            break

    return new_us, new_vs
def main():
    ## Parameters ##
    data = cmn.dataset(xSet = "dist_days_time_dayOfWeek")
    data.load()#N_points = 1000)
    futureMask = makeFutureMask(data.tScope, data.tTest)

    # Try many values of k
    vals = np.ceil(2 ** (np.arange(15) / 1.5))
    rmse = np.zeros(shape=(len(vals)), dtype=np.float)
    for i in range(len(vals)):
        k = vals[i]
        
        timer = cmn.timer()
        yHat = manyNearestNeighborsVector(data.xScope, data.yScope, data.xTest, k, futureMask)
        print "k = {}\tRuntime = {:.2f}".format(k, timer.dur())
        rmse[i] = cmn.rmse(data.yTest, yHat)
        print "\tRMSE = {:.2f}".format(rmse[i])
        data.saveYHat(yHat, model = "{}NN".format(k))

        # Visualize and save the images for the model
        data.visualize(yHat, "{}NN".format(k))
    
    # Plot the historical RMSE
    clf()
    plot(vals, rmse)
    xlabel("Number of nearest points, k in kNN")
    ylabel("Root Mean Squared Error (seconds)")
    title("kNN Model, RMSE for different ks")
    savefig("{}/{}_{}_k-rmse.png".format(data.figPath, data.serviceName, data.routeName))
Exemple #5
0
def test_em():
    init_mixture, post = common.init(X, K, seed)
    mixture, post, c = em.run(X, init_mixture, post)

    prediction = em.fill_matrix(X, mixture)
    print(c)
    print(common.rmse(prediction, X_gold))
Exemple #6
0
def best_weights(models):

    predictions = []
    for m in models:
        preds = common.load_preds(m)
        predictions.append(preds)
        print(m, common.rmse(common.y_all, preds))

    def objective_fun(weights):
        final_prediction = 0
        for weight, prediction in zip(weights, predictions):
            final_prediction += weight * prediction
        return common.rmse(common.y_all, final_prediction)

    for i in range(5):
        starting_values = np.random.random(len(models))
        starting_values /= starting_values.sum()

        #adding constraints  and a different solver as suggested by user 16universe
        #https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
        cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
        #our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(predictions)

        res = minimize(objective_fun,
                       starting_values,
                       method='SLSQP',
                       bounds=bounds,
                       constraints=cons)

        print('score: {best_score}'.format(best_score=res['fun']))
        print('weights: {weights}'.format(weights=res['x']))
def main():
    ## Parameters ##
    data = cmn.dataset(xSet = "dist_days_time_dayOfWeek")
    data.load(Nparts = 10)#N_points = 1000)
    futureMask = makeFutureMask(data.tScope, data.tTest)

    # Try many values of k
    vals = np.ceil(2 ** np.arange(10))
    rmse = np.zeros(shape=(len(vals)), dtype=np.float)
    for i in range(len(vals)):
        k = vals[i]
        
        timer = cmn.timer()
        yHat = regress(data.xScope, data.yScope, data.xTest, k, futureMask)
        print "rho = {}\tRuntime = {:.2f}".format(k, timer.dur())
        rmse[i] = cmn.rmse(data.yTest, yHat)
        print "\tRMSE = {:.2f}".format(rmse[i])
        data.saveYHat(yHat, model = "kernel_{}rho".format(k))

        # Visualize and save the images for the model
        data.visualize(yHat, "kernel_{}rho".format(k))
    
    # Plot the historical RMSE
    clf()
    plot(vals, rmse)
    xlabel("rho Paramater")
    ylabel("Root Mean Squared Error (seconds)")
    title("Kernel Regression Model, RMSE for different rhos")
    savefig("{}/{}_{}_kernel_rho-rmse.png".format(data.figPath, data.serviceName, data.routeName))
Exemple #8
0
def average(models, oob=True):
    dfs = [common.load_preds(m, oob) for m in models]
    df = pd.concat(dfs)
    df = df.groupby(df.index).mean()
    common.save_preds("ensemble_average", df)
    print("ensemble", common.rmse(common.y_all, df))
    return df
Exemple #9
0
def average(models, oob=True):
    dfs = [common.load_preds(m, oob) for m in models]
    df = pd.concat(dfs)
    df = df.groupby(df.index).mean()
    common.save_preds("ensemble_average", df)
    print("ensemble", common.rmse(common.y_all, df))
    return df
Exemple #10
0
def best_weights(models):

    predictions = []
    for m in models:
        preds = common.load_preds(m)
        predictions.append(preds)
        print(m, common.rmse(common.y_all, preds))

    def objective_fun(weights):
        final_prediction = 0
        for weight, prediction in zip(weights, predictions):
            final_prediction += weight * prediction
        return common.rmse(common.y_all, final_prediction)
    
    for i in range(5):
        starting_values = np.random.random(len(models))
        starting_values /= starting_values.sum()

        #adding constraints  and a different solver as suggested by user 16universe
        #https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
        cons = ({'type': 'eq','fun': lambda w: 1-sum(w)})
        #our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(predictions)

        res = minimize(objective_fun, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

        print('score: {best_score}'.format(best_score=res['fun']))
        print('weights: {weights}'.format(weights=res['x']))
def main():
    parser = ArgumentParser()
    add_args(parser)
    args = parser.parse_args()
    dsmoothedTraj = pickle.load(open(args.inputfile,'rb'))
    learnfilter.addsmooth(dsmoothedTraj)
    fdf,edf,m = common.loadall(args.pbname)
    if args.model=='' or args.coverage is None:
        #compute valid time intervals for each aircraft, without filtering
        dicot = {aircraft:[(np.min(smo.trajff.timeAtServer),np.max(smo.trajff.timeAtServer))] for aircraft,smo in dsmoothedTraj.items()}
    else:
        #compute valid time intervals for each aircraft with filtering using the model
        with open(args.model,'rb') as f:
            model = pickle.load(f)
        vdf = pd.read_csv("./Data/{0}_result/{0}_result.csv".format(args.pbname))
        tokeep = ceil(vdf.shape[0]*args.coverage)
        print("# of points to keep",tokeep)
        dicot = compute_dicot_from_model(edf,{k:smo for (k,smo) in dsmoothedTraj.items() if smo.trajff.shape[0]>0},model, tokeep, args.min_continuous_to_keep)
    print("compute prediction")
    pred= build_predictionfile(edf,dsmoothedTraj, dicot)
    print("compute distance")
    d = common.haversine_distance(pred.loc[:,latn].values,pred.loc[:,lonn].values,pred.latitude.values,pred.longitude.values)
    print(d.shape[0],common.rmse(d),common.rmse90(d))
    if args.outputfile != '':
        print("writing prediction file")
        pred=pred.drop(columns=["longitude","latitude"])
        df=merge_with_result(pred,args.pbname)
        print("actual coverage",df.query("longitude==longitude").shape[0]/df.shape[0])
        df.to_csv(args.outputfile,float_format="%.12f",index=False)
def run_matrix_completion():
    K = 12
    seed = 1
    mixture, post = common.init(X, K, seed)
    mixture, post, ll = em.run(X, mixture, post)
    X_pred = em.fill_matrix(X, mixture)
    X_gold = np.loadtxt('netflix_complete.txt')
    print("RMSE:", common.rmse(X_gold, X_pred))
Exemple #13
0
def test_k12():
    lls = []
    for s in [0, 1, 2, 3, 4]:
        print(s)
        init_mixture, post = common.init(X, 12, s)
        model = em.run(X, init_mixture, post)
        lls.append(model)
    m, p, l = max(lls, key=lambda x: x[-1])
    prediction = em.fill_matrix(X, m)
    return common.rmse(prediction, X_gold)
def main():
    ## Parameters ##
    data = cmn.dataset(xSet = "dist_days_time_dayOfWeek")
    data.load(Nparts = 10)#N_points = 1000)
    futureMask = makeFutureMask(data.tScope, data.tTest)

    rmse = np.zeros(shape=(len(vals)), dtype=np.float)
    
    timer = cmn.timer()
    yHat = regress(data.xScope, data.yScope, data.xTest, 1, futureMask)
    print "onTime\tRuntime = {:.2f}".format(timer.dur())
    rmse[i] = cmn.rmse(data.yTest, yHat)
    print "\tRMSE = {:.2f}".format(rmse[i])
    data.saveYHat(yHat, model = "onTime".format(k))

    # Visualize and save the images for the model
    data.visualize(yHat, "onTime".format(k))
def main():
    ## Parameters ##
    data = cmn.dataset(xSet = "dist_days_time_dayOfWeek")
    data.load()#N_points = 1000)
    k = 10
    futureMask = makeFutureMask(data.tScope, data.tTest)

    timer = cmn.timer()
    yHat = manyNearestNeighborsVector(data.xScope, data.yScope, data.xTest, k, futureMask)
    print "Vectorized Runtime = {:.2f}".format(timer.dur())
    
    print "RMSE = {:.2f}".format(cmn.rmse(data.yTest, yHat))
    data.saveYHat(yHat, model = "{}NN".format(k))

    #timer.reset()
    #yHat2 = manyNearestNeighbors(data.xTrain, data.yTrain, data.xTest, k)
    #print "Iterative Runtime = {:.2f}".format(timer.dur())
    
    #print "RMSE = {}".format(cmn.rmse(data.yTest, yHat))

    # Visualize and save the images for the model
    data.visualize(yHat, "{}NN".format(k))
Exemple #16
0
#         gaussian, post, new_ll = kmeans.run(X, gaussian, post)
#         common.plot(X, gaussian, post, "K-means: number of classes{}, random seed {}".format(k, i))
#
# for k in range(1, 5, 1):
#     for i in range(1):
#         gaussian, post = common.init(X, k, seed=i)
#         gaussian, post, new_ll = naive_em.run(X, gaussian, post)
#         common.plot(X, gaussian, post, "EM: number of classes{}, random seed {}".format(k, i))

X = np.loadtxt("netflix_incomplete.txt")
X_gold = np.loadtxt('netflix_complete.txt')
# for k in [1, 12]:
#     for i in range(5):
#         gaussian, post = common.init(X, k, seed=i)
#         gaussian, post, new_ll = em.run(X, gaussian, post)
#         print("EM: number of classes {}, random seed {}:".format(k, i))
#         print(new_ll)

gaussian, post = common.init(X, 12, seed=1)
gaussian, post, new_ll = em.run(X, gaussian, post)
X_pred = em.fill_matrix(X, gaussian)
print(common.rmse(X_gold, X_pred))

# for k in range(1, 5, 1):
#     for i in range(5):
#         gaussian, post = common.init(X, k, seed=i)
#         gaussian, post, new_ll = naive_em.run(X, gaussian, post)
#         print("BIC = {} for K = {} and seed = {}".format(common.bic(X, gaussian, new_ll), k, i))
#
#
Exemple #17
0
 def objective_fun(weights):
     final_prediction = 0
     for weight, prediction in zip(weights, predictions):
         final_prediction += weight * prediction
     return common.rmse(common.y_all, final_prediction)
Exemple #18
0
 def objective_fun(weights):
     final_prediction = 0
     for weight, prediction in zip(weights, predictions):
         final_prediction += weight * prediction
     return common.rmse(common.y_all, final_prediction)
def main():
    ## Parameters ##
    xSet = "dist_days_time_dayOfWeek"
    data = cmn.dataset(xSet = xSet)
    data.load(Nparts = 10, N_points = 4000)
    futureMask = makeFutureMask(data.tScope, data.tTest, futureTime = -30 * 60)

    print "Model\t{}\t{}\t{}".format("Param", "RunTime", "RMSE")

    # Predict on time
    timer = cmn.timer()
    yHat = onTime.regress(data.xScope, data.yScope, data.xTest, 1, futureMask)
    time_onTime = timer.dur()
    rmse_onTime = cmn.rmse(data.yTest, yHat)
    print "onTime\t\t{:.2f}\t{:.2f}".format(time_onTime, rmse_onTime)
    data.saveYHat(yHat, model = "onTime")

    # Visualize and save the images for the model
    data.visualize(yHat, "onTime")

    ## kNN

    # Try many values of k
    ks = np.ceil(2 ** (np.arange(15) / 1.5))
    rmse_kNN = np.zeros(shape=(len(ks)), dtype=np.float)
    time_kNN = np.zeros(shape=(len(ks)), dtype=np.float)
    for i in range(len(ks)):
        k = ks[i]
        
        timer = cmn.timer()
        yHat = kNN.regress(data.xScope, data.yScope, data.xTest, k, futureMask)
        time_kNN[i] = timer.dur()
        rmse_kNN[i] = cmn.rmse(data.yTest, yHat)
        print "kNN\t{: 5.0f}\t{:.2f}\t{:.2f}".format(k, time_kNN[i], rmse_kNN[i])
        data.saveYHat(yHat, model = "{}NN".format(k))

        # Visualize and save the images for the model
        data.visualize(yHat, "{}NN".format(k))
    
    # Plot the historical RMSE
    clf()
    plot(ks, rmse_kNN)
    xlabel("Number of nearest points, k in kNN")
    ylabel("Root Mean Squared Error (seconds)")
    title("kNN Model, RMSE for different ks")
    savefig("{}/{}_{}_k-rmse.png".format(data.figPath, data.serviceName, data.routeName))

    ## Kernel

    # Try many values of k
    rhos = np.ceil(10 ** (np.arange(12) / 2.0))
    rmse_kernel = np.zeros(shape=(len(rhos)), dtype=np.float)
    time_kernel = np.zeros(shape=(len(rhos)), dtype=np.float)
    for i in range(len(rhos)):
        rho = rhos[i]
        
        timer = cmn.timer()
        yHat = kernel.regress(data.xScope, data.yScope, data.xTest, rho, futureMask)
        time_kernel[i] = timer.dur()
        rmse_kernel[i] = cmn.rmse(data.yTest, yHat)
        print "kernel\t{: 5.0f}\t{:.2f}\t{:.2f}".format(rho, time_kernel[i], rmse_kernel[i])
        data.saveYHat(yHat, model = "kernel_{}rho".format(rho))

        # Visualize and save the images for the model
        data.visualize(yHat, "kernel_{}rho".format(rho))
    
    # Plot the historical RMSE
    clf()
    plot(rhos, rmse_kernel)
    xlabel("rho Paramater")
    ylabel("Root Mean Squared Error (seconds)")
    title("Kernel Regression Model, RMSE for different rhos")
    savefig("{}/{}_{}_kernel_rho-rmse.png".format(data.figPath, data.serviceName, data.routeName))
Exemple #20
0
def main():
    parser = ArgumentParser()
    add_args(parser)
    args = parser.parse_args()
    ds = pickle.load(open(args.inputfile, 'rb'))
    aircrafts = ds.aircraft.unique()
    if args.pbname != '':
        fdf, edf, m = common.loadall(args.pbname)
        vdftrue = None if args.ts else pd.read_csv(
            "./Data/{}_result/{}_result.csv".format(args.pbname, args.pbname))
    d = {}
    ld = []
    for aircraft in aircrafts:
        print("aircraft", aircraft)
        traj = ds.query("aircraft==" + str(aircraft)).reset_index(drop=True)
        if traj.shape[0] > MIN_REQUIRED_NB:
            error = traj.error.values
            # discard points with a large multilateration error
            filterror = filter_error(error, args.thr_error)
            trajf = traj.loc[filterror]
            # keep the longest sequence satisfying speed constraints
            if trajf.shape[0] > MIN_REQUIRED_NB:
                filtspeed = filter_speedlimit(trajf.nnpredlatitude.values,
                                              trajf.nnpredlongitude.values,
                                              trajf.timeAtServer.values, 0.,
                                              args.speed_limit)
                trajff = trajf.loc[filtspeed]
                drawtrue = common.haversine_distance(
                    trajff.latitude, trajff.longitude,
                    trajff.nnpredlatitude.values,
                    trajff.nnpredlongitude.values)
                smoothedtraj = SmoothedTraj(trajff, args.smooth)
                t = trajff.timeAtServer.values
                slat, slon = smoothedtraj.predict(t)
                dsmoothraw = common.haversine_distance(
                    slat, slon, trajff.nnpredlatitude.values,
                    trajff.nnpredlongitude.values)
                tmin = np.min(t)
                tmax = np.max(t)
                if args.pbname != '':
                    traje = edf.query("aircraft==" + str(aircraft)).query(
                        str(tmin) +
                        "<=timeAtServer").query("timeAtServer<=" +
                                                str(tmax)).reset_index(
                                                    drop=True)
                    dsmoothtrue = comparewithtrue(traje, smoothedtraj,
                                                  vdftrue)  #[300:-300]
                    ld.append(dsmoothtrue)
                    print(common.rmse(ld[-1]), common.rmse90(ld[-1]),
                          common.rmse50(ld[-1]))
                print(traj.shape, trajff.shape)
                d[aircraft] = smoothedtraj
    if len(ld) > 0:
        dsmoothtrue = np.concatenate(ld)
        print(dsmoothtrue.shape[0], common.rmse(dsmoothtrue),
              common.rmse90(dsmoothtrue))
        e = np.sort(dsmoothtrue,
                    axis=None)[:int(dsmoothtrue.shape[0] * 0.6) + 1]
        print(e.shape[0], common.rmse(e), common.rmse90(e))
    if args.outputfile != '':
        # save dict[aircraft]=SmoothedTraj
        with open(args.outputfile, 'wb') as f:
            pickle.dump(d, f)
Exemple #21
0
    for seed in seeds:
        mixture, post = common.init(X, K=K, seed=seed)
        mixture, post, log_likelihood = em.run(X, mixture, post)
        print(K, seed, log_likelihood)

# =============================================================================
# Completing missing entries
# =============================================================================

X = np.loadtxt("test_incomplete.txt")
X_gold = np.loadtxt("test_complete.txt")

mixture, post = common.init(X, K=4, seed=0)
mixture, post, log_likelihood = em.run(X, mixture, post)
X_pred = em.fill_matrix(X, mixture)
RMSE = common.rmse(X_gold, X_pred)
print(X_pred, RMSE)

# =============================================================================
# Comparing with gold targets
# =============================================================================

X = np.loadtxt("netflix_incomplete.txt")
X_gold = np.loadtxt('netflix_complete.txt')

mixture, post = common.init(X, K=12, seed=1)
mixture, post, log_likelihood = em.run(X, mixture, post)
X_pred = em.fill_matrix(X, mixture)
RMSE = common.rmse(X_gold, X_pred)
print(RMSE)
    return np.array(new_vs)


def loss(X, us, vs, l):
    not_missing = X != 0
    se = np.sum((X - (us @ vs.T) * not_missing)**2)/2
    regularization = np.sum(us ** 2) + np.sum(vs**2)
    regularization = regularization*l/2

    return se + regularization


if __name__ == "__main__":
    test_X = X
    num_u,num_i = test_X.shape
    is_missing = test_X == 0
    # us, s, vs = svds(test_X)
    # us, vs = altering_minimize(test_X, us, vs, 1)
    rmse = []
    for k in [1,2,3,4,5,6,7,8,9,10]:
        us = np.random.randint(1, 6, (num_u, k)).astype('float')
        vs = np.random.randint(1, 6, (num_i, k)).astype('float')
        for l in [0]:
            us, vs = altering_minimize(test_X, us, vs, l)
            x_pre_raw = (us @ vs.T)
            x_pred = x_pre_raw * is_missing + (test_X *(~is_missing))
            rmse.append(common.rmse(x_pred,X_gold))

    print(rmse)
Exemple #23
0
        em_mix, em_post, em_ll = naive_em.run(X, init_mix, init_post)
        if k_cost < k_best_cost:
            k_best_mix, k_best_post, k_best_cost = k_mix, k_post, k_cost
        if em_ll > em_best_ll:
            em_best_mix, em_best_post, em_best_ll = em_mix, em_post, em_ll
    BICs[i] = common.bic(X, em_best_mix, em_best_ll)
    common.plot(X, k_best_mix, k_best_post, "K-means K={}".format(K))
    common.plot(X, em_best_mix, em_best_post, "EM K={}".format(K))

print("BICs: ", BICs)
print("Best BIC: ", np.max(BICs))
print("Best K: ", Ks[np.argmax(BICs)])

X = np.loadtxt("netflix_incomplete.txt")

K = 12
seeds = [0, 1, 2, 3, 4]

em_best_mix, em_best_post, em_best_ll = None, None, -np.inf
for seed in seeds:
    init_mix, init_post = common.init(X, K, seed)
    em_mix, em_post, em_ll = em.run(X, init_mix, init_post)
    if em_ll > em_best_ll:
        em_best_mix, em_best_post, em_best_ll = em_mix, em_post, em_ll
print("K = {}, LL = {}".format(K, em_best_ll))

X_fill_pred = em.fill_matrix(X, em_best_mix)
X_fill = np.load("netflix_complete")

print("X_filled Error:", common.rmse(X_fill_pred, X_fill))
# Posterior probs. for best seeds
posts = [0, 0, 0, 0, 0]

# RMS Error for clusters
rmse = [0., 0.]

start_time = time.perf_counter()

for k in range(len(K)):
    for i in range(5):

        # Run EM
        mixtures[i], posts[i], log_lh[i] = \
        em.run(X, *common.init(X, K[k], i))

    # Print lowest cost
    print("=============== Clusters:", K[k], "======================")
    print("Highest log likelihood using EM is:", np.max(log_lh))

    #    # Save best seed for plotting
    best_seed[k] = np.argmax(log_lh)
    #
    #    # Use the best mixture to fill prediction matrix
    X_pred = em.fill_matrix(X, mixtures[best_seed[k]])
    rmse[k] = common.rmse(X_gold, X_pred)

print("===================================================")
print("RMS Error for K = 12 is: {:.4f}".format(rmse[1]))
end_time = time.perf_counter()
print("Time taken for this run: {:.4f} seconds".format(end_time - start_time))
Exemple #25
0
print("naive EM log likelihood : " + str(naive_em_estimate))

print("############## Some Tests ######################")
initialMixture, initialPost = common.init(toy_X, 1, 0)
mixtureEM1, postEM1, ll1 = naive_em.run(toy_X, initialMixture, initialPost)

initialMixture, initialPost = common.init(toy_X, 2, 0)
mixtureEM2, postEM2, ll2 = naive_em.run(toy_X, initialMixture, initialPost)

initialMixture, initialPost = common.init(toy_X, 3, 0)
mixtureEM3, postEM3, ll3 = naive_em.run(toy_X, initialMixture, initialPost)

initialMixture, initialPost = common.init(toy_X, 4, 0)
mixtureEM4, postEM4, ll4 = naive_em.run(toy_X, initialMixture, initialPost)

print("BIC K1 : " + str(common.bic(toy_X, mixtureEM1, ll1)))
print("BIC K2 : " + str(common.bic(toy_X, mixtureEM2, ll2)))
print("BIC K3 : " + str(common.bic(toy_X, mixtureEM3, ll3)))
print("BIC K4 : " + str(common.bic(toy_X, mixtureEM4, ll4)))

X_netflix = np.loadtxt("netflix_incomplete.txt")
test_em_seeds(X_netflix, 1)
#test_em_seeds(X_netflix, 12)

X_gold = np.loadtxt('netflix_complete.txt')
mixture4, post4 = common.init(X_netflix, 12, 1)
mixture, post, cost4 = em.run(X_netflix, mixture4, post4)
X_pred = em.fill_matrix(X_netflix, mixture)

rmse_result = common.rmse(X_gold, X_pred)
print("RMSE between prediction and GOLD is : " + str(rmse_result))
Exemple #26
0
def run_matrix_completion():
    mixture, post = common.init(X, 12, 1)
    mixture, post, ll = em.run(X, mixture, post)
    X_pred = em.fill_matrix(X, mixture)
    X_gold = np.loadtxt('netflix_complete.txt')
    print("root mean squared error:", common.rmse(X_gold, X_pred))
def main():
    ## Parameters ##
    xSet = "allfeats_normalized" # or manyfeats...
    k = 100
    eta = 0.00001
    N_passes = 50

    # Load the Data
    data = cmn.dataset(xSet = xSet)
    data.load(Nparts = 10, validation = True)#, N_points = 4000)

    # Get xs and sizes
    X  = data.xScope
    Xv = data.xVal
    Xt = data.xTest
    Y  = data.yScope
    Yv = data.yVal
    Yt = data.yTest
    T  = data.tScope
    Tv = data.tVal
    Tt = data.tTest
    (N_train, N_feat) = data.xScope.shape
    N_val             = data.xVal.shape[0]
    N_test            = data.xTest.shape[0]

    # Start a file to record
    filename = "{}/{}_{}_weightvalidation_allfeats_Ntrain{}.txt".format(data.resPath, data.serviceName, data.routeName, N_train)
    f = open(filename, 'w')

    # Precompute the future mask
    futureMask = makeFutureMask(T, Tt, futureTime = -30 * 60)

    # Compute the distance matrix (since they all use the same)
    #traintile = np.tile(np.reshape(data.xScope, (N_train, N_feat, 1)), (1, 1, N_test));
    #testtile = np.tile(np.transpose(np.reshape(data.xTest, (N_test, N_feat, 1)), (2, 1, 0)), (N_train, 1, 1));
    #difftile = ((traintile - testtile) ** 2)

    N_iterations = N_val * N_passes
    ws   = np.zeros(shape=(N_iterations, N_feat))
    #ws[0, 0] = 1
    rmse = np.zeros(shape=(N_iterations, 1))
    
    # Preliminary Test
    Ypreds = np.zeros((N_test, 1))
    w = ws[0, :].copy()
    w.shape = (1, N_feat)
    for i in range(N_test):
        i_point = i % N_test
        Xp = Xt[i_point, :].copy()
        Xp.shape = (1, N_feat)
        Yp = Yt[i_point]
        Tp = Tt[i_point]

        # Limit training data to the past
        Tpast = T < (Tp - (30 * 60))
        Xpast = X[Tpast, :]
        Ypast = Y[Tpast]
        N_past = Xpast.shape[0]

        # Find the distance
        diff = Xpast - np.tile(Xp, (N_past, 1))
        diff *= np.tile(w, (N_past, 1))
        dist = sum(diff ** 2, axis = 1)
        
        # Predict Y using kNN
        if(len(dist) < k):
            Ypred = 0
        else:
            Ypred = np.mean(Ypast[dist.argsort()[:k]])
        Ypreds[i] = Ypred
        Yerror = Yp - Ypred
        #str = "t = {}\tx={:.2f}, {:.2f}, {:.2f}, {:.2f}\ty={:.2f}\typred={:.2f}\n".format(i, Xp[0, 0], Xp[0, 1], Xp[0, 2], Xp[0, 3], Yp, Ypred)
        #print str[:(len(str) - 1)]
        #f.write(str)
        
    rmse = cmn.rmse(Yt, Ypreds)
    str = "eta {}\tpass {}\tw={}\trmse = {:.2f}\n".format(eta, 0, w[0, :], rmse)
    #str = "eta {}\tpass {}\tw={:.2f}, {:.2f}, {:.2f}, {:.2f}\trmse = {:.2f}\n\n".format(eta, 0, w[0, 0], w[0, 1], w[0, 2], w[0, 3], rmse)
    print str[:(len(str) - 1)]
    f.write(str)

    for i_pass in range(N_passes):

        # Improve the weights
        randOrder = np.random.permutation(range(N_val))
        for i in range(N_val * i_pass, N_val * (i_pass + 1)):
            w = ws[i, :].copy()
            w.shape = (1, N_feat)
            i_point = randOrder[i % N_val]
            Xp = Xv[i_point, :].copy()
            Xp.shape = (1, N_feat)
            Yp = Yv[i_point]
            Tp = Tv[i_point]
    
            # Limit training data to the past
            Tpast = T < (Tp - (30 * 60))
            Xpast = X[Tpast, :]
            Ypast = Y[Tpast]
            N_past = Xpast.shape[0]

            # Find the distance
            diff = Xpast - np.tile(Xp, (N_past, 1))
            diff *= np.tile(w, (N_past, 1))
            dist = sum(diff ** 2, axis = 1)
        
            # Predict Y using kNN
            if(len(dist) < k):
               Ypred = 0
            else:
                Ypred = np.mean(Ypast[dist.argsort()[:k]])
            Yerror = Yp - Ypred
            #str = "i = {}\tw={:.2f}, {:.2f}, {:.2f}, {:.2f}\tx={:.2f}, {:.2f}, {:.2f}, {:.2f}\ty={:.2f}\typred={:.2f}\n".format(i, w[0, 0], w[0, 1], w[0, 2], w[0, 3], Xp[0, 0], Xp[0, 1], Xp[0, 2], Xp[0, 3], Yp, Ypred)
            #print str[:(len(str) - 1)]
            #f.write(str)
            
            # Recalculate weights
            w_delta = eta * (Xp * Yerror + w)
            if i < (N_iterations - 1):
                ws[i + 1] = w - w_delta

        #str = "\nFinished validating\tw={:.2f}, {:.2f}, {:.2f}, {:.2f}\n\n".format(w[0, 0], w[0, 1], w[0, 2], w[0, 3])
        #print str[:(len(str) - 1)]
        #f.write(str)

        # Test
        Ypreds = np.zeros((N_test, 1))
        for i in range(N_test):
            #w.shape = (1, N_feat)
            i_point = i % N_test
            Xp = Xt[i_point, :].copy()
            Xp.shape = (1, N_feat)
            Yp = Yt[i_point]
            Tp = Tt[i_point]

            # Limit training data to the past
            Tpast = T < (Tp - (30 * 60))
            Xpast = X[Tpast, :]
            Ypast = Y[Tpast]
            N_past = Xpast.shape[0]

            # Find the distance
            diff = Xpast - np.tile(Xp, (N_past, 1))
            diff *= np.tile(w, (N_past, 1))
            dist = sum(diff ** 2, axis = 1)
        
            # Predict Y using kNN
	    if(len(dist) < k):
               Ypred = 0
            else:
                Ypred = np.mean(Ypast[dist.argsort()[:k]])
            Ypreds[i] = Ypred
            Yerror = Yp - Ypred
            #str = "t = {}\tx={:.2f}, {:.2f}, {:.2f}, {:.2f}\ty={:.2f}\typred={:.2f}\n".format(i, Xp[0, 0], Xp[0, 1], Xp[0, 2], Xp[0, 3], Yp, Ypred)
            #print str[:(len(str) - 1)]
            #f.write(str)
        
        rmse = cmn.rmse(Yt, Ypreds)
        str = "eta {}\tpass {}\tw={}\trmse = {:.2f}\n".format(eta, i_pass + 1, w[0, :], rmse)
        #wstr = "{:.2f} ".format(w[0, :])
        #str = "eta {}\tpass {}\tw={}\trmse = {:.2f}\n\n".format(eta, i_pass + 1, wstr, rmse)
        print str[:(len(str) - 1)]
        f.write(str)
    
    f.close()
Exemple #28
0
print('Mu:\n' + str(mixture.mu))
print('Var: ' + str(mixture.var))
print('P: ' + str(mixture.p))
print()

print("After first E-step:")
post, ll = em.estep(X, mixture)
print('post:\n' + str(post))
print('LL:' + str(ll))
print()

print("After first M-step:")
mu, var, p = em.mstep(X, post, mixture)
print('Mu:\n' + str(mu))
print('Var: ' + str(var))
print('P: ' + str(p))
print()

print("After a run")
(mu, var, p), post, ll = em.run(X, mixture, post)
print('Mu:\n' + str(mu))
print('Var: ' + str(var))
print('P: ' + str(p))
print('post:\n' + str(post))
print('LL: ' + str(ll))
X_pred = em.fill_matrix(X, common.GaussianMixture(mu, var, p))
error = common.rmse(X_gold, X_pred)
print("X_gold:\n" + str(X_gold))
print("X_pred:\n" + str(X_pred))
print("RMSE: " + str(error))
Exemple #29
0
import common

X = np.loadtxt("test_incomplete.txt")
X_gold = np.loadtxt("test_complete.txt")

K = 4
n, d = X.shape
seed = 0

# TODO: Your code here

mix_conv, post_conv, log_lh_conv = em.run(X, *common.init(X, K, seed))

X_predict = em.fill_matrix(X, mix_conv)

rmse = common.rmse(X_gold, X_predict)

#%% Begin: Comparison of EM for matrix completion with K = 1 and 12
import time

X = np.loadtxt("netflix_incomplete.txt")
X_gold = np.loadtxt("netflix_complete.txt")

K = [1, 12]  # Clusters to try

log_lh = [0, 0, 0, 0, 0]  # Log likelihoods for different seeds

# Best seed for cluster based on highest log likelihoods
best_seed = [0, 0]

# Mixtures for best seeds
Exemple #30
0
import numpy as np
import em
import common

X = np.loadtxt("netflix_incomplete.txt")
X_gold = np.loadtxt("netflix_complete.txt")

K = 12

log_lh = [0, 0, 0, 0, 0]
best_seed = 0
mixtures = [0, 0, 0, 0, 0]
posts = [0, 0, 0, 0, 0]
rmse = 0.

# Test all seeds
for i in range(5):
    mixtures[i], posts[i], log_lh[i] = em.run(X, *common.init(X, K, i))

best_seed = np.argmax(log_lh)
Y = em.fill_matrix(X, mixtures[best_seed])
rmse = common.rmse(X_gold, Y)
print("RMSE for K = 12: {:.4f}".format(rmse))
def main():
    ## Parameters ##
    #data = cmn.dataset(xSet = "traj")
    #data.load()#N_points = 1000)
    #futureMask = makeFutureMask(data.tScope, data.tTest)
    dataPath = "/projects/onebusaway/BakerNiedMLProject/data/routefeatures"
    resPath = "/projects/onebusaway/BakerNiedMLProject/data/modelPredictions"
    figPath = "/projects/onebusaway/BakerNiedMLProject/figures/predictions"
    serviceName = "intercitytransit"
    routeName = "route13"
    xSet = "traj"
    ySet = "dev"
    x = np.loadtxt("{}/{}_{}_{}.txt".format(dataPath, serviceName, routeName, xSet), dtype=np.float)
    # Try many values of k
    vals = np.ceil(2 ** (np.arange(15) / 1.5))
    rmse = np.zeros(shape=(len(vals)), dtype=np.float)
    minK = 0
    minRMSE = 0
    sel = np.random.permutation(range(len(x)));
    split = len(x)/4;
    xTrain = x[sel[:split*2]];
    xVal = x[sel[split*2:3*split]];
    xTest = x[sel[3*split:]];
    yTest = np.zeros(len(xVal));
    yHat = np.zeros(len(xVal));
    data_norm = np.empty(shape = x.shape)
    theMean = x[:,:].mean()
    theStdDev = x[:,:].std()
    data_norm = (x - theMean)/ theStdDev
    xTrainNorm = data_norm[sel[:split*2]];
    xValNorm = data_norm[sel[split*2:3*split]];
    xTestNorm = data_norm[sel[3*split:]];
    for i in range(len(vals)):
        k = vals[i]
        model = "{}NN".format(k);
        timer = cmn.timer()
        
        

        
        print xTrain.shape;
        print xTest.shape;
        print xVal.shape;
        for j in range(len(xVal)):
            v = len(xVal[0])-15
            t = np.random.randint(10,v);
            yTest[j] = xVal[j][t+10];
            
            #print xTrain[:,:t].shape;
            #print xTrain[:,t+10].shape;
            #print xVal[j,:t].shape;
            #print t;

            yHat[i] = manyNearestNeighborsVector(xTrainNorm[:,:t], xTrain[:,t+10], xValNorm[j,:t].reshape(1,t), k, weights=np.ones(t))
        print "k = {}\tRuntime = {:.2f}".format(k, timer.dur())
        rmse[i] = cmn.rmse(yTest, yHat)
        if i == 0 or rmse[i]<minRMSE:
            minRMSE = rmse[i]
            minK = vals[i]
        print "\tRMSE = {:.2f}".format(rmse[i])
        np.savetxt("{}/{}_{}_{}_{}_val.txt".format(resPath, serviceName, routeName, model, xSet), cmn.cmb(xVal, yTest, yHat))

    k = minK
    model = "{}NN".format(k);
    yTest = np.zeros(len(xTest));
    yHat = np.zeros(len(xTest));
    for i in range(len(xTest)):
        v = len(xVal[0])-15
        t = np.random.randint(10,v);
        yTest[i] = xTest[i][t+10];
        yHat[i] = manyNearestNeighborsVector(xTrain[:,:t], xTrain[:,t+10], xTest[i,:t].reshape(1,t), k, weights=np.ones(t))
        # Visualize and save the images for the model
        #data.visualize(yHat, "{}NN".format(k))
    np.savetxt("{}/{}_{}_{}_{}_test.txt".format(resPath, serviceName, routeName, model, xSet), cmn.cmb(xTest, yTest, yHat))
    
    # Plot the historical RMSE
    clf()
    plot(vals, rmse)
    xlabel("Number of nearest points, k in kNN")
    ylabel("Root Mean Squared Error (seconds)")
    title("kNN Model, RMSE for different ks")
    savefig("{}/{}_{}_k-rmse.png".format(figPath, serviceName, routeName))
def main():
    ## Parameters ##
    xSet = "dist_days_time_dayOfWeek_normalized"

    # Load the Data
    data = cmn.dataset(xSet = xSet)
    data.load(Nparts = 10)#, N_points = 12000)

    # Set X, Y, xTest and get sizes
    Y     = data.yScope;

    (N_train, N_feat) = data.xScope.shape
    N_test            = data.xTest.shape[0]

    # Precompute the future mask
    futureMask = makeFutureMask(data.tScope, data.tTest, futureTime = -30 * 60)

    # Compute the distance matrix (since they all use the same)
    weights = np.array([3, 0.5, 2, 1])

    timer = cmn.timer()
    traintile = np.tile(np.reshape(data.xScope, (N_train, N_feat, 1)), (1, 1, N_test));
    testtile = np.tile(np.transpose(np.reshape(data.xTest, (N_test, N_feat, 1)), (2, 1, 0)), (N_train, 1, 1));
    weightstile = np.tile(weights.reshape(1, N_feat, 1), (N_train, 1, N_test))
    print "Dist runtime: {:.2f}".format(timer.dur())

    dist = np.sum(((traintile - testtile) ** 2) * weightstile, axis=1)

    ## Try out some models with rhos and ks
    filename = "{}/{}_{}_tests_Ntrain{}.txt".format(data.resPath, data.serviceName, data.routeName, N_train)
    f = open(filename, 'w')
    str = "Model\t{}\t{}\t{}\n".format("Param", "RunTime", "RMSE")
    print str[:len(str)-1]
    f.write(str)

    # Make a list of parameters to test
    rhos = 2 ** (np.arange(-10, 10) / 2.0)
    ks   = np.ceil( 2 ** (np.arange(18) / 1.5))

    # Allocate data containers
    N_onTime = 1
    N_kernel = len(rhos)
    N_kNN = len(ks)
    N_attempts = N_onTime + N_kernel + N_kNN
    attempts = range(0, N_attempts)
    rmses = np.zeros(shape=(N_attempts), dtype=np.float)
    times = np.zeros(shape=(N_attempts), dtype=np.float)
    models = [""] * N_attempts

    for i in attempts:
        timer = cmn.timer()
        
        if(i < N_onTime):
            yHat = onTime(dist.shape[1])
            models[i] = "onTime\t"
        elif(i < (N_kernel + N_onTime)):
            rho = rhos[i - N_onTime]
            yHat = kernel(dist, futureMask, Y, rho)
            models[i] = "kernel\t{: 3.2f}".format(rho)
        elif(i < (N_kNN + N_kernel + N_onTime)):
            k = ks[i - N_onTime - N_kernel]
            yHat = kNN(dist, futureMask, Y, k)
            models[i] = "kNN\t{: 5.0f}".format(k)

        times[i] = timer.dur()
        rmses[i] = cmn.rmse(data.yTest, yHat)
        str = "{}\t{:.2f}\t{:.2f}\n".format(models[i], times[i], rmses[i])
        print str[:len(str)-1]
        f.write(str)

    f.close()
Exemple #33
0
        print('K=', k, 'seed=', seed, 'logloss=', LL)

    best_seed = np.argmax(logloss)
    logloss = logloss[best_seed]
    mixture = mixtures[best_seed]
    post = posts[best_seed]

    current_bic = common.bic(X, mixture, logloss)
    bic[j] = current_bic

    print(f'K={k}', f'Best seed={best_seed}', f'logloss={logloss}', f'BIC={current_bic}')

best_K_ix = np.argmax(bic)
best_K = K[best_K_ix]
best_bic = bic[best_K_ix]
print(f"Best K={best_K}", f"BIC={best_bic}")

# -----------------------------------
# EM Algorithm for Matrix Completion
# -----------------------------------

X_gold = np.loadtxt('netflix_complete.txt')

X_pred = em.fill_matrix(X, mixture)

rmse = common.rmse(X_gold, X_pred)

print(f"RMSE= {rmse}")

print(X)
print(X_pred)
Exemple #34
0
import em
import common

X = np.loadtxt("test_incomplete.txt")
X_gold = np.loadtxt("test_complete.txt")
X_gold_netflix = np.loadtxt("netflix_complete.txt")
X_netflix =np.loadtxt("netflix_incomplete.txt")

K = 12
n, d = X.shape
seed = [0,1,2,3,4]


# TODO: Your code here
for i in range(len(seed)):
    print(seed[i])
    init_model = common.init(X_netflix, K, seed[i])
    mixture, post, cost = em.run(X_netflix, init_model[0], init_model[1])
    X_pred = em.fill_matrix(X_netflix, mixture)
    rmse = common.rmse(X_gold_netflix,X_pred)
    print(cost)
    print(rmse)

# K= 4
# n,d = X.shape
# seed =0
# init_model = common.init(X, K, seed)
# mixture, post, cost = em.run(X, init_model[0], init_model[1])
# # print(mixture)
# X_pred = em.fill_matrix(X,mixture)
# print(X_pred)