def weighted_average(models=weights, oob=True): dfs = [] for m, perc in models: preds = common.load_preds(m, oob) print(m, common.rmse(preds, common.y_all)) df = preds * perc dfs.append(df) df = pd.concat(dfs) df = df.groupby(df.index).sum() print("ensemble", common.rmse(common.y_all, df)) common.save_preds("ensemble_weighted", df, oob=oob) return df
def weighted_average(models=weights, oob=True): dfs = [] for m, perc in models: preds = common.load_preds(m, oob) print(m, common.rmse(preds, common.y_all)) df = preds * perc dfs.append(df) df = pd.concat(dfs) df = df.groupby(df.index).sum() print("ensemble", common.rmse(common.y_all, df)) common.save_preds("ensemble_weighted", df, oob=oob) return df
def altering_minimize(X, us, vs, lamda): """ Fill missing values in X by matrix factorization with altering minimizing method. Factorize X into the form of UV We first keep vs fixed, to evaluate the us. Then evaluate us and vs in such an approach alternatively. Args: X: Ratings with missing values us: User factors vs: Move factors lamda: Normalization term Returns: The us and vs that minimize out cost """ # X_filled = X rmse_fill = np.inf # Start by fixing vss new_vs = vs new_us = us print(common.rmse(us @ vs.T, X), loss(X, us, vs, lamda)) # Iterate until X_filled has no detective change count = 0 while True: count += 1 rmse_old = rmse_fill new_us = update_us(X, new_vs, new_us, lamda) rmse_fill = common.rmse(new_us @ new_vs.T, X) ls = loss(X, new_us, new_vs, lamda) print(rmse_fill, ls) new_vs = update_vs(X, new_vs, new_us, lamda) rmse_fill = common.rmse(new_us @ new_vs.T, X) ls = loss(X, new_us, new_vs, lamda) print(rmse_fill, ls) print() if rmse_old - rmse_fill < 0.001: break return new_us, new_vs
def main(): ## Parameters ## data = cmn.dataset(xSet = "dist_days_time_dayOfWeek") data.load()#N_points = 1000) futureMask = makeFutureMask(data.tScope, data.tTest) # Try many values of k vals = np.ceil(2 ** (np.arange(15) / 1.5)) rmse = np.zeros(shape=(len(vals)), dtype=np.float) for i in range(len(vals)): k = vals[i] timer = cmn.timer() yHat = manyNearestNeighborsVector(data.xScope, data.yScope, data.xTest, k, futureMask) print "k = {}\tRuntime = {:.2f}".format(k, timer.dur()) rmse[i] = cmn.rmse(data.yTest, yHat) print "\tRMSE = {:.2f}".format(rmse[i]) data.saveYHat(yHat, model = "{}NN".format(k)) # Visualize and save the images for the model data.visualize(yHat, "{}NN".format(k)) # Plot the historical RMSE clf() plot(vals, rmse) xlabel("Number of nearest points, k in kNN") ylabel("Root Mean Squared Error (seconds)") title("kNN Model, RMSE for different ks") savefig("{}/{}_{}_k-rmse.png".format(data.figPath, data.serviceName, data.routeName))
def test_em(): init_mixture, post = common.init(X, K, seed) mixture, post, c = em.run(X, init_mixture, post) prediction = em.fill_matrix(X, mixture) print(c) print(common.rmse(prediction, X_gold))
def best_weights(models): predictions = [] for m in models: preds = common.load_preds(m) predictions.append(preds) print(m, common.rmse(common.y_all, preds)) def objective_fun(weights): final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction return common.rmse(common.y_all, final_prediction) for i in range(5): starting_values = np.random.random(len(models)) starting_values /= starting_values.sum() #adding constraints and a different solver as suggested by user 16universe #https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)}) #our weights are bound between 0 and 1 bounds = [(0, 1)] * len(predictions) res = minimize(objective_fun, starting_values, method='SLSQP', bounds=bounds, constraints=cons) print('score: {best_score}'.format(best_score=res['fun'])) print('weights: {weights}'.format(weights=res['x']))
def main(): ## Parameters ## data = cmn.dataset(xSet = "dist_days_time_dayOfWeek") data.load(Nparts = 10)#N_points = 1000) futureMask = makeFutureMask(data.tScope, data.tTest) # Try many values of k vals = np.ceil(2 ** np.arange(10)) rmse = np.zeros(shape=(len(vals)), dtype=np.float) for i in range(len(vals)): k = vals[i] timer = cmn.timer() yHat = regress(data.xScope, data.yScope, data.xTest, k, futureMask) print "rho = {}\tRuntime = {:.2f}".format(k, timer.dur()) rmse[i] = cmn.rmse(data.yTest, yHat) print "\tRMSE = {:.2f}".format(rmse[i]) data.saveYHat(yHat, model = "kernel_{}rho".format(k)) # Visualize and save the images for the model data.visualize(yHat, "kernel_{}rho".format(k)) # Plot the historical RMSE clf() plot(vals, rmse) xlabel("rho Paramater") ylabel("Root Mean Squared Error (seconds)") title("Kernel Regression Model, RMSE for different rhos") savefig("{}/{}_{}_kernel_rho-rmse.png".format(data.figPath, data.serviceName, data.routeName))
def average(models, oob=True): dfs = [common.load_preds(m, oob) for m in models] df = pd.concat(dfs) df = df.groupby(df.index).mean() common.save_preds("ensemble_average", df) print("ensemble", common.rmse(common.y_all, df)) return df
def average(models, oob=True): dfs = [common.load_preds(m, oob) for m in models] df = pd.concat(dfs) df = df.groupby(df.index).mean() common.save_preds("ensemble_average", df) print("ensemble", common.rmse(common.y_all, df)) return df
def best_weights(models): predictions = [] for m in models: preds = common.load_preds(m) predictions.append(preds) print(m, common.rmse(common.y_all, preds)) def objective_fun(weights): final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction return common.rmse(common.y_all, final_prediction) for i in range(5): starting_values = np.random.random(len(models)) starting_values /= starting_values.sum() #adding constraints and a different solver as suggested by user 16universe #https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D cons = ({'type': 'eq','fun': lambda w: 1-sum(w)}) #our weights are bound between 0 and 1 bounds = [(0, 1)] * len(predictions) res = minimize(objective_fun, starting_values, method='SLSQP', bounds=bounds, constraints=cons) print('score: {best_score}'.format(best_score=res['fun'])) print('weights: {weights}'.format(weights=res['x']))
def main(): parser = ArgumentParser() add_args(parser) args = parser.parse_args() dsmoothedTraj = pickle.load(open(args.inputfile,'rb')) learnfilter.addsmooth(dsmoothedTraj) fdf,edf,m = common.loadall(args.pbname) if args.model=='' or args.coverage is None: #compute valid time intervals for each aircraft, without filtering dicot = {aircraft:[(np.min(smo.trajff.timeAtServer),np.max(smo.trajff.timeAtServer))] for aircraft,smo in dsmoothedTraj.items()} else: #compute valid time intervals for each aircraft with filtering using the model with open(args.model,'rb') as f: model = pickle.load(f) vdf = pd.read_csv("./Data/{0}_result/{0}_result.csv".format(args.pbname)) tokeep = ceil(vdf.shape[0]*args.coverage) print("# of points to keep",tokeep) dicot = compute_dicot_from_model(edf,{k:smo for (k,smo) in dsmoothedTraj.items() if smo.trajff.shape[0]>0},model, tokeep, args.min_continuous_to_keep) print("compute prediction") pred= build_predictionfile(edf,dsmoothedTraj, dicot) print("compute distance") d = common.haversine_distance(pred.loc[:,latn].values,pred.loc[:,lonn].values,pred.latitude.values,pred.longitude.values) print(d.shape[0],common.rmse(d),common.rmse90(d)) if args.outputfile != '': print("writing prediction file") pred=pred.drop(columns=["longitude","latitude"]) df=merge_with_result(pred,args.pbname) print("actual coverage",df.query("longitude==longitude").shape[0]/df.shape[0]) df.to_csv(args.outputfile,float_format="%.12f",index=False)
def run_matrix_completion(): K = 12 seed = 1 mixture, post = common.init(X, K, seed) mixture, post, ll = em.run(X, mixture, post) X_pred = em.fill_matrix(X, mixture) X_gold = np.loadtxt('netflix_complete.txt') print("RMSE:", common.rmse(X_gold, X_pred))
def test_k12(): lls = [] for s in [0, 1, 2, 3, 4]: print(s) init_mixture, post = common.init(X, 12, s) model = em.run(X, init_mixture, post) lls.append(model) m, p, l = max(lls, key=lambda x: x[-1]) prediction = em.fill_matrix(X, m) return common.rmse(prediction, X_gold)
def main(): ## Parameters ## data = cmn.dataset(xSet = "dist_days_time_dayOfWeek") data.load(Nparts = 10)#N_points = 1000) futureMask = makeFutureMask(data.tScope, data.tTest) rmse = np.zeros(shape=(len(vals)), dtype=np.float) timer = cmn.timer() yHat = regress(data.xScope, data.yScope, data.xTest, 1, futureMask) print "onTime\tRuntime = {:.2f}".format(timer.dur()) rmse[i] = cmn.rmse(data.yTest, yHat) print "\tRMSE = {:.2f}".format(rmse[i]) data.saveYHat(yHat, model = "onTime".format(k)) # Visualize and save the images for the model data.visualize(yHat, "onTime".format(k))
def main(): ## Parameters ## data = cmn.dataset(xSet = "dist_days_time_dayOfWeek") data.load()#N_points = 1000) k = 10 futureMask = makeFutureMask(data.tScope, data.tTest) timer = cmn.timer() yHat = manyNearestNeighborsVector(data.xScope, data.yScope, data.xTest, k, futureMask) print "Vectorized Runtime = {:.2f}".format(timer.dur()) print "RMSE = {:.2f}".format(cmn.rmse(data.yTest, yHat)) data.saveYHat(yHat, model = "{}NN".format(k)) #timer.reset() #yHat2 = manyNearestNeighbors(data.xTrain, data.yTrain, data.xTest, k) #print "Iterative Runtime = {:.2f}".format(timer.dur()) #print "RMSE = {}".format(cmn.rmse(data.yTest, yHat)) # Visualize and save the images for the model data.visualize(yHat, "{}NN".format(k))
# gaussian, post, new_ll = kmeans.run(X, gaussian, post) # common.plot(X, gaussian, post, "K-means: number of classes{}, random seed {}".format(k, i)) # # for k in range(1, 5, 1): # for i in range(1): # gaussian, post = common.init(X, k, seed=i) # gaussian, post, new_ll = naive_em.run(X, gaussian, post) # common.plot(X, gaussian, post, "EM: number of classes{}, random seed {}".format(k, i)) X = np.loadtxt("netflix_incomplete.txt") X_gold = np.loadtxt('netflix_complete.txt') # for k in [1, 12]: # for i in range(5): # gaussian, post = common.init(X, k, seed=i) # gaussian, post, new_ll = em.run(X, gaussian, post) # print("EM: number of classes {}, random seed {}:".format(k, i)) # print(new_ll) gaussian, post = common.init(X, 12, seed=1) gaussian, post, new_ll = em.run(X, gaussian, post) X_pred = em.fill_matrix(X, gaussian) print(common.rmse(X_gold, X_pred)) # for k in range(1, 5, 1): # for i in range(5): # gaussian, post = common.init(X, k, seed=i) # gaussian, post, new_ll = naive_em.run(X, gaussian, post) # print("BIC = {} for K = {} and seed = {}".format(common.bic(X, gaussian, new_ll), k, i)) # #
def objective_fun(weights): final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction return common.rmse(common.y_all, final_prediction)
def objective_fun(weights): final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction return common.rmse(common.y_all, final_prediction)
def main(): ## Parameters ## xSet = "dist_days_time_dayOfWeek" data = cmn.dataset(xSet = xSet) data.load(Nparts = 10, N_points = 4000) futureMask = makeFutureMask(data.tScope, data.tTest, futureTime = -30 * 60) print "Model\t{}\t{}\t{}".format("Param", "RunTime", "RMSE") # Predict on time timer = cmn.timer() yHat = onTime.regress(data.xScope, data.yScope, data.xTest, 1, futureMask) time_onTime = timer.dur() rmse_onTime = cmn.rmse(data.yTest, yHat) print "onTime\t\t{:.2f}\t{:.2f}".format(time_onTime, rmse_onTime) data.saveYHat(yHat, model = "onTime") # Visualize and save the images for the model data.visualize(yHat, "onTime") ## kNN # Try many values of k ks = np.ceil(2 ** (np.arange(15) / 1.5)) rmse_kNN = np.zeros(shape=(len(ks)), dtype=np.float) time_kNN = np.zeros(shape=(len(ks)), dtype=np.float) for i in range(len(ks)): k = ks[i] timer = cmn.timer() yHat = kNN.regress(data.xScope, data.yScope, data.xTest, k, futureMask) time_kNN[i] = timer.dur() rmse_kNN[i] = cmn.rmse(data.yTest, yHat) print "kNN\t{: 5.0f}\t{:.2f}\t{:.2f}".format(k, time_kNN[i], rmse_kNN[i]) data.saveYHat(yHat, model = "{}NN".format(k)) # Visualize and save the images for the model data.visualize(yHat, "{}NN".format(k)) # Plot the historical RMSE clf() plot(ks, rmse_kNN) xlabel("Number of nearest points, k in kNN") ylabel("Root Mean Squared Error (seconds)") title("kNN Model, RMSE for different ks") savefig("{}/{}_{}_k-rmse.png".format(data.figPath, data.serviceName, data.routeName)) ## Kernel # Try many values of k rhos = np.ceil(10 ** (np.arange(12) / 2.0)) rmse_kernel = np.zeros(shape=(len(rhos)), dtype=np.float) time_kernel = np.zeros(shape=(len(rhos)), dtype=np.float) for i in range(len(rhos)): rho = rhos[i] timer = cmn.timer() yHat = kernel.regress(data.xScope, data.yScope, data.xTest, rho, futureMask) time_kernel[i] = timer.dur() rmse_kernel[i] = cmn.rmse(data.yTest, yHat) print "kernel\t{: 5.0f}\t{:.2f}\t{:.2f}".format(rho, time_kernel[i], rmse_kernel[i]) data.saveYHat(yHat, model = "kernel_{}rho".format(rho)) # Visualize and save the images for the model data.visualize(yHat, "kernel_{}rho".format(rho)) # Plot the historical RMSE clf() plot(rhos, rmse_kernel) xlabel("rho Paramater") ylabel("Root Mean Squared Error (seconds)") title("Kernel Regression Model, RMSE for different rhos") savefig("{}/{}_{}_kernel_rho-rmse.png".format(data.figPath, data.serviceName, data.routeName))
def main(): parser = ArgumentParser() add_args(parser) args = parser.parse_args() ds = pickle.load(open(args.inputfile, 'rb')) aircrafts = ds.aircraft.unique() if args.pbname != '': fdf, edf, m = common.loadall(args.pbname) vdftrue = None if args.ts else pd.read_csv( "./Data/{}_result/{}_result.csv".format(args.pbname, args.pbname)) d = {} ld = [] for aircraft in aircrafts: print("aircraft", aircraft) traj = ds.query("aircraft==" + str(aircraft)).reset_index(drop=True) if traj.shape[0] > MIN_REQUIRED_NB: error = traj.error.values # discard points with a large multilateration error filterror = filter_error(error, args.thr_error) trajf = traj.loc[filterror] # keep the longest sequence satisfying speed constraints if trajf.shape[0] > MIN_REQUIRED_NB: filtspeed = filter_speedlimit(trajf.nnpredlatitude.values, trajf.nnpredlongitude.values, trajf.timeAtServer.values, 0., args.speed_limit) trajff = trajf.loc[filtspeed] drawtrue = common.haversine_distance( trajff.latitude, trajff.longitude, trajff.nnpredlatitude.values, trajff.nnpredlongitude.values) smoothedtraj = SmoothedTraj(trajff, args.smooth) t = trajff.timeAtServer.values slat, slon = smoothedtraj.predict(t) dsmoothraw = common.haversine_distance( slat, slon, trajff.nnpredlatitude.values, trajff.nnpredlongitude.values) tmin = np.min(t) tmax = np.max(t) if args.pbname != '': traje = edf.query("aircraft==" + str(aircraft)).query( str(tmin) + "<=timeAtServer").query("timeAtServer<=" + str(tmax)).reset_index( drop=True) dsmoothtrue = comparewithtrue(traje, smoothedtraj, vdftrue) #[300:-300] ld.append(dsmoothtrue) print(common.rmse(ld[-1]), common.rmse90(ld[-1]), common.rmse50(ld[-1])) print(traj.shape, trajff.shape) d[aircraft] = smoothedtraj if len(ld) > 0: dsmoothtrue = np.concatenate(ld) print(dsmoothtrue.shape[0], common.rmse(dsmoothtrue), common.rmse90(dsmoothtrue)) e = np.sort(dsmoothtrue, axis=None)[:int(dsmoothtrue.shape[0] * 0.6) + 1] print(e.shape[0], common.rmse(e), common.rmse90(e)) if args.outputfile != '': # save dict[aircraft]=SmoothedTraj with open(args.outputfile, 'wb') as f: pickle.dump(d, f)
for seed in seeds: mixture, post = common.init(X, K=K, seed=seed) mixture, post, log_likelihood = em.run(X, mixture, post) print(K, seed, log_likelihood) # ============================================================================= # Completing missing entries # ============================================================================= X = np.loadtxt("test_incomplete.txt") X_gold = np.loadtxt("test_complete.txt") mixture, post = common.init(X, K=4, seed=0) mixture, post, log_likelihood = em.run(X, mixture, post) X_pred = em.fill_matrix(X, mixture) RMSE = common.rmse(X_gold, X_pred) print(X_pred, RMSE) # ============================================================================= # Comparing with gold targets # ============================================================================= X = np.loadtxt("netflix_incomplete.txt") X_gold = np.loadtxt('netflix_complete.txt') mixture, post = common.init(X, K=12, seed=1) mixture, post, log_likelihood = em.run(X, mixture, post) X_pred = em.fill_matrix(X, mixture) RMSE = common.rmse(X_gold, X_pred) print(RMSE)
return np.array(new_vs) def loss(X, us, vs, l): not_missing = X != 0 se = np.sum((X - (us @ vs.T) * not_missing)**2)/2 regularization = np.sum(us ** 2) + np.sum(vs**2) regularization = regularization*l/2 return se + regularization if __name__ == "__main__": test_X = X num_u,num_i = test_X.shape is_missing = test_X == 0 # us, s, vs = svds(test_X) # us, vs = altering_minimize(test_X, us, vs, 1) rmse = [] for k in [1,2,3,4,5,6,7,8,9,10]: us = np.random.randint(1, 6, (num_u, k)).astype('float') vs = np.random.randint(1, 6, (num_i, k)).astype('float') for l in [0]: us, vs = altering_minimize(test_X, us, vs, l) x_pre_raw = (us @ vs.T) x_pred = x_pre_raw * is_missing + (test_X *(~is_missing)) rmse.append(common.rmse(x_pred,X_gold)) print(rmse)
em_mix, em_post, em_ll = naive_em.run(X, init_mix, init_post) if k_cost < k_best_cost: k_best_mix, k_best_post, k_best_cost = k_mix, k_post, k_cost if em_ll > em_best_ll: em_best_mix, em_best_post, em_best_ll = em_mix, em_post, em_ll BICs[i] = common.bic(X, em_best_mix, em_best_ll) common.plot(X, k_best_mix, k_best_post, "K-means K={}".format(K)) common.plot(X, em_best_mix, em_best_post, "EM K={}".format(K)) print("BICs: ", BICs) print("Best BIC: ", np.max(BICs)) print("Best K: ", Ks[np.argmax(BICs)]) X = np.loadtxt("netflix_incomplete.txt") K = 12 seeds = [0, 1, 2, 3, 4] em_best_mix, em_best_post, em_best_ll = None, None, -np.inf for seed in seeds: init_mix, init_post = common.init(X, K, seed) em_mix, em_post, em_ll = em.run(X, init_mix, init_post) if em_ll > em_best_ll: em_best_mix, em_best_post, em_best_ll = em_mix, em_post, em_ll print("K = {}, LL = {}".format(K, em_best_ll)) X_fill_pred = em.fill_matrix(X, em_best_mix) X_fill = np.load("netflix_complete") print("X_filled Error:", common.rmse(X_fill_pred, X_fill))
# Posterior probs. for best seeds posts = [0, 0, 0, 0, 0] # RMS Error for clusters rmse = [0., 0.] start_time = time.perf_counter() for k in range(len(K)): for i in range(5): # Run EM mixtures[i], posts[i], log_lh[i] = \ em.run(X, *common.init(X, K[k], i)) # Print lowest cost print("=============== Clusters:", K[k], "======================") print("Highest log likelihood using EM is:", np.max(log_lh)) # # Save best seed for plotting best_seed[k] = np.argmax(log_lh) # # # Use the best mixture to fill prediction matrix X_pred = em.fill_matrix(X, mixtures[best_seed[k]]) rmse[k] = common.rmse(X_gold, X_pred) print("===================================================") print("RMS Error for K = 12 is: {:.4f}".format(rmse[1])) end_time = time.perf_counter() print("Time taken for this run: {:.4f} seconds".format(end_time - start_time))
print("naive EM log likelihood : " + str(naive_em_estimate)) print("############## Some Tests ######################") initialMixture, initialPost = common.init(toy_X, 1, 0) mixtureEM1, postEM1, ll1 = naive_em.run(toy_X, initialMixture, initialPost) initialMixture, initialPost = common.init(toy_X, 2, 0) mixtureEM2, postEM2, ll2 = naive_em.run(toy_X, initialMixture, initialPost) initialMixture, initialPost = common.init(toy_X, 3, 0) mixtureEM3, postEM3, ll3 = naive_em.run(toy_X, initialMixture, initialPost) initialMixture, initialPost = common.init(toy_X, 4, 0) mixtureEM4, postEM4, ll4 = naive_em.run(toy_X, initialMixture, initialPost) print("BIC K1 : " + str(common.bic(toy_X, mixtureEM1, ll1))) print("BIC K2 : " + str(common.bic(toy_X, mixtureEM2, ll2))) print("BIC K3 : " + str(common.bic(toy_X, mixtureEM3, ll3))) print("BIC K4 : " + str(common.bic(toy_X, mixtureEM4, ll4))) X_netflix = np.loadtxt("netflix_incomplete.txt") test_em_seeds(X_netflix, 1) #test_em_seeds(X_netflix, 12) X_gold = np.loadtxt('netflix_complete.txt') mixture4, post4 = common.init(X_netflix, 12, 1) mixture, post, cost4 = em.run(X_netflix, mixture4, post4) X_pred = em.fill_matrix(X_netflix, mixture) rmse_result = common.rmse(X_gold, X_pred) print("RMSE between prediction and GOLD is : " + str(rmse_result))
def run_matrix_completion(): mixture, post = common.init(X, 12, 1) mixture, post, ll = em.run(X, mixture, post) X_pred = em.fill_matrix(X, mixture) X_gold = np.loadtxt('netflix_complete.txt') print("root mean squared error:", common.rmse(X_gold, X_pred))
def main(): ## Parameters ## xSet = "allfeats_normalized" # or manyfeats... k = 100 eta = 0.00001 N_passes = 50 # Load the Data data = cmn.dataset(xSet = xSet) data.load(Nparts = 10, validation = True)#, N_points = 4000) # Get xs and sizes X = data.xScope Xv = data.xVal Xt = data.xTest Y = data.yScope Yv = data.yVal Yt = data.yTest T = data.tScope Tv = data.tVal Tt = data.tTest (N_train, N_feat) = data.xScope.shape N_val = data.xVal.shape[0] N_test = data.xTest.shape[0] # Start a file to record filename = "{}/{}_{}_weightvalidation_allfeats_Ntrain{}.txt".format(data.resPath, data.serviceName, data.routeName, N_train) f = open(filename, 'w') # Precompute the future mask futureMask = makeFutureMask(T, Tt, futureTime = -30 * 60) # Compute the distance matrix (since they all use the same) #traintile = np.tile(np.reshape(data.xScope, (N_train, N_feat, 1)), (1, 1, N_test)); #testtile = np.tile(np.transpose(np.reshape(data.xTest, (N_test, N_feat, 1)), (2, 1, 0)), (N_train, 1, 1)); #difftile = ((traintile - testtile) ** 2) N_iterations = N_val * N_passes ws = np.zeros(shape=(N_iterations, N_feat)) #ws[0, 0] = 1 rmse = np.zeros(shape=(N_iterations, 1)) # Preliminary Test Ypreds = np.zeros((N_test, 1)) w = ws[0, :].copy() w.shape = (1, N_feat) for i in range(N_test): i_point = i % N_test Xp = Xt[i_point, :].copy() Xp.shape = (1, N_feat) Yp = Yt[i_point] Tp = Tt[i_point] # Limit training data to the past Tpast = T < (Tp - (30 * 60)) Xpast = X[Tpast, :] Ypast = Y[Tpast] N_past = Xpast.shape[0] # Find the distance diff = Xpast - np.tile(Xp, (N_past, 1)) diff *= np.tile(w, (N_past, 1)) dist = sum(diff ** 2, axis = 1) # Predict Y using kNN if(len(dist) < k): Ypred = 0 else: Ypred = np.mean(Ypast[dist.argsort()[:k]]) Ypreds[i] = Ypred Yerror = Yp - Ypred #str = "t = {}\tx={:.2f}, {:.2f}, {:.2f}, {:.2f}\ty={:.2f}\typred={:.2f}\n".format(i, Xp[0, 0], Xp[0, 1], Xp[0, 2], Xp[0, 3], Yp, Ypred) #print str[:(len(str) - 1)] #f.write(str) rmse = cmn.rmse(Yt, Ypreds) str = "eta {}\tpass {}\tw={}\trmse = {:.2f}\n".format(eta, 0, w[0, :], rmse) #str = "eta {}\tpass {}\tw={:.2f}, {:.2f}, {:.2f}, {:.2f}\trmse = {:.2f}\n\n".format(eta, 0, w[0, 0], w[0, 1], w[0, 2], w[0, 3], rmse) print str[:(len(str) - 1)] f.write(str) for i_pass in range(N_passes): # Improve the weights randOrder = np.random.permutation(range(N_val)) for i in range(N_val * i_pass, N_val * (i_pass + 1)): w = ws[i, :].copy() w.shape = (1, N_feat) i_point = randOrder[i % N_val] Xp = Xv[i_point, :].copy() Xp.shape = (1, N_feat) Yp = Yv[i_point] Tp = Tv[i_point] # Limit training data to the past Tpast = T < (Tp - (30 * 60)) Xpast = X[Tpast, :] Ypast = Y[Tpast] N_past = Xpast.shape[0] # Find the distance diff = Xpast - np.tile(Xp, (N_past, 1)) diff *= np.tile(w, (N_past, 1)) dist = sum(diff ** 2, axis = 1) # Predict Y using kNN if(len(dist) < k): Ypred = 0 else: Ypred = np.mean(Ypast[dist.argsort()[:k]]) Yerror = Yp - Ypred #str = "i = {}\tw={:.2f}, {:.2f}, {:.2f}, {:.2f}\tx={:.2f}, {:.2f}, {:.2f}, {:.2f}\ty={:.2f}\typred={:.2f}\n".format(i, w[0, 0], w[0, 1], w[0, 2], w[0, 3], Xp[0, 0], Xp[0, 1], Xp[0, 2], Xp[0, 3], Yp, Ypred) #print str[:(len(str) - 1)] #f.write(str) # Recalculate weights w_delta = eta * (Xp * Yerror + w) if i < (N_iterations - 1): ws[i + 1] = w - w_delta #str = "\nFinished validating\tw={:.2f}, {:.2f}, {:.2f}, {:.2f}\n\n".format(w[0, 0], w[0, 1], w[0, 2], w[0, 3]) #print str[:(len(str) - 1)] #f.write(str) # Test Ypreds = np.zeros((N_test, 1)) for i in range(N_test): #w.shape = (1, N_feat) i_point = i % N_test Xp = Xt[i_point, :].copy() Xp.shape = (1, N_feat) Yp = Yt[i_point] Tp = Tt[i_point] # Limit training data to the past Tpast = T < (Tp - (30 * 60)) Xpast = X[Tpast, :] Ypast = Y[Tpast] N_past = Xpast.shape[0] # Find the distance diff = Xpast - np.tile(Xp, (N_past, 1)) diff *= np.tile(w, (N_past, 1)) dist = sum(diff ** 2, axis = 1) # Predict Y using kNN if(len(dist) < k): Ypred = 0 else: Ypred = np.mean(Ypast[dist.argsort()[:k]]) Ypreds[i] = Ypred Yerror = Yp - Ypred #str = "t = {}\tx={:.2f}, {:.2f}, {:.2f}, {:.2f}\ty={:.2f}\typred={:.2f}\n".format(i, Xp[0, 0], Xp[0, 1], Xp[0, 2], Xp[0, 3], Yp, Ypred) #print str[:(len(str) - 1)] #f.write(str) rmse = cmn.rmse(Yt, Ypreds) str = "eta {}\tpass {}\tw={}\trmse = {:.2f}\n".format(eta, i_pass + 1, w[0, :], rmse) #wstr = "{:.2f} ".format(w[0, :]) #str = "eta {}\tpass {}\tw={}\trmse = {:.2f}\n\n".format(eta, i_pass + 1, wstr, rmse) print str[:(len(str) - 1)] f.write(str) f.close()
print('Mu:\n' + str(mixture.mu)) print('Var: ' + str(mixture.var)) print('P: ' + str(mixture.p)) print() print("After first E-step:") post, ll = em.estep(X, mixture) print('post:\n' + str(post)) print('LL:' + str(ll)) print() print("After first M-step:") mu, var, p = em.mstep(X, post, mixture) print('Mu:\n' + str(mu)) print('Var: ' + str(var)) print('P: ' + str(p)) print() print("After a run") (mu, var, p), post, ll = em.run(X, mixture, post) print('Mu:\n' + str(mu)) print('Var: ' + str(var)) print('P: ' + str(p)) print('post:\n' + str(post)) print('LL: ' + str(ll)) X_pred = em.fill_matrix(X, common.GaussianMixture(mu, var, p)) error = common.rmse(X_gold, X_pred) print("X_gold:\n" + str(X_gold)) print("X_pred:\n" + str(X_pred)) print("RMSE: " + str(error))
import common X = np.loadtxt("test_incomplete.txt") X_gold = np.loadtxt("test_complete.txt") K = 4 n, d = X.shape seed = 0 # TODO: Your code here mix_conv, post_conv, log_lh_conv = em.run(X, *common.init(X, K, seed)) X_predict = em.fill_matrix(X, mix_conv) rmse = common.rmse(X_gold, X_predict) #%% Begin: Comparison of EM for matrix completion with K = 1 and 12 import time X = np.loadtxt("netflix_incomplete.txt") X_gold = np.loadtxt("netflix_complete.txt") K = [1, 12] # Clusters to try log_lh = [0, 0, 0, 0, 0] # Log likelihoods for different seeds # Best seed for cluster based on highest log likelihoods best_seed = [0, 0] # Mixtures for best seeds
import numpy as np import em import common X = np.loadtxt("netflix_incomplete.txt") X_gold = np.loadtxt("netflix_complete.txt") K = 12 log_lh = [0, 0, 0, 0, 0] best_seed = 0 mixtures = [0, 0, 0, 0, 0] posts = [0, 0, 0, 0, 0] rmse = 0. # Test all seeds for i in range(5): mixtures[i], posts[i], log_lh[i] = em.run(X, *common.init(X, K, i)) best_seed = np.argmax(log_lh) Y = em.fill_matrix(X, mixtures[best_seed]) rmse = common.rmse(X_gold, Y) print("RMSE for K = 12: {:.4f}".format(rmse))
def main(): ## Parameters ## #data = cmn.dataset(xSet = "traj") #data.load()#N_points = 1000) #futureMask = makeFutureMask(data.tScope, data.tTest) dataPath = "/projects/onebusaway/BakerNiedMLProject/data/routefeatures" resPath = "/projects/onebusaway/BakerNiedMLProject/data/modelPredictions" figPath = "/projects/onebusaway/BakerNiedMLProject/figures/predictions" serviceName = "intercitytransit" routeName = "route13" xSet = "traj" ySet = "dev" x = np.loadtxt("{}/{}_{}_{}.txt".format(dataPath, serviceName, routeName, xSet), dtype=np.float) # Try many values of k vals = np.ceil(2 ** (np.arange(15) / 1.5)) rmse = np.zeros(shape=(len(vals)), dtype=np.float) minK = 0 minRMSE = 0 sel = np.random.permutation(range(len(x))); split = len(x)/4; xTrain = x[sel[:split*2]]; xVal = x[sel[split*2:3*split]]; xTest = x[sel[3*split:]]; yTest = np.zeros(len(xVal)); yHat = np.zeros(len(xVal)); data_norm = np.empty(shape = x.shape) theMean = x[:,:].mean() theStdDev = x[:,:].std() data_norm = (x - theMean)/ theStdDev xTrainNorm = data_norm[sel[:split*2]]; xValNorm = data_norm[sel[split*2:3*split]]; xTestNorm = data_norm[sel[3*split:]]; for i in range(len(vals)): k = vals[i] model = "{}NN".format(k); timer = cmn.timer() print xTrain.shape; print xTest.shape; print xVal.shape; for j in range(len(xVal)): v = len(xVal[0])-15 t = np.random.randint(10,v); yTest[j] = xVal[j][t+10]; #print xTrain[:,:t].shape; #print xTrain[:,t+10].shape; #print xVal[j,:t].shape; #print t; yHat[i] = manyNearestNeighborsVector(xTrainNorm[:,:t], xTrain[:,t+10], xValNorm[j,:t].reshape(1,t), k, weights=np.ones(t)) print "k = {}\tRuntime = {:.2f}".format(k, timer.dur()) rmse[i] = cmn.rmse(yTest, yHat) if i == 0 or rmse[i]<minRMSE: minRMSE = rmse[i] minK = vals[i] print "\tRMSE = {:.2f}".format(rmse[i]) np.savetxt("{}/{}_{}_{}_{}_val.txt".format(resPath, serviceName, routeName, model, xSet), cmn.cmb(xVal, yTest, yHat)) k = minK model = "{}NN".format(k); yTest = np.zeros(len(xTest)); yHat = np.zeros(len(xTest)); for i in range(len(xTest)): v = len(xVal[0])-15 t = np.random.randint(10,v); yTest[i] = xTest[i][t+10]; yHat[i] = manyNearestNeighborsVector(xTrain[:,:t], xTrain[:,t+10], xTest[i,:t].reshape(1,t), k, weights=np.ones(t)) # Visualize and save the images for the model #data.visualize(yHat, "{}NN".format(k)) np.savetxt("{}/{}_{}_{}_{}_test.txt".format(resPath, serviceName, routeName, model, xSet), cmn.cmb(xTest, yTest, yHat)) # Plot the historical RMSE clf() plot(vals, rmse) xlabel("Number of nearest points, k in kNN") ylabel("Root Mean Squared Error (seconds)") title("kNN Model, RMSE for different ks") savefig("{}/{}_{}_k-rmse.png".format(figPath, serviceName, routeName))
def main(): ## Parameters ## xSet = "dist_days_time_dayOfWeek_normalized" # Load the Data data = cmn.dataset(xSet = xSet) data.load(Nparts = 10)#, N_points = 12000) # Set X, Y, xTest and get sizes Y = data.yScope; (N_train, N_feat) = data.xScope.shape N_test = data.xTest.shape[0] # Precompute the future mask futureMask = makeFutureMask(data.tScope, data.tTest, futureTime = -30 * 60) # Compute the distance matrix (since they all use the same) weights = np.array([3, 0.5, 2, 1]) timer = cmn.timer() traintile = np.tile(np.reshape(data.xScope, (N_train, N_feat, 1)), (1, 1, N_test)); testtile = np.tile(np.transpose(np.reshape(data.xTest, (N_test, N_feat, 1)), (2, 1, 0)), (N_train, 1, 1)); weightstile = np.tile(weights.reshape(1, N_feat, 1), (N_train, 1, N_test)) print "Dist runtime: {:.2f}".format(timer.dur()) dist = np.sum(((traintile - testtile) ** 2) * weightstile, axis=1) ## Try out some models with rhos and ks filename = "{}/{}_{}_tests_Ntrain{}.txt".format(data.resPath, data.serviceName, data.routeName, N_train) f = open(filename, 'w') str = "Model\t{}\t{}\t{}\n".format("Param", "RunTime", "RMSE") print str[:len(str)-1] f.write(str) # Make a list of parameters to test rhos = 2 ** (np.arange(-10, 10) / 2.0) ks = np.ceil( 2 ** (np.arange(18) / 1.5)) # Allocate data containers N_onTime = 1 N_kernel = len(rhos) N_kNN = len(ks) N_attempts = N_onTime + N_kernel + N_kNN attempts = range(0, N_attempts) rmses = np.zeros(shape=(N_attempts), dtype=np.float) times = np.zeros(shape=(N_attempts), dtype=np.float) models = [""] * N_attempts for i in attempts: timer = cmn.timer() if(i < N_onTime): yHat = onTime(dist.shape[1]) models[i] = "onTime\t" elif(i < (N_kernel + N_onTime)): rho = rhos[i - N_onTime] yHat = kernel(dist, futureMask, Y, rho) models[i] = "kernel\t{: 3.2f}".format(rho) elif(i < (N_kNN + N_kernel + N_onTime)): k = ks[i - N_onTime - N_kernel] yHat = kNN(dist, futureMask, Y, k) models[i] = "kNN\t{: 5.0f}".format(k) times[i] = timer.dur() rmses[i] = cmn.rmse(data.yTest, yHat) str = "{}\t{:.2f}\t{:.2f}\n".format(models[i], times[i], rmses[i]) print str[:len(str)-1] f.write(str) f.close()
print('K=', k, 'seed=', seed, 'logloss=', LL) best_seed = np.argmax(logloss) logloss = logloss[best_seed] mixture = mixtures[best_seed] post = posts[best_seed] current_bic = common.bic(X, mixture, logloss) bic[j] = current_bic print(f'K={k}', f'Best seed={best_seed}', f'logloss={logloss}', f'BIC={current_bic}') best_K_ix = np.argmax(bic) best_K = K[best_K_ix] best_bic = bic[best_K_ix] print(f"Best K={best_K}", f"BIC={best_bic}") # ----------------------------------- # EM Algorithm for Matrix Completion # ----------------------------------- X_gold = np.loadtxt('netflix_complete.txt') X_pred = em.fill_matrix(X, mixture) rmse = common.rmse(X_gold, X_pred) print(f"RMSE= {rmse}") print(X) print(X_pred)
import em import common X = np.loadtxt("test_incomplete.txt") X_gold = np.loadtxt("test_complete.txt") X_gold_netflix = np.loadtxt("netflix_complete.txt") X_netflix =np.loadtxt("netflix_incomplete.txt") K = 12 n, d = X.shape seed = [0,1,2,3,4] # TODO: Your code here for i in range(len(seed)): print(seed[i]) init_model = common.init(X_netflix, K, seed[i]) mixture, post, cost = em.run(X_netflix, init_model[0], init_model[1]) X_pred = em.fill_matrix(X_netflix, mixture) rmse = common.rmse(X_gold_netflix,X_pred) print(cost) print(rmse) # K= 4 # n,d = X.shape # seed =0 # init_model = common.init(X, K, seed) # mixture, post, cost = em.run(X, init_model[0], init_model[1]) # # print(mixture) # X_pred = em.fill_matrix(X,mixture) # print(X_pred)