def predict_by_user(): users = ur.readPanelData("/Users/zhanzhao/Dropbox (MIT)/TfL/Data/sampleData.csv") start = 11688 end = 12419 user_acc = [] user_freq = [] # dowR = np.zeros(7) # dowW = np.zeros(7) # hodR = np.zeros(24) # hodW = np.zeros(24) # weekR = np.zeros(105) # weekW = np.zeros(105) days = range(start, end, 7) total_R = 0 total_W = 0 # train_R = 0 # train_W = 0 for i, user in enumerate(users): user_R = 0 user_W = 0 sum_train = 0 count_train = 0 for t in days: # day1 = max(start, (t-7*training_period)) # day2 = t # day3 = t + 7 train, test = splitDataset(user.tripList, t) if len(train) > 0 and len(test) > 0: train_X, train_Y = data_prep(train) test_X, test_Y = data_prep(test) if len(np.unique(train_Y)) > 1: new_train_X, new_test_X = feature_extraction(train_X, test_X) # clf = nb.MultinomialNB() clf = lm.LogisticRegression() # clf = tree.DecisionTreeClassifier() # clf = svm.LinearSVC() clf.fit(new_train_X, train_Y) pred_Y = clf.predict(new_test_X) else: pred_Y = [train_Y[0]] * len(test_Y) r, w = predict_eval(test_Y, pred_Y) user_R += r user_W += w count_train += 1 sum_train += len(train_Y) # new_train_X, tX = feature_extraction(train_X, train_X) # if len(np.unique(train_Y)) > 1: # pred_train_Y = clf.predict(tX) # else: # pred_train_Y = [train_Y[0]] * len(train_Y) # if len(pred_train_Y) != len(train_Y): # print len(train), new_train_X.shape, len(pred_train_Y), len(train_Y) # tr, tw = predict_eval(train_Y, pred_train_Y) # train_R += tr # train_W += tw # predictability.append((user.id, len(train), r, w)) # dow_r, dow_w, hod_r, hod_w = predict_eval_by_time(test_X, test_Y, pred_Y) # dowR = dowR + dow_r # dowW = dowW + dow_w # hodR = hodR + hod_r # hodW = hodW + hod_w # weekR[(t-11688)/7] += r # weekW[(t-11688)/7] += w total_R += user_R total_W += user_W if user_R + user_W > 0: accuracy = user_R * 1.0 / (user_R + user_W) user_acc.append(accuracy) avg_train_size = sum_train * 1.0/count_train user_freq.append(avg_train_size) else: accuracy = -9999 # if train_R + train_W > 0: # train_acc = train_R*1.0/(train_R+train_W) # else: # train_acc = -9999 print i, (user_R+user_W), accuracy # bar_plot(hodR, hodW) # bar_plot(dowR, dowW) # bar_plot(weekR, weekW) # dist_plot(user_acc) scatter_plot(user_freq, user_acc) print np.corrcoef(user_freq, user_acc) print total_R * 1.0 / (total_R + total_W)
ax[row, col].set_ylim([0, 150]) ax[row, col].hist(pp_list[i], bins=range(30)) else: ax[row, col].set_ylim([0, 100]) ax[row, col].hist(pp_list[i], bins=[j/20.0 for j in range(21)]) median = ' (median = {0:.2f})'.format(np.median(pp_list[i])) ax[row, col].set_title(names[i] + median) plt.show() def plot6(pp_list): names = ['In', 'Out', 'First In', 'First Out', 'Stop', 'Overall'] f, ax = plt.subplots(2, 3) for i in xrange(len(names)): row = i % 2 col = int(i/2) pp = pp_list[i] ax[row, col].hist(pp, bins=range(30)) ax[row, col].set_title(names[i]) plt.show() if __name__ == '__main__': curr_dir = os.path.dirname(os.path.realpath(__file__)) os.chdir(curr_dir) V = load_vocabulary("../Data/all_stations.csv", n=4) users = ur.readPanelData("../Data/sampleData_2013.csv") prior = construct_priors(users, V) pp = user_fourgram_pp(users, V, prior) plot4(pp)
def predict_over_time(): users = ur.readPanelData("/Users/zhanzhao/Dropbox (MIT)/TfL/Data/sampleData.csv") start = 11688 end = 12419 # predictability = [] dowR = np.zeros(7) dowW = np.zeros(7) # hodR = np.zeros(24) # hodW = np.zeros(24) # weekR = np.zeros(105) # weekW = np.zeros(105) days = range(start, end, 7) total_R = 0 total_W = 0 # train_R = 0 # train_W = 0 for t in days: for user in users: # day1 = max(start, (t-7*training_period)) # day2 = t # day3 = t + 7 train, test = splitDataset(user.tripList, t) if len(train) > 0 and len(test) > 0: train_X, train_Y = data_prep(train) test_X, test_Y = data_prep(test) if len(np.unique(train_Y)) > 1: new_train_X, new_test_X = feature_extraction(train_X, test_X) # clf = nb.MultinomialNB() clf = lm.LogisticRegression() # clf = tree.DecisionTreeClassifier() # clf = svm.LinearSVC() clf.fit(new_train_X, train_Y) pred_Y = clf.predict(new_test_X) else: pred_Y = [train_Y[0]] * len(test_Y) r, w = predict_eval(test_Y, pred_Y) total_R += r total_W += w # new_train_X, tX = feature_extraction(train_X, train_X) # if len(np.unique(train_Y)) > 1: # pred_train_Y = clf.predict(tX) # else: # pred_train_Y = [train_Y[0]] * len(train_Y) # if len(pred_train_Y) != len(train_Y): # print len(train), new_train_X.shape, len(pred_train_Y), len(train_Y) # tr, tw = predict_eval(train_Y, pred_train_Y) # train_R += tr # train_W += tw # predictability.append((user.id, len(train), r, w)) dow_r, dow_w, hod_r, hod_w = predict_eval_by_time(test_X, test_Y, pred_Y) dowR = dowR + dow_r dowW = dowW + dow_w # hodR = hodR + hod_r # hodW = hodW + hod_w # weekR[(t-11688)/7] += r # weekW[(t-11688)/7] += w if total_R + total_W > 0: accuracy = total_R*1.0/(total_R+total_W) else: accuracy = -9999 # if train_R + train_W > 0: # train_acc = train_R*1.0/(train_R+train_W) # else: # train_acc = -9999 print (t-start)/7, (total_R+total_W), accuracy # bar_plot(hodR, hodW) # bar_plot(dowR, dowW) # bar_plot(weekR, weekW) for i in xrange(len(dowR)): print i, dowR[i], dowW[i]