def predict_by_user():
	users = ur.readPanelData("/Users/zhanzhao/Dropbox (MIT)/TfL/Data/sampleData.csv")
	start = 11688
	end = 12419
	user_acc = []
	user_freq = []
	# dowR = np.zeros(7)
	# dowW = np.zeros(7)
	# hodR = np.zeros(24)
	# hodW = np.zeros(24)
	# weekR = np.zeros(105)
	# weekW = np.zeros(105)
	days = range(start, end, 7)
	total_R = 0
	total_W = 0
	# train_R = 0
	# train_W = 0
	for i, user in enumerate(users):
		user_R = 0
		user_W = 0
		sum_train = 0
		count_train = 0
		for t in days:
			# day1 = max(start, (t-7*training_period))
			# day2 = t
			# day3 = t + 7
			train, test = splitDataset(user.tripList, t)
			if len(train) > 0 and len(test) > 0:
				train_X, train_Y = data_prep(train)
				test_X, test_Y = data_prep(test)
				if len(np.unique(train_Y)) > 1:
					new_train_X, new_test_X = feature_extraction(train_X, test_X)
					# clf = nb.MultinomialNB()
					clf = lm.LogisticRegression()
					# clf = tree.DecisionTreeClassifier()
					# clf = svm.LinearSVC()
					clf.fit(new_train_X, train_Y)
					pred_Y = clf.predict(new_test_X)
				else:
					pred_Y = [train_Y[0]] * len(test_Y)
				r, w = predict_eval(test_Y, pred_Y)
				user_R += r
				user_W += w

				count_train += 1
				sum_train += len(train_Y)

				# new_train_X, tX = feature_extraction(train_X, train_X)
				# if len(np.unique(train_Y)) > 1:
				# 	pred_train_Y = clf.predict(tX)
				# else:
				# 	pred_train_Y = [train_Y[0]] * len(train_Y)
				# if len(pred_train_Y) != len(train_Y):
				# 	print len(train), new_train_X.shape, len(pred_train_Y), len(train_Y)
				# tr, tw = predict_eval(train_Y, pred_train_Y)
				# train_R += tr
				# train_W += tw

				# predictability.append((user.id, len(train), r, w))

				# dow_r, dow_w, hod_r, hod_w = predict_eval_by_time(test_X, test_Y, pred_Y)
				# dowR = dowR + dow_r
				# dowW = dowW + dow_w
				# hodR = hodR + hod_r
				# hodW = hodW + hod_w

				# weekR[(t-11688)/7] += r
				# weekW[(t-11688)/7] += w
		total_R += user_R
		total_W += user_W
		if user_R + user_W > 0:
			accuracy = user_R * 1.0 / (user_R + user_W)
			user_acc.append(accuracy)
			avg_train_size = sum_train * 1.0/count_train
			user_freq.append(avg_train_size)
		else:
			accuracy = -9999
		# if train_R + train_W > 0:
		# 	train_acc = train_R*1.0/(train_R+train_W)
		# else:
		# 	train_acc = -9999
		print i, (user_R+user_W), accuracy
	# bar_plot(hodR, hodW)
	# bar_plot(dowR, dowW)
	# bar_plot(weekR, weekW)
	# dist_plot(user_acc)
	scatter_plot(user_freq, user_acc)
	print np.corrcoef(user_freq, user_acc)
	print total_R * 1.0 / (total_R + total_W)
            ax[row, col].set_ylim([0, 150])
            ax[row, col].hist(pp_list[i], bins=range(30))
        else:
            ax[row, col].set_ylim([0, 100])
            ax[row, col].hist(pp_list[i], bins=[j/20.0 for j in range(21)])
        median = ' (median = {0:.2f})'.format(np.median(pp_list[i]))
        ax[row, col].set_title(names[i] + median)
    plt.show()


def plot6(pp_list):
    names = ['In', 'Out', 'First In', 'First Out', 'Stop', 'Overall']
    f, ax = plt.subplots(2, 3)
    for i in xrange(len(names)):
        row = i % 2
        col = int(i/2)
        pp = pp_list[i]
        ax[row, col].hist(pp, bins=range(30))
        ax[row, col].set_title(names[i])
    plt.show()


if __name__ == '__main__':
    curr_dir = os.path.dirname(os.path.realpath(__file__))
    os.chdir(curr_dir)
    V = load_vocabulary("../Data/all_stations.csv", n=4)
    users = ur.readPanelData("../Data/sampleData_2013.csv")
    prior = construct_priors(users, V)
    pp = user_fourgram_pp(users, V, prior)
    plot4(pp)
def predict_over_time():
	users = ur.readPanelData("/Users/zhanzhao/Dropbox (MIT)/TfL/Data/sampleData.csv")
	start = 11688
	end = 12419
	# predictability = []
	dowR = np.zeros(7)
	dowW = np.zeros(7)
	# hodR = np.zeros(24)
	# hodW = np.zeros(24)
	# weekR = np.zeros(105)
	# weekW = np.zeros(105)
	days = range(start, end, 7)
	total_R = 0
	total_W = 0
	# train_R = 0
	# train_W = 0
	for t in days:
		for user in users:
			# day1 = max(start, (t-7*training_period))
			# day2 = t
			# day3 = t + 7
			train, test = splitDataset(user.tripList, t)
			if len(train) > 0 and len(test) > 0:
				train_X, train_Y = data_prep(train)
				test_X, test_Y = data_prep(test)
				if len(np.unique(train_Y)) > 1:
					new_train_X, new_test_X = feature_extraction(train_X, test_X)
					# clf = nb.MultinomialNB()
					clf = lm.LogisticRegression()
					# clf = tree.DecisionTreeClassifier()
					# clf = svm.LinearSVC()
					clf.fit(new_train_X, train_Y)
					pred_Y = clf.predict(new_test_X)
				else:
					pred_Y = [train_Y[0]] * len(test_Y)
				r, w = predict_eval(test_Y, pred_Y)
				total_R += r
				total_W += w

				# new_train_X, tX = feature_extraction(train_X, train_X)
				# if len(np.unique(train_Y)) > 1:
				# 	pred_train_Y = clf.predict(tX)
				# else:
				# 	pred_train_Y = [train_Y[0]] * len(train_Y)
				# if len(pred_train_Y) != len(train_Y):
				# 	print len(train), new_train_X.shape, len(pred_train_Y), len(train_Y)
				# tr, tw = predict_eval(train_Y, pred_train_Y)
				# train_R += tr
				# train_W += tw

				# predictability.append((user.id, len(train), r, w))

				dow_r, dow_w, hod_r, hod_w = predict_eval_by_time(test_X, test_Y, pred_Y)
				dowR = dowR + dow_r
				dowW = dowW + dow_w
				# hodR = hodR + hod_r
				# hodW = hodW + hod_w

				# weekR[(t-11688)/7] += r
				# weekW[(t-11688)/7] += w

		if total_R + total_W > 0:
			accuracy = total_R*1.0/(total_R+total_W)
		else:
			accuracy = -9999
		# if train_R + train_W > 0:
		# 	train_acc = train_R*1.0/(train_R+train_W)
		# else:
		# 	train_acc = -9999
		print (t-start)/7, (total_R+total_W), accuracy
	# bar_plot(hodR, hodW)
	# bar_plot(dowR, dowW)
	# bar_plot(weekR, weekW)
	for i in xrange(len(dowR)):
		print i, dowR[i], dowW[i]