-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict_persistence.py
212 lines (192 loc) · 8.7 KB
/
predict_persistence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import util
import cPickle
import pandas
import math
import numpy as np
import sklearn.metrics
import sklearn.linear_model
import scipy.stats
from common import loadData, getCourseStartAndEndDates, getDummiesFixedSet, getRelevantUsers
from predict_certification import loadPersonCourseData, loadPersonCourseDayData, loadPrecourseSurveyData, \
convertTimes, convertYoB, computeCourseDates, computeDaysSinceLastEvent, \
trainMLR, START_DATES, MIN_EXAMPLES, WEEK, PREDICTION_DATES_1_0
# Converts each column of the specified matrix into percentiles (over the values
# in that column).
def percentilize (X):
for i in range(X.shape[1]):
X[:,i] = scipy.stats.rankdata(X[:,i])/float(X.shape[0])
def runExperiments (allCourseData, withPrecourseSurvey = False):
allAucs = {}
allUsernamesAndPredictions = {}
allAucsCert = {}
for courseId in set(allCourseData.keys()).intersection(START_DATES.keys()): # For each course
print courseId
allAucs[courseId] = []
allUsernamesAndPredictions[courseId] = []
allAucsCert[courseId] = []
for i, weekData in enumerate(allCourseData[courseId]):
# Find start date T0 and cutoff date Tc
(trainX, trainY, trainYcert, testX, testY, testYcert, usernames) = weekData
if not withPrecourseSurvey:
# Trim off the last feature (whether student submitted precourse survey or not)
trainX = trainX[:, 0:-1]
testX = testX[:, 0:-1]
if (len(set(testY)) < 2) or (len(set(testYcert)) < 2):
print "Skipping..."
continue
_, auc, (_, testYhat) = trainMLR(trainX, trainY, testX, testY, 1.)
print "{}: {}".format(courseId, auc)
_, aucCert, _ = trainMLR(trainX, trainY, testX, testYcert, 1.)
#print "To predict week {}: {}".format(i+3, auc)
allAucs[courseId].append(auc)
allUsernamesAndPredictions[courseId].append((usernames, testYhat))
allAucsCert[courseId].append(aucCert)
#print
return allAucs, allUsernamesAndPredictions, allAucsCert
def trainAll (allCourseData, withPrecourseSurvey):
global MLR_REG
MLR_REG = 1.
results = runExperiments(allCourseData, withPrecourseSurvey)
cPickle.dump(results, open("results_prong2.pkl", "wb"))
return results
def optimize (allCourseData):
MLR_REG_SET = 10. ** np.arange(-5, +6).astype(np.float32)
bestAuc = -1
for paramValue in MLR_REG_SET:
global MLR_REG
MLR_REG = float(paramValue)
allAucs, _, _ = runExperiments(allCourseData)
avgAuc = np.mean(np.hstack(allAucs.values()))
print allAucs
print "Mean acc: {}".format(avgAuc)
if avgAuc > bestAuc:
bestAuc = avgAuc
bestParamValue = paramValue
print "Accuracy: {} for {}".format(bestAuc, bestParamValue)
#def convertToQuantiles (X):
# X = np.array(X, dtype=np.float32)
# N = X.shape[0]
# Xquantiles = np.zeros_like(X, dtype=np.float32)
# for i in range(X.shape[1]):
# col = X[:,i]
# colSorted = np.tile(np.atleast_2d(np.sort(col, axis=0)), (N, 1))
# colRep = np.tile(np.atleast_2d(col).T, (1, N))
# # Line below: for each observation (element of col), find the smallest index
# # in the *sorted* column that is >= that observation. Then normalize.
# Xquantiles[:,i] = np.argmax(colRep <= colSorted, axis=1) / float(N)
# return Xquantiles
def getXandY (pc, pcd, survey, usernames, T0, Tc, normalize):
# TARGET VALUES
# The target value for each user consists of whether or not the user
# did *anything* during the week just prior to Tc
idxs = np.nonzero((pcd.date >= Tc - WEEK) & (pcd.date < Tc))[0]
lastWeekPcd = pcd.iloc[idxs]
grouping = lastWeekPcd.groupby('username')
lastWeekUsernames = np.array(grouping.groups.keys())
persistenceIdxs = np.nonzero(grouping.sum_dt.sum() > 0)[0]
usersWhoPersisted = set(lastWeekUsernames[persistenceIdxs])
# FEATURE EXTRACTION
# Restrict analysis to days between T0 and Tc-WEEK
idxs = np.nonzero((pcd.date >= T0) & (pcd.date < Tc - WEEK))[0]
pcd = pcd.iloc[idxs]
# Create dummy variables
pcUsernames = pc.username
usernamesToCertifiedMap = { pcUsernames.iloc[i]:pc.certified.iloc[i] for i in range(len(pcUsernames)) }
DEMOGRAPHIC_FIELDS = [ 'continent', 'YoB', 'LoE', 'gender' ]
pc = pc[DEMOGRAPHIC_FIELDS]
pc.YoB = convertYoB(pc.YoB)
pc = getDummiesFixedSet(pc)
#pc = pandas.get_dummies(pc, columns = [ 'continent', 'LoE', 'gender', 'YoB' ], dummy_na = True)
# For efficiency, figure out which rows of the person-course and person-course-day
# datasets belong to which users
usernamesToPcIdxsMap = dict(zip(pcUsernames, range(len(pc))))
usernamesToCompletedSurveyMap = dict(zip(survey.username, survey.prs_ResponseID.notnull()))
usernamesToPcdIdxsMap = {}
for i in range(pcd.shape[0]):
username = pcd.username.iloc[i]
usernamesToPcdIdxsMap.setdefault(username, [])
usernamesToPcdIdxsMap[username].append(i)
### Only analyze users who appear in the person-course-day dataset
##usernames = list(set(usernames).intersection(usernamesToPcdIdxsMap.keys()))
# Extract features for all users and put them into the design matrix X
pcdDates = pcd.date
pcd = pcd.drop([ 'username', 'course_id', 'date', 'last_event' ], axis=1)
nevents = pcd.nevents
# Convert NaNs in person-course-day dataset to 0
pcd = pcd.fillna(value=0)
pcd = pcd.as_matrix()
if normalize:
pcd = pcd.astype(np.float32)
#quantify.quantify(pcd.shape[0], pcd.shape[1], pcd)
percentilize(pcd)
NUM_FEATURES = pcd.shape[1] + len(pc.columns) + 2 # "+ 2" -- encode whether or not user completed precourse survey; and numDaysSinceLastEvent
X = np.zeros((len(usernames), NUM_FEATURES))
y = np.zeros(len(usernames))
yCert = np.zeros(len(usernames))
for i, username in enumerate(usernames):
if username in usernamesToPcdIdxsMap.keys():
idxs = np.array(usernamesToPcdIdxsMap[username])
# For each row in the person-course-day dataset for this user, put the
# features into the correct column range for that user in the design matrix X.
X[i,0:pcd.shape[1]] = np.sum(pcd[idxs,:], axis=0)
else:
idxs = []
X[i,0:pcd.shape[1]] = np.zeros(pcd.shape[1])
# Now append the demographic features
demographics = pc.iloc[usernamesToPcIdxsMap[username]]
X[i,pcd.shape[1]:pcd.shape[1]+len(demographics)] = demographics
# Last 2 features
usernamesToCompletedSurveyMap.setdefault(username, False)
completedSurvey = usernamesToCompletedSurveyMap[username]
X[i,NUM_FEATURES-2] = completedSurvey
numDaysSinceLastEvent = computeDaysSinceLastEvent(nevents, pcdDates, T0, Tc, idxs)
X[i,NUM_FEATURES-1] = numDaysSinceLastEvent
y[i] = username in usersWhoPersisted
yCert[i] = usernamesToCertifiedMap[username]
return X, y, yCert
def extractFeaturesAndTargets (somePc, somePcd, someSurvey, usernames, T0, Tc, normalize):
# Get features and target values
trainX, trainY, trainYcert = getXandY(somePc, somePcd, someSurvey, usernames, T0, Tc - 1*WEEK, normalize)
testX, testY, testYcert = getXandY(somePc, somePcd, someSurvey, usernames, T0, Tc, normalize)
if len(np.nonzero(trainY == 0)[0]) < MIN_EXAMPLES or len(np.nonzero(trainY == 1)[0]) < MIN_EXAMPLES:
raise ValueError("Train: Too few examples or all one class")
if len(np.nonzero(testY == 0)[0]) < MIN_EXAMPLES or len(np.nonzero(testY == 1)[0]) < MIN_EXAMPLES:
raise ValueError("Test: Too few examples or all one class")
return trainX, trainY, trainYcert, testX, testY, testYcert, usernames
def prepareAllData (startDates, endDates, normalize):
print "Preparing data..."
allCourseData = {}
for courseId in set(startDates.keys()).intersection(START_DATES.keys()): # For each course
# Load data for this course
print "Loading {}...".format(courseId)
try:
somePc, someSurvey, somePcd = loadData(courseId)
T0, Tc = computeCourseDates(courseId, startDates)
allCourseData[courseId] = []
print "...done"
# We need at least 3 weeks' worth of data to both train and test the model.
# We use the first 2 weeks' data to train a model (labels are determined by week 2, and
# features are extracted from week 1). But then to *evaluate* that model, we need
# another (3rd) week.
Tcutoffs = np.arange(T0 + 3*WEEK, Tc, WEEK)
print courseId, Tcutoffs
for Tcutoff in Tcutoffs:
# The users that we train/test on must have entered the course by the end of the
# *first* week of the last 3 weeks in the time range. Hence, we subtract 2 weeks.
usernames = getRelevantUsers(somePc, Tcutoff - 2*WEEK)
allData = extractFeaturesAndTargets(somePc, somePcd, someSurvey, usernames, T0, Tcutoff, normalize)
allCourseData[courseId].append(allData)
except (IOError, ValueError):
print "Skipping"
continue
print "...done"
return allCourseData
if __name__ == "__main__":
NORMALIZE = True
if 'startDates' not in globals():
startDates, endDates = getCourseStartAndEndDates()
if 'allCourseData' not in globals():
allCourseData = prepareAllData(startDates, endDates, NORMALIZE)
#optimize(allCourseData)
results = trainAll(allCourseData, True)
print results