def getXandY (pc, pcd, survey, usernames, T0, Tc, demographicsOnly): # Restrict analysis to days between T0 and Tc idxs = np.nonzero((pcd.date >= T0) & (pcd.date < Tc))[0] pcd = pcd.iloc[idxs] # Create dummy variables pcUsernames = pc.username usernamesToCertifiedMap = { pcUsernames.iloc[i]:pc.certified.iloc[i] for i in range(len(pcUsernames)) } usernamesToLastEventMap = { pcUsernames.iloc[i]:pc.last_event.iloc[i] for i in range(len(pcUsernames)) } DEMOGRAPHIC_FIELDS = [ 'continent', 'YoB', 'LoE', 'gender' ] pc = pc[DEMOGRAPHIC_FIELDS] pc.YoB = convertYoB(pc.YoB) pc = getDummiesFixedSet(pc) #pc = pandas.get_dummies(pc, columns = [ 'continent', 'LoE', 'gender', 'YoB' ], dummy_na = True) # For efficiency, figure out which rows of the person-course and person-course-day # datasets belong to which users usernamesToPcIdxsMap = dict(zip(pcUsernames, range(len(pc)))) usernamesToCompletedSurveyMap = dict(zip(survey.username, survey.prs_ResponseID.notnull())) usernamesToPcdIdxsMap = {} for i in range(pcd.shape[0]): username = pcd.username.iloc[i] usernamesToPcdIdxsMap.setdefault(username, []) usernamesToPcdIdxsMap[username].append(i) ### Only analyze users who appear in the person-course-day dataset ##usernames = list(set(usernames).intersection(usernamesToPcdIdxsMap.keys())) # Extract features for all users and put them into the design matrix X pcdDates = pcd.date pcd = pcd.drop([ 'username', 'course_id', 'date', 'last_event' ], axis=1) # Convert NaNs in person-course-day dataset to 0 pcd = pcd.fillna(value=0) NUM_DAYS = 1 NUM_FEATURES = NUM_DAYS * len(pcd.columns) + len(pc.columns) + 2 # "+ 2" -- completion of precourse survey; and numDaysSinceLastEvent X = np.zeros((len(usernames), NUM_FEATURES)) Xheur = np.zeros(len(usernames)) y = np.zeros(len(usernames)) sumDts = np.zeros((len(usernames), NUM_DAYS)) # Keep track of sum_dt as a special feature goodIdxs = [] for i, username in enumerate(usernames): if username in usernamesToPcdIdxsMap.keys(): idxs = np.array(usernamesToPcdIdxsMap[username]) # For each row in the person-course-day dataset for this user, put the # features into the correct column range for that user in the design matrix X. X[i,0:len(pcd.columns)] = np.sum(pcd.iloc[idxs].as_matrix(), axis=0) # Call as_matrix() so nan is treated as nan in sum! sumDts[i] = np.sum(pcd.sum_dt.iloc[idxs]) else: idxs = [] X[i,0:len(pcd.columns)] = np.zeros(len(pcd.columns)) sumDts[i] = 0 # Now append the demographic features demographics = pc.iloc[usernamesToPcIdxsMap[username]] X[i,NUM_DAYS * len(pcd.columns):NUM_FEATURES-2] = demographics # "Heuristic" predictor -- whether the student's last event time is before/after the first week of the course lastEvent = usernamesToLastEventMap[username] # Last 2 features usernamesToCompletedSurveyMap.setdefault(username, False) completedSurvey = usernamesToCompletedSurveyMap[username] X[i,NUM_FEATURES-2] = completedSurvey numDaysSinceLastEvent = computeDaysSinceLastEvent(pcd.nevents, pcdDates, T0, Tc, idxs) X[i,NUM_FEATURES-1] = numDaysSinceLastEvent Xheur[i] = numDaysSinceLastEvent * -1 # "*-1" -- so that fewer days since last action means higher prob. of certification y[i] = usernamesToCertifiedMap[username] if np.isfinite(np.sum(X[i,:])): goodIdxs.append(i) if demographicsOnly: # Zero out the non-demographics information X[:,0:NUM_DAYS*len(pcd.columns)] = 0 X[:,NUM_FEATURES-2:] = 0 return X[goodIdxs,:], Xheur[goodIdxs], y[goodIdxs], np.sum(sumDts[goodIdxs,:], axis=1)
def getXandY (pc, pcd, survey, usernames, T0, Tc, normalize): # TARGET VALUES # The target value for each user consists of whether or not the user # did *anything* during the week just prior to Tc idxs = np.nonzero((pcd.date >= Tc - WEEK) & (pcd.date < Tc))[0] lastWeekPcd = pcd.iloc[idxs] grouping = lastWeekPcd.groupby('username') lastWeekUsernames = np.array(grouping.groups.keys()) persistenceIdxs = np.nonzero(grouping.sum_dt.sum() > 0)[0] usersWhoPersisted = set(lastWeekUsernames[persistenceIdxs]) # FEATURE EXTRACTION # Restrict analysis to days between T0 and Tc-WEEK idxs = np.nonzero((pcd.date >= T0) & (pcd.date < Tc - WEEK))[0] pcd = pcd.iloc[idxs] # Create dummy variables pcUsernames = pc.username usernamesToCertifiedMap = { pcUsernames.iloc[i]:pc.certified.iloc[i] for i in range(len(pcUsernames)) } DEMOGRAPHIC_FIELDS = [ 'continent', 'YoB', 'LoE', 'gender' ] pc = pc[DEMOGRAPHIC_FIELDS] pc.YoB = convertYoB(pc.YoB) pc = getDummiesFixedSet(pc) #pc = pandas.get_dummies(pc, columns = [ 'continent', 'LoE', 'gender', 'YoB' ], dummy_na = True) # For efficiency, figure out which rows of the person-course and person-course-day # datasets belong to which users usernamesToPcIdxsMap = dict(zip(pcUsernames, range(len(pc)))) usernamesToCompletedSurveyMap = dict(zip(survey.username, survey.prs_ResponseID.notnull())) usernamesToPcdIdxsMap = {} for i in range(pcd.shape[0]): username = pcd.username.iloc[i] usernamesToPcdIdxsMap.setdefault(username, []) usernamesToPcdIdxsMap[username].append(i) ### Only analyze users who appear in the person-course-day dataset ##usernames = list(set(usernames).intersection(usernamesToPcdIdxsMap.keys())) # Extract features for all users and put them into the design matrix X pcdDates = pcd.date pcd = pcd.drop([ 'username', 'course_id', 'date', 'last_event' ], axis=1) nevents = pcd.nevents # Convert NaNs in person-course-day dataset to 0 pcd = pcd.fillna(value=0) pcd = pcd.as_matrix() if normalize: pcd = pcd.astype(np.float32) #quantify.quantify(pcd.shape[0], pcd.shape[1], pcd) percentilize(pcd) NUM_FEATURES = pcd.shape[1] + len(pc.columns) + 2 # "+ 2" -- encode whether or not user completed precourse survey; and numDaysSinceLastEvent X = np.zeros((len(usernames), NUM_FEATURES)) y = np.zeros(len(usernames)) yCert = np.zeros(len(usernames)) for i, username in enumerate(usernames): if username in usernamesToPcdIdxsMap.keys(): idxs = np.array(usernamesToPcdIdxsMap[username]) # For each row in the person-course-day dataset for this user, put the # features into the correct column range for that user in the design matrix X. X[i,0:pcd.shape[1]] = np.sum(pcd[idxs,:], axis=0) else: idxs = [] X[i,0:pcd.shape[1]] = np.zeros(pcd.shape[1]) # Now append the demographic features demographics = pc.iloc[usernamesToPcIdxsMap[username]] X[i,pcd.shape[1]:pcd.shape[1]+len(demographics)] = demographics # Last 2 features usernamesToCompletedSurveyMap.setdefault(username, False) completedSurvey = usernamesToCompletedSurveyMap[username] X[i,NUM_FEATURES-2] = completedSurvey numDaysSinceLastEvent = computeDaysSinceLastEvent(nevents, pcdDates, T0, Tc, idxs) X[i,NUM_FEATURES-1] = numDaysSinceLastEvent y[i] = username in usersWhoPersisted yCert[i] = usernamesToCertifiedMap[username] return X, y, yCert