def runExperiments (allCourseData, withPrecourseSurvey = False): allAucs = {} allUsernamesAndPredictions = {} allAucsCert = {} for courseId in set(allCourseData.keys()).intersection(START_DATES.keys()): # For each course print courseId allAucs[courseId] = [] allUsernamesAndPredictions[courseId] = [] allAucsCert[courseId] = [] for i, weekData in enumerate(allCourseData[courseId]): # Find start date T0 and cutoff date Tc (trainX, trainY, trainYcert, testX, testY, testYcert, usernames) = weekData if not withPrecourseSurvey: # Trim off the last feature (whether student submitted precourse survey or not) trainX = trainX[:, 0:-1] testX = testX[:, 0:-1] if (len(set(testY)) < 2) or (len(set(testYcert)) < 2): print "Skipping..." continue _, auc, (_, testYhat) = trainMLR(trainX, trainY, testX, testY, 1.) print "{}: {}".format(courseId, auc) _, aucCert, _ = trainMLR(trainX, trainY, testX, testYcert, 1.) #print "To predict week {}: {}".format(i+3, auc) allAucs[courseId].append(auc) allUsernamesAndPredictions[courseId].append((usernames, testYhat)) allAucsCert[courseId].append(aucCert) #print return allAucs, allUsernamesAndPredictions, allAucsCert
def prepareAllData (startDates, endDates, normalize): print "Preparing data..." allCourseData = {} for courseId in set(startDates.keys()).intersection(START_DATES.keys()): # For each course # Load data for this course print "Loading {}...".format(courseId) try: somePc, someSurvey, somePcd = loadData(courseId) T0, Tc = computeCourseDates(courseId, startDates) allCourseData[courseId] = [] print "...done" # We need at least 3 weeks' worth of data to both train and test the model. # We use the first 2 weeks' data to train a model (labels are determined by week 2, and # features are extracted from week 1). But then to *evaluate* that model, we need # another (3rd) week. Tcutoffs = np.arange(T0 + 3*WEEK, Tc, WEEK) print courseId, Tcutoffs for Tcutoff in Tcutoffs: # The users that we train/test on must have entered the course by the end of the # *first* week of the last 3 weeks in the time range. Hence, we subtract 2 weeks. usernames = getRelevantUsers(somePc, Tcutoff - 2*WEEK) allData = extractFeaturesAndTargets(somePc, somePcd, someSurvey, usernames, T0, Tcutoff, normalize) allCourseData[courseId].append(allData) except (IOError, ValueError): print "Skipping" continue print "...done" return allCourseData