def getXandY (pc, pcd, survey, usernames, T0, Tc, demographicsOnly):
	# Restrict analysis to days between T0 and Tc
	idxs = np.nonzero((pcd.date >= T0) & (pcd.date < Tc))[0]
	pcd = pcd.iloc[idxs]
	
	# Create dummy variables
	pcUsernames = pc.username
	usernamesToCertifiedMap = { pcUsernames.iloc[i]:pc.certified.iloc[i] for i in range(len(pcUsernames)) }
	usernamesToLastEventMap = { pcUsernames.iloc[i]:pc.last_event.iloc[i] for i in range(len(pcUsernames)) }
	DEMOGRAPHIC_FIELDS = [ 'continent', 'YoB', 'LoE', 'gender' ]
	pc = pc[DEMOGRAPHIC_FIELDS]
	pc.YoB = convertYoB(pc.YoB)
	pc = getDummiesFixedSet(pc)
	#pc = pandas.get_dummies(pc, columns = [ 'continent', 'LoE', 'gender', 'YoB' ], dummy_na = True)

	# For efficiency, figure out which rows of the person-course and person-course-day
	# datasets belong to which users
	usernamesToPcIdxsMap = dict(zip(pcUsernames, range(len(pc))))
	usernamesToCompletedSurveyMap = dict(zip(survey.username, survey.prs_ResponseID.notnull()))
	usernamesToPcdIdxsMap = {}
	for i in range(pcd.shape[0]):
		username = pcd.username.iloc[i]
		usernamesToPcdIdxsMap.setdefault(username, [])
		usernamesToPcdIdxsMap[username].append(i)

	### Only analyze users who appear in the person-course-day dataset
	##usernames = list(set(usernames).intersection(usernamesToPcdIdxsMap.keys()))

	# Extract features for all users and put them into the design matrix X
	pcdDates = pcd.date
	pcd = pcd.drop([ 'username', 'course_id', 'date', 'last_event' ], axis=1)

	# Convert NaNs in person-course-day dataset to 0
	pcd = pcd.fillna(value=0)

	NUM_DAYS = 1
	NUM_FEATURES = NUM_DAYS * len(pcd.columns) + len(pc.columns) + 2  # "+ 2" -- completion of precourse survey; and numDaysSinceLastEvent
	X = np.zeros((len(usernames), NUM_FEATURES))
	Xheur = np.zeros(len(usernames))
	y = np.zeros(len(usernames))
	sumDts = np.zeros((len(usernames), NUM_DAYS))  # Keep track of sum_dt as a special feature
	goodIdxs = []
	for i, username in enumerate(usernames):
		if username in usernamesToPcdIdxsMap.keys():
			idxs = np.array(usernamesToPcdIdxsMap[username])
			# For each row in the person-course-day dataset for this user, put the
			# features into the correct column range for that user in the design matrix X.
			X[i,0:len(pcd.columns)] = np.sum(pcd.iloc[idxs].as_matrix(), axis=0)  # Call as_matrix() so nan is treated as nan in sum!
			sumDts[i] = np.sum(pcd.sum_dt.iloc[idxs])
		else:
			idxs = []
			X[i,0:len(pcd.columns)] = np.zeros(len(pcd.columns))
			sumDts[i] = 0
		# Now append the demographic features
		demographics = pc.iloc[usernamesToPcIdxsMap[username]]
		X[i,NUM_DAYS * len(pcd.columns):NUM_FEATURES-2] = demographics
		# "Heuristic" predictor -- whether the student's last event time is before/after the first week of the course
		lastEvent = usernamesToLastEventMap[username]
	
		# Last 2 features	
		usernamesToCompletedSurveyMap.setdefault(username, False)
		completedSurvey = usernamesToCompletedSurveyMap[username]
		X[i,NUM_FEATURES-2] = completedSurvey
		numDaysSinceLastEvent = computeDaysSinceLastEvent(pcd.nevents, pcdDates, T0, Tc, idxs)
		X[i,NUM_FEATURES-1] = numDaysSinceLastEvent

		Xheur[i] = numDaysSinceLastEvent * -1  # "*-1" -- so that fewer days since last action means higher prob. of certification
		y[i] = usernamesToCertifiedMap[username]
		if np.isfinite(np.sum(X[i,:])):
			goodIdxs.append(i)
	
	if demographicsOnly:
		# Zero out the non-demographics information
		X[:,0:NUM_DAYS*len(pcd.columns)] = 0
		X[:,NUM_FEATURES-2:] = 0
	return X[goodIdxs,:], Xheur[goodIdxs], y[goodIdxs], np.sum(sumDts[goodIdxs,:], axis=1)
def getXandY (pc, pcd, survey, usernames, T0, Tc, normalize):
	# TARGET VALUES
	# The target value for each user consists of whether or not the user
	# did *anything* during the week just prior to Tc
	idxs = np.nonzero((pcd.date >= Tc - WEEK) & (pcd.date < Tc))[0]
	lastWeekPcd = pcd.iloc[idxs]
	grouping = lastWeekPcd.groupby('username')
	lastWeekUsernames = np.array(grouping.groups.keys())
	persistenceIdxs = np.nonzero(grouping.sum_dt.sum() > 0)[0]
	usersWhoPersisted = set(lastWeekUsernames[persistenceIdxs])

	# FEATURE EXTRACTION
	# Restrict analysis to days between T0 and Tc-WEEK
	idxs = np.nonzero((pcd.date >= T0) & (pcd.date < Tc - WEEK))[0]
	pcd = pcd.iloc[idxs]
	
	# Create dummy variables
	pcUsernames = pc.username
	usernamesToCertifiedMap = { pcUsernames.iloc[i]:pc.certified.iloc[i] for i in range(len(pcUsernames)) }
	DEMOGRAPHIC_FIELDS = [ 'continent', 'YoB', 'LoE', 'gender' ]
	pc = pc[DEMOGRAPHIC_FIELDS]
	pc.YoB = convertYoB(pc.YoB)
	pc = getDummiesFixedSet(pc)
	#pc = pandas.get_dummies(pc, columns = [ 'continent', 'LoE', 'gender', 'YoB' ], dummy_na = True)

	# For efficiency, figure out which rows of the person-course and person-course-day
	# datasets belong to which users
	usernamesToPcIdxsMap = dict(zip(pcUsernames, range(len(pc))))
	usernamesToCompletedSurveyMap = dict(zip(survey.username, survey.prs_ResponseID.notnull()))
	usernamesToPcdIdxsMap = {}
	for i in range(pcd.shape[0]):
		username = pcd.username.iloc[i]
		usernamesToPcdIdxsMap.setdefault(username, [])
		usernamesToPcdIdxsMap[username].append(i)

	### Only analyze users who appear in the person-course-day dataset
	##usernames = list(set(usernames).intersection(usernamesToPcdIdxsMap.keys()))

	# Extract features for all users and put them into the design matrix X
	pcdDates = pcd.date
	pcd = pcd.drop([ 'username', 'course_id', 'date', 'last_event' ], axis=1)
	nevents = pcd.nevents
	
	# Convert NaNs in person-course-day dataset to 0
	pcd = pcd.fillna(value=0)
	pcd = pcd.as_matrix()
	if normalize:
		pcd = pcd.astype(np.float32)
		#quantify.quantify(pcd.shape[0], pcd.shape[1], pcd)
		percentilize(pcd)

	NUM_FEATURES = pcd.shape[1] + len(pc.columns) + 2  # "+ 2" -- encode whether or not user completed precourse survey; and numDaysSinceLastEvent
	X = np.zeros((len(usernames), NUM_FEATURES))
	y = np.zeros(len(usernames))
	yCert = np.zeros(len(usernames))
	for i, username in enumerate(usernames):
		if username in usernamesToPcdIdxsMap.keys():
			idxs = np.array(usernamesToPcdIdxsMap[username])
			# For each row in the person-course-day dataset for this user, put the
			# features into the correct column range for that user in the design matrix X.
			X[i,0:pcd.shape[1]] = np.sum(pcd[idxs,:], axis=0)
		else:
			idxs = []
			X[i,0:pcd.shape[1]] = np.zeros(pcd.shape[1])
		# Now append the demographic features
		demographics = pc.iloc[usernamesToPcIdxsMap[username]]
		X[i,pcd.shape[1]:pcd.shape[1]+len(demographics)] = demographics
		
		# Last 2 features
		usernamesToCompletedSurveyMap.setdefault(username, False)
		completedSurvey = usernamesToCompletedSurveyMap[username]
		X[i,NUM_FEATURES-2] = completedSurvey
		numDaysSinceLastEvent = computeDaysSinceLastEvent(nevents, pcdDates, T0, Tc, idxs)
		X[i,NUM_FEATURES-1] = numDaysSinceLastEvent
		y[i] = username in usersWhoPersisted
		yCert[i] = usernamesToCertifiedMap[username]
	return X, y, yCert