Exemple #1
0
def processScrapedToOpenClosed(printProgress = False):
	def statusOut():
		out = str(pctComplete) + '% complete: ' + \
			'File ' + str(numFilesProcessed + 1) + '/' + str(numFilesTotal) + \
			' (' + fileTime + '.' + courseDataExt + ')' + \
			alignRightSpacer + \
			eta
		return out
	filesToAnalyze = fileUtils.getAllFiles(dataDir = courseDataDir, dataExt = courseDataExt, latestFirst = False)
	try:
		with open(openClosedRawFileLoc, 'r') as existingDataFile:
			existingData = cPickle.load(existingDataFile)
			fileNamesToAnalyze = [fileUtils.getFileNameFromPath(fileName) for fileName in filesToAnalyze]
			fileNamesToAnalyze = list(set(fileNamesToAnalyze).difference(set(existingData)))
			filesToAnalyze = [courseDataDir + '/' + fileName + '.' + courseDataExt for fileName in fileNamesToAnalyze]
			allData = existingData
	except IOError:
		allData = {}
	numFilesProcessed = 0
	numFilesTotal = len(filesToAnalyze)
	pluralText = 'files'
	if numFilesTotal == 1:
		pluralText = 'file'
	if printProgress:
		print 'Processing open/closed data:', numFilesTotal, pluralText, 'to analyze.'
	if numFilesTotal > 0:
		startTime = time.clock()
		alignRightSpacer = ''
		for fileToAnalyze in filesToAnalyze:
			fileTime = fileUtils.getFileNameFromPath(fileToAnalyze)
			timePassed = time.clock() - startTime
			numfilesLeft = numFilesTotal - numFilesProcessed
			pctComplete = numFilesProcessed * 100 / numFilesTotal
			if numFilesProcessed == 0:
				eta = ''
			else:
				etaTime = (timePassed / numFilesProcessed) * numfilesLeft
				etaTimePretty = time.strftime('%H:%M:%S', time.gmtime(etaTime))
				eta = ' (ETA: ' + etaTimePretty + ')'
			try:
				consoleWidth = int(consoleSize()[0])
			except ValueError:
				consoleWidth = 80
			while len(statusOut()) != consoleWidth:
				if len(statusOut()) < consoleWidth:
					alignRightSpacer += ' '
				else:
					if len(alignRightSpacer) == 0:
						break
					alignRightSpacer = alignRightSpacer[:-1]
			if printProgress:
				dynPrint(statusOut())
			courseDataRaw = fileUtils.unpickle(fileToAnalyze)
			courseDataProc = getOpenClosedStats(courseDataRaw)
			allData[fileTime] = courseDataProc
			numFilesProcessed += 1
		with open(openClosedRawFileLoc, 'w') as dataOut:
			cPickle.dump(allData, dataOut)
Exemple #2
0
def processScrapedToSubjectSeats(printProgress = False):
	subjDict = fileUtils.unpickle(subjListFileLoc)
	subjList = sorted(subjDict.keys())
	filesToAnalyze = fileUtils.getAllFiles(dataDir = courseDataDir, dataExt = courseDataExt, latestFirst = False)
	try:
		with open(subjectSeatsFileLoc, 'r') as existingDataFile:
			allData = cPickle.load(existingDataFile)
	except IOError:
		allData = {'_filesProcessed': set()}
	numFilesProcessed = 0
	numFilesTotal = len(filesToAnalyze)
	pluralText = 'files'
	if numFilesTotal == 1:
		pluralText = 'file'
	if numFilesTotal > 0:
		openClosedData = fileUtils.unpickle(openClosedRawFileLoc)
		saneOpenClosedFileKeys = sanityCheck(openClosedData)
		saneOpenClosedFileKeysSet = set(saneOpenClosedFileKeys)
		saneFileKeys = saneOpenClosedFileKeysSet.intersection(set(fileUtils.getAllFiles()))
		existingFileKeys = allData['_filesProcessed']
		unprocessedFileKeysSet = set(saneFileKeys).difference(existingFileKeys)
		print len(unprocessedFileKeysSet), 'unprocessed files'
		unprocessedFileKeys = sorted(list(unprocessedFileKeysSet))
		if len(unprocessedFileKeys) > 0:
			for fileTime in unprocessedFileKeys:
				fileLoc = courseDataDir + '/' + fileTime + '.' + courseDataExt
				print fileLoc
				try:
					subjDataRaw = fileUtils.unpickle(fileLoc)
				except IOError:
					continue # with the next file if the one being opened doesn't exist
				subjDataProc = getSubjSeatStats(subjList, subjDataRaw)
				allData[fileTime] = subjDataProc
				allData['_filesProcessed'].add(fileTime)
				numFilesProcessed += 1
			with open(subjectSeatsFileLoc, 'w') as dataOut:
				cPickle.dump(allData, dataOut)
		print 'Done.'
Exemple #3
0
cfg = loadConfig()
statsOutputDir = cfg['dataLoc']['statsDir']
statsExt = cfg['dataLoc']['statsFiles']['statsExt']
openClosedProcessedFileName = cfg['dataLoc']['statsFiles']['openClosedData']['processed'] + '.' + statsExt
openClosedProcessedFileLoc = statsOutputDir + '/' + openClosedProcessedFileName
jsonFullOutLoc = cfg['dataLoc']['json']['seatsAllSubjs']
jsonDividedOutLoc = cfg['dataLoc']['json']['seatsBySubj']
firstUsefulTime = cfg['stockChart']['firstUsefulTime']
dataColumn = cfg['stockChart']['column']
dataMult = cfg['stockChart']['dataMult']
subjectDataLoc = cfg['dataLoc']['subjList']
subjectSeatsFileLoc = statsOutputDir + '/' + 'subjectSeats.dat'

if __name__ == '__main__':
	diffData = fileUtils.unpickle(openClosedProcessedFileLoc)
	subjSeatsData = fileUtils.unpickle(subjectSeatsFileLoc)
	del subjSeatsData['_filesProcessed']

	# print subjSeatsData

	subjectDict = fileUtils.unpickle(subjectDataLoc)
	subjects = sorted(subjectDict.keys())

	jsonFullData = []
	jsonSubjData = []

	def getUsefulSortedKeys(unorderedDict):
		sortedKeys = sorted(unorderedDict.keys())
		def isUsefulData(key):
			key = int(key)