def processScrapedToOpenClosed(printProgress = False): def statusOut(): out = str(pctComplete) + '% complete: ' + \ 'File ' + str(numFilesProcessed + 1) + '/' + str(numFilesTotal) + \ ' (' + fileTime + '.' + courseDataExt + ')' + \ alignRightSpacer + \ eta return out filesToAnalyze = fileUtils.getAllFiles(dataDir = courseDataDir, dataExt = courseDataExt, latestFirst = False) try: with open(openClosedRawFileLoc, 'r') as existingDataFile: existingData = cPickle.load(existingDataFile) fileNamesToAnalyze = [fileUtils.getFileNameFromPath(fileName) for fileName in filesToAnalyze] fileNamesToAnalyze = list(set(fileNamesToAnalyze).difference(set(existingData))) filesToAnalyze = [courseDataDir + '/' + fileName + '.' + courseDataExt for fileName in fileNamesToAnalyze] allData = existingData except IOError: allData = {} numFilesProcessed = 0 numFilesTotal = len(filesToAnalyze) pluralText = 'files' if numFilesTotal == 1: pluralText = 'file' if printProgress: print 'Processing open/closed data:', numFilesTotal, pluralText, 'to analyze.' if numFilesTotal > 0: startTime = time.clock() alignRightSpacer = '' for fileToAnalyze in filesToAnalyze: fileTime = fileUtils.getFileNameFromPath(fileToAnalyze) timePassed = time.clock() - startTime numfilesLeft = numFilesTotal - numFilesProcessed pctComplete = numFilesProcessed * 100 / numFilesTotal if numFilesProcessed == 0: eta = '' else: etaTime = (timePassed / numFilesProcessed) * numfilesLeft etaTimePretty = time.strftime('%H:%M:%S', time.gmtime(etaTime)) eta = ' (ETA: ' + etaTimePretty + ')' try: consoleWidth = int(consoleSize()[0]) except ValueError: consoleWidth = 80 while len(statusOut()) != consoleWidth: if len(statusOut()) < consoleWidth: alignRightSpacer += ' ' else: if len(alignRightSpacer) == 0: break alignRightSpacer = alignRightSpacer[:-1] if printProgress: dynPrint(statusOut()) courseDataRaw = fileUtils.unpickle(fileToAnalyze) courseDataProc = getOpenClosedStats(courseDataRaw) allData[fileTime] = courseDataProc numFilesProcessed += 1 with open(openClosedRawFileLoc, 'w') as dataOut: cPickle.dump(allData, dataOut)
if numFilesTotal > 0: openClosedData = fileUtils.unpickle(openClosedRawFileLoc) saneOpenClosedFileKeys = sanityCheck(openClosedData) saneOpenClosedFileKeysSet = set(saneOpenClosedFileKeys) saneFileKeys = saneOpenClosedFileKeysSet.intersection(set(fileUtils.getAllFiles())) existingFileKeys = allData['_filesProcessed'] unprocessedFileKeysSet = set(saneFileKeys).difference(existingFileKeys) print len(unprocessedFileKeysSet), 'unprocessed files' unprocessedFileKeys = sorted(list(unprocessedFileKeysSet)) if len(unprocessedFileKeys) > 0: for fileTime in unprocessedFileKeys: fileLoc = courseDataDir + '/' + fileTime + '.' + courseDataExt print fileLoc try: subjDataRaw = fileUtils.unpickle(fileLoc) except IOError: continue # with the next file if the one being opened doesn't exist subjDataProc = getSubjSeatStats(subjList, subjDataRaw) allData[fileTime] = subjDataProc allData['_filesProcessed'].add(fileTime) numFilesProcessed += 1 with open(subjectSeatsFileLoc, 'w') as dataOut: cPickle.dump(allData, dataOut) print 'Done.' if __name__ == '__main__': processScrapedToOpenClosed(printProgress = True) dynPrint('Done. Adding diff data to open/closed data...\n') processOpenClosedToDiff() print 'Done. Processing raw data to section data...' processScrapedToSubjectSeats(printProgress = True)