def processScrapedToOpenClosed(printProgress = False): def statusOut(): out = str(pctComplete) + '% complete: ' + \ 'File ' + str(numFilesProcessed + 1) + '/' + str(numFilesTotal) + \ ' (' + fileTime + '.' + courseDataExt + ')' + \ alignRightSpacer + \ eta return out filesToAnalyze = fileUtils.getAllFiles(dataDir = courseDataDir, dataExt = courseDataExt, latestFirst = False) try: with open(openClosedRawFileLoc, 'r') as existingDataFile: existingData = cPickle.load(existingDataFile) fileNamesToAnalyze = [fileUtils.getFileNameFromPath(fileName) for fileName in filesToAnalyze] fileNamesToAnalyze = list(set(fileNamesToAnalyze).difference(set(existingData))) filesToAnalyze = [courseDataDir + '/' + fileName + '.' + courseDataExt for fileName in fileNamesToAnalyze] allData = existingData except IOError: allData = {} numFilesProcessed = 0 numFilesTotal = len(filesToAnalyze) pluralText = 'files' if numFilesTotal == 1: pluralText = 'file' if printProgress: print 'Processing open/closed data:', numFilesTotal, pluralText, 'to analyze.' if numFilesTotal > 0: startTime = time.clock() alignRightSpacer = '' for fileToAnalyze in filesToAnalyze: fileTime = fileUtils.getFileNameFromPath(fileToAnalyze) timePassed = time.clock() - startTime numfilesLeft = numFilesTotal - numFilesProcessed pctComplete = numFilesProcessed * 100 / numFilesTotal if numFilesProcessed == 0: eta = '' else: etaTime = (timePassed / numFilesProcessed) * numfilesLeft etaTimePretty = time.strftime('%H:%M:%S', time.gmtime(etaTime)) eta = ' (ETA: ' + etaTimePretty + ')' try: consoleWidth = int(consoleSize()[0]) except ValueError: consoleWidth = 80 while len(statusOut()) != consoleWidth: if len(statusOut()) < consoleWidth: alignRightSpacer += ' ' else: if len(alignRightSpacer) == 0: break alignRightSpacer = alignRightSpacer[:-1] if printProgress: dynPrint(statusOut()) courseDataRaw = fileUtils.unpickle(fileToAnalyze) courseDataProc = getOpenClosedStats(courseDataRaw) allData[fileTime] = courseDataProc numFilesProcessed += 1 with open(openClosedRawFileLoc, 'w') as dataOut: cPickle.dump(allData, dataOut)
def processScrapedToSubjectSeats(printProgress = False): subjDict = fileUtils.unpickle(subjListFileLoc) subjList = sorted(subjDict.keys()) filesToAnalyze = fileUtils.getAllFiles(dataDir = courseDataDir, dataExt = courseDataExt, latestFirst = False) try: with open(subjectSeatsFileLoc, 'r') as existingDataFile: allData = cPickle.load(existingDataFile) except IOError: allData = {'_filesProcessed': set()} numFilesProcessed = 0 numFilesTotal = len(filesToAnalyze) pluralText = 'files' if numFilesTotal == 1: pluralText = 'file' if numFilesTotal > 0: openClosedData = fileUtils.unpickle(openClosedRawFileLoc) saneOpenClosedFileKeys = sanityCheck(openClosedData) saneOpenClosedFileKeysSet = set(saneOpenClosedFileKeys) saneFileKeys = saneOpenClosedFileKeysSet.intersection(set(fileUtils.getAllFiles())) existingFileKeys = allData['_filesProcessed'] unprocessedFileKeysSet = set(saneFileKeys).difference(existingFileKeys) print len(unprocessedFileKeysSet), 'unprocessed files' unprocessedFileKeys = sorted(list(unprocessedFileKeysSet)) if len(unprocessedFileKeys) > 0: for fileTime in unprocessedFileKeys: fileLoc = courseDataDir + '/' + fileTime + '.' + courseDataExt print fileLoc try: subjDataRaw = fileUtils.unpickle(fileLoc) except IOError: continue # with the next file if the one being opened doesn't exist subjDataProc = getSubjSeatStats(subjList, subjDataRaw) allData[fileTime] = subjDataProc allData['_filesProcessed'].add(fileTime) numFilesProcessed += 1 with open(subjectSeatsFileLoc, 'w') as dataOut: cPickle.dump(allData, dataOut) print 'Done.'
cfg = loadConfig() statsOutputDir = cfg['dataLoc']['statsDir'] statsExt = cfg['dataLoc']['statsFiles']['statsExt'] openClosedProcessedFileName = cfg['dataLoc']['statsFiles']['openClosedData']['processed'] + '.' + statsExt openClosedProcessedFileLoc = statsOutputDir + '/' + openClosedProcessedFileName jsonFullOutLoc = cfg['dataLoc']['json']['seatsAllSubjs'] jsonDividedOutLoc = cfg['dataLoc']['json']['seatsBySubj'] firstUsefulTime = cfg['stockChart']['firstUsefulTime'] dataColumn = cfg['stockChart']['column'] dataMult = cfg['stockChart']['dataMult'] subjectDataLoc = cfg['dataLoc']['subjList'] subjectSeatsFileLoc = statsOutputDir + '/' + 'subjectSeats.dat' if __name__ == '__main__': diffData = fileUtils.unpickle(openClosedProcessedFileLoc) subjSeatsData = fileUtils.unpickle(subjectSeatsFileLoc) del subjSeatsData['_filesProcessed'] # print subjSeatsData subjectDict = fileUtils.unpickle(subjectDataLoc) subjects = sorted(subjectDict.keys()) jsonFullData = [] jsonSubjData = [] def getUsefulSortedKeys(unorderedDict): sortedKeys = sorted(unorderedDict.keys()) def isUsefulData(key): key = int(key)