コード例 #1
0
    def readFile(self, url):
        try:
            f = open(url, 'r')
        except:
            print('File not found')
            newPath = input('Enter new path > ');
            return self.readFile(newPath) #TODO: this doesn't work for entirely unknown reasons

        newdate = re.compile('\s*([0-9]{1,2}-[0-9]{1,2}-[0-9]{2})\s*')
        currentDateStr = None
        currentDateObj = None
        numWords = 0
        namesFound = set()
        totalWordNum = 0

        currentDayEntry = '' #holds all the lines for the current day, so we can compute a hash of the day later on
        
        line = f.readline()
        while (line != ''):
            if self.prefs.GUESS_NAMES:
                self.guessNames(line)
            #check a line to see if it's a date, therefore a new day
            dateFound = newdate.match(line)
            if dateFound != None: #it's a new date, so wrapup the previous date and set up to move onto the next one
                if namesFound != None:
                    self.addRelatedNames(namesFound)
                    namesFound = set()
                    self.dayEntryHashTable[currentDateObj] = hashlib.md5(currentDayEntry.encode()) #TODO: deal with first date

                if numWords > 0:
                    self.wordCountOfEntriesDict[currentDateObj] = numWords #should be here, since we want it triggered at the end
                totalWordNum += numWords
                numWords = 0
                currentDateStr = dateFound.group(0)
                currentDateStr = Helper.formatDateStringIntoCleanedString(currentDateStr)
                currentDateObj = Helper.makeDateObject(currentDateStr)

                if currentDateObj > self.mostRecentDate: #found a higher date than what we've seen so far
                    self.mostRecentDate = currentDateObj
                if currentDateObj < self.firstDate: #found a lower date than what we have now
                    self.firstDate = currentDateObj
                line = line[len(currentDateStr):] #remove date from line, so it's not a word

            if currentDateStr != None:
                (wordsFound, namesFoundThisLine) = self.addLine(line, currentDateObj)
                for name in namesFoundThisLine:
                    namesFound.add(name)
                numWords += wordsFound
            line = f.readline()
            currentDayEntry += line #add line to the day's entry

        #need to capture the last date for the entry length
        self.wordCountOfEntriesDict[currentDateObj] = numWords 
        self.totalNumberOfWords = totalWordNum + numWords #need to get words from last line
        f.close()