def getInitialPSList(dumpFile): PSList = set() currentLength = 1 for line in dumpFile: record = processDump.getRecord(line) url, referrer, method, status, \ request_cookie, response_cookie = record urlDomain, urlMime, urlQuery, urlParams = \ processDump.parseURL(url) # dont process IP addresses if processDump.isIp(urlDomain): continue urlDomain = suffixParser.get_public_suffix(urlDomain) if len(urlDomain.split('.')) > currentLength: PSList.add(urlDomain) return PSList
def getNextLevel(dumpFile, PSList): candidatePSList = set() for line in dumpFile: record = processDump.getRecord(line) url, referrer, method, status, \ request_cookie, response_cookie = record urlDomain, urlMime, urlQuery, urlParams = \ processDump.parseURL(url) # dont process IP addresses if processDump.isIp(urlDomain): continue currentDotIndex = urlDomain.find('.') previousDotIndex = -1 while currentDotIndex != -1: suffix = urlDomain[currentDotIndex + 1:] if suffix in PSList: prefix = urlDomain[previousDotIndex + 1:currentDotIndex] candidatePSList.add('.'.join([prefix, suffix])) previousDotIndex = currentDotIndex currentDotIndex = urlDomain.find('.', currentDotIndex + 1) return candidatePSList