def getPrefixesOfAlexaSuffixList(PSList, dumpFile): prefixCounter = {} for line in dumpFile: match = re.match(r'\d+,(.*?)$', line) if match: domain = match.group(1) else: continue if processDump.isIp(domain): continue buildPrefixCounter(domain, PSList, prefixCounter) return prefixCounter
def getPrefixesOfSuffixList(PSList, dumpFile): # map suffix => prefixes of suffix prefixCounter = {} for line in dumpFile: record = processDump.getRecord(line) url, referrer, method, status, \ request_cookie, response_cookie = record domain = getDomain(url) # dont process IPs if processDump.isIp(domain): continue buildPrefixCounter(domain, PSList, prefixCounter) return prefixCounter
def getInitialPSList(dumpFile): PSList = set() currentLength = 1 for line in dumpFile: record = processDump.getRecord(line) url, referrer, method, status, \ request_cookie, response_cookie = record urlDomain, urlMime, urlQuery, urlParams = \ processDump.parseURL(url) # dont process IP addresses if processDump.isIp(urlDomain): continue urlDomain = suffixParser.get_public_suffix(urlDomain) if len(urlDomain.split('.')) > currentLength: PSList.add(urlDomain) return PSList
def getInitialPSListFromAlexa(dumpFile): PSList = set() currentLength = 1 suffixParser = publicsuffix.PublicSuffixList() for line in dumpFile: match = re.match(r'\d+,(.*?)$', line) if match: urlDomain = match.group(1) else: continue if processDump.isIp(urlDomain): continue urlDomain = suffixParser.get_public_suffix(urlDomain) if len(urlDomain.split('.')) > currentLength: PSList.add(urlDomain) return PSList
def getNextLevelAlexa(dumpFile, PSList): candidatePSList = set() for line in dumpFile: match = re.match(r'\d+,(.*?)$', line) if match: urlDomain = match.group(1) else: continue if processDump.isIp(urlDomain): continue currentDotIndex = urlDomain.find('.') previousDotIndex = -1 while currentDotIndex != -1: suffix = urlDomain[currentDotIndex + 1:] if suffix in PSList: prefix = urlDomain[previousDotIndex + 1:currentDotIndex] candidatePSList.add('.'.join([prefix, suffix])) previousDotIndex = currentDotIndex currentDotIndex = urlDomain.find('.', currentDotIndex + 1)
def getNextLevel(dumpFile, PSList): candidatePSList = set() for line in dumpFile: record = processDump.getRecord(line) url, referrer, method, status, \ request_cookie, response_cookie = record urlDomain, urlMime, urlQuery, urlParams = \ processDump.parseURL(url) # dont process IP addresses if processDump.isIp(urlDomain): continue currentDotIndex = urlDomain.find('.') previousDotIndex = -1 while currentDotIndex != -1: suffix = urlDomain[currentDotIndex + 1:] if suffix in PSList: prefix = urlDomain[previousDotIndex + 1:currentDotIndex] candidatePSList.add('.'.join([prefix, suffix])) previousDotIndex = currentDotIndex currentDotIndex = urlDomain.find('.', currentDotIndex + 1) return candidatePSList