コード例 #1
0
def getPrefixesOfAlexaSuffixList(PSList, dumpFile):
	prefixCounter = {}
	for line in dumpFile:
		match = re.match(r'\d+,(.*?)$', line)
		if match:
			domain = match.group(1)
		else:
			continue
		if processDump.isIp(domain):
			continue
		buildPrefixCounter(domain, PSList, prefixCounter)
	return prefixCounter
コード例 #2
0
def getPrefixesOfSuffixList(PSList, dumpFile):
  # map  suffix => prefixes of suffix
	prefixCounter = {}
	for line in dumpFile:
		record = processDump.getRecord(line)
		url, referrer, method, status, \
			request_cookie, response_cookie = record
    
		domain = getDomain(url)
    # dont process IPs
		if processDump.isIp(domain):
			continue
		buildPrefixCounter(domain, PSList, prefixCounter)
	return prefixCounter
コード例 #3
0
def getInitialPSList(dumpFile):	
	PSList = set()
	currentLength = 1
	for line in dumpFile:	
		record = processDump.getRecord(line)
		url, referrer, method, status, \
			request_cookie, response_cookie = record

		urlDomain, urlMime, urlQuery, urlParams = \
		    processDump.parseURL(url)
	  # dont process IP addresses	
		if processDump.isIp(urlDomain):
			continue
		urlDomain = suffixParser.get_public_suffix(urlDomain)
		if len(urlDomain.split('.')) > currentLength:
			PSList.add(urlDomain)
	return PSList
コード例 #4
0
def getInitialPSListFromAlexa(dumpFile):
	PSList = set()
	currentLength = 1
	suffixParser = publicsuffix.PublicSuffixList()
	for line in dumpFile:	
		match = re.match(r'\d+,(.*?)$', line)
		if match:
			urlDomain = match.group(1)
		else:
			continue
		if processDump.isIp(urlDomain):
			continue
	
		urlDomain = suffixParser.get_public_suffix(urlDomain)
		if len(urlDomain.split('.')) > currentLength:
			PSList.add(urlDomain)
	return PSList
コード例 #5
0
def getNextLevelAlexa(dumpFile, PSList):
  candidatePSList = set()
	for line in dumpFile:	
		match = re.match(r'\d+,(.*?)$', line)
		if match:
			urlDomain = match.group(1)
		else:
			continue
		if processDump.isIp(urlDomain):
			continue

		currentDotIndex = urlDomain.find('.') 
		previousDotIndex = -1
		while currentDotIndex != -1:
			suffix = urlDomain[currentDotIndex + 1:]
			if suffix in PSList:
				prefix = urlDomain[previousDotIndex + 1:currentDotIndex]	
				candidatePSList.add('.'.join([prefix, suffix]))
			previousDotIndex = currentDotIndex
			currentDotIndex = urlDomain.find('.', currentDotIndex + 1)
コード例 #6
0
def getNextLevel(dumpFile, PSList):
  candidatePSList = set()
  for line in dumpFile:
		record = processDump.getRecord(line)
		url, referrer, method, status, \
			request_cookie, response_cookie = record

		urlDomain, urlMime, urlQuery, urlParams = \
		    processDump.parseURL(url)
	  # dont process IP addresses	
		if processDump.isIp(urlDomain):
			continue

		currentDotIndex = urlDomain.find('.') 
		previousDotIndex = -1
		while currentDotIndex != -1:
			suffix = urlDomain[currentDotIndex + 1:]
			if suffix in PSList:
				prefix = urlDomain[previousDotIndex + 1:currentDotIndex]	
				candidatePSList.add('.'.join([prefix, suffix]))
			previousDotIndex = currentDotIndex
			currentDotIndex = urlDomain.find('.', currentDotIndex + 1)
  return candidatePSList