Ejemplo n.º 1
0
def google_scrape(query, interface):
	address = "http://www.google.com/search?hl=en&output=search&q=%s" % (urllib.quote_plus(query))
	request = urllib2.Request(address, None, {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'})
	# use tcpdump
	# before this, I should probably obtain google's ip for this transaction

	q = Queue.Queue()
	thread = threading.Thread(target=tcpdump, args=(5, q, interface))
	thread.start()
	
	elapsed = -1
	try:
		# count time around this command
		start_time = time.time()
		page = urllib2.urlopen(request).read()
		end_time = time.time()
		print int(round(start_time * 1000)), int(round(end_time * 1000))
		elapsed = end_time - start_time
	except httplib.BadStatusLine:
		print request

	print len(page)
	logging.debug("time elapsed for google query: %f",  elapsed)

	thread.join(10)
	if thread.isAlive():
		logging.debug("terminating tcpdump process")
		thread.join()
	try:
		returncode, ip_src, tcpEntries = q.get(timeout = 1)
	except Queue.Empty:
		logging.debug("queue is empty for query: %s", query)
		
	# if write this to file for debugging
	if htmlFile:
		htmlFile.write(page)

	# parse the file
	soup = BeautifulSoup(page)
	googleTime = None
	try:
		resultStats = soup.find("div", {"id": "resultStats"})
		# u'(0.11 seconds) '
		pattern = '\(([0-9]+\.[0-9]+)'
		text = str(soup.nobr.text)
		webpage_time = re.search(pattern, soup.nobr.text)
		googleTime = timedelta( seconds = float(webpage_time.group(1)) )
		logging.debug("%s %f %f", query, elapsed, float(webpage_time.group(1)))
	except:
		logging.debug("exception caught when parsing, no google stats for query: %s", query)
		
	queryTime = timedelta(seconds=elapsed)
	
	if ip_src is not None:
		logging.debug("find ip source, now pinging")
		returncode, pingTime = ping(ip_src)
		pingTimeDelta = map(lambda t: timedelta(seconds=t), pingTime)
		return queryTime, googleTime, ip_src, pingTimeDelta, tcpEntries
	return queryTime, googleTime, ip_src, None, tcpEntries
if extract_db:
  files = [f for f in os.listdir('./') if os.path.isfile('./'+ f) and f.endswith('.db')]
  print files[0]
  extract(files[0])

if parse_ping:
  # used to parse ping logs
  f = open('ping_log', 'r') 
  meta = f.readline()
  pings = map(lambda x: re.search('time=(\d+\.\d+) ms', x) and int(re.search('time=(\d+\.\d+) ms', x).group(1)) or -1, f.readlines())[:-4]


if ping_urllist:
  f = open('../mobile/urllist.txt', 'r')
  traceroute_dsts = [line.split(' ')[0] for line in filter(lambda x: x!='', map(string.strip, f.readlines()))]

  for dst in traceroute_dsts:
    print dst[7:]
    print ping(dst[7:])

if file_download:
  file_size = ['1KB', '10KB', '100KB', '1MB'];
  addresses = map(lambda x: "http://chuchufan.info/files/testfile_"+x, file_size)
  for address in addresses:
    request = urllib2.Request(address, None)
    start_time = time.time()
    urllib2.urlopen(request).read()
    elapsed = time.time() - start_time
    print elapsed