def google_scrape(query, interface): address = "http://www.google.com/search?hl=en&output=search&q=%s" % (urllib.quote_plus(query)) request = urllib2.Request(address, None, {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'}) # use tcpdump # before this, I should probably obtain google's ip for this transaction q = Queue.Queue() thread = threading.Thread(target=tcpdump, args=(5, q, interface)) thread.start() elapsed = -1 try: # count time around this command start_time = time.time() page = urllib2.urlopen(request).read() end_time = time.time() print int(round(start_time * 1000)), int(round(end_time * 1000)) elapsed = end_time - start_time except httplib.BadStatusLine: print request print len(page) logging.debug("time elapsed for google query: %f", elapsed) thread.join(10) if thread.isAlive(): logging.debug("terminating tcpdump process") thread.join() try: returncode, ip_src, tcpEntries = q.get(timeout = 1) except Queue.Empty: logging.debug("queue is empty for query: %s", query) # if write this to file for debugging if htmlFile: htmlFile.write(page) # parse the file soup = BeautifulSoup(page) googleTime = None try: resultStats = soup.find("div", {"id": "resultStats"}) # u'(0.11 seconds) ' pattern = '\(([0-9]+\.[0-9]+)' text = str(soup.nobr.text) webpage_time = re.search(pattern, soup.nobr.text) googleTime = timedelta( seconds = float(webpage_time.group(1)) ) logging.debug("%s %f %f", query, elapsed, float(webpage_time.group(1))) except: logging.debug("exception caught when parsing, no google stats for query: %s", query) queryTime = timedelta(seconds=elapsed) if ip_src is not None: logging.debug("find ip source, now pinging") returncode, pingTime = ping(ip_src) pingTimeDelta = map(lambda t: timedelta(seconds=t), pingTime) return queryTime, googleTime, ip_src, pingTimeDelta, tcpEntries return queryTime, googleTime, ip_src, None, tcpEntries
if extract_db: files = [f for f in os.listdir('./') if os.path.isfile('./'+ f) and f.endswith('.db')] print files[0] extract(files[0]) if parse_ping: # used to parse ping logs f = open('ping_log', 'r') meta = f.readline() pings = map(lambda x: re.search('time=(\d+\.\d+) ms', x) and int(re.search('time=(\d+\.\d+) ms', x).group(1)) or -1, f.readlines())[:-4] if ping_urllist: f = open('../mobile/urllist.txt', 'r') traceroute_dsts = [line.split(' ')[0] for line in filter(lambda x: x!='', map(string.strip, f.readlines()))] for dst in traceroute_dsts: print dst[7:] print ping(dst[7:]) if file_download: file_size = ['1KB', '10KB', '100KB', '1MB']; addresses = map(lambda x: "http://chuchufan.info/files/testfile_"+x, file_size) for address in addresses: request = urllib2.Request(address, None) start_time = time.time() urllib2.urlopen(request).read() elapsed = time.time() - start_time print elapsed