def pre_hash_alexa(): alexa_dict = {} alexa_stringio = StringIO() print "Downloading %s ..." % alexa_url alexa_stringio.write(urllib.urlopen(alexa_url).read()) alexa_zip = ZipFile(alexa_stringio) print "Hashing + caching Alexa top 1,000,000 domain hashes, this may take about a minute..." alexa_domains = map(lambda x: x.split(',', 1)[1].strip().split('/', 1)[0], alexa_zip.read(alexa_file).split('\n')[:-1]) for host in alexa_domains: hashed_hostname = hash_host(host) alexa_dict[hashed_hostname] = host cPickle.dump(alexa_dict, open('top-1m_hashed.pickle', 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Take advantage of the fact that python will return this by reference, so you can use it immediately without # reading it back from disk. return alexa_dict
result = cursor.fetchall() url_list = map(lambda x: x[0], result) url_list = filter(lambda x: x.count('/') > 1 and x.count(':') > 0, url_list) history_domains = map(lambda x: x.split('/')[2], url_list) history_domains = map(lambda x: x.split(':')[0], history_domains) history_domains = filter(lambda x: len(x) > 0, history_domains) except Exception, ex: raise finally: os.unlink(temp) # Build a dictionary of hashed_host: hostname so we can easily lookup hosts based on their hash (from history) for host in history_domains: hashed_hostname = hash_host(host) host_dict[hashed_hostname] = host # Add hashes for Alexa top 1m sites to the dictionary if not os.path.isfile(alexa_file_pickle): # Perhaps our first time running, generate them and load them. host_dict.update(pre_hash_alexa()) else: # We already have the Alexa hashes, yay! Load them (much quicker than generating them) host_dict.update(cPickle.load(open(alexa_file_pickle, 'rb'))) # Create the Chrome STS Object that will hold all the STS entries from disk, and ones we add/delete csts = None try: csts = ChromeSTS(autocommit=True) except Exception, ex: