Esempio n. 1
0
def pre_hash_alexa():
    alexa_dict = {}
    alexa_stringio = StringIO()

    print "Downloading %s ..." % alexa_url
    alexa_stringio.write(urllib.urlopen(alexa_url).read())
    alexa_zip = ZipFile(alexa_stringio)

    print "Hashing + caching Alexa top 1,000,000 domain hashes, this may take about a minute..."
    alexa_domains = map(lambda x: x.split(',', 1)[1].strip().split('/', 1)[0], alexa_zip.read(alexa_file).split('\n')[:-1])

    for host in alexa_domains:
        hashed_hostname = hash_host(host)
        alexa_dict[hashed_hostname] = host

    cPickle.dump(alexa_dict, open('top-1m_hashed.pickle', 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)

    # Take advantage of the fact that python will return this by reference, so you can use it immediately without
    # reading it back from disk.
    return alexa_dict
Esempio n. 2
0
        result = cursor.fetchall()
        url_list = map(lambda x: x[0], result)
        url_list = filter(lambda x: x.count('/') > 1 and x.count(':') > 0, url_list)
        history_domains = map(lambda x: x.split('/')[2], url_list)
        history_domains = map(lambda x: x.split(':')[0], history_domains)
        history_domains = filter(lambda x: len(x) > 0, history_domains)

    except Exception, ex:
        raise

    finally:
        os.unlink(temp)

    # Build a dictionary of hashed_host: hostname so we can easily lookup hosts based on their hash (from history)
    for host in history_domains:
        hashed_hostname = hash_host(host)
        host_dict[hashed_hostname] = host

    # Add hashes for Alexa top 1m sites to the dictionary
    if not os.path.isfile(alexa_file_pickle):
        # Perhaps our first time running, generate them and load them.
        host_dict.update(pre_hash_alexa())
    else:
        # We already have the Alexa hashes, yay! Load them (much quicker than generating them)
        host_dict.update(cPickle.load(open(alexa_file_pickle, 'rb')))

    # Create the Chrome STS Object that will hold all the STS entries from disk, and ones we add/delete
    csts = None
    try:
        csts = ChromeSTS(autocommit=True)
    except Exception, ex: