Beispiel #1
0
    def parse(dburl, history):

        logger.debug('Received %d historical events to parse for url: %s' %
                     (len(history), dburl.final_url))
        for redirect in history:

            logger.debug('Saving %d redirect to %s' %
                         (redirect.status_code, redirect.url))
            with db.execution_context() as ctx:
                db_history = HistoryDB.create(url=dburl,
                                              code=redirect.status_code,
                                              history_url=redirect.url)

            for header in redirect.headers:

                logger.debug('Saving redirection headers %s for url: %s' %
                             (header, redirect.url))
                with db.execution_context() as ctx:
                    HistoryHeaderDB.create(url=dburl,
                                           history=db_history,
                                           name=header,
                                           value=unicode(
                                               redirect.headers[header],
                                               errors='ignore'))

        return
Beispiel #2
0
def progress(url):

    with db.execution_context() as ctx:
        p = Url.select().count()
        Printer('Progress: [%.4f%% - %d/1000000] Last processed url: %s' %\
            ((p*100.0 / 1000000), p, (url[:75] + '..') if len(url) > 75 else url))

    return
Beispiel #3
0
    def parse(dburl, headers):

        logger.debug('Received %d headers to parse for url: %s' % (len(headers), dburl.final_url))
        for header in headers:
            logger.debug('Saving header %s as %s' % (header, headers[header]))
            with db.execution_context() as ctx:
                HeaderDB.create(
                    url = dburl,
                    name = header,
                    value = unicode(headers[header], errors = 'ignore')
                )

        return
Beispiel #4
0
def process_url(rank_domain):

    # as this will be run in a muliprocessing env, lets
    # attempt to cactch any exceptions and print it
    # so that the pool exception later is more
    # useful
    try:

        ranking = rank_domain[0]
        site_url = 'http://' + rank_domain[1]

        # Do a db lookup to check if we have possibly already processed
        # this url. If we havent, a request will be made to it
        try:

            Url.get(Url.rank == ranking)
            logger.warning('#%s: %s has already been processed. Skipping' %
                           (ranking, site_url))
            return

        except Url.DoesNotExist, e:

            # New url it seems, make a request!
            logger.info('Processing url #%d/1000000 which is: %s' %
                        (ranking, site_url))
            try:

                session = requests.Session()
                headers = {'User-Agent': user_agent}
                logger.debug(
                    'Headers and session setup for request to: %s. Making request'
                    % site_url)
                r = session.get(site_url, headers=headers, timeout=10)

            except Exception, e:

                logger.warning('Failed to retreive %s with error %s' %
                               (site_url, e))
                with db.execution_context() as ctx:
                    Url.create(rank=ranking,
                               domain=rank_domain[1],
                               final_url=None,
                               response_code=0,
                               is_ok=False,
                               content=None)
                return
Beispiel #5
0
    def setup():
        '''
            Setup Database Tables

            The primary purpose of this method is to prepare the tables
            needed in the database.
        '''

        # create the tables if they do not exist
        with db.execution_context() as ctx:

            logger.debug('Connected to database: %s' % db.database)
            db.create_tables([
                Url, Header, Cookie, History, HistoryHeader, Comment,
                Certificate
            ], True)
            logger.debug('Tables synced')

        return
Beispiel #6
0
    def parse(dburl, cookie_values, all_cookies):

        for cookie in all_cookies:

            try:

                logger.debug('Saving cookie %s as %s' %
                             (cookie, cookie_values[cookie]))
                with db.execution_context() as ctx:
                    CookieDB.create(url=dburl,
                                    name=cookie,
                                    value=unicode(cookie_values[cookie],
                                                  errors='ignore'))

            except KeyError, e:
                logger.warning('Failed to read cookie %s with error %s' %
                               (cookie, e))

            except requests.cookies.CookieConflictError, e:
                logger.warning('Duplicate cookie %s exists' % cookie)
Beispiel #7
0
            except Exception, e:

                logger.warning('Failed to retreive %s with error %s' %
                               (site_url, e))
                with db.execution_context() as ctx:
                    Url.create(rank=ranking,
                               domain=rank_domain[1],
                               final_url=None,
                               response_code=0,
                               is_ok=False,
                               content=None)
                return

        # The request was made and it most probably didnt have
        # any problems. Save it!
        with db.execution_context() as ctx:
            # print r.url, type(r.text)
            data_url = Url.create(
                rank=ranking,
                domain=rank_domain[1],
                final_url=r.url,
                response_code=r.status_code,
                is_ok=True if r.status_code == requests.codes.ok else False,
                # Tired of fighting the unicode errors. Convert to ASCII for now.
                content=r.text.encode('ascii', 'ignore'))

        # Move on to the Parsers of the Requests response
        Header.parse(data_url, r.headers)
        Cookie.parse(data_url, session.cookies, session.cookies.get_dict())
        History.parse(data_url, r.history)
        Comment.parse(data_url, r.text)
Beispiel #8
0
class Certificate():
    ''' Parse a Certificate from a URL '''
    @staticmethod
    def parse(dburl, url):

        logger.debug('Received url %s to attempt certificate parsing' % url)

        # Parse the URL with the aim of getting the final
        # domain out of it.
        o = urlparse(url)

        # ParseResult(scheme='http', netloc='www.google.co.za', path='', params='', query='', fragment='')
        if o.netloc == '':
            return

        # prepare thingies for some SSL talking
        context = SSL.Context(SSL.TLSv1_METHOD)  # Use TLS Method
        context.set_options(SSL.OP_NO_SSLv2)  # Don't accept SSLv2
        sock = socket()
        sock.settimeout(10)
        ssl_sock = SSL.Connection(context, sock)

        try:

            # for now, just try the most common tcp/443 port
            ssl_sock.connect((o.netloc, 443))

            # Timeout breaks this cause of the non blocking mode
            # and who knows what else. Lets try implement
            # a patch:
            # https://bitbucket.org/sardarnl/apns-client/pull-request/1/issue-5-retry-on-wantreaderror/diff
            tries = 0
            while True:
                try:

                    ssl_sock.do_handshake()
                    break

                except SSL.WantReadError:

                    tries += 1
                    if tries >= 5:
                        raise

                    time.sleep(0.2)

            # get the certificate. Think this is a PEM
            cert = ssl_sock.get_peer_certificate()

        except Exception, e:

            logger.warning('Failed to connect to %s to get the certificate' %
                           url)
            return

        # Write the information from the certificate to the DB
        try:
            with db.execution_context() as ctx:
                save = CertificateDB.create(
                    url=dburl,
                    issuer_components=unicode(json.dumps(
                        dict(cert.get_issuer().get_components())),
                                              errors='ignore'),
                    public_key_type=cert.get_pubkey().type(),
                    serial_number=cert.get_serial_number(),
                    signature_algorithm=cert.get_signature_algorithm(),
                    subject_common_name=cert.get_subject().commonName,
                    subject_components=unicode(json.dumps(
                        dict(cert.get_subject().get_components())),
                                               errors='ignore'),
                    version=cert.get_version(),
                    expired=cert.has_expired(),
                    public_key_bits=cert.get_pubkey().bits())
                logger.debug(
                    'Saved certificate information with subject components: %s'
                    % save.subject_components)
        except Exception, e:
            logger.error(
                'Was not able to save the certificate information for %s. Error: %s'
                % (url, e))