def parse(dburl, history): logger.debug('Received %d historical events to parse for url: %s' % (len(history), dburl.final_url)) for redirect in history: logger.debug('Saving %d redirect to %s' % (redirect.status_code, redirect.url)) with db.execution_context() as ctx: db_history = HistoryDB.create(url=dburl, code=redirect.status_code, history_url=redirect.url) for header in redirect.headers: logger.debug('Saving redirection headers %s for url: %s' % (header, redirect.url)) with db.execution_context() as ctx: HistoryHeaderDB.create(url=dburl, history=db_history, name=header, value=unicode( redirect.headers[header], errors='ignore')) return
def progress(url): with db.execution_context() as ctx: p = Url.select().count() Printer('Progress: [%.4f%% - %d/1000000] Last processed url: %s' %\ ((p*100.0 / 1000000), p, (url[:75] + '..') if len(url) > 75 else url)) return
def parse(dburl, headers): logger.debug('Received %d headers to parse for url: %s' % (len(headers), dburl.final_url)) for header in headers: logger.debug('Saving header %s as %s' % (header, headers[header])) with db.execution_context() as ctx: HeaderDB.create( url = dburl, name = header, value = unicode(headers[header], errors = 'ignore') ) return
def process_url(rank_domain): # as this will be run in a muliprocessing env, lets # attempt to cactch any exceptions and print it # so that the pool exception later is more # useful try: ranking = rank_domain[0] site_url = 'http://' + rank_domain[1] # Do a db lookup to check if we have possibly already processed # this url. If we havent, a request will be made to it try: Url.get(Url.rank == ranking) logger.warning('#%s: %s has already been processed. Skipping' % (ranking, site_url)) return except Url.DoesNotExist, e: # New url it seems, make a request! logger.info('Processing url #%d/1000000 which is: %s' % (ranking, site_url)) try: session = requests.Session() headers = {'User-Agent': user_agent} logger.debug( 'Headers and session setup for request to: %s. Making request' % site_url) r = session.get(site_url, headers=headers, timeout=10) except Exception, e: logger.warning('Failed to retreive %s with error %s' % (site_url, e)) with db.execution_context() as ctx: Url.create(rank=ranking, domain=rank_domain[1], final_url=None, response_code=0, is_ok=False, content=None) return
def setup(): ''' Setup Database Tables The primary purpose of this method is to prepare the tables needed in the database. ''' # create the tables if they do not exist with db.execution_context() as ctx: logger.debug('Connected to database: %s' % db.database) db.create_tables([ Url, Header, Cookie, History, HistoryHeader, Comment, Certificate ], True) logger.debug('Tables synced') return
def parse(dburl, cookie_values, all_cookies): for cookie in all_cookies: try: logger.debug('Saving cookie %s as %s' % (cookie, cookie_values[cookie])) with db.execution_context() as ctx: CookieDB.create(url=dburl, name=cookie, value=unicode(cookie_values[cookie], errors='ignore')) except KeyError, e: logger.warning('Failed to read cookie %s with error %s' % (cookie, e)) except requests.cookies.CookieConflictError, e: logger.warning('Duplicate cookie %s exists' % cookie)
except Exception, e: logger.warning('Failed to retreive %s with error %s' % (site_url, e)) with db.execution_context() as ctx: Url.create(rank=ranking, domain=rank_domain[1], final_url=None, response_code=0, is_ok=False, content=None) return # The request was made and it most probably didnt have # any problems. Save it! with db.execution_context() as ctx: # print r.url, type(r.text) data_url = Url.create( rank=ranking, domain=rank_domain[1], final_url=r.url, response_code=r.status_code, is_ok=True if r.status_code == requests.codes.ok else False, # Tired of fighting the unicode errors. Convert to ASCII for now. content=r.text.encode('ascii', 'ignore')) # Move on to the Parsers of the Requests response Header.parse(data_url, r.headers) Cookie.parse(data_url, session.cookies, session.cookies.get_dict()) History.parse(data_url, r.history) Comment.parse(data_url, r.text)
class Certificate(): ''' Parse a Certificate from a URL ''' @staticmethod def parse(dburl, url): logger.debug('Received url %s to attempt certificate parsing' % url) # Parse the URL with the aim of getting the final # domain out of it. o = urlparse(url) # ParseResult(scheme='http', netloc='www.google.co.za', path='', params='', query='', fragment='') if o.netloc == '': return # prepare thingies for some SSL talking context = SSL.Context(SSL.TLSv1_METHOD) # Use TLS Method context.set_options(SSL.OP_NO_SSLv2) # Don't accept SSLv2 sock = socket() sock.settimeout(10) ssl_sock = SSL.Connection(context, sock) try: # for now, just try the most common tcp/443 port ssl_sock.connect((o.netloc, 443)) # Timeout breaks this cause of the non blocking mode # and who knows what else. Lets try implement # a patch: # https://bitbucket.org/sardarnl/apns-client/pull-request/1/issue-5-retry-on-wantreaderror/diff tries = 0 while True: try: ssl_sock.do_handshake() break except SSL.WantReadError: tries += 1 if tries >= 5: raise time.sleep(0.2) # get the certificate. Think this is a PEM cert = ssl_sock.get_peer_certificate() except Exception, e: logger.warning('Failed to connect to %s to get the certificate' % url) return # Write the information from the certificate to the DB try: with db.execution_context() as ctx: save = CertificateDB.create( url=dburl, issuer_components=unicode(json.dumps( dict(cert.get_issuer().get_components())), errors='ignore'), public_key_type=cert.get_pubkey().type(), serial_number=cert.get_serial_number(), signature_algorithm=cert.get_signature_algorithm(), subject_common_name=cert.get_subject().commonName, subject_components=unicode(json.dumps( dict(cert.get_subject().get_components())), errors='ignore'), version=cert.get_version(), expired=cert.has_expired(), public_key_bits=cert.get_pubkey().bits()) logger.debug( 'Saved certificate information with subject components: %s' % save.subject_components) except Exception, e: logger.error( 'Was not able to save the certificate information for %s. Error: %s' % (url, e))