def http_error_301(self, req, fp, code, msg, headers): result = urllib2.HTTPRedirectHandler.http_error_301( self, req, fp, code, msg, headers) result.status = code logger.debug('Redirect URL (301): %s' % result.url) return result
def log_doc(self, doc): txt = 'Document saved:\n' for key,item in doc.items(): if key not in ('summary', 'document'): txt += ' %-12s: %s\n' % (key, item) logger.debug(txt[:-1]) logger.warn('Got *new* document: %(doc_type)s %(number)s %(date_st)s' % doc )
def process_digesto(self, doc_obj): ''' Gets more information from the digesto system Extracts the document html text from the digesto system ''' # Do we have a digesto entry? If not, return if not self.doc.data['digesto']: logger.debug(msg_doc('No digesto:', self.doc)) return # Check for digesto text document_text=self.check_digesto(doc_obj) # If it does not exist or we have a forced update read the html if not document_text: logger.debug(msg_doc('New digesto:', self.doc)) document_text = DocumentText() elif document_text and self.options['update_digesto']: logger.debug(msg_doc('Update digesto:', self.doc)) else: logger.debug(msg_doc('Already have digesto:', self.doc)) return # Get the digesto text text = self.get_digesto() if not text: logger.debug(msg_doc('No digesto text:', self.doc)) return # Save the text self.save_digesto(document_text, doc_obj, text)
def save_doc(self): # Check for document duplication doc_obj = self.check_duplicate() if self.mode == UPDATE: if not self.options["update"]: logger.debug(msg_doc("IGNORING duplicated document:", self.doc)) raise DREDuplicateError("Not going to process this doc.") else: logger.warn(msg_doc("UPDATE mode:", self.doc)) logger.debug("doc_obj: %s" % doc_obj) else: logger.warn(msg_doc("NEW mode:", self.doc)) # Save metadata if self.mode == NEW or (self.mode == UPDATE and self.options["update_metadata"]): logger.debug(msg_doc("Metadata:", self.doc)) self.save_metadata(doc_obj) self.check_forgetme(doc_obj) # Save digesto if self.mode == NEW or (self.mode == UPDATE and self.options["update_digesto"]): self.process_digesto(doc_obj) # Update inforce if self.mode == NEW or (self.mode == UPDATE and self.options["update_inforce"]): logger.debug(msg_doc("Update inforce:", self.doc)) self.update_inforce(doc_obj) # Save PDF if self.mode == NEW or (self.mode == UPDATE and self.options["save_pdf"]): logger.debug(msg_doc("Get PDF:", self.doc)) self.save_pdf(doc_obj)
def get_digesto( self, doc ): document = doc['document'] doc_id = doc['digesto'] # Checks if the document already has the digesto text try: document_text = DocumentText.objects.get( document = document ) except ObjectDoesNotExist: logger.warn('Getting digesto text: %(doc_type)s %(number)s %(date_st)s' % doc) else: return # Gets the DIGESTO system integral text soup = read_soup( digesto_url % doc_id ) # Parse the text # <li class="formatedTextoWithLinks"> try: text = soup.find( 'li', { 'class': 'formatedTextoWithLinks' } ).renderContents() text = text.replace('<span>Texto</span>','') except AttributeError: # No digesto text, abort logger.debug('No digesto text.') return # Save the text to the database document_text = DocumentText() document_text.document = document document_text.text_url = digesto_url % doc_id document_text.text = text document_text.save()
def check_duplicate( self, doc ): # For dates before the site change we should try to verify # the document duplication by other means (since the 'claint' changed # on the new site if doc['date'] < datetime.datetime(2014,9,19): # Does the current doc_type have synonyms? doc_types = [ doc['doc_type'].lower() ] for sn in synonyms: if doc['doc_type'].lower() in sn: doc_types = sn # Create a query for the synonyms: dt_qs = Q( doc_type__iexact = doc_types[0] ) for dt in doc_types[1:]: dt_qs = dt_qs | Q( doc_type__iexact = dt ) dl = Document.objects.filter( date__exact = doc['date'] ).filter( dt_qs ).filter( number__iexact = doc['number'] ).filter( series__exact = doc['series'] ) if len(dl) > 1: # We have a number of documents that, for a given date, have # duplicates with the same number and type. The dates can be # listed with: # select # count(*), date, doc_type, number # from # dreapp_document # where # date < '2014-9-18' # group by # date, doc_type, number # having # count(*) > 1; logger.error('Duplicate document in the database: %(doc_type)s %(number)s %(date_st)s' % doc) raise DREScraperError('More than one doc with the same number and type.') if len(dl) == 1: doc['document'] = dl[0] raise DREDuplicateError('Duplicate document') # For other dates we simply use the db integrity checks to spot a # duplicate document = doc['document'] try: sid = transaction.savepoint() document.save() transaction.savepoint_commit(sid) logger.debug('ID: %d http://dre.tretas.org/dre/%d/' % (document.id, document.id) ) except IntegrityError: # Duplicated document transaction.savepoint_rollback(sid) doc['document'] = Document.objects.get(claint = doc['id'] ) raise DREDuplicateError('Duplicate document')
def parse_pdf(doc): # Public tenders: if (doc.doc_type.lower() == u'Anúncio de Procedimento'.lower() or doc.doc_type.lower() == u'Aviso de prorrogação de prazo'.lower() or doc.doc_type.lower() == u'Declaração de retificação de anúncio'.lower() or doc.doc_type.lower() == u'Anúncio de concurso urgente'.lower()): logger.debug('CACHEPDF Tender text extract from pdf for doc id=%d' % doc.id) return ParseTenderPdf(doc).run() # Generic documents: logger.debug('CACHEPDF Generic text extract from pdf for doc id=%d' % doc.id) return ParseGenericPdf(doc).run()
def http_error_302(self, req, fp, code, msg, headers): result = urllib2.HTTPRedirectHandler.http_error_302( self, req, fp, code, msg, headers) result.status = code logger.debug('Redirect URL (302): %s' % result.url) ### DRE ugly hack: the dre.pt site instead of issuing an http error code ### redirects to an error page. Here we catch the error and raise an ### exception. if 'Paginas/Erro.aspx' in result.url: raise DREError('Error condition on the site') return result
def save_file(filename, url): k = 1 while True: try: url, data_blob, cookies = fetch_url( url ) break except urllib2.HTTPError: logger.error('Could not read PDF: %s DOC: %s' % ( url, filename)) k += 1 if k == MAX_ATTEMPTS: raise DREError('Couldn\'t get the PDF: %s' % url ) logger.debug('Sleeping 2 secs...') time.sleep(2) with open(filename, 'wb') as f: f.write(data_blob) f.close()
def read_index(self): dr_id_number = self.data['dr_id_number'] page = 1 doc_list = [] sufix = '' while True: logger.debug('JOURNAL: Read journal page') soup = read_soup(JOURNAL_URL % (dr_id_number, page, sufix)) doc_page = self.get_document_list(soup) for doc in doc_page: try: yield DREReadDoc(doc,self) except DREParseError: pass if not doc_page: logger.debug('JOURNAL: Empty page') if not sufix and self.data['series']==2: page = 0 sufix = '?at=c' else: break page += 1
def save_doc_list(self): if not self.doc_list: logger.debug('Couldn\'t get documents for %s' % self.date.isoformat()) for doc in self.doc_list: logger.debug('*** Processing document: %(doc_type)s %(number)s %(date_st)s' % doc ) try: self.save_doc( doc ) except DREDuplicateError: logger.debug('We have this document: %(doc_type)s %(number)s %(date_st)s' % doc ) # Duplicated document: even if the document is duplicated we # check for the "digesto" text since sometimes this is created # long after the original date of the document. if doc['digesto']: # Check the "digesto" integral text self.get_digesto( doc ) # Check if the document is in force self.get_in_force_status( doc ) # In the new dre.pt the doc's pdf url has changed. Because of # this even in duplicated documents we update the pdf url. if doc['url']: self.update_pdf( doc ) continue except DREScraperError: continue # Get the "digesto" integral text if doc['digesto']: self.get_digesto( doc ) # Check if the document is in force if doc['digesto']: self.get_in_force_status( doc ) # Get the pdf version if doc['url']: self.save_pdf( doc ) self.create_cache( doc ) self.log_doc( doc ) time.sleep(1)
def fetch_url( url, data=None, cj=None ): # Treat url url_object = list(urlparse.urlsplit(url)) if u'\xba' in url_object[2]: url_object[2] = url_object[2].encode('utf-8') url_object[2] = urllib.quote(url_object[2]) url = urlparse.urlunsplit(url_object) # Get the payload repeat = 1 while repeat: try: logger.debug('Getting: %s' % url) request = urllib2.Request(url, data) request.add_header('Accept-Encoding', 'gzip; q=1.0, identity; q=0.5') request.add_header('User-agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)') if not cj: cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(SmartRedirectHandler(), urllib2.HTTPCookieProcessor(cj) ) resource = opener.open( request ) is_gzip = resource.headers.get('Content-Encoding') == 'gzip' payload = resource.read() url = resource.url resource.close() if is_gzip: try: compressedstream = StringIO.StringIO(payload) gzipper = gzip.GzipFile(fileobj=compressedstream) payload = gzipper.read() except IOError: pass repeat = False except (socket.timeout, socket.error) as e: repeat += 1 if repeat > MAXREPEAT: logger.critical('Socket timeout! Aborting') raise logger.debug('Socket timeout! Sleeping for 5 minutes') time.sleep(300) except (urllib2.URLError, urllib2.HTTPError) as e: msg = str(e) repeat += 1 if repeat > MAXREPEAT: logger.critical('HTTP Error! Aborting. Error repeated %d times: %s' % (MAXREPEAT, msg) ) raise DREError('Error condition on the site') if 'Error 400' in str(msg) or 'Error 404' in str(msg): logger.critical('HTTP Error 40x - URL: %s' % url) raise if 'Error 503' in str(msg): logger.critical('HTTP Error 503 - cache problem going to try again in 10 seconds.') time.sleep(10) continue logger.warn('HTTP Error! Sleeping for 5 minutes: %s' % msg) time.sleep(300) t=random.randint(1,5) logger.debug('Sleeping %ds' % t) time.sleep(t) return url, payload, cj
def save_doc(self): # Check for document duplication doc_obj = self.check_duplicate() if self.mode == UPDATE: if not self.options['update']: logger.debug(msg_doc('IGNORING duplicated document:', self.doc)) raise DREDuplicateError('Not going to process this doc.') else: logger.warn(msg_doc('UPDATE mode:', self.doc)) logger.debug('doc_obj: %s' % doc_obj) else: logger.warn(msg_doc('NEW mode:', self.doc)) # Save metadata if self.mode==NEW or (self.mode==UPDATE and self.options['update_metadata']): logger.debug(msg_doc('Metadata:', self.doc)) self.save_metadata(doc_obj) self.check_forgetme(doc_obj) # Save digesto if self.mode==NEW or (self.mode==UPDATE and self.options['update_digesto']): self.process_digesto(doc_obj) # Update cache if self.mode==NEW or (self.mode==UPDATE and self.options['update_cache']): logger.debug(msg_doc('Cache:', self.doc)) self.update_cache(doc_obj) # Update inforce if self.mode==NEW or (self.mode==UPDATE and self.options['update_inforce']): logger.debug(msg_doc('Update inforce:', self.doc)) self.update_inforce(doc_obj) # Save PDF if self.mode==NEW or (self.mode==UPDATE and self.options['save_pdf']): logger.debug(msg_doc('Get PDF:', self.doc)) self.save_pdf(doc_obj)
def update_pdf( self, doc ): if doc['url'] and doc['document'].dre_pdf != doc['url']: doc['document'].dre_pdf = doc['url'] doc['document'].save() logger.debug('PDF\'s url updated: %(doc_type)s %(number)s %(date_st)s' % doc)