def log_doc(self, doc): txt = 'Document saved:\n' for key,item in doc.items(): if key not in ('summary', 'document'): txt += ' %-12s: %s\n' % (key, item) logger.debug(txt[:-1]) logger.warn('Got *new* document: %(doc_type)s %(number)s %(date_st)s' % doc )
def save_doc(self): # Check for document duplication doc_obj = self.check_duplicate() if self.mode == UPDATE: if not self.options["update"]: logger.debug(msg_doc("IGNORING duplicated document:", self.doc)) raise DREDuplicateError("Not going to process this doc.") else: logger.warn(msg_doc("UPDATE mode:", self.doc)) logger.debug("doc_obj: %s" % doc_obj) else: logger.warn(msg_doc("NEW mode:", self.doc)) # Save metadata if self.mode == NEW or (self.mode == UPDATE and self.options["update_metadata"]): logger.debug(msg_doc("Metadata:", self.doc)) self.save_metadata(doc_obj) self.check_forgetme(doc_obj) # Save digesto if self.mode == NEW or (self.mode == UPDATE and self.options["update_digesto"]): self.process_digesto(doc_obj) # Update inforce if self.mode == NEW or (self.mode == UPDATE and self.options["update_inforce"]): logger.debug(msg_doc("Update inforce:", self.doc)) self.update_inforce(doc_obj) # Save PDF if self.mode == NEW or (self.mode == UPDATE and self.options["save_pdf"]): logger.debug(msg_doc("Get PDF:", self.doc)) self.save_pdf(doc_obj)
def get_digesto( self, doc ): document = doc['document'] doc_id = doc['digesto'] # Checks if the document already has the digesto text try: document_text = DocumentText.objects.get( document = document ) except ObjectDoesNotExist: logger.warn('Getting digesto text: %(doc_type)s %(number)s %(date_st)s' % doc) else: return # Gets the DIGESTO system integral text soup = read_soup( digesto_url % doc_id ) # Parse the text # <li class="formatedTextoWithLinks"> try: text = soup.find( 'li', { 'class': 'formatedTextoWithLinks' } ).renderContents() text = text.replace('<span>Texto</span>','') except AttributeError: # No digesto text, abort logger.debug('No digesto text.') return # Save the text to the database document_text = DocumentText() document_text.document = document document_text.text_url = digesto_url % doc_id document_text.text = text document_text.save()
def save_doc(self): # Check for document duplication doc_obj = self.check_duplicate() if self.mode == UPDATE: if not self.options['update']: logger.debug(msg_doc('IGNORING duplicated document:', self.doc)) raise DREDuplicateError('Not going to process this doc.') else: logger.warn(msg_doc('UPDATE mode:', self.doc)) logger.debug('doc_obj: %s' % doc_obj) else: logger.warn(msg_doc('NEW mode:', self.doc)) # Save metadata if self.mode==NEW or (self.mode==UPDATE and self.options['update_metadata']): logger.debug(msg_doc('Metadata:', self.doc)) self.save_metadata(doc_obj) self.check_forgetme(doc_obj) # Save digesto if self.mode==NEW or (self.mode==UPDATE and self.options['update_digesto']): self.process_digesto(doc_obj) # Update cache if self.mode==NEW or (self.mode==UPDATE and self.options['update_cache']): logger.debug(msg_doc('Cache:', self.doc)) self.update_cache(doc_obj) # Update inforce if self.mode==NEW or (self.mode==UPDATE and self.options['update_inforce']): logger.debug(msg_doc('Update inforce:', self.doc)) self.update_inforce(doc_obj) # Save PDF if self.mode==NEW or (self.mode==UPDATE and self.options['save_pdf']): logger.debug(msg_doc('Get PDF:', self.doc)) self.save_pdf(doc_obj)
def fetch_url( url, data=None, cj=None ): # Treat url url_object = list(urlparse.urlsplit(url)) if u'\xba' in url_object[2]: url_object[2] = url_object[2].encode('utf-8') url_object[2] = urllib.quote(url_object[2]) url = urlparse.urlunsplit(url_object) # Get the payload repeat = 1 while repeat: try: logger.debug('Getting: %s' % url) request = urllib2.Request(url, data) request.add_header('Accept-Encoding', 'gzip; q=1.0, identity; q=0.5') request.add_header('User-agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)') if not cj: cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(SmartRedirectHandler(), urllib2.HTTPCookieProcessor(cj) ) resource = opener.open( request ) is_gzip = resource.headers.get('Content-Encoding') == 'gzip' payload = resource.read() url = resource.url resource.close() if is_gzip: try: compressedstream = StringIO.StringIO(payload) gzipper = gzip.GzipFile(fileobj=compressedstream) payload = gzipper.read() except IOError: pass repeat = False except (socket.timeout, socket.error) as e: repeat += 1 if repeat > MAXREPEAT: logger.critical('Socket timeout! Aborting') raise logger.debug('Socket timeout! Sleeping for 5 minutes') time.sleep(300) except (urllib2.URLError, urllib2.HTTPError) as e: msg = str(e) repeat += 1 if repeat > MAXREPEAT: logger.critical('HTTP Error! Aborting. Error repeated %d times: %s' % (MAXREPEAT, msg) ) raise DREError('Error condition on the site') if 'Error 400' in str(msg) or 'Error 404' in str(msg): logger.critical('HTTP Error 40x - URL: %s' % url) raise if 'Error 503' in str(msg): logger.critical('HTTP Error 503 - cache problem going to try again in 10 seconds.') time.sleep(10) continue logger.warn('HTTP Error! Sleeping for 5 minutes: %s' % msg) time.sleep(300) t=random.randint(1,5) logger.debug('Sleeping %ds' % t) time.sleep(t) return url, payload, cj