Esempio n. 1
0
 def log_doc(self, doc):
     txt = 'Document saved:\n'
     for key,item in doc.items():
         if key not in ('summary', 'document'):
             txt += '   %-12s: %s\n' % (key, item)
     logger.debug(txt[:-1])
     logger.warn('Got *new* document: %(doc_type)s %(number)s %(date_st)s' % doc )
Esempio n. 2
0
 def save_doc(self):
     # Check for document duplication
     doc_obj = self.check_duplicate()
     if self.mode == UPDATE:
         if not self.options["update"]:
             logger.debug(msg_doc("IGNORING duplicated document:", self.doc))
             raise DREDuplicateError("Not going to process this doc.")
         else:
             logger.warn(msg_doc("UPDATE mode:", self.doc))
             logger.debug("doc_obj: %s" % doc_obj)
     else:
         logger.warn(msg_doc("NEW mode:", self.doc))
     # Save metadata
     if self.mode == NEW or (self.mode == UPDATE and self.options["update_metadata"]):
         logger.debug(msg_doc("Metadata:", self.doc))
         self.save_metadata(doc_obj)
         self.check_forgetme(doc_obj)
     # Save digesto
     if self.mode == NEW or (self.mode == UPDATE and self.options["update_digesto"]):
         self.process_digesto(doc_obj)
     # Update inforce
     if self.mode == NEW or (self.mode == UPDATE and self.options["update_inforce"]):
         logger.debug(msg_doc("Update inforce:", self.doc))
         self.update_inforce(doc_obj)
     # Save PDF
     if self.mode == NEW or (self.mode == UPDATE and self.options["save_pdf"]):
         logger.debug(msg_doc("Get PDF:", self.doc))
         self.save_pdf(doc_obj)
Esempio n. 3
0
    def get_digesto( self, doc ):
        document = doc['document']
        doc_id = doc['digesto']

        # Checks if the document already has the digesto text
        try:
            document_text = DocumentText.objects.get( document = document )
        except ObjectDoesNotExist:
            logger.warn('Getting digesto text: %(doc_type)s %(number)s %(date_st)s' % doc)
        else:
            return

        # Gets the DIGESTO system integral text
        soup = read_soup( digesto_url % doc_id )

        # Parse the text
        # <li class="formatedTextoWithLinks">
        try:
            text = soup.find( 'li', { 'class': 'formatedTextoWithLinks' }
                    ).renderContents()
            text = text.replace('<span>Texto</span>','')
        except AttributeError:
            # No digesto text, abort
            logger.debug('No digesto text.')
            return

        # Save the text to the database
        document_text = DocumentText()
        document_text.document = document
        document_text.text_url = digesto_url % doc_id
        document_text.text = text
        document_text.save()
Esempio n. 4
0
 def save_doc(self):
     # Check for document duplication
     doc_obj = self.check_duplicate()
     if self.mode == UPDATE:
         if not self.options['update']:
             logger.debug(msg_doc('IGNORING duplicated document:',
                 self.doc))
             raise DREDuplicateError('Not going to process this doc.')
         else:
             logger.warn(msg_doc('UPDATE mode:', self.doc))
             logger.debug('doc_obj: %s' % doc_obj)
     else:
         logger.warn(msg_doc('NEW mode:', self.doc))
     # Save metadata
     if self.mode==NEW or (self.mode==UPDATE and
                           self.options['update_metadata']):
         logger.debug(msg_doc('Metadata:', self.doc))
         self.save_metadata(doc_obj)
         self.check_forgetme(doc_obj)
     # Save digesto
     if self.mode==NEW or (self.mode==UPDATE and
                           self.options['update_digesto']):
         self.process_digesto(doc_obj)
     # Update cache
     if self.mode==NEW or (self.mode==UPDATE and
                           self.options['update_cache']):
         logger.debug(msg_doc('Cache:', self.doc))
         self.update_cache(doc_obj)
     # Update inforce
     if self.mode==NEW or (self.mode==UPDATE and
                           self.options['update_inforce']):
         logger.debug(msg_doc('Update inforce:', self.doc))
         self.update_inforce(doc_obj)
     # Save PDF
     if self.mode==NEW or (self.mode==UPDATE and
                           self.options['save_pdf']):
         logger.debug(msg_doc('Get PDF:', self.doc))
         self.save_pdf(doc_obj)
Esempio n. 5
0
def fetch_url( url, data=None, cj=None ):
    # Treat url
    url_object = list(urlparse.urlsplit(url))
    if u'\xba' in url_object[2]:
        url_object[2] = url_object[2].encode('utf-8')
    url_object[2] = urllib.quote(url_object[2])
    url = urlparse.urlunsplit(url_object)

    # Get the payload
    repeat = 1
    while repeat:
        try:
            logger.debug('Getting: %s' % url)
            request = urllib2.Request(url, data)
            request.add_header('Accept-Encoding', 'gzip; q=1.0, identity; q=0.5')
            request.add_header('User-agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)')
            if not cj:
                cj = cookielib.LWPCookieJar()
            opener = urllib2.build_opener(SmartRedirectHandler(), urllib2.HTTPCookieProcessor(cj) )
            resource = opener.open( request )
            is_gzip = resource.headers.get('Content-Encoding') == 'gzip'

            payload = resource.read()

            url = resource.url

            resource.close()

            if is_gzip:
                try:
                    compressedstream = StringIO.StringIO(payload)
                    gzipper = gzip.GzipFile(fileobj=compressedstream)
                    payload = gzipper.read()
                except IOError:
                    pass

            repeat = False
        except (socket.timeout, socket.error) as e:
            repeat += 1
            if repeat > MAXREPEAT:
                logger.critical('Socket timeout! Aborting')
                raise
            logger.debug('Socket timeout! Sleeping for 5 minutes')
            time.sleep(300)
        except (urllib2.URLError, urllib2.HTTPError) as e:
            msg = str(e)
            repeat += 1
            if repeat > MAXREPEAT:
                logger.critical('HTTP Error! Aborting. Error repeated %d times: %s' % (MAXREPEAT, msg) )
                raise DREError('Error condition on the site')
            if 'Error 400' in str(msg) or 'Error 404' in str(msg):
                logger.critical('HTTP Error 40x - URL: %s' % url)
                raise
            if 'Error 503' in str(msg):
                logger.critical('HTTP Error 503 - cache problem going to try again in 10 seconds.')
                time.sleep(10)
                continue

            logger.warn('HTTP Error! Sleeping for 5 minutes: %s' % msg)
            time.sleep(300)

    t=random.randint(1,5)
    logger.debug('Sleeping %ds' % t)
    time.sleep(t)
    return url, payload, cj