def dont_run__parse_government_bill_pdf(self):
     # make sure we have poppler - if not, just pass the test with an ignore
     self.assertTrue(os.path.exists(GOV_BILL_TEST_FILE), 'missing %s (cwd = %s)' % (GOV_BILL_TEST_FILE, os.getcwd()))
     self.assertTrue(os.path.exists(GOV_BILL_CORRECT_OUTPUT))
     prop = GovProposalParser(GOV_BILL_TEST_FILE)
     expected_result = cPickle.load(open(GOV_BILL_CORRECT_OUTPUT, 'r'))
     self.assertEqual(prop.to_unicode(True).encode('utf-8'), expected_result)
 def dont_run__parse_government_bill_pdf(self):
     # make sure we have poppler - if not, just pass the test with an ignore
     self.assertTrue(
         os.path.exists(GOV_BILL_TEST_FILE),
         'missing %s (cwd = %s)' % (GOV_BILL_TEST_FILE, os.getcwd()))
     self.assertTrue(os.path.exists(GOV_BILL_CORRECT_OUTPUT))
     prop = GovProposalParser(GOV_BILL_TEST_FILE)
     expected_result = cPickle.load(open(GOV_BILL_CORRECT_OUTPUT, 'r'))
     self.assertEqual(
         prop.to_unicode(True).encode('utf-8'), expected_result)
Example #3
0
 def parse_pdf(self, pdf_url):
     """ Grab a single pdf url, using cache via LinkedFile
     """
     existing_count = Link.objects.filter(url=pdf_url).count()
     if existing_count >= 1:
         if existing_count > 1:
             logger.warn(
                 "found two objects with the url %s. Taking the first" %
                 pdf_url)
         link = Link.objects.filter(url=pdf_url).first()
     filename = None
     if existing_count > 0:
         files = [
             f for f in link.linkedfile_set.order_by('last_updated')
             if f.link_file.name != ''
         ]
         if len(files) > 0:
             link_file = files[0]
             filename = link_file.link_file.path
             logger.debug('trying reusing %s from %s' % (pdf_url, filename))
             if not os.path.exists(filename):
                 # for some reason the file can't be found, we'll just d/l
                 # it again
                 filename = None
                 logger.debug('not reusing because file not found')
     if not filename:
         logger.debug('getting %s' % pdf_url)
         contents = urllib2.urlopen(pdf_url).read()
         link_file = LinkedFile()
         saved_filename = os.path.basename(urlparse(pdf_url).path)
         link_file.link_file.save(saved_filename, ContentFile(contents))
         filename = link_file.link_file.path
     try:
         prop = GovProposalParser(filename)
     except Exception:
         logger.exception('Gov proposal exception %s'.format(pdf_url))
         return None
     # TODO: check if parsing handles more than 1 prop in a booklet
     x = {
         'title': prop.get_title(),
         'date': prop.get_date(),
         # 'bill':prop,
         'link_file': link_file
     }
     return [x]
Example #4
0
 def parse_pdf(self, pdf_url):
     """ Grab a single pdf url, using cache via LinkedFile
     """
     existing_count = Link.objects.filter(url=pdf_url).count()
     if existing_count >= 1:
         if existing_count > 1:
             logger.warn("found two objects with the url %s. Taking the first" % pdf_url)
         link = Link.objects.filter(url=pdf_url).first()
     filename = None
     if existing_count > 0:
         files = [f for f in link.linkedfile_set.order_by('last_updated') if f.link_file.name != '']
         if len(files) > 0:
             link_file = files[0]
             filename = link_file.link_file.path
             logger.debug('trying reusing %s from %s' % (pdf_url, filename))
             if not os.path.exists(filename):
                 # for some reason the file can't be found, we'll just d/l
                 # it again
                 filename = None
                 logger.debug('not reusing because file not found')
     if not filename:
         logger.debug('getting %s' % pdf_url)
         contents = urllib2.urlopen(pdf_url).read()
         link_file = LinkedFile()
         saved_filename = os.path.basename(urlparse(pdf_url).path)
         link_file.link_file.save(saved_filename, ContentFile(contents))
         filename = link_file.link_file.path
     try:
         prop = GovProposalParser(filename)
     except Exception:
         logger.exception('Gov proposal exception %s'.format(pdf_url))
         return None
     # TODO: check if parsing handles more than 1 prop in a booklet
     x = {'title': prop.get_title(),
          'date': prop.get_date(),
          # 'bill':prop,
          'link_file': link_file}
     return [x]