def dont_run__parse_government_bill_pdf(self): # make sure we have poppler - if not, just pass the test with an ignore self.assertTrue(os.path.exists(GOV_BILL_TEST_FILE), 'missing %s (cwd = %s)' % (GOV_BILL_TEST_FILE, os.getcwd())) self.assertTrue(os.path.exists(GOV_BILL_CORRECT_OUTPUT)) prop = GovProposalParser(GOV_BILL_TEST_FILE) expected_result = cPickle.load(open(GOV_BILL_CORRECT_OUTPUT, 'r')) self.assertEqual(prop.to_unicode(True).encode('utf-8'), expected_result)
def dont_run__parse_government_bill_pdf(self): # make sure we have poppler - if not, just pass the test with an ignore self.assertTrue( os.path.exists(GOV_BILL_TEST_FILE), 'missing %s (cwd = %s)' % (GOV_BILL_TEST_FILE, os.getcwd())) self.assertTrue(os.path.exists(GOV_BILL_CORRECT_OUTPUT)) prop = GovProposalParser(GOV_BILL_TEST_FILE) expected_result = cPickle.load(open(GOV_BILL_CORRECT_OUTPUT, 'r')) self.assertEqual( prop.to_unicode(True).encode('utf-8'), expected_result)
def parse_pdf(self, pdf_url): """ Grab a single pdf url, using cache via LinkedFile """ existing_count = Link.objects.filter(url=pdf_url).count() if existing_count >= 1: if existing_count > 1: logger.warn( "found two objects with the url %s. Taking the first" % pdf_url) link = Link.objects.filter(url=pdf_url).first() filename = None if existing_count > 0: files = [ f for f in link.linkedfile_set.order_by('last_updated') if f.link_file.name != '' ] if len(files) > 0: link_file = files[0] filename = link_file.link_file.path logger.debug('trying reusing %s from %s' % (pdf_url, filename)) if not os.path.exists(filename): # for some reason the file can't be found, we'll just d/l # it again filename = None logger.debug('not reusing because file not found') if not filename: logger.debug('getting %s' % pdf_url) contents = urllib2.urlopen(pdf_url).read() link_file = LinkedFile() saved_filename = os.path.basename(urlparse(pdf_url).path) link_file.link_file.save(saved_filename, ContentFile(contents)) filename = link_file.link_file.path try: prop = GovProposalParser(filename) except Exception: logger.exception('Gov proposal exception %s'.format(pdf_url)) return None # TODO: check if parsing handles more than 1 prop in a booklet x = { 'title': prop.get_title(), 'date': prop.get_date(), # 'bill':prop, 'link_file': link_file } return [x]
def parse_pdf(self, pdf_url): """ Grab a single pdf url, using cache via LinkedFile """ existing_count = Link.objects.filter(url=pdf_url).count() if existing_count >= 1: if existing_count > 1: logger.warn("found two objects with the url %s. Taking the first" % pdf_url) link = Link.objects.filter(url=pdf_url).first() filename = None if existing_count > 0: files = [f for f in link.linkedfile_set.order_by('last_updated') if f.link_file.name != ''] if len(files) > 0: link_file = files[0] filename = link_file.link_file.path logger.debug('trying reusing %s from %s' % (pdf_url, filename)) if not os.path.exists(filename): # for some reason the file can't be found, we'll just d/l # it again filename = None logger.debug('not reusing because file not found') if not filename: logger.debug('getting %s' % pdf_url) contents = urllib2.urlopen(pdf_url).read() link_file = LinkedFile() saved_filename = os.path.basename(urlparse(pdf_url).path) link_file.link_file.save(saved_filename, ContentFile(contents)) filename = link_file.link_file.path try: prop = GovProposalParser(filename) except Exception: logger.exception('Gov proposal exception %s'.format(pdf_url)) return None # TODO: check if parsing handles more than 1 prop in a booklet x = {'title': prop.get_title(), 'date': prop.get_date(), # 'bill':prop, 'link_file': link_file} return [x]