def parse_pdf(self, pdf_url): """ Grab a single pdf url, using cache via LinkedFile """ existing_count = Link.objects.filter(url=pdf_url).count() if existing_count < 1: link = Link(url=pdf_url) link.save() else: if existing_count > 1: print "WARNING: you have two objects with the url %s. Taking the first" % pdf_url link = Link.objects.filter(url=pdf_url).iterator().next() filename = None if link.linkedfile_set.count() > 0: files = [f for f in link.linkedfile_set.order_by("last_updated") if f.link_file.name != ""] if len(files) > 0: filename = files[0].link_file.path logger.debug("reusing %s from %s" % (pdf_url, filename)) if not filename: logger.debug("getting %s" % pdf_url) contents = urllib2.urlopen(pdf_url).read() link_file = LinkedFile(link=link) saved_filename = os.path.basename(urlparse(pdf_url).path) link_file.link_file.save(saved_filename, ContentFile(contents)) filename = link_file.link_file.path prop = GovProposalParser(filename) # TODO: check if parsing handles more than 1 prop in a booklet return [{"title": prop.get_title(), "date": prop.get_date(), "bill": prop}]
def parse_pdf(self,pdf_url): """ Grab a single pdf url, using cache via LinkedFile """ existing_count = Link.objects.filter(url=pdf_url).count() if existing_count < 1: link = Link(url=pdf_url) link.save() else: if existing_count > 1: print "WARNING: you have two objects with the url %s. Taking the first" % pdf_url link = Link.objects.filter(url=pdf_url).iterator().next() filename = None if link.linkedfile_set.count() > 0: files = [f for f in link.linkedfile_set.order_by('last_updated') if f.link_file.name != ''] if len(files) > 0: filename = files[0].link_file.path logger.debug('reusing %s from %s' % (pdf_url, filename)) if not filename: logger.debug('getting %s' % pdf_url) contents = urllib2.urlopen(pdf_url).read() link_file = LinkedFile(link=link) saved_filename = os.path.basename(urlparse(pdf_url).path) link_file.link_file.save(saved_filename, ContentFile(contents)) filename = link_file.link_file.path prop = GovProposalParser(filename) # TODO: check if parsing handles more than 1 prop in a booklet return [{'title':prop.get_title(),'date':prop.get_date(), 'bill':prop}]
def create_or_update_single_bill(self, proposal, pdf_link, link_file): if not(proposal['date']) or CUTOFF_DATE and proposal['date'] < CUTOFF_DATE: return law_name = proposal['law'] (law, created) = Law.objects.get_or_create(title=law_name) if created: law.save() if law.merged_into: law = law.merged_into title = u'' if proposal['correction']: title += proposal['correction'] if proposal['comment']: title += ' ' + proposal['comment'] if len(title)<=1: title = u'חוק חדש' (gp,created) = GovProposal.objects.get_or_create(booklet_number=proposal['booklet'], knesset_id=18, source_url=proposal['link'], title=title, law=law, date=proposal['date']) if created: gp.save() logger.debug("created GovProposal id = %d" % gp.id) bill_params = dict(law=law, title=title, stage='3', stage_date=proposal['date']) similar_bills = Bill.objects.filter(**bill_params).order_by('id') if len(similar_bills) >= 1: b = similar_bills[0] if len(similar_bills) > 1: logger.debug("multiple bills detected") for bill in similar_bills: if bill.id == b.id: logger.debug("bill being used now - %d" % bill.id) else: logger.debug("bill with same fields - %d" % bill.id) else: b = Bill(**bill_params) b.save() gp.bill = b gp.save() if link_file.link is None: link = Link(title=pdf_link, url=pdf_link, content_type=ContentType.objects.get_for_model(gp), object_pk=str(gp.id)) link.save() link_file.link = link link_file.save() logger.debug("check updated %s" % b.get_absolute_url())