Ejemplo n.º 1
0
    def parse_pdf(self, pdf_url):
        """ Grab a single pdf url, using cache via LinkedFile
        """
        existing_count = Link.objects.filter(url=pdf_url).count()
        if existing_count < 1:
            link = Link(url=pdf_url)
            link.save()
        else:
            if existing_count > 1:
                print "WARNING: you have two objects with the url %s. Taking the first" % pdf_url
            link = Link.objects.filter(url=pdf_url).iterator().next()
        filename = None
        if link.linkedfile_set.count() > 0:
            files = [f for f in link.linkedfile_set.order_by("last_updated") if f.link_file.name != ""]
            if len(files) > 0:
                filename = files[0].link_file.path
                logger.debug("reusing %s from %s" % (pdf_url, filename))
        if not filename:
            logger.debug("getting %s" % pdf_url)
            contents = urllib2.urlopen(pdf_url).read()
            link_file = LinkedFile(link=link)
            saved_filename = os.path.basename(urlparse(pdf_url).path)
            link_file.link_file.save(saved_filename, ContentFile(contents))
            filename = link_file.link_file.path
        prop = GovProposalParser(filename)

        # TODO: check if parsing handles more than 1 prop in a booklet
        return [{"title": prop.get_title(), "date": prop.get_date(), "bill": prop}]
Ejemplo n.º 2
0
 def parse_pdf(self,pdf_url):
     """ Grab a single pdf url, using cache via LinkedFile
     """
     existing_count = Link.objects.filter(url=pdf_url).count()
     if existing_count < 1:
         link = Link(url=pdf_url)
         link.save()
     else:
         if existing_count > 1:
             print "WARNING: you have two objects with the url %s. Taking the first" % pdf_url
         link = Link.objects.filter(url=pdf_url).iterator().next()
     filename = None
     if link.linkedfile_set.count() > 0:
         files = [f for f in link.linkedfile_set.order_by('last_updated') if f.link_file.name != '']
         if len(files) > 0:
             filename = files[0].link_file.path
             logger.debug('reusing %s from %s' % (pdf_url, filename))
     if not filename:
         logger.debug('getting %s' % pdf_url)
         contents = urllib2.urlopen(pdf_url).read()
         link_file = LinkedFile(link=link)
         saved_filename = os.path.basename(urlparse(pdf_url).path)
         link_file.link_file.save(saved_filename, ContentFile(contents))
         filename = link_file.link_file.path
     prop = GovProposalParser(filename)
     
     # TODO: check if parsing handles more than 1 prop in a booklet                
     return [{'title':prop.get_title(),'date':prop.get_date(), 'bill':prop}]
Ejemplo n.º 3
0
    def create_or_update_single_bill(self, proposal, pdf_link, link_file):
        if not(proposal['date']) or CUTOFF_DATE and proposal['date'] < CUTOFF_DATE:
            return
        law_name = proposal['law']
        (law, created) = Law.objects.get_or_create(title=law_name)
        if created:
            law.save()
        if law.merged_into:
            law = law.merged_into
        title = u''
        if proposal['correction']:
            title += proposal['correction']
        if proposal['comment']:
            title += ' ' + proposal['comment']
        if len(title)<=1:
            title = u'חוק חדש'
        (gp,created) = GovProposal.objects.get_or_create(booklet_number=proposal['booklet'], knesset_id=18,
                                                             source_url=proposal['link'],
                                                             title=title, law=law, date=proposal['date'])
        if created:
            gp.save()
            logger.debug("created GovProposal id = %d" % gp.id)

        bill_params = dict(law=law, title=title, stage='3', stage_date=proposal['date'])
        similar_bills = Bill.objects.filter(**bill_params).order_by('id')
        if len(similar_bills) >= 1:
            b = similar_bills[0]
            if len(similar_bills) > 1:
                logger.debug("multiple bills detected")
                for bill in similar_bills:
                    if bill.id == b.id:
                        logger.debug("bill being used now   - %d" % bill.id)
                    else:
                        logger.debug("bill with same fields - %d" % bill.id)
        else:
            b = Bill(**bill_params)
            b.save()
        gp.bill = b
        gp.save()
        if link_file.link is None:
            link = Link(title=pdf_link, url=pdf_link,
                content_type=ContentType.objects.get_for_model(gp),
                object_pk=str(gp.id))
            link.save()
            link_file.link = link
            link_file.save()
            logger.debug("check updated %s" % b.get_absolute_url())