def parse_pdf(self,pdf_url):
        """ Grab a single pdf url, using cache via LinkedFile
        """
        existing_count = Link.objects.filter(url=pdf_url).count()
        if existing_count >= 1:
            if existing_count > 1:
                print "WARNING: you have two objects with the url %s. Taking the first" % pdf_url
            link = Link.objects.filter(url=pdf_url).iterator().next()
        filename = None
        if existing_count > 0:
            files = [f for f in link.linkedfile_set.order_by('last_updated') if f.link_file.name != '']
            if len(files) > 0:
                link_file = files[0]
                filename = link_file.link_file.path
                logger.debug('reusing %s from %s' % (pdf_url, filename))
                if not os.path.exists(filename):
                    # for some reason the file can't be found, we'll just d/l
                    # it again
                    filename = None
                    logger.debug('not reusing because file not found')
        if not filename:
            logger.debug('getting %s' % pdf_url)
            contents = urllib2.urlopen(pdf_url).read()
            link_file = LinkedFile()
            saved_filename = os.path.basename(urlparse(pdf_url).path)
            link_file.link_file.save(saved_filename, ContentFile(contents))
            filename = link_file.link_file.path
        prop = GovProposalParser(filename)

        # TODO: check if parsing handles more than 1 prop in a booklet
        x = [{'title':prop.get_title(),
              'date':prop.get_date(),
              #'bill':prop,
              'link_file': link_file}]
        return x
Beispiel #2
0
    def parse_pdf(self,pdf_url):
        """ Grab a single pdf url, using cache via LinkedFile
        """
        existing_count = Link.objects.filter(url=pdf_url).count()
        if existing_count >= 1:
            if existing_count > 1:
                print "WARNING: you have two objects with the url %s. Taking the first" % pdf_url
            link = Link.objects.filter(url=pdf_url).iterator().next()
        filename = None
        if existing_count > 0:
            files = [f for f in link.linkedfile_set.order_by('last_updated') if f.link_file.name != '']
            if len(files) > 0:
                link_file = files[0]
                filename = link_file.link_file.path
                logger.debug('reusing %s from %s' % (pdf_url, filename))
        if not filename:
            logger.debug('getting %s' % pdf_url)
            contents = urllib2.urlopen(pdf_url).read()
            link_file = LinkedFile()
            saved_filename = os.path.basename(urlparse(pdf_url).path)
            link_file.link_file.save(saved_filename, ContentFile(contents))
            filename = link_file.link_file.path
        prop = GovProposalParser(filename)

        # TODO: check if parsing handles more than 1 prop in a booklet
        x = [{'title':prop.get_title(),
              'date':prop.get_date(),
              #'bill':prop,
              'link_file': link_file}]
        return x
Beispiel #3
0
    def parse_pdf(self, pdf_url):
        """ Grab a single pdf url, using cache via LinkedFile
        """
        existing_count = Link.objects.filter(url=pdf_url).count()
        if existing_count < 1:
            link = Link(url=pdf_url)
            link.save()
        else:
            if existing_count > 1:
                print "WARNING: you have two objects with the url %s. Taking the first" % pdf_url
            link = Link.objects.filter(url=pdf_url).iterator().next()
        filename = None
        if link.linkedfile_set.count() > 0:
            files = [f for f in link.linkedfile_set.order_by("last_updated") if f.link_file.name != ""]
            if len(files) > 0:
                filename = files[0].link_file.path
                logger.debug("reusing %s from %s" % (pdf_url, filename))
        if not filename:
            logger.debug("getting %s" % pdf_url)
            contents = urllib2.urlopen(pdf_url).read()
            link_file = LinkedFile(link=link)
            saved_filename = os.path.basename(urlparse(pdf_url).path)
            link_file.link_file.save(saved_filename, ContentFile(contents))
            filename = link_file.link_file.path
        prop = GovProposalParser(filename)

        # TODO: check if parsing handles more than 1 prop in a booklet
        return [{"title": prop.get_title(), "date": prop.get_date(), "bill": prop}]
Beispiel #4
0
 def parse_pdf(self, pdf_url):
     """ Grab a single pdf url, using cache via LinkedFile
     """
     existing_count = Link.objects.filter(url=pdf_url).count()
     if existing_count >= 1:
         if existing_count > 1:
             logger.warn(
                 "found two objects with the url %s. Taking the first" %
                 pdf_url)
         link = Link.objects.filter(url=pdf_url).iterator().next()
     filename = None
     if existing_count > 0:
         files = [
             f for f in link.linkedfile_set.order_by('last_updated')
             if f.link_file.name != ''
         ]
         if len(files) > 0:
             link_file = files[0]
             filename = link_file.link_file.path
             logger.debug('reusing %s from %s' % (pdf_url, filename))
             if not os.path.exists(filename):
                 # for some reason the file can't be found, we'll just d/l
                 # it again
                 filename = None
                 logger.debug('not reusing because file not found')
     if not filename:
         logger.debug('getting %s' % pdf_url)
         contents = urllib2.urlopen(pdf_url).read()
         link_file = LinkedFile()
         saved_filename = os.path.basename(urlparse(pdf_url).path)
         link_file.link_file.save(saved_filename, ContentFile(contents))
         filename = link_file.link_file.path
     try:
         prop = GovProposalParser(filename)
     except Exception, e:
         logger.info(e)
         return None
Beispiel #5
0
def show_one(pdf_filename, show_details=False):
    prop = GovProposalParser(pdf_filename)
    print prop.to_unicode(show_details).encode('utf-8')
Beispiel #6
0
def show_one(pdf_filename, show_details=False):
    prop = GovProposalParser(pdf_filename)
    print prop.to_unicode(show_details).encode('utf-8')