def parse_pdf(self,pdf_url): """ Grab a single pdf url, using cache via LinkedFile """ existing_count = Link.objects.filter(url=pdf_url).count() if existing_count >= 1: if existing_count > 1: print "WARNING: you have two objects with the url %s. Taking the first" % pdf_url link = Link.objects.filter(url=pdf_url).iterator().next() filename = None if existing_count > 0: files = [f for f in link.linkedfile_set.order_by('last_updated') if f.link_file.name != ''] if len(files) > 0: link_file = files[0] filename = link_file.link_file.path logger.debug('reusing %s from %s' % (pdf_url, filename)) if not os.path.exists(filename): # for some reason the file can't be found, we'll just d/l # it again filename = None logger.debug('not reusing because file not found') if not filename: logger.debug('getting %s' % pdf_url) contents = urllib2.urlopen(pdf_url).read() link_file = LinkedFile() saved_filename = os.path.basename(urlparse(pdf_url).path) link_file.link_file.save(saved_filename, ContentFile(contents)) filename = link_file.link_file.path prop = GovProposalParser(filename) # TODO: check if parsing handles more than 1 prop in a booklet x = [{'title':prop.get_title(), 'date':prop.get_date(), #'bill':prop, 'link_file': link_file}] return x
def parse_pdf(self,pdf_url): """ Grab a single pdf url, using cache via LinkedFile """ existing_count = Link.objects.filter(url=pdf_url).count() if existing_count >= 1: if existing_count > 1: print "WARNING: you have two objects with the url %s. Taking the first" % pdf_url link = Link.objects.filter(url=pdf_url).iterator().next() filename = None if existing_count > 0: files = [f for f in link.linkedfile_set.order_by('last_updated') if f.link_file.name != ''] if len(files) > 0: link_file = files[0] filename = link_file.link_file.path logger.debug('reusing %s from %s' % (pdf_url, filename)) if not filename: logger.debug('getting %s' % pdf_url) contents = urllib2.urlopen(pdf_url).read() link_file = LinkedFile() saved_filename = os.path.basename(urlparse(pdf_url).path) link_file.link_file.save(saved_filename, ContentFile(contents)) filename = link_file.link_file.path prop = GovProposalParser(filename) # TODO: check if parsing handles more than 1 prop in a booklet x = [{'title':prop.get_title(), 'date':prop.get_date(), #'bill':prop, 'link_file': link_file}] return x
def parse_pdf(self, pdf_url): """ Grab a single pdf url, using cache via LinkedFile """ existing_count = Link.objects.filter(url=pdf_url).count() if existing_count < 1: link = Link(url=pdf_url) link.save() else: if existing_count > 1: print "WARNING: you have two objects with the url %s. Taking the first" % pdf_url link = Link.objects.filter(url=pdf_url).iterator().next() filename = None if link.linkedfile_set.count() > 0: files = [f for f in link.linkedfile_set.order_by("last_updated") if f.link_file.name != ""] if len(files) > 0: filename = files[0].link_file.path logger.debug("reusing %s from %s" % (pdf_url, filename)) if not filename: logger.debug("getting %s" % pdf_url) contents = urllib2.urlopen(pdf_url).read() link_file = LinkedFile(link=link) saved_filename = os.path.basename(urlparse(pdf_url).path) link_file.link_file.save(saved_filename, ContentFile(contents)) filename = link_file.link_file.path prop = GovProposalParser(filename) # TODO: check if parsing handles more than 1 prop in a booklet return [{"title": prop.get_title(), "date": prop.get_date(), "bill": prop}]
def parse_pdf(self, pdf_url): """ Grab a single pdf url, using cache via LinkedFile """ existing_count = Link.objects.filter(url=pdf_url).count() if existing_count >= 1: if existing_count > 1: logger.warn( "found two objects with the url %s. Taking the first" % pdf_url) link = Link.objects.filter(url=pdf_url).iterator().next() filename = None if existing_count > 0: files = [ f for f in link.linkedfile_set.order_by('last_updated') if f.link_file.name != '' ] if len(files) > 0: link_file = files[0] filename = link_file.link_file.path logger.debug('reusing %s from %s' % (pdf_url, filename)) if not os.path.exists(filename): # for some reason the file can't be found, we'll just d/l # it again filename = None logger.debug('not reusing because file not found') if not filename: logger.debug('getting %s' % pdf_url) contents = urllib2.urlopen(pdf_url).read() link_file = LinkedFile() saved_filename = os.path.basename(urlparse(pdf_url).path) link_file.link_file.save(saved_filename, ContentFile(contents)) filename = link_file.link_file.path try: prop = GovProposalParser(filename) except Exception, e: logger.info(e) return None
def show_one(pdf_filename, show_details=False): prop = GovProposalParser(pdf_filename) print prop.to_unicode(show_details).encode('utf-8')