def process_pdf(parser, pdfurl, errors): postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None except ValueError, e: errors.append(e)
def process_pdf(parser, pdfurl, errors): postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None except ValueError, e: # Some PDFs can not be parsed! This should be investigated print "PDF format problem" errors.append(e)
def process_pdf(parser, pdfurl, errors): errors = [] postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors) try: pdfcontent = lazycache.lazycache(pdfurl) parser.preprocess(pdfurl, pdfcontent) # except ValueError, e: # errors.append(e) except IndexError, e: errors.append(e)
def process_pdf(parser, pdfurl, errors): if parser.is_already_scraped(pdfurl): return postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None except ValueError, e: print e errors.append(e)
def fetch_and_preprocess(parser, pdfurl): pdfcontent = postlistelib.fetch_url_harder(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None