def get_and_process_pdf(self, data, session, row_pk, index=False): if data is None: return result = data['result'] rd = RECAPDocument.objects.get(pk=data['rd_pk']) report = FreeOpinionReport(data['pacer_court_id'], session) try: r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % result) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at %s with doc id %s" % \ (result, result.court_id, result.pacer_doc_id) logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) self.request.callbacks = None return file_name = get_document_filename( result.court.pk, result.pacer_case_id, result.document_number, 0, # Attachment number is zero for all free opinions. ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.is_free_on_pacer = True rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save and extract, skipping OCR. rd.save(do_extraction=False, index=index) extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False) return {'result': result, 'rd_pk': rd.pk}
def get_and_save_free_document_report(self, court_id, start, end, session): """Download the Free document report and save it to the DB. :param self: The Celery task. :param court_id: A pacer court id. :param start: a date object representing the first day to get results. :param end: a date object representing the last day to get results. :param session: A PACER Session object :return: None """ report = FreeOpinionReport(court_id, session) try: responses = report.query(start, end, sort='case_number') except (ConnectionError, ChunkedEncodingError, ReadTimeoutError, ReadTimeout, ConnectTimeout) as exc: logger.warning("Unable to get free document report results from %s " "(%s to %s). Trying again." % (court_id, start, end)) if self.request.retries == self.max_retries: return PACERFreeDocumentLog.SCRAPE_FAILED raise self.retry(exc=exc, countdown=10) try: results = report.parse(responses) except (IndexError, HTTPError) as exc: # IndexError: When the page isn't downloaded properly. # HTTPError: raise_for_status in parse hit bad status. if self.request.retries == self.max_retries: return PACERFreeDocumentLog.SCRAPE_FAILED raise self.retry(exc=exc, countdown=10) for row in results: try: PACERFreeDocumentRow.objects.create( court_id=row.court_id, pacer_case_id=row.pacer_case_id, docket_number=row.docket_number, case_name=row.case_name, date_filed=row.date_filed, pacer_doc_id=row.pacer_doc_id, document_number=row.document_number, description=row.description, nature_of_suit=row.nature_of_suit, cause=row.cause, ) except IntegrityError: # Duplicate for whatever reason. continue return PACERFreeDocumentLog.SCRAPE_SUCCESSFUL
def get_free_document_report(self, court_id, start, end, session): """Get structured results from the PACER free document report""" report = FreeOpinionReport(court_id, session) try: responses = report.query(start, end, sort='case_number') except (ConnectionError, ChunkedEncodingError, ReadTimeoutError, ConnectTimeout) as exc: logger.warning("Unable to get free document report results from %s " "(%s to %s). Trying again." % (court_id, start, end)) raise self.retry(exc=exc, countdown=5) try: return report.parse(responses) except IndexError as exc: # Happens when the page isn't downloaded properly, ugh. raise self.retry(exc=exc, countdown=15)
def setUpClass(cls): pacer_session = PacerSession() if PACER_USERNAME and PACER_PASSWORD: # CAND chosen at random pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j: cls.courts = get_courts_from_json(json.load(j)) path = os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json') with open(path) as j: cls.valid_dates = json.load(j) cls.reports = {} for court in cls.courts: court_id = get_court_id_from_url(court['court_link']) cls.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
def setUp(self): pacer_session = PacerSession() if pacer_credentials_are_defined(): # CAND chosen at random pacer_session = get_pacer_session() pacer_session.login() with open(os.path.join(JURISCRAPER_ROOT, "pacer/courts.json")) as j: self.courts = get_courts_from_json(json.load(j)) path = os.path.join(TESTS_ROOT_EXAMPLES_PACER, "dates/valid_free_opinion_dates.json") with open(path) as j: self.valid_dates = json.load(j) self.reports = {} for court in self.courts: court_id = get_court_id_from_url(court["court_link"]) self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)