Exemple #1
0
    def download_pdfs(self):
        """Download the first (presumably complaint) PDF to downlaods dir.

        :return: None
        """
        q = Query()
        db = TinyDB("db/master.json")
        fjc_table = db.table("fjc")
        for row in fjc_table.search((q.JSON == "True") & (q.PDF == "False")):
            logging.info(
                "Collecting PDF #%s, in %s" % (row["PACER_CASE_ID"], row["TITLE"])
            )
            report = FreeOpinionReport(row["COURT"], self.s)
            r = report.download_pdf(row["PACER_CASE_ID"], row["pacer_doc_id"])
            with open(
                "downloads/pdf/pacer_complaint_%s.pdf" % row["PACER_CASE_ID"], "w"
            ) as file:
                file.write(r.content)

            fjc_table.update(
                {"PDF": "True"}, doc_ids=[row.doc_id],
            )
            logging.info(
                "Collected PDF #%s, in %s" % (row["PACER_CASE_ID"], row["TITLE"])
            )
Exemple #2
0
def get_and_process_pdf(self, data, session, row_pk, index=False):
    if data is None:
        return
    result = data['result']
    rd = RECAPDocument.objects.get(pk=data['rd_pk'])
    report = FreeOpinionReport(data['pacer_court_id'], session)
    try:
        r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % result)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            PACERFreeDocumentRow.objects.filter(pk=row_pk).update(
                error_msg=msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at %s with doc id %s" % \
              (result, result.court_id, result.pacer_doc_id)
        logger.error(msg)
        PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        result.court.pk,
        result.pacer_case_id,
        result.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.is_free_on_pacer = True
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save(do_extraction=False, index=index)
    extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False)
    return {'result': result, 'rd_pk': rd.pk}
Exemple #3
0
def get_and_process_pdf(self, data, session, row_pk, index=False):
    if data is None:
        return
    result = data['result']
    rd = RECAPDocument.objects.get(pk=data['rd_pk'])
    report = FreeOpinionReport(data['pacer_court_id'], session)
    try:
        r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % result)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [HTTP_500_INTERNAL_SERVER_ERROR,
                                        HTTP_504_GATEWAY_TIMEOUT]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at %s with doc id %s" % \
              (result, result.court_id, result.pacer_doc_id)
        logger.error(msg)
        PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        result.court.pk,
        result.pacer_case_id,
        result.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.is_free_on_pacer = True
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save(do_extraction=False, index=index)
    extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False)
    return {'result': result, 'rd_pk': rd.pk}
Exemple #4
0
def download_pacer_pdf_by_rd(self, rd_pk, pacer_case_id, pacer_doc_id,
                             cookies):
    """Using a RECAPDocument object ID, download the PDF if it doesn't already
    exist.

    :param rd_pk: The PK of the RECAPDocument to download
    :param pacer_case_id: The internal PACER case ID number
    :param pacer_doc_id: The internal PACER document ID to download
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :return: requests.Response object usually containing a PDF, or None if that
    wasn't possible.
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id)
    s = PacerSession(cookies=cookies)
    report = FreeOpinionReport(pacer_court_id, s)
    try:
        r = report.download_pdf(pacer_case_id, pacer_doc_id)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning(
                "Ran into HTTPError while getting PDF: %s. "
                "Retrying.", exc.response.status_code)
            if self.request.retries == self.max_retries:
                self.request.callbacks = None
                return
            raise self.retry(exc)
        else:
            logger.error(
                "Ran into unknown HTTPError while getting PDF: %s. "
                "Aborting.", exc.response.status_code)
            self.request.callbacks = None
            return
    except requests.RequestException as exc:
        logger.warning("Unable to get PDF for %s in %s", pacer_doc_id,
                       pacer_case_id)
        if self.request.retries == self.max_retries:
            self.request.callbacks = None
            return
        raise self.retry(exc=exc)
    return r