Python extract_recap_pdf Exemples, cl.scrapers.tasks.extract_recap_pdf Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : tasks.py Projet : snorey/courtlistener

def get_and_process_pdf(self, data, session, row_pk, index=False):
    if data is None:
        return
    result = data['result']
    rd = RECAPDocument.objects.get(pk=data['rd_pk'])
    report = FreeOpinionReport(data['pacer_court_id'], session)
    try:
        r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % result)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [HTTP_500_INTERNAL_SERVER_ERROR,
                                        HTTP_504_GATEWAY_TIMEOUT]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at %s with doc id %s" % \
              (result, result.court_id, result.pacer_doc_id)
        logger.error(msg)
        PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        result.court.pk,
        result.pacer_case_id,
        result.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.is_free_on_pacer = True
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save(do_extraction=False, index=index)
    extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False)
    return {'result': result, 'rd_pk': rd.pk}

Exemple #2

0

Afficher le fichier

Fichier : cleanup_636_pacer_leading_zeros.py Projet : wethepeopleonline/courtlistener

    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Do nothing for items that don't start with zero. For ones that do,
        find the stripped version, fix it, download the correct item, extract
        it and finally save it to Solr.
        """

        if not entry_number.startswith('0'):
            # Only touch things where the new value leads with a zero.
            return None
        else:
            logger.info("  Doing docket_entry: %s, document_number, "
                        "%s and attachment number: %s" %
                        (docket_entry, entry_number, attachment_number))
        old_entry_number = int(entry_number)

        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=old_entry_number,
                attachment_number=attachment_number or None,
            )
            logger.info("    Found item.")
        except RECAPDocument.DoesNotExist:
            logger.info("    Failed to find item.")
            return None

        rd.document_number = entry_number
        if rd.is_available:
            new_ia = get_ia_document_url_from_path(self.path, entry_number,
                                                   attachment_number)
            logger.info("    Updating IA URL from %s to %s" %
                        (rd.filepath_ia, new_ia))
            rd.filepath_ia = new_ia

            if not os.path.isfile(rd.filepath_local.path):
                # Set the value correctly and get the file from IA if we don't
                # already have it.
                new_local_path = os.path.join(
                    'recap',
                    get_local_document_url_from_path(self.path, entry_number,
                                                     attachment_number),
                )
                logger.info("    Updating local path from %s to %s" %
                            (rd.filepath_local, new_local_path))
                rd.filepath_local = new_local_path
                filename = rd.filepath_ia.rsplit('/', 1)[-1]
                logger.info("    Downloading item with filename %s" % filename)
                if not debug:
                    download_recap_item(rd.filepath_ia, filename)
            else:
                logger.info("    File already on disk. Punting.")

            if rd.page_count is None:
                logger.info("    Getting page count.")
                extension = rd.filepath_local.path.split('.')[-1]
                rd.page_count = get_page_count(rd.filepath_local.path,
                                               extension)
        else:
            logger.info("    Item not available in RECAP. Punting.")
            return None

        if not debug:
            try:
                extract_recap_pdf(rd.pk, check_if_needed=False)
                rd.save(do_extraction=False, index=True)
                logger.info(
                    "    Item saved at https://www.courtlistener.com%s" %
                    rd.get_absolute_url())
            except IntegrityError:
                logger.info("    Integrity error while saving.")
                return None
        else:
            logger.info("    No save requested in debug mode.")

        return rd

Exemple #3

0

Afficher le fichier

Fichier : tasks.py Projet : wethepeopleonline/courtlistener

def process_recap_pdf(self, pk):
    """Process an uploaded PDF from the RECAP API endpoint.

    :param pk: The PK of the processing queue item you want to work on.
    :return: A RECAPDocument object that was created or updated.
    """
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)

    if pq.attachment_number is None:
        document_type = RECAPDocument.PACER_DOCUMENT
    else:
        document_type = RECAPDocument.ATTACHMENT

    logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq))
    try:
        if pq.pacer_case_id:
            rd = RECAPDocument.objects.get(
                docket_entry__docket__pacer_case_id=pq.pacer_case_id,
                pacer_doc_id=pq.pacer_doc_id,
            )
        else:
            # Sometimes we don't have the case ID from PACER. Try to make this
            # work anyway.
            rd = RECAPDocument.objects.get(
                pacer_doc_id=pq.pacer_doc_id,
            )
    except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned):
        try:
            d = Docket.objects.get(pacer_case_id=pq.pacer_case_id,
                                   court_id=pq.court_id)
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully the docket
            # will be in place soon (it could be in a different upload task that
            # hasn't yet been processed).
            logger.warning("Unable to find docket for processing queue '%s'. "
                           "Retrying if max_retries is not exceeded." % pq)
            error_message = "Unable to find docket for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, error_message, pq.PROCESSING_FAILED)
                return None
            else:
                mark_pq_status(pq, error_message, pq.QUEUED_FOR_RETRY)
                raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            return None
        else:
            # Got the Docket, attempt to get/create the DocketEntry, and then
            # create the RECAPDocument
            try:
                de = DocketEntry.objects.get(docket=d,
                                             entry_number=pq.document_number)
            except DocketEntry.DoesNotExist as exc:
                logger.warning("Unable to find docket entry for processing "
                               "queue '%s'. Retrying if max_retries is not "
                               "exceeded." % pq)
                pq.error_message = "Unable to find docket entry for item."
                if (self.request.retries == self.max_retries) or pq.debug:
                    pq.status = pq.PROCESSING_FAILED
                    pq.save()
                    return None
                else:
                    pq.status = pq.QUEUED_FOR_RETRY
                    pq.save()
                    raise self.retry(exc=exc)
            else:
                # If we're here, we've got the docket and docket entry, but
                # were unable to find the document by pacer_doc_id. This happens
                # when pacer_doc_id is missing, for example. ∴, try to get the
                # document from the docket entry.
                try:
                    rd = RECAPDocument.objects.get(
                        docket_entry=de,
                        document_number=pq.document_number,
                        attachment_number=pq.attachment_number,
                        document_type=document_type,
                    )
                except (RECAPDocument.DoesNotExist,
                        RECAPDocument.MultipleObjectsReturned):
                    # Unable to find it. Make a new item.
                    rd = RECAPDocument(
                        docket_entry=de,
                        pacer_doc_id=pq.pacer_doc_id,
                        date_upload=timezone.now(),
                        document_type=document_type,
                    )

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    content = pq.filepath_local.read()
    new_sha1 = hashlib.sha1(content).hexdigest()
    existing_document = all([
        rd.sha1 == new_sha1,
        rd.is_available,
        rd.filepath_local and os.path.isfile(rd.filepath_local.path)
    ])
    if not existing_document:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        if not pq.debug:
            rd.filepath_local.save(file_name, cf, save=False)

            # Do page count and extraction
            extension = rd.filepath_local.path.split('.')[-1]
            rd.page_count = get_page_count(rd.filepath_local.path, extension)

        rd.ocr_status = None
        rd.is_available = True
        rd.sha1 = new_sha1

    if not pq.debug:
        try:
            rd.save()
        except IntegrityError:
            msg = "Duplicate key on unique_together constraint"
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            rd.filepath_local.delete(save=False)
            return None

    if not existing_document and not pq.debug:
        extract_recap_pdf(rd.pk)
        add_or_update_recap_document([rd.pk], force_commit=False)

    mark_pq_successful(pq, d_id=rd.docket_entry.docket_id,
                       de_id=rd.docket_entry_id, rd_id=rd.pk)
    return rd

Exemple #4

0

Afficher le fichier

Fichier : tasks.py Projet : nehasingh8289/courtlistener

def get_pacer_doc_by_rd_and_description(self, rd_pk, description_re, cookies,
                                        fallback_to_main_doc=False, tag=None):
    """Using a RECAPDocument object ID and a description of a document, get the
    document from PACER.

    This function was originally meant to get civil cover sheets, but can be
    repurposed as needed.

    :param rd_pk: The PK of a RECAPDocument object to use as a source.
    :param description_re: A compiled regular expression to search against the
    description provided by the attachment page.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param fallback_to_main_doc: Should we grab the main doc if none of the
    attachments match the regex?
    :param tag: A tag name to apply to any downloaded content.
    :return: None
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    att_report = get_attachment_page_by_rd(self, rd_pk, cookies)

    att_found = None
    for attachment in att_report.data.get('attachments', []):
        if description_re.search(attachment['description']):
            att_found = attachment.copy()
            document_type = RECAPDocument.ATTACHMENT
            break

    if not att_found:
        if fallback_to_main_doc:
            logger.info("Falling back to main document for pacer_doc_id: %s" %
                        rd.pacer_doc_id)
            att_found = att_report.data
            document_type = RECAPDocument.PACER_DOCUMENT
        else:
            msg = "Aborting. Did not find civil cover sheet for %s." % rd
            logger.error(msg)
            self.request.callbacks = None
            return

    if not att_found.get('pacer_doc_id'):
        logger.warn("No pacer_doc_id for document (is it sealed?)")
        self.request.callbacks = None
        return

    # Try to find the attachment already in the collection
    rd, _ = RECAPDocument.objects.get_or_create(
        docket_entry=rd.docket_entry,
        attachment_number=att_found.get('attachment_number'),
        document_number=rd.document_number,
        pacer_doc_id=att_found['pacer_doc_id'],
        document_type=document_type,
        defaults={
            'date_upload': now(),
        },
    )
    # Replace the description if we have description data.
    # Else fallback on old.
    rd.description = att_found.get('description', '') or rd.description
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        tag.tag_object(rd)

    if rd.is_available:
        # Great. Call it a day.
        rd.save()
        return

    pacer_case_id = rd.docket_entry.docket.pacer_case_id
    r = download_pacer_pdf_by_rd(rd.pk, pacer_case_id,
                                 att_found['pacer_doc_id'], cookies)
    court_id = rd.docket_entry.docket.court_id
    success, msg = update_rd_metadata(
        self, rd_pk, r, court_id, pacer_case_id, rd.pacer_doc_id,
        rd.document_number, rd.attachment_number)

    if success is False:
        return

    # Skip OCR for now. It'll happen in a second step.
    extract_recap_pdf(rd.pk, skip_ocr=True)
    add_or_update_recap_document([rd.pk])

Exemple #5

0

Afficher le fichier

Fichier : tasks.py Projet : malteos/courtlistener

def get_pacer_doc_by_rd_and_description(self,
                                        rd_pk,
                                        description_re,
                                        session,
                                        fallback_to_main_doc=False,
                                        tag=None):
    """Using a RECAPDocument object ID and a description of a document, get the
    document from PACER.

    This function was originally meant to get civil cover sheets, but can be
    repurposed as needed.

    :param rd_pk: The PK of a RECAPDocument object to use as a source.
    :param description_re: A compiled regular expression to search against the
    description provided by the attachment page.
    :param session: The PACER session object to use.
    :param fallback_to_main_doc: Should we grab the main doc if none of the
    attachments match the regex?
    :param tag: A tag name to apply to any downloaded content.
    :return: None
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    if not rd.pacer_doc_id:
        # Some docket entries are just text/don't have a pacer_doc_id.
        self.request.callbacks = None
        return

    d = rd.docket_entry.docket
    pacer_court_id = map_cl_to_pacer_id(d.court_id)
    att_report = AttachmentPage(pacer_court_id, session)
    try:
        att_report.query(rd.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % rd)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            self.request.callbacks = None
            return

    att_found = None
    for attachment in att_report.data.get('attachments', []):
        if description_re.search(attachment['description']):
            att_found = attachment.copy()
            document_type = RECAPDocument.ATTACHMENT
            break

    if not att_found:
        if fallback_to_main_doc:
            logger.info("Falling back to main document for pacer_doc_id: %s" %
                        rd.pacer_doc_id)
            att_found = att_report.data
            document_type = RECAPDocument.PACER_DOCUMENT
        else:
            msg = "Aborting. Did not find civil cover sheet for %s." % rd
            logger.error(msg)
            self.request.callbacks = None
            return

    if not att_found.get('pacer_doc_id'):
        logger.warn("No pacer_doc_id for document (is it sealed?)")
        self.request.callbacks = None
        return

    # Try to find the attachment already in the collection
    rd, _ = RECAPDocument.objects.get_or_create(
        docket_entry=rd.docket_entry,
        attachment_number=att_found.get('attachment_number'),
        document_number=rd.document_number,
        pacer_doc_id=att_found['pacer_doc_id'],
        document_type=document_type,
        defaults={
            'date_upload': now(),
        },
    )
    # Replace the description if we have description data. Else fallback on old.
    rd.description = att_found.get('description', '') or rd.description
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        rd.tags.add(tag)

    if rd.is_available:
        # Great. Call it a day.
        rd.save(do_extraction=False, index=False)
        return

    # Not available. Go get it.
    try:
        pacer_case_id = rd.docket_entry.docket.pacer_case_id
        r = att_report.download_pdf(pacer_case_id, att_found['pacer_doc_id'])
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % att_found['pacer_doc_id'])
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at PACER court '%s' with doc id %s" % \
              (rd, pacer_court_id, rd.pacer_doc_id)
        logger.error(msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        d.court_id,
        pacer_case_id,
        rd.document_number,
        rd.attachment_number,
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, force it all to be
    # bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save, extract, then save to Solr. Skip OCR for now. Don't do these async.
    rd.save(do_extraction=False, index=False)
    extract_recap_pdf(rd.pk, skip_ocr=True)
    add_or_update_recap_document([rd.pk])

Exemple #6

0

Afficher le fichier

Fichier : tasks.py Projet : snorey/courtlistener

def process_recap_pdf(self, pk):
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item: %s" % pq)
    try:
        rd = RECAPDocument.objects.get(
            docket_entry__docket__pacer_case_id=pq.pacer_case_id,
            pacer_doc_id=pq.pacer_doc_id,
        )
    except RECAPDocument.DoesNotExist:
        try:
            d = Docket.objects.get(pacer_case_id=pq.pacer_case_id,
                                   court_id=pq.court_id)
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully the docket
            # will be in place soon (it could be in a different upload task that
            # hasn't yet been processed).
            logger.warning("Unable to find docket for processing queue '%s'. "
                           "Retrying if max_retries is not exceeded." % pq)
            pq.error_message = "Unable to find docket for item."
            if self.request.retries == self.max_retries:
                pq.status = pq.PROCESSING_FAILED
            else:
                pq.status = pq.QUEUED_FOR_RETRY
            pq.save()
            raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            logger.error(msg)
            pq.error_message = msg
            pq.status = pq.PROCESSING_FAILED
            pq.save()
            return None
        else:
            # Got the Docket, attempt to get/create the DocketEntry, and then
            # create the RECAPDocument
            try:
                de = DocketEntry.objects.get(docket=d,
                                             entry_number=pq.document_number)
            except DocketEntry.DoesNotExist as exc:
                logger.warning("Unable to find docket entry for processing "
                               "queue '%s'. Retrying if max_retries is not "
                               "exceeded." % pq)
                pq.error_message = "Unable to find docket entry for item."
                if self.request.retries == self.max_retries:
                    pq.status = pq.PROCESSING_FAILED
                else:
                    pq.status = pq.QUEUED_FOR_RETRY
                pq.save()
                raise self.retry(exc=exc)

        # All objects accounted for. Make some data.
        rd = RECAPDocument(
            docket_entry=de,
            pacer_doc_id=pq.pacer_doc_id,
            date_upload=timezone.now(),
        )
        if pq.attachment_number is None:
            rd.document_type = RECAPDocument.PACER_DOCUMENT
        else:
            rd.document_type = RECAPDocument.ATTACHMENT

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    content = pq.filepath_local.read()
    new_sha1 = hashlib.sha1(content).hexdigest()
    if all([rd.sha1 == new_sha1,
            rd.is_available,
            rd.filepath_local and os.path.isfile(rd.filepath_local.path)]):
        # All good. Press on.
        new_document = False
    else:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        new_document = True
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        rd.filepath_local.save(file_name, cf, save=False)
        rd.is_available = True
        rd.sha1 = new_sha1

        # Do page count and extraction
        extension = rd.filepath_local.path.split('.')[-1]
        rd.page_count = get_page_count(rd.filepath_local.path, extension)
        rd.ocr_status = None

    # Ditch the original file
    pq.filepath_local.delete(save=False)
    pq.error_message = ''  # Clear out errors b/c successful
    pq.status = pq.PROCESSING_SUCCESSFUL
    pq.save()

    rd.save()
    if new_document:
        extract_recap_pdf(rd.pk)
        add_or_update_recap_document([rd.pk], force_commit=False)

    return rd

Exemple #7

0

Afficher le fichier

Fichier : tasks.py Projet : mattdahl/courtlistener

def process_recap_pdf(self, pk):
    """Process an uploaded PDF from the RECAP API endpoint.

    :param pk: The PK of the processing queue item you want to work on.
    :return: A RECAPDocument object that was created or updated.
    """
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)

    if pq.attachment_number is None:
        document_type = RECAPDocument.PACER_DOCUMENT
    else:
        document_type = RECAPDocument.ATTACHMENT

    logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq))
    try:
        if pq.pacer_case_id:
            rd = RECAPDocument.objects.get(
                docket_entry__docket__pacer_case_id=pq.pacer_case_id,
                pacer_doc_id=pq.pacer_doc_id,
            )
        else:
            # Sometimes we don't have the case ID from PACER. Try to make this
            # work anyway.
            rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id)
    except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned):
        try:
            d = Docket.objects.get(pacer_case_id=pq.pacer_case_id,
                                   court_id=pq.court_id)
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully
            # the docket will be in place soon (it could be in a
            # different upload task that hasn't yet been processed).
            logger.warning("Unable to find docket for processing queue '%s'. "
                           "Retrying if max_retries is not exceeded." % pq)
            error_message = "Unable to find docket for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED)
                return None
            else:
                mark_pq_status(pq, error_message,
                               PROCESSING_STATUS.QUEUED_FOR_RETRY)
                raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None

        # Got the Docket, attempt to get/create the DocketEntry, and then
        # create the RECAPDocument
        try:
            de = DocketEntry.objects.get(docket=d,
                                         entry_number=pq.document_number)
        except DocketEntry.DoesNotExist as exc:
            logger.warning("Unable to find docket entry for processing "
                           "queue '%s'." % pq)
            msg = "Unable to find docket entry for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
                return None
            else:
                mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
                raise self.retry(exc=exc)
        else:
            # If we're here, we've got the docket and docket
            # entry, but were unable to find the document by
            # pacer_doc_id. This happens when pacer_doc_id is
            # missing, for example. ∴, try to get the document
            # from the docket entry.
            try:
                rd = RECAPDocument.objects.get(
                    docket_entry=de,
                    document_number=pq.document_number,
                    attachment_number=pq.attachment_number,
                    document_type=document_type,
                )
            except (
                    RECAPDocument.DoesNotExist,
                    RECAPDocument.MultipleObjectsReturned,
            ):
                # Unable to find it. Make a new item.
                rd = RECAPDocument(
                    docket_entry=de,
                    pacer_doc_id=pq.pacer_doc_id,
                    document_type=document_type,
                )

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    try:
        content = pq.filepath_local.read()
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    new_sha1 = sha1(content)
    existing_document = all([
        rd.sha1 == new_sha1,
        rd.is_available,
        rd.filepath_local,
    ])
    if not existing_document:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        if not pq.debug:
            rd.filepath_local.save(file_name, cf, save=False)

            # Do page count and extraction
            extension = rd.filepath_local.name.split(".")[-1]
            with NamedTemporaryFile(
                    prefix="rd_page_count_",
                    suffix=f".{extension}",
                    buffering=0,
            ) as tmp:
                tmp.write(content)
                rd.page_count = get_page_count(tmp.name, extension)
                rd.file_size = rd.filepath_local.size

        rd.ocr_status = None
        rd.is_available = True
        rd.sha1 = new_sha1
        rd.date_upload = now()

    if not pq.debug:
        try:
            rd.save()
        except (IntegrityError, ValidationError):
            msg = "Duplicate key on unique_together constraint"
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            rd.filepath_local.delete(save=False)
            return None

    if not existing_document and not pq.debug:
        extract_recap_pdf(rd.pk)
        add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_pq_successful(
        pq,
        d_id=rd.docket_entry.docket_id,
        de_id=rd.docket_entry_id,
        rd_id=rd.pk,
    )
    mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True)
    return rd

Exemple #8

0

Afficher le fichier

Fichier : cleanup_636_pacer_leading_zeros.py Projet : snorey/courtlistener

    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Do nothing for items that don't start with zero. For ones that do,
        find the stripped version, fix it, download the correct item, extract
        it and finally save it to Solr.
        """

        if not entry_number.startswith('0'):
            # Only touch things where the new value leads with a zero.
            return None
        else:
            logger.info("  Doing docket_entry: %s, document_number, "
                        "%s and attachment number: %s" %
                        (docket_entry, entry_number, attachment_number))
        old_entry_number = int(entry_number)

        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=old_entry_number,
                attachment_number=attachment_number or None,
            )
            logger.info("    Found item.")
        except RECAPDocument.DoesNotExist:
            logger.info("    Failed to find item.")
            return None

        rd.document_number = entry_number
        if rd.is_available:
            new_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            logger.info("    Updating IA URL from %s to %s" %
                        (rd.filepath_ia, new_ia))
            rd.filepath_ia = new_ia

            if not os.path.isfile(rd.filepath_local.path):
                # Set the value correctly and get the file from IA if we don't
                # already have it.
                new_local_path = os.path.join(
                    'recap',
                    get_local_document_url_from_path(self.path, entry_number,
                                                     attachment_number),
                )
                logger.info("    Updating local path from %s to %s" %
                            (rd.filepath_local, new_local_path))
                rd.filepath_local = new_local_path
                filename = rd.filepath_ia.rsplit('/', 1)[-1]
                logger.info("    Downloading item with filename %s" % filename)
                if not debug:
                    download_recap_item(rd.filepath_ia, filename)
            else:
                logger.info("    File already on disk. Punting.")

            if rd.page_count is None:
                logger.info("    Getting page count.")
                extension = rd.filepath_local.path.split('.')[-1]
                rd.page_count = get_page_count(rd.filepath_local.path, extension)
        else:
            logger.info("    Item not available in RECAP. Punting.")
            return None

        if not debug:
            try:
                extract_recap_pdf(rd.pk, check_if_needed=False)
                rd.save(do_extraction=False, index=True)
                logger.info("    Item saved at https://www.courtlistener.com%s"
                            % rd.get_absolute_url())
            except IntegrityError:
                logger.info("    Integrity error while saving.")
                return None
        else:
            logger.info("    No save requested in debug mode.")

        return rd

Exemple #9

0

Afficher le fichier

def process_recap_pdf(self, pk):
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item: %s" % pq)
    try:
        rd = RECAPDocument.objects.get(
            docket_entry__docket__pacer_case_id=pq.pacer_case_id,
            pacer_doc_id=pq.pacer_doc_id,
        )
    except RECAPDocument.DoesNotExist:
        try:
            d = Docket.objects.get(pacer_case_id=pq.pacer_case_id,
                                   court_id=pq.court_id)
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully the docket
            # will be in place soon (it could be in a different upload task that
            # hasn't yet been processed).
            logger.warning("Unable to find docket for processing queue '%s'. "
                           "Retrying if max_retries is not exceeded." % pq)
            pq.error_message = "Unable to find docket for item."
            if self.request.retries == self.max_retries:
                pq.status = pq.PROCESSING_FAILED
            else:
                pq.status = pq.QUEUED_FOR_RETRY
            pq.save()
            raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            logger.error(msg)
            pq.error_message = msg
            pq.status = pq.PROCESSING_FAILED
            pq.save()
            return None
        else:
            # Got the Docket, attempt to get/create the DocketEntry, and then
            # create the RECAPDocument
            try:
                de = DocketEntry.objects.get(docket=d,
                                             entry_number=pq.document_number)
            except DocketEntry.DoesNotExist as exc:
                logger.warning("Unable to find docket entry for processing "
                               "queue '%s'. Retrying if max_retries is not "
                               "exceeded." % pq)
                pq.error_message = "Unable to find docket entry for item."
                if self.request.retries == self.max_retries:
                    pq.status = pq.PROCESSING_FAILED
                else:
                    pq.status = pq.QUEUED_FOR_RETRY
                pq.save()
                raise self.retry(exc=exc)

        # All objects accounted for. Make some data.
        rd = RECAPDocument(
            docket_entry=de,
            pacer_doc_id=pq.pacer_doc_id,
            date_upload=timezone.now(),
        )
        if pq.attachment_number is None:
            rd.document_type = RECAPDocument.PACER_DOCUMENT
        else:
            rd.document_type = RECAPDocument.ATTACHMENT

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    content = pq.filepath_local.read()
    new_sha1 = hashlib.sha1(content).hexdigest()
    if all([
            rd.sha1 == new_sha1, rd.is_available, rd.filepath_local
            and os.path.isfile(rd.filepath_local.path)
    ]):
        # All good. Press on.
        new_document = False
    else:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        new_document = True
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        rd.filepath_local.save(file_name, cf, save=False)
        rd.is_available = True
        rd.sha1 = new_sha1

        # Do page count and extraction
        extension = rd.filepath_local.path.split('.')[-1]
        rd.page_count = get_page_count(rd.filepath_local.path, extension)
        rd.ocr_status = None

    # Ditch the original file
    pq.filepath_local.delete(save=False)
    pq.error_message = ''  # Clear out errors b/c successful
    pq.status = pq.PROCESSING_SUCCESSFUL
    pq.save()

    rd.save()
    if new_document:
        extract_recap_pdf(rd.pk)
        add_or_update_recap_document([rd.pk], force_commit=False)

    return rd