Example #1
0
 def test_filesize_conversions(self):
     """Can we convert human filesizes to bytes?"""
     qa_pairs = [
         ('58 kb', 59392),
         ('117 kb', 119808),
         ('117kb', 119808),
         ('1 byte', 1),
         ('117 bytes', 117),
         ('117  bytes', 117),
         ('  117 bytes  ', 117),
         ('117b', 117),
         ('117bytes', 117),
         ('1 kilobyte', 1024),
         ('117 kilobytes', 119808),
         ('0.7 mb', 734003),
         ('1mb', 1048576),
         ('5.2 mb', 5452595),
     ]
     for qa in qa_pairs:
         print("Converting '%s' to bytes..." % qa[0], end='')
         self.assertEqual(convert_size_to_bytes(qa[0]), qa[1])
         print('✓')
Example #2
0
 def test_filesize_conversions(self):
     """Can we convert human filesizes to bytes?"""
     qa_pairs = [
         ("58 kb", 59392),
         ("117 kb", 119808),
         ("117kb", 119808),
         ("1 byte", 1),
         ("117 bytes", 117),
         ("117  bytes", 117),
         ("  117 bytes  ", 117),
         ("117b", 117),
         ("117bytes", 117),
         ("1 kilobyte", 1024),
         ("117 kilobytes", 119808),
         ("0.7 mb", 734003),
         ("1mb", 1048576),
         ("5.2 mb", 5452595),
     ]
     for qa in qa_pairs:
         print("Converting '%s' to bytes..." % qa[0], end="")
         self.assertEqual(convert_size_to_bytes(qa[0]), qa[1])
         print("✓")
Example #3
0
 def test_filesize_conversions(self):
     """Can we convert human filesizes to bytes?"""
     qa_pairs = [
         ('58 kb', 59392),
         ('117 kb', 119808),
         ('117kb', 119808),
         ('1 byte', 1),
         ('117 bytes', 117),
         ('117  bytes', 117),
         ('  117 bytes  ', 117),
         ('117b', 117),
         ('117bytes', 117),
         ('1 kilobyte', 1024),
         ('117 kilobytes', 119808),
         ('0.7 mb', 734003),
         ('1mb', 1048576),
         ('5.2 mb', 5452595),
     ]
     for qa in qa_pairs:
         print("Converting '%s' to bytes..." % qa[0], end='')
         self.assertEqual(convert_size_to_bytes(qa[0]), qa[1])
         print('✓')
Example #4
0
def process_recap_zip(self, pk):
    """Process a zip uploaded from a PACER district court

    The general process is to use our existing infrastructure. We open the zip,
    identify the documents inside, and then associate them with the rest of our
    collection.

    :param self: A celery task object
    :param pk: The PK of the ProcessingQueue object to process
    :return: A list of new PQ's that were created, one per PDF that was
    enqueued.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)

    logger.info("Processing RECAP zip (debug is: %s): %s", pq.debug, pq)
    with ZipFile(pq.filepath_local.path, "r") as archive:
        # Security: Check for zip bombs.
        max_file_size = convert_size_to_bytes("200MB")
        for zip_info in archive.infolist():
            if zip_info.file_size < max_file_size:
                continue
            mark_pq_status(
                pq,
                "Zip too large; possible zip bomb. File in zip named %s "
                "would be %s bytes expanded."
                % (zip_info.filename, zip_info.file_size),
                PROCESSING_STATUS.INVALID_CONTENT,
            )
            return {"new_pqs": [], "tasks": []}

        # For each document in the zip, create a new PQ
        new_pqs = []
        tasks = []
        for file_name in archive.namelist():
            file_content = archive.read(file_name)
            f = SimpleUploadedFile(file_name, file_content)

            file_name = file_name.split(".pdf")[0]
            if "-" in file_name:
                doc_num, att_num = file_name.split("-")
                if att_num == "main":
                    att_num = None
            else:
                doc_num = file_name
                att_num = None

            if att_num:
                # An attachment, ∴ nuke the pacer_doc_id value, since it
                # corresponds to the main doc only.
                pacer_doc_id = ""
            else:
                pacer_doc_id = pq.pacer_doc_id

            # Create a new PQ and enqueue it for processing
            new_pq = ProcessingQueue.objects.create(
                court=pq.court,
                uploader=pq.uploader,
                pacer_case_id=pq.pacer_case_id,
                pacer_doc_id=pacer_doc_id,
                document_number=doc_num,
                attachment_number=att_num,
                filepath_local=f,
                status=PROCESSING_STATUS.ENQUEUED,
                upload_type=UPLOAD_TYPE.PDF,
                debug=pq.debug,
            )
            new_pqs.append(new_pq.pk)
            tasks.append(process_recap_pdf.delay(new_pq.pk))

        # At the end, mark the pq as successful and return the PQ
        mark_pq_status(
            pq,
            "Successfully created ProcessingQueue objects: %s"
            % oxford_join(new_pqs),
            PROCESSING_STATUS.SUCCESSFUL,
        )

        # Returning the tasks allows tests to wait() for the PDFs to complete
        # before checking assertions.
        return {
            "new_pqs": new_pqs,
            "tasks": tasks,
        }
Example #5
0
def process_recap_attachment(self, pk, tag_names=None):
    """Process an uploaded attachment page from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on
    :param tag_names: A list of tag names to add to all items created or
    modified in this function.
    :return: None
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id))
    with open(pq.filepath_local.path) as f:
        text = f.read().decode("utf-8")
    att_page._parse_text(text)
    att_data = att_page.data
    logger.info("Parsing completed for item %s" % pq)

    if att_data == {}:
        # Bad attachment page.
        msg = "Not a valid attachment page upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    if pq.pacer_case_id in ["undefined", "null"]:
        # Bad data from the client. Fix it with parsed data.
        pq.pacer_case_id = att_data.get("pacer_case_id")
        pq.save()

    # Merge the contents of the data into CL.
    try:
        params = {
            "pacer_doc_id": att_data["pacer_doc_id"],
            "docket_entry__docket__court": pq.court,
        }
        if pq.pacer_case_id:
            params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id
        main_rd = RECAPDocument.objects.get(**params)
    except RECAPDocument.MultipleObjectsReturned:
        # Unclear how to proceed and we don't want to associate this data with
        # the wrong case. We must punt.
        msg = (
            "Too many documents found when attempting to associate "
            "attachment data"
        )
        mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
        return None
    except RECAPDocument.DoesNotExist as exc:
        msg = "Could not find docket to associate with attachment metadata"
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = main_rd.docket_entry
    if att_data["document_number"] is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        att_data["document_number"] = main_rd.document_number

    rds_created = []
    if not pq.debug:
        # Save the old HTML to the docket entry.
        pacer_file = PacerHtmlFiles(
            content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE
        )
        pacer_file.filepath.save(
            "attachment_page.html",  # Irrelevant b/c UUIDFileSystemStorage
            ContentFile(text),
        )

        # Create/update the attachment items.
        tags = []
        if tag_names:
            for tag_name in tag_names:
                tag, _ = Tag.objects.get_or_create(name=tag_name)
                tags.append(tag)
        for attachment in att_data["attachments"]:
            if all(
                [
                    attachment["attachment_number"],
                    # Missing on sealed items.
                    attachment.get("pacer_doc_id", False),
                    # Missing on some restricted docs (see Juriscraper)
                    attachment["page_count"] is not None,
                    attachment["description"],
                ]
            ):
                rd, created = RECAPDocument.objects.update_or_create(
                    docket_entry=de,
                    document_number=att_data["document_number"],
                    attachment_number=attachment["attachment_number"],
                    document_type=RECAPDocument.ATTACHMENT,
                )
                if created:
                    rds_created.append(rd)
                needs_save = False
                for field in ["description", "pacer_doc_id"]:
                    if attachment[field]:
                        setattr(rd, field, attachment[field])
                        needs_save = True

                # Only set page_count and file_size if they're blank, in case
                # we got the real value by measuring.
                if rd.page_count is None:
                    rd.page_count = attachment["page_count"]
                if rd.file_size is None and attachment["file_size_str"]:
                    try:
                        rd.file_size = convert_size_to_bytes(
                            attachment["file_size_str"]
                        )
                    except ValueError:
                        pass

                if needs_save:
                    rd.save()
                if tags:
                    for tag in tags:
                        tag.tag_object(rd)

                # Do *not* do this async — that can cause race conditions.
                add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)
    process_orphan_documents(
        rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed
    )
    changed = mark_ia_upload_needed(de.docket)
    if changed:
        de.docket.save()
Example #6
0
def merge_attachment_page_data(
    court: Court,
    pacer_case_id: int,
    pacer_doc_id: int,
    document_number: int,
    text: str,
    attachment_dicts: List[Dict[str, Union[int, str]]],
    debug: bool = False,
) -> Tuple[List[RECAPDocument], DocketEntry]:
    """Merge attachment page data into the docket

    :param court: The court object we're working with
    :param pacer_case_id: A PACER case ID
    :param pacer_doc_id: A PACER document ID
    :param document_number: The docket entry number
    :param text: The text of the attachment page
    :param attachment_dicts: A list of Juriscraper-parsed dicts for each
    attachment.
    :param debug: Whether to do saves during this process.
    :return: A list of RECAPDocuments modified or created during the process,
    and the DocketEntry object associated with the RECAPDocuments
    :raises: RECAPDocument.MultipleObjectsReturned, RECAPDocument.DoesNotExist
    """
    try:
        params = {
            "pacer_doc_id": pacer_doc_id,
            "docket_entry__docket__court": court,
        }
        if pacer_case_id:
            params["docket_entry__docket__pacer_case_id"] = pacer_case_id
        main_rd = RECAPDocument.objects.get(**params)
    except RECAPDocument.MultipleObjectsReturned as exc:
        # Unclear how to proceed and we don't want to associate this data with
        # the wrong case. We must punt.
        raise exc
    except RECAPDocument.DoesNotExist as exc:
        # Can't find the docket to associate with the attachment metadata
        # It may be possible to go look for orphaned documents at this stage
        # and to then add them here, as we do when adding dockets. This need is
        # particularly acute for those that get free look emails and then go to
        # the attachment page.
        raise exc

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = main_rd.docket_entry
    if document_number is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        document_number = main_rd.document_number

    if debug:
        return [], de

    # Save the old HTML to the docket entry.
    pacer_file = PacerHtmlFiles(
        content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE
    )
    pacer_file.filepath.save(
        "attachment_page.html",  # Irrelevant b/c UUIDFileSystemStorage
        ContentFile(text),
    )

    # Create/update the attachment items.
    rds_created = []
    rds_affected = []
    for attachment in attachment_dicts:
        sanity_checks = [
            attachment["attachment_number"],
            # Missing on sealed items.
            attachment.get("pacer_doc_id", False),
            # Missing on some restricted docs (see Juriscraper)
            attachment["page_count"] is not None,
            attachment["description"],
        ]
        if not all(sanity_checks):
            continue

        rd, created = RECAPDocument.objects.update_or_create(
            docket_entry=de,
            document_number=document_number,
            attachment_number=attachment["attachment_number"],
            document_type=RECAPDocument.ATTACHMENT,
        )
        if created:
            rds_created.append(rd)
        rds_affected.append(rd)

        for field in ["description", "pacer_doc_id"]:
            if attachment[field]:
                setattr(rd, field, attachment[field])

        # Only set page_count and file_size if they're blank, in case
        # we got the real value by measuring.
        if rd.page_count is None:
            rd.page_count = attachment["page_count"]
        if rd.file_size is None and attachment["file_size_str"]:
            try:
                rd.file_size = convert_size_to_bytes(
                    attachment["file_size_str"]
                )
            except ValueError:
                pass
        rd.save()

        # Do *not* do this async — that can cause race conditions.
        add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_ia_upload_needed(de.docket, save_docket=True)
    process_orphan_documents(
        rds_created, court.pk, main_rd.docket_entry.docket.date_filed
    )
    return rds_affected, de