Exemple #1
0
def update_docket_metadata(d, docket_data):
    """Update the Docket object with the data from Juriscraper.

    Works on either docket history report or docket report (appellate
    or district) results.
    """
    d = update_case_names(d, docket_data['case_name'])
    mark_ia_upload_needed(d)
    d.docket_number = docket_data['docket_number'] or d.docket_number
    d.date_filed = docket_data['date_filed'] or d.date_filed
    d.date_last_filing = docket_data.get(
        'date_last_filing') or d.date_last_filing
    d.date_terminated = docket_data.get('date_terminated') or d.date_terminated
    d.cause = docket_data.get('cause') or d.cause
    d.nature_of_suit = docket_data.get('nature_of_suit') or d.nature_of_suit
    d.jury_demand = docket_data.get('jury_demand') or d.jury_demand
    d.jurisdiction_type = docket_data.get(
        'jurisdiction') or d.jurisdiction_type
    d.mdl_status = docket_data.get('mdl_status') or d.mdl_status
    judges = get_candidate_judges(docket_data.get('assigned_to_str'),
                                  d.court_id, docket_data['date_filed'])
    if judges is not None and len(judges) == 1:
        d.assigned_to = judges[0]
    d.assigned_to_str = docket_data.get('assigned_to_str') or ''
    judges = get_candidate_judges(docket_data.get('referred_to_str'),
                                  d.court_id, docket_data['date_filed'])
    if judges is not None and len(judges) == 1:
        d.referred_to = judges[0]
    d.referred_to_str = docket_data.get('referred_to_str') or ''
    d.blocked, d.date_blocked = get_blocked_status(d)

    return d
Exemple #2
0
def upload_recap_json(self, pk):
    """Make a JSON object for a RECAP docket and upload it to IA"""
    # This is a pretty highly optimized query that uses only 13 hits to the DB
    # when generating a docket JSON rendering, regardless of how many related
    # objects the docket has such as docket entries, parties, etc.
    ds = Docket.objects.filter(pk=pk).select_related(
        'originating_court_information',
    ).prefetch_related(
        'panel',
        'parties__attorneys__roles',
        'parties__party_types__criminal_complaints',
        'parties__party_types__criminal_counts',
        # Django appears to have a bug where you can't defer a field on a
        # queryset where you prefetch the values. If you try to, it crashes.
        # We should be able to just do the prefetch below like the ones above
        # and then do the defer statement at the end, but that throws an error.
        Prefetch(
            'docket_entries__recap_documents',
            queryset=RECAPDocument.objects.all().defer('plain_text')
        )
    )
    d = ds[0]
    renderer = JSONRenderer()
    json_str = renderer.render(
        IADocketSerializer(d).data,
        accepted_media_type='application/json; indent=2',
    )

    file_name = get_docket_filename(d.court_id, d.pacer_case_id, 'json')
    bucket_name = get_bucket_name(d.court_id, d.pacer_case_id)
    responses = upload_to_ia(
        self,
        identifier=bucket_name,
        files={file_name: StringIO(json_str)},
        title=best_case_name(d),
        collection=settings.IA_COLLECTIONS,
        court_id=d.court_id,
        source_url='https://www.courtlistener.com%s' % d.get_absolute_url(),
        media_type='texts',
        description="This item represents a case in PACER, the U.S. "
                    "Government's website for federal case data. This "
                    "information is uploaded quarterly. To see our most "
                    "recent version please use the source url parameter, "
                    "linked below. To see the canonical source for this data, "
                    "please consult PACER directly.",
    )
    if responses is None:
        increment_failure_count(d)
        return

    if all(r.ok for r in responses):
        d.ia_upload_failure_count = None
        d.ia_date_first_changed = None
        d.filepath_ia_json = "https://archive.org/download/%s/%s" % (
            bucket_name, file_name)
        mark_ia_upload_needed(d)
        d.save()
    else:
        increment_failure_count(d)
Exemple #3
0
def update_docket_metadata(d: Docket, docket_data: Dict[str, Any]) -> Docket:
    """Update the Docket object with the data from Juriscraper.

    Works on either docket history report or docket report (appellate
    or district) results.
    """
    d = update_case_names(d, docket_data["case_name"])
    mark_ia_upload_needed(d, save_docket=False)
    d.docket_number = docket_data["docket_number"] or d.docket_number
    d.date_filed = docket_data.get("date_filed") or d.date_filed
    d.date_last_filing = (
        docket_data.get("date_last_filing") or d.date_last_filing
    )
    d.date_terminated = docket_data.get("date_terminated") or d.date_terminated
    d.cause = docket_data.get("cause") or d.cause
    d.nature_of_suit = docket_data.get("nature_of_suit") or d.nature_of_suit
    d.jury_demand = docket_data.get("jury_demand") or d.jury_demand
    d.jurisdiction_type = (
        docket_data.get("jurisdiction") or d.jurisdiction_type
    )
    d.mdl_status = docket_data.get("mdl_status") or d.mdl_status
    judges = get_candidate_judges(
        docket_data.get("assigned_to_str"),
        d.court_id,
        docket_data.get("date_filed"),
    )
    if judges is not None and len(judges) == 1:
        d.assigned_to = judges[0]
    d.assigned_to_str = docket_data.get("assigned_to_str") or ""
    judges = get_candidate_judges(
        docket_data.get("referred_to_str"),
        d.court_id,
        docket_data.get("date_filed"),
    )
    if judges is not None and len(judges) == 1:
        d.referred_to = judges[0]
    d.referred_to_str = docket_data.get("referred_to_str") or ""
    d.blocked, d.date_blocked = get_blocked_status(d)

    return d
Exemple #4
0
def update_rd_metadata(self, rd_pk, response, court_id, pacer_case_id,
                       pacer_doc_id, document_number, attachment_number):
    """After querying PACER and downloading a document, save it to the DB.

    :param rd_pk: The primary key of the RECAPDocument to work on
    :param response: A requests.Response object containing the PDF data.
    :param court_id: A CourtListener court ID to use for file names.
    :param pacer_case_id: The pacer_case_id to use in error logs.
    :param pacer_doc_id: The pacer_doc_id to use in error logs.
    :param document_number: The docket entry number for use in file names.
    :param attachment_number: The attachment number (if applicable) for use in
    file names.
    :return: A two-tuple of a boolean indicating success and a corresponding
    error/success message string.
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    if response is None:
        msg = "Unable to get PDF for RECAP Document '%s' " \
              "at '%s' with doc id '%s'" % (rd_pk, court_id, pacer_doc_id)
        logger.error(msg)
        self.request.callbacks = None
        return False, msg

    file_name = get_document_filename(court_id, pacer_case_id, document_number,
                                      attachment_number)
    cf = ContentFile(response.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.file_size = rd.filepath_local.size
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(response.content)).hexdigest()
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save()

    # Make sure we mark the docket as needing upload
    changed = mark_ia_upload_needed(rd.docket_entry.docket)
    if changed:
        rd.docket_entry.docket.save()

    return True, 'Saved item successfully'
Exemple #5
0
def process_recap_pdf(self, pk):
    """Process an uploaded PDF from the RECAP API endpoint.

    :param pk: The PK of the processing queue item you want to work on.
    :return: A RECAPDocument object that was created or updated.
    """
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)

    if pq.attachment_number is None:
        document_type = RECAPDocument.PACER_DOCUMENT
    else:
        document_type = RECAPDocument.ATTACHMENT

    logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq))
    try:
        if pq.pacer_case_id:
            rd = RECAPDocument.objects.get(
                docket_entry__docket__pacer_case_id=pq.pacer_case_id,
                pacer_doc_id=pq.pacer_doc_id,
            )
        else:
            # Sometimes we don't have the case ID from PACER. Try to make this
            # work anyway.
            rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id,)
    except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned):
        try:
            d = Docket.objects.get(
                pacer_case_id=pq.pacer_case_id, court_id=pq.court_id
            )
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully
            # the docket will be in place soon (it could be in a
            # different upload task that hasn't yet been processed).
            logger.warning(
                "Unable to find docket for processing queue '%s'. "
                "Retrying if max_retries is not exceeded." % pq
            )
            error_message = "Unable to find docket for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED)
                return None
            else:
                mark_pq_status(
                    pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY
                )
                raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None

        # Got the Docket, attempt to get/create the DocketEntry, and then
        # create the RECAPDocument
        try:
            de = DocketEntry.objects.get(
                docket=d, entry_number=pq.document_number
            )
        except DocketEntry.DoesNotExist as exc:
            logger.warning(
                "Unable to find docket entry for processing "
                "queue '%s'." % pq
            )
            msg = "Unable to find docket entry for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
                return None
            else:
                mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
                raise self.retry(exc=exc)
        else:
            # If we're here, we've got the docket and docket
            # entry, but were unable to find the document by
            # pacer_doc_id. This happens when pacer_doc_id is
            # missing, for example. ∴, try to get the document
            # from the docket entry.
            try:
                rd = RECAPDocument.objects.get(
                    docket_entry=de,
                    document_number=pq.document_number,
                    attachment_number=pq.attachment_number,
                    document_type=document_type,
                )
            except (
                RECAPDocument.DoesNotExist,
                RECAPDocument.MultipleObjectsReturned,
            ):
                # Unable to find it. Make a new item.
                rd = RECAPDocument(
                    docket_entry=de,
                    pacer_doc_id=pq.pacer_doc_id,
                    document_type=document_type,
                )

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    try:
        content = pq.filepath_local.read()
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    new_sha1 = sha1(content)
    existing_document = all(
        [
            rd.sha1 == new_sha1,
            rd.is_available,
            rd.filepath_local and os.path.isfile(rd.filepath_local.path),
        ]
    )
    if not existing_document:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        if not pq.debug:
            rd.filepath_local.save(file_name, cf, save=False)

            # Do page count and extraction
            extension = rd.filepath_local.path.split(".")[-1]
            rd.page_count = get_page_count(rd.filepath_local.path, extension)
            rd.file_size = rd.filepath_local.size

        rd.ocr_status = None
        rd.is_available = True
        rd.sha1 = new_sha1
        rd.date_upload = now()

    if not pq.debug:
        try:
            rd.save()
        except (IntegrityError, ValidationError):
            msg = "Duplicate key on unique_together constraint"
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            rd.filepath_local.delete(save=False)
            return None

    if not existing_document and not pq.debug:
        extract_recap_pdf(rd.pk)
        add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_pq_successful(
        pq,
        d_id=rd.docket_entry.docket_id,
        de_id=rd.docket_entry_id,
        rd_id=rd.pk,
    )
    mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True)
    return rd
Exemple #6
0
def process_recap_attachment(self, pk, tag_names=None):
    """Process an uploaded attachment page from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on
    :param tag_names: A list of tag names to add to all items created or
    modified in this function.
    :return: None
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id))
    with open(pq.filepath_local.path) as f:
        text = f.read().decode("utf-8")
    att_page._parse_text(text)
    att_data = att_page.data
    logger.info("Parsing completed for item %s" % pq)

    if att_data == {}:
        # Bad attachment page.
        msg = "Not a valid attachment page upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    if pq.pacer_case_id in ["undefined", "null"]:
        # Bad data from the client. Fix it with parsed data.
        pq.pacer_case_id = att_data.get("pacer_case_id")
        pq.save()

    # Merge the contents of the data into CL.
    try:
        params = {
            "pacer_doc_id": att_data["pacer_doc_id"],
            "docket_entry__docket__court": pq.court,
        }
        if pq.pacer_case_id:
            params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id
        main_rd = RECAPDocument.objects.get(**params)
    except RECAPDocument.MultipleObjectsReturned:
        # Unclear how to proceed and we don't want to associate this data with
        # the wrong case. We must punt.
        msg = (
            "Too many documents found when attempting to associate "
            "attachment data"
        )
        mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
        return None
    except RECAPDocument.DoesNotExist as exc:
        msg = "Could not find docket to associate with attachment metadata"
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = main_rd.docket_entry
    if att_data["document_number"] is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        att_data["document_number"] = main_rd.document_number

    rds_created = []
    if not pq.debug:
        # Save the old HTML to the docket entry.
        pacer_file = PacerHtmlFiles(
            content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE
        )
        pacer_file.filepath.save(
            "attachment_page.html",  # Irrelevant b/c UUIDFileSystemStorage
            ContentFile(text),
        )

        # Create/update the attachment items.
        tags = []
        if tag_names:
            for tag_name in tag_names:
                tag, _ = Tag.objects.get_or_create(name=tag_name)
                tags.append(tag)
        for attachment in att_data["attachments"]:
            if all(
                [
                    attachment["attachment_number"],
                    # Missing on sealed items.
                    attachment.get("pacer_doc_id", False),
                    # Missing on some restricted docs (see Juriscraper)
                    attachment["page_count"] is not None,
                    attachment["description"],
                ]
            ):
                rd, created = RECAPDocument.objects.update_or_create(
                    docket_entry=de,
                    document_number=att_data["document_number"],
                    attachment_number=attachment["attachment_number"],
                    document_type=RECAPDocument.ATTACHMENT,
                )
                if created:
                    rds_created.append(rd)
                needs_save = False
                for field in ["description", "pacer_doc_id"]:
                    if attachment[field]:
                        setattr(rd, field, attachment[field])
                        needs_save = True

                # Only set page_count and file_size if they're blank, in case
                # we got the real value by measuring.
                if rd.page_count is None:
                    rd.page_count = attachment["page_count"]
                if rd.file_size is None and attachment["file_size_str"]:
                    try:
                        rd.file_size = convert_size_to_bytes(
                            attachment["file_size_str"]
                        )
                    except ValueError:
                        pass

                if needs_save:
                    rd.save()
                if tags:
                    for tag in tags:
                        tag.tag_object(rd)

                # Do *not* do this async — that can cause race conditions.
                add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)
    process_orphan_documents(
        rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed
    )
    changed = mark_ia_upload_needed(de.docket)
    if changed:
        de.docket.save()
Exemple #7
0
def process_free_opinion_result(self, row_pk, cnt):
    """Process a single result from the free opinion report"""
    result = PACERFreeDocumentRow.objects.get(pk=row_pk)
    result.court = Court.objects.get(pk=map_pacer_to_cl_id(result.court_id))
    result.case_name = harmonize(result.case_name)
    result.case_name_short = cnt.make_case_name_short(result.case_name)
    row_copy = copy.copy(result)
    # If we don't do this, the doc's date_filed becomes the docket's
    # date_filed. Bad.
    delattr(row_copy, 'date_filed')
    # If we don't do this, we get the PACER court id and it crashes
    delattr(row_copy, 'court_id')
    # If we don't do this, the id of result tries to smash that of the docket.
    delattr(row_copy, 'id')
    try:
        with transaction.atomic():
            docket = lookup_and_save(row_copy)
            if not docket:
                msg = "Unable to create docket for %s" % result
                logger.error(msg)
                result.error_msg = msg
                result.save()
                self.request.callbacks = None
                return
            docket.blocked, docket.date_blocked = get_blocked_status(docket)
            mark_ia_upload_needed(docket)
            docket.save()

            de, de_created = DocketEntry.objects.update_or_create(
                docket=docket,
                entry_number=result.document_number,
                defaults={
                    'date_filed': result.date_filed,
                    'description': result.description,
                })
            rd, rd_created = RECAPDocument.objects.update_or_create(
                docket_entry=de,
                document_number=result.document_number,
                attachment_number=None,
                defaults={
                    'pacer_doc_id': result.pacer_doc_id,
                    'document_type': RECAPDocument.PACER_DOCUMENT,
                    'is_free_on_pacer': True,
                })
    except IntegrityError as e:
        msg = "Raised IntegrityError: %s" % e
        logger.error(msg)
        if self.request.retries == self.max_retries:
            result.error_msg = msg
            result.save()
            return
        raise self.retry(exc=e)
    except DatabaseError as e:
        msg = "Unable to complete database transaction:\n%s" % e
        logger.error(msg)
        result.error_msg = msg
        result.save()
        self.request.callbacks = None
        return

    if not rd_created and rd.is_available:
        # The item already exists and is available. Fantastic, mark it as free,
        # and call it a day.
        rd.is_free_on_pacer = True
        rd.save()
        result.delete()
        self.request.callbacks = None
        return

    return {
        'result': result,
        'rd_pk': rd.pk,
        'pacer_court_id': result.court_id
    }
Exemple #8
0
def merge_attachment_page_data(
    court: Court,
    pacer_case_id: int,
    pacer_doc_id: int,
    document_number: int,
    text: str,
    attachment_dicts: List[Dict[str, Union[int, str]]],
    debug: bool = False,
) -> Tuple[List[RECAPDocument], DocketEntry]:
    """Merge attachment page data into the docket

    :param court: The court object we're working with
    :param pacer_case_id: A PACER case ID
    :param pacer_doc_id: A PACER document ID
    :param document_number: The docket entry number
    :param text: The text of the attachment page
    :param attachment_dicts: A list of Juriscraper-parsed dicts for each
    attachment.
    :param debug: Whether to do saves during this process.
    :return: A list of RECAPDocuments modified or created during the process,
    and the DocketEntry object associated with the RECAPDocuments
    :raises: RECAPDocument.MultipleObjectsReturned, RECAPDocument.DoesNotExist
    """
    try:
        params = {
            "pacer_doc_id": pacer_doc_id,
            "docket_entry__docket__court": court,
        }
        if pacer_case_id:
            params["docket_entry__docket__pacer_case_id"] = pacer_case_id
        main_rd = RECAPDocument.objects.get(**params)
    except RECAPDocument.MultipleObjectsReturned as exc:
        # Unclear how to proceed and we don't want to associate this data with
        # the wrong case. We must punt.
        raise exc
    except RECAPDocument.DoesNotExist as exc:
        # Can't find the docket to associate with the attachment metadata
        # It may be possible to go look for orphaned documents at this stage
        # and to then add them here, as we do when adding dockets. This need is
        # particularly acute for those that get free look emails and then go to
        # the attachment page.
        raise exc

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = main_rd.docket_entry
    if document_number is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        document_number = main_rd.document_number

    if debug:
        return [], de

    # Save the old HTML to the docket entry.
    pacer_file = PacerHtmlFiles(
        content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE
    )
    pacer_file.filepath.save(
        "attachment_page.html",  # Irrelevant b/c UUIDFileSystemStorage
        ContentFile(text),
    )

    # Create/update the attachment items.
    rds_created = []
    rds_affected = []
    for attachment in attachment_dicts:
        sanity_checks = [
            attachment["attachment_number"],
            # Missing on sealed items.
            attachment.get("pacer_doc_id", False),
            # Missing on some restricted docs (see Juriscraper)
            attachment["page_count"] is not None,
            attachment["description"],
        ]
        if not all(sanity_checks):
            continue

        rd, created = RECAPDocument.objects.update_or_create(
            docket_entry=de,
            document_number=document_number,
            attachment_number=attachment["attachment_number"],
            document_type=RECAPDocument.ATTACHMENT,
        )
        if created:
            rds_created.append(rd)
        rds_affected.append(rd)

        for field in ["description", "pacer_doc_id"]:
            if attachment[field]:
                setattr(rd, field, attachment[field])

        # Only set page_count and file_size if they're blank, in case
        # we got the real value by measuring.
        if rd.page_count is None:
            rd.page_count = attachment["page_count"]
        if rd.file_size is None and attachment["file_size_str"]:
            try:
                rd.file_size = convert_size_to_bytes(
                    attachment["file_size_str"]
                )
            except ValueError:
                pass
        rd.save()

        # Do *not* do this async — that can cause race conditions.
        add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_ia_upload_needed(de.docket, save_docket=True)
    process_orphan_documents(
        rds_created, court.pk, main_rd.docket_entry.docket.date_filed
    )
    return rds_affected, de