Esempio n. 1
0
def upload_pdf_to_ia(self, rd_pk):
    rd = RECAPDocument.objects.get(pk=rd_pk)
    d = rd.docket_entry.docket
    file_name = get_document_filename(
        d.court_id,
        d.pacer_case_id,
        rd.document_number,
        rd.attachment_number or 0,
    )
    bucket_name = get_bucket_name(d.court_id, d.pacer_case_id)
    responses = upload_to_ia(
        self,
        identifier=bucket_name,
        files=rd.filepath_local.path,
        title=best_case_name(d),
        collection=settings.IA_COLLECTIONS,
        court_id=d.court_id,
        source_url='https://www.courtlistener.com%s' % rd.get_absolute_url(),
        media_type='texts',
        description="This item represents a case in PACER, the U.S. "
        "Government's website for federal case data. If you wish "
        "to see the entire case, please consult PACER directly.",
    )
    if responses is None:
        increment_failure_count(rd)
        return

    if all(r.ok for r in responses):
        rd.ia_upload_failure_count = None
        rd.filepath_ia = "https://archive.org/download/%s/%s" % (bucket_name,
                                                                 file_name)
        rd.save()
    else:
        increment_failure_count(rd)
Esempio n. 2
0
def upload_free_opinion_to_ia(self, rd_pk):
    rd = RECAPDocument.objects.get(pk=rd_pk)
    d = rd.docket_entry.docket
    file_name = get_document_filename(
        d.court_id,
        d.pacer_case_id,
        rd.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    bucket_name = get_bucket_name(d.court_id, d.pacer_case_id)
    try:
        responses = upload_to_ia(
            identifier=bucket_name,
            files=rd.filepath_local.path,
            metadata={
                'title': best_case_name(d),
                'collection': settings.IA_COLLECTIONS,
                'contributor': '<a href="https://free.law">Free Law Project</a>',
                'court': d.court_id,
                'language': 'eng',
                'mediatype': 'texts',
                'description': "This item represents a case in PACER, "
                               "the U.S. Government's website for "
                               "federal case data. If you wish to see "
                               "the entire case, please consult PACER "
                               "directly.",
                'licenseurl': 'https://www.usa.gov/government-works',
            },
        )
    except (OverloadedException, ExpatError) as exc:
        # Overloaded: IA wants us to slow down.
        # ExpatError: The syntax of the XML file that's supposed to be returned
        #             by IA is bad (or something).
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
            HTTP_403_FORBIDDEN,    # Can't access bucket, typically.
            HTTP_400_BAD_REQUEST,  # Corrupt PDF, typically.
        ]:
            return [exc.response]
        if self.request.retries == self.max_retries:
            # This exception is also raised when the endpoint is overloaded, but
            # doesn't get caught in the OverloadedException below due to
            # multiple processes running at the same time. Just give up for now.
            return
        raise self.retry(exc=exc)
    except (requests.Timeout, requests.RequestException) as exc:
        logger.warning("Timeout or unknown RequestException. Unable to upload "
                       "to IA. Trying again if retries not exceeded: %s" % rd)
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    if all(r.ok for r in responses):
        rd.filepath_ia = "https://archive.org/download/%s/%s" % (
            bucket_name, file_name)
        rd.save(do_extraction=False, index=False)
Esempio n. 3
0
def upload_free_opinion_to_ia(self, rd_pk):
    rd = RECAPDocument.objects.get(pk=rd_pk)
    d = rd.docket_entry.docket
    file_name = get_document_filename(
        d.court_id,
        d.pacer_case_id,
        rd.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    bucket_name = get_bucket_name(d.court_id, d.pacer_case_id)
    try:
        responses = upload_to_ia(
            identifier=bucket_name,
            files=rd.filepath_local.path,
            metadata={
                'title': best_case_name(d),
                'collection': settings.IA_COLLECTIONS,
                'contributor': '<a href="https://free.law">Free Law Project</a>',
                'court': d.court_id,
                'language': 'eng',
                'mediatype': 'texts',
                'description': "This item represents a case in PACER, "
                               "the U.S. Government's website for "
                               "federal case data. If you wish to see "
                               "the entire case, please consult PACER "
                               "directly.",
                'licenseurl': 'https://www.usa.gov/government-works',
            },
        )
    except (OverloadedException, ExpatError) as exc:
        # Overloaded: IA wants us to slow down.
        # ExpatError: The syntax of the XML file that's supposed to be returned
        #             by IA is bad (or something).
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
            HTTP_403_FORBIDDEN,    # Can't access bucket, typically.
            HTTP_400_BAD_REQUEST,  # Corrupt PDF, typically.
        ]:
            return [exc.response]
        if self.request.retries == self.max_retries:
            # This exception is also raised when the endpoint is overloaded, but
            # doesn't get caught in the OverloadedException below due to
            # multiple processes running at the same time. Just give up for now.
            return
        raise self.retry(exc=exc)
    except (requests.Timeout, requests.RequestException) as exc:
        logger.warning("Timeout or unknown RequestException. Unable to upload "
                       "to IA. Trying again if retries not exceeded: %s" % rd)
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    if all(r.ok for r in responses):
        rd.filepath_ia = "https://archive.org/download/%s/%s" % (
            bucket_name, file_name)
        rd.save(do_extraction=False, index=False)
Esempio n. 4
0
def get_and_process_pdf(self, data, session, row_pk, index=False):
    if data is None:
        return
    result = data['result']
    rd = RECAPDocument.objects.get(pk=data['rd_pk'])
    report = FreeOpinionReport(data['pacer_court_id'], session)
    try:
        r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % result)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            PACERFreeDocumentRow.objects.filter(pk=row_pk).update(
                error_msg=msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at %s with doc id %s" % \
              (result, result.court_id, result.pacer_doc_id)
        logger.error(msg)
        PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        result.court.pk,
        result.pacer_case_id,
        result.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.is_free_on_pacer = True
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save(do_extraction=False, index=index)
    extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False)
    return {'result': result, 'rd_pk': rd.pk}
Esempio n. 5
0
def get_and_process_pdf(self, data, session, row_pk, index=False):
    if data is None:
        return
    result = data['result']
    rd = RECAPDocument.objects.get(pk=data['rd_pk'])
    report = FreeOpinionReport(data['pacer_court_id'], session)
    try:
        r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % result)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [HTTP_500_INTERNAL_SERVER_ERROR,
                                        HTTP_504_GATEWAY_TIMEOUT]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at %s with doc id %s" % \
              (result, result.court_id, result.pacer_doc_id)
        logger.error(msg)
        PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        result.court.pk,
        result.pacer_case_id,
        result.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.is_free_on_pacer = True
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save(do_extraction=False, index=index)
    extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False)
    return {'result': result, 'rd_pk': rd.pk}
def get_and_merge_items(items, log):
    """Get the items returned from the RECAP server and merge them into CL.

    Items is a list of dicts like so, sorted by court, case number, document
    number and attachment number:

    [{'attachment_number': '0',
      'document_number': '1',
      'case_number': '186759',
      'court_id': 'almb',
      'is_available': '0'},
      ...
    ]

    Note that all values are strings. The idea is to iterate over all of these
    dicts, grabbing the docket, and adding any items that have is_available = 1.
    """
    update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS)
    tasks = []
    for prev, item, nxt in previous_and_next(items):
        if prev is None or item['case_number'] != prev['case_number']:
            # New case. Get the next docket before getting any PDFs.
            url = get_docketxml_url(item['court_id'], item['case_number'])
            logger.info("New docket found at: %s" % url)
            filename = get_docket_filename(item['court_id'],
                                           item['case_number'])
            tasks.append(download_recap_item.si(url, filename, clobber=True))

        # Get the document
        filename = get_document_filename(item['court_id'], item['case_number'],
                                         item['document_number'],
                                         item['attachment_number'])
        location = os.path.join(settings.MEDIA_ROOT, 'recap', filename)
        if not os.path.isfile(location) and int(item['is_available']):
            # We don't have it yet, and it's available to get. Get it!
            url = get_pdf_url(item['court_id'], item['case_number'], filename)
            tasks.append(download_recap_item.si(url, filename))

        if nxt is None or item['case_number'] != nxt['case_number']:
            # Last item in the case. Send for processing.
            if len(tasks) > 0:
                logger.info("Sending %s tasks for processing." % len(tasks))
                filename = get_docket_filename(item['court_id'],
                                               item['case_number'])
                chord(tasks)(chain(
                    parse_recap_docket.si(filename, debug=False),
                    extract_recap_pdf.s().set(priority=5),
                    add_or_update_recap_document.s(coalesce_docket=True),
                ))
                tasks = []
    logger.info("Finished queueing new cases.")
def get_and_merge_items(items, log):
    """Get the items returned from the RECAP server and merge them into CL.

    Items is a list of dicts like so, sorted by court, case number, document
    number and attachment number:

    [{'attachment_number': '0',
      'document_number': '1',
      'case_number': '186759',
      'court_id': 'almb',
      'is_available': '0'},
      ...
    ]

    Note that all values are strings. The idea is to iterate over all of these
    dicts, grabbing the docket, and adding any items that have is_available = 1.
    """
    update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS)
    tasks = []
    for prev, item, nxt in previous_and_next(items):
        if prev is None or item['case_number'] != prev['case_number']:
            # New case. Get the next docket before getting any PDFs.
            url = get_docketxml_url(item['court_id'], item['case_number'])
            logger.info("New docket found at: %s" % url)
            filename = get_docket_filename(item['court_id'], item['case_number'])
            tasks.append(download_recap_item.si(url, filename, clobber=True))

        # Get the document
        filename = get_document_filename(item['court_id'], item['case_number'],
                                         item['document_number'],
                                         item['attachment_number'])
        location = os.path.join(settings.MEDIA_ROOT, 'recap', filename)
        if not os.path.isfile(location) and int(item['is_available']):
            # We don't have it yet, and it's available to get. Get it!
            url = get_pdf_url(item['court_id'], item['case_number'], filename)
            tasks.append(download_recap_item.si(url, filename))

        if nxt is None or item['case_number'] != nxt['case_number']:
            # Last item in the case. Send for processing.
            if len(tasks) > 0:
                logger.info("Sending %s tasks for processing." % len(tasks))
                filename = get_docket_filename(item['court_id'],
                                               item['case_number'])
                chord(tasks)(chain(
                    parse_recap_docket.si(filename, debug=False),
                    extract_recap_pdf.s().set(priority=5),
                    add_or_update_recap_document.s(coalesce_docket=True),
                ))
                tasks = []
    logger.info("Finished queueing new cases.")
Esempio n. 8
0
def update_rd_metadata(self, rd_pk, response, court_id, pacer_case_id,
                       pacer_doc_id, document_number, attachment_number):
    """After querying PACER and downloading a document, save it to the DB.

    :param rd_pk: The primary key of the RECAPDocument to work on
    :param response: A requests.Response object containing the PDF data.
    :param court_id: A CourtListener court ID to use for file names.
    :param pacer_case_id: The pacer_case_id to use in error logs.
    :param pacer_doc_id: The pacer_doc_id to use in error logs.
    :param document_number: The docket entry number for use in file names.
    :param attachment_number: The attachment number (if applicable) for use in
    file names.
    :return: A two-tuple of a boolean indicating success and a corresponding
    error/success message string.
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    if response is None:
        msg = "Unable to get PDF for RECAP Document '%s' " \
              "at '%s' with doc id '%s'" % (rd_pk, court_id, pacer_doc_id)
        logger.error(msg)
        self.request.callbacks = None
        return False, msg

    file_name = get_document_filename(court_id, pacer_case_id, document_number,
                                      attachment_number)
    cf = ContentFile(response.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.file_size = rd.filepath_local.size
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(response.content)).hexdigest()
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save()

    # Make sure we mark the docket as needing upload
    changed = mark_ia_upload_needed(rd.docket_entry.docket)
    if changed:
        rd.docket_entry.docket.save()

    return True, 'Saved item successfully'
def make_download_tasks(data, line_count, start_line):
    """For every item in the CSV, send it to Celery for processing"""
    previous_casenum = None
    subtasks = []
    completed = 0
    for index, item in data.iterrows():
        if completed < start_line - 1:
            # Skip ahead if start_lines is provided.
            completed += 1
            continue

        if item['casenum'] != previous_casenum:
            # New case, get the docket before getting the pdf
            logger.info("New docket found with casenum: %s" % item['casenum'])
            previous_casenum = item['casenum']
            filename = get_docket_filename(item['court'], item['casenum'])
            url = get_docketxml_url(item['court'], item['casenum'])
            subtasks.append(download_recap_item.subtask((url, filename)))

        # Get the document
        filename = get_document_filename(item['court'], item['casenum'],
                                         item['docnum'], item['subdocnum'])
        url = get_pdf_url(item['court'], item['casenum'], filename)
        subtasks.append(download_recap_item.subtask((url, filename)))

        # Every n items or on the last item, send the subtasks to Celery.
        last_item = (line_count == completed + 1)
        if (len(subtasks) >= 1000) or last_item:
            msg = ("Sent %s subtasks to celery. We have processed %s "
                   "rows so far." % (len(subtasks), completed + 1))
            logger.info(msg)
            print msg
            job = TaskSet(tasks=subtasks)
            job.apply_async().join()
            subtasks = []

        completed += 1
Esempio n. 10
0
def make_download_tasks(data, line_count, start_line):
    """For every item in the CSV, send it to Celery for processing"""
    previous_casenum = None
    subtasks = []
    completed = 0
    for index, item in data.iterrows():
        if completed < start_line - 1:
            # Skip ahead if start_lines is provided.
            completed += 1
            continue

        last_item = (line_count == completed + 1)
        if item['casenum'] != previous_casenum:
            # New case, get the docket before getting the pdf
            logger.info("New docket found with casenum: %s" % item['casenum'])
            previous_casenum = item['casenum']
            filename = get_docket_filename(item['court'], item['casenum'])
            url = get_docketxml_url(item['court'], item['casenum'])
            subtasks.append(download_recap_item.subtask((url, filename)))

        # Get the document
        filename = get_document_filename(item['court'], item['casenum'],
                                         item['docnum'], item['subdocnum'])
        url = get_pdf_url(item['court'], item['casenum'], filename)
        subtasks.append(download_recap_item.subtask((url, filename)))

        # Every n items send the subtasks to Celery.
        if (len(subtasks) >= 1000) or last_item:
            msg = ("Sent %s subtasks to celery. We have processed %s "
                   "rows so far." % (len(subtasks), completed + 1))
            logger.info(msg)
            print msg
            job = TaskSet(tasks=subtasks)
            job.apply_async().join()
            subtasks = []

        completed += 1
Esempio n. 11
0
def process_recap_pdf(self, pk):
    """Process an uploaded PDF from the RECAP API endpoint.

    :param pk: The PK of the processing queue item you want to work on.
    :return: A RECAPDocument object that was created or updated.
    """
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)

    if pq.attachment_number is None:
        document_type = RECAPDocument.PACER_DOCUMENT
    else:
        document_type = RECAPDocument.ATTACHMENT

    logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq))
    try:
        if pq.pacer_case_id:
            rd = RECAPDocument.objects.get(
                docket_entry__docket__pacer_case_id=pq.pacer_case_id,
                pacer_doc_id=pq.pacer_doc_id,
            )
        else:
            # Sometimes we don't have the case ID from PACER. Try to make this
            # work anyway.
            rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id,)
    except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned):
        try:
            d = Docket.objects.get(
                pacer_case_id=pq.pacer_case_id, court_id=pq.court_id
            )
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully
            # the docket will be in place soon (it could be in a
            # different upload task that hasn't yet been processed).
            logger.warning(
                "Unable to find docket for processing queue '%s'. "
                "Retrying if max_retries is not exceeded." % pq
            )
            error_message = "Unable to find docket for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED)
                return None
            else:
                mark_pq_status(
                    pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY
                )
                raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None

        # Got the Docket, attempt to get/create the DocketEntry, and then
        # create the RECAPDocument
        try:
            de = DocketEntry.objects.get(
                docket=d, entry_number=pq.document_number
            )
        except DocketEntry.DoesNotExist as exc:
            logger.warning(
                "Unable to find docket entry for processing "
                "queue '%s'." % pq
            )
            msg = "Unable to find docket entry for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
                return None
            else:
                mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
                raise self.retry(exc=exc)
        else:
            # If we're here, we've got the docket and docket
            # entry, but were unable to find the document by
            # pacer_doc_id. This happens when pacer_doc_id is
            # missing, for example. ∴, try to get the document
            # from the docket entry.
            try:
                rd = RECAPDocument.objects.get(
                    docket_entry=de,
                    document_number=pq.document_number,
                    attachment_number=pq.attachment_number,
                    document_type=document_type,
                )
            except (
                RECAPDocument.DoesNotExist,
                RECAPDocument.MultipleObjectsReturned,
            ):
                # Unable to find it. Make a new item.
                rd = RECAPDocument(
                    docket_entry=de,
                    pacer_doc_id=pq.pacer_doc_id,
                    document_type=document_type,
                )

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    try:
        content = pq.filepath_local.read()
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    new_sha1 = sha1(content)
    existing_document = all(
        [
            rd.sha1 == new_sha1,
            rd.is_available,
            rd.filepath_local and os.path.isfile(rd.filepath_local.path),
        ]
    )
    if not existing_document:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        if not pq.debug:
            rd.filepath_local.save(file_name, cf, save=False)

            # Do page count and extraction
            extension = rd.filepath_local.path.split(".")[-1]
            rd.page_count = get_page_count(rd.filepath_local.path, extension)
            rd.file_size = rd.filepath_local.size

        rd.ocr_status = None
        rd.is_available = True
        rd.sha1 = new_sha1
        rd.date_upload = now()

    if not pq.debug:
        try:
            rd.save()
        except (IntegrityError, ValidationError):
            msg = "Duplicate key on unique_together constraint"
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            rd.filepath_local.delete(save=False)
            return None

    if not existing_document and not pq.debug:
        extract_recap_pdf(rd.pk)
        add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_pq_successful(
        pq,
        d_id=rd.docket_entry.docket_id,
        de_id=rd.docket_entry_id,
        rd_id=rd.pk,
    )
    mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True)
    return rd
Esempio n. 12
0
def get_pacer_doc_by_rd_and_description(self,
                                        rd_pk,
                                        description_re,
                                        session,
                                        fallback_to_main_doc=False,
                                        tag=None):
    """Using a RECAPDocument object ID and a description of a document, get the
    document from PACER.

    This function was originally meant to get civil cover sheets, but can be
    repurposed as needed.

    :param rd_pk: The PK of a RECAPDocument object to use as a source.
    :param description_re: A compiled regular expression to search against the
    description provided by the attachment page.
    :param session: The PACER session object to use.
    :param fallback_to_main_doc: Should we grab the main doc if none of the
    attachments match the regex?
    :param tag: A tag name to apply to any downloaded content.
    :return: None
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    if not rd.pacer_doc_id:
        # Some docket entries are just text/don't have a pacer_doc_id.
        self.request.callbacks = None
        return

    d = rd.docket_entry.docket
    pacer_court_id = map_cl_to_pacer_id(d.court_id)
    att_report = AttachmentPage(pacer_court_id, session)
    try:
        att_report.query(rd.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % rd)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            self.request.callbacks = None
            return

    att_found = None
    for attachment in att_report.data.get('attachments', []):
        if description_re.search(attachment['description']):
            att_found = attachment.copy()
            document_type = RECAPDocument.ATTACHMENT
            break

    if not att_found:
        if fallback_to_main_doc:
            logger.info("Falling back to main document for pacer_doc_id: %s" %
                        rd.pacer_doc_id)
            att_found = att_report.data
            document_type = RECAPDocument.PACER_DOCUMENT
        else:
            msg = "Aborting. Did not find civil cover sheet for %s." % rd
            logger.error(msg)
            self.request.callbacks = None
            return

    if not att_found.get('pacer_doc_id'):
        logger.warn("No pacer_doc_id for document (is it sealed?)")
        self.request.callbacks = None
        return

    # Try to find the attachment already in the collection
    rd, _ = RECAPDocument.objects.get_or_create(
        docket_entry=rd.docket_entry,
        attachment_number=att_found.get('attachment_number'),
        document_number=rd.document_number,
        pacer_doc_id=att_found['pacer_doc_id'],
        document_type=document_type,
        defaults={
            'date_upload': now(),
        },
    )
    # Replace the description if we have description data. Else fallback on old.
    rd.description = att_found.get('description', '') or rd.description
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        rd.tags.add(tag)

    if rd.is_available:
        # Great. Call it a day.
        rd.save(do_extraction=False, index=False)
        return

    # Not available. Go get it.
    try:
        pacer_case_id = rd.docket_entry.docket.pacer_case_id
        r = att_report.download_pdf(pacer_case_id, att_found['pacer_doc_id'])
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % att_found['pacer_doc_id'])
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at PACER court '%s' with doc id %s" % \
              (rd, pacer_court_id, rd.pacer_doc_id)
        logger.error(msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        d.court_id,
        pacer_case_id,
        rd.document_number,
        rd.attachment_number,
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, force it all to be
    # bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save, extract, then save to Solr. Skip OCR for now. Don't do these async.
    rd.save(do_extraction=False, index=False)
    extract_recap_pdf(rd.pk, skip_ocr=True)
    add_or_update_recap_document([rd.pk])
Esempio n. 13
0
def process_recap_pdf(self, pk):
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq))
    try:
        rd = RECAPDocument.objects.get(
            docket_entry__docket__pacer_case_id=pq.pacer_case_id,
            pacer_doc_id=pq.pacer_doc_id,
        )
    except RECAPDocument.DoesNotExist:
        try:
            d = Docket.objects.get(pacer_case_id=pq.pacer_case_id,
                                   court_id=pq.court_id)
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully the docket
            # will be in place soon (it could be in a different upload task that
            # hasn't yet been processed).
            logger.warning("Unable to find docket for processing queue '%s'. "
                           "Retrying if max_retries is not exceeded." % pq)
            pq.error_message = "Unable to find docket for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                pq.status = pq.PROCESSING_FAILED
                pq.save()
                return None
            else:
                pq.status = pq.QUEUED_FOR_RETRY
                pq.save()
                raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            logger.error(msg)
            pq.error_message = msg
            pq.status = pq.PROCESSING_FAILED
            pq.save()
            return None
        else:
            # Got the Docket, attempt to get/create the DocketEntry, and then
            # create the RECAPDocument
            try:
                de = DocketEntry.objects.get(docket=d,
                                             entry_number=pq.document_number)
            except DocketEntry.DoesNotExist as exc:
                logger.warning("Unable to find docket entry for processing "
                               "queue '%s'. Retrying if max_retries is not "
                               "exceeded." % pq)
                pq.error_message = "Unable to find docket entry for item."
                if (self.request.retries == self.max_retries) or pq.debug:
                    pq.status = pq.PROCESSING_FAILED
                    pq.save()
                    return None
                else:
                    pq.status = pq.QUEUED_FOR_RETRY
                    pq.save()
                    raise self.retry(exc=exc)

        # All objects accounted for. Make some data.
        rd = RECAPDocument(
            docket_entry=de,
            pacer_doc_id=pq.pacer_doc_id,
            date_upload=timezone.now(),
        )
        if pq.attachment_number is None:
            rd.document_type = RECAPDocument.PACER_DOCUMENT
        else:
            rd.document_type = RECAPDocument.ATTACHMENT

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    content = pq.filepath_local.read()
    new_sha1 = hashlib.sha1(content).hexdigest()
    existing_document = all([
        rd.sha1 == new_sha1,
        rd.is_available,
        rd.filepath_local and os.path.isfile(rd.filepath_local.path)
    ])
    if not existing_document:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        rd.filepath_local.save(file_name, cf, save=False)
        rd.is_available = True
        rd.sha1 = new_sha1

        # Do page count and extraction
        extension = rd.filepath_local.path.split('.')[-1]
        rd.page_count = get_page_count(rd.filepath_local.path, extension)
        rd.ocr_status = None

    if not pq.debug:
        rd.save()

    if not existing_document and not pq.debug:
        extract_recap_pdf(rd.pk)
        add_or_update_recap_document([rd.pk], force_commit=False)

    mark_pq_successful(pq, d_id=rd.docket_entry.docket_id,
                       de_id=rd.docket_entry_id, rd_id=rd.pk)
    return rd
Esempio n. 14
0
def process_recap_pdf(self, pk):
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item: %s" % pq)
    try:
        rd = RECAPDocument.objects.get(
            docket_entry__docket__pacer_case_id=pq.pacer_case_id,
            pacer_doc_id=pq.pacer_doc_id,
        )
    except RECAPDocument.DoesNotExist:
        try:
            d = Docket.objects.get(pacer_case_id=pq.pacer_case_id,
                                   court_id=pq.court_id)
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully the docket
            # will be in place soon (it could be in a different upload task that
            # hasn't yet been processed).
            logger.warning("Unable to find docket for processing queue '%s'. "
                           "Retrying if max_retries is not exceeded." % pq)
            pq.error_message = "Unable to find docket for item."
            if self.request.retries == self.max_retries:
                pq.status = pq.PROCESSING_FAILED
            else:
                pq.status = pq.QUEUED_FOR_RETRY
            pq.save()
            raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            logger.error(msg)
            pq.error_message = msg
            pq.status = pq.PROCESSING_FAILED
            pq.save()
            return None
        else:
            # Got the Docket, attempt to get/create the DocketEntry, and then
            # create the RECAPDocument
            try:
                de = DocketEntry.objects.get(docket=d,
                                             entry_number=pq.document_number)
            except DocketEntry.DoesNotExist as exc:
                logger.warning("Unable to find docket entry for processing "
                               "queue '%s'. Retrying if max_retries is not "
                               "exceeded." % pq)
                pq.error_message = "Unable to find docket entry for item."
                if self.request.retries == self.max_retries:
                    pq.status = pq.PROCESSING_FAILED
                else:
                    pq.status = pq.QUEUED_FOR_RETRY
                pq.save()
                raise self.retry(exc=exc)

        # All objects accounted for. Make some data.
        rd = RECAPDocument(
            docket_entry=de,
            pacer_doc_id=pq.pacer_doc_id,
            date_upload=timezone.now(),
        )
        if pq.attachment_number is None:
            rd.document_type = RECAPDocument.PACER_DOCUMENT
        else:
            rd.document_type = RECAPDocument.ATTACHMENT

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    content = pq.filepath_local.read()
    new_sha1 = hashlib.sha1(content).hexdigest()
    if all([rd.sha1 == new_sha1,
            rd.is_available,
            rd.filepath_local and os.path.isfile(rd.filepath_local.path)]):
        # All good. Press on.
        new_document = False
    else:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        new_document = True
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        rd.filepath_local.save(file_name, cf, save=False)
        rd.is_available = True
        rd.sha1 = new_sha1

        # Do page count and extraction
        extension = rd.filepath_local.path.split('.')[-1]
        rd.page_count = get_page_count(rd.filepath_local.path, extension)
        rd.ocr_status = None

    # Ditch the original file
    pq.filepath_local.delete(save=False)
    pq.error_message = ''  # Clear out errors b/c successful
    pq.status = pq.PROCESSING_SUCCESSFUL
    pq.save()

    rd.save()
    if new_document:
        extract_recap_pdf(rd.pk)
        add_or_update_recap_document([rd.pk], force_commit=False)

    return rd