Exemple #1
0
def upload_recap_json(self, pk):
    """Make a JSON object for a RECAP docket and upload it to IA"""
    d, json_str = generate_ia_json(pk)

    file_name = get_docket_filename(d.court_id, d.pacer_case_id, 'json')
    bucket_name = get_bucket_name(d.court_id, d.pacer_case_id)
    responses = upload_to_ia(
        self,
        identifier=bucket_name,
        files={file_name: StringIO(json_str)},
        title=best_case_name(d),
        collection=settings.IA_COLLECTIONS,
        court_id=d.court_id,
        source_url='https://www.courtlistener.com%s' % d.get_absolute_url(),
        media_type='texts',
        description="This item represents a case in PACER, the U.S. "
        "Government's website for federal case data. This "
        "information is uploaded quarterly. To see our most "
        "recent version please use the source url parameter, "
        "linked below. To see the canonical source for this data, "
        "please consult PACER directly.",
    )
    if responses is None:
        increment_failure_count(d)
        return

    if all(r.ok for r in responses):
        d.ia_upload_failure_count = None
        d.ia_date_first_changed = None
        d.ia_needs_upload = False
        d.filepath_ia_json = "https://archive.org/download/%s/%s" % (
            bucket_name, file_name)
        d.save()
    else:
        increment_failure_count(d)
def make_pdf_path(instance, filename, thumbs=False):
    from cl.search.models import ClaimHistory, RECAPDocument
    from cl.lasc.models import LASCPDF

    if type(instance) == RECAPDocument:
        root = "recap"
        court_id = instance.docket_entry.docket.court_id
        pacer_case_id = instance.docket_entry.docket.pacer_case_id
    elif type(instance) == ClaimHistory:
        root = "claim"
        court_id = instance.claim.docket.court_id
        pacer_case_id = instance.pacer_case_id
    elif type(instance) == LASCPDF:
        slug = slugify(trunc(filename, 40))
        root = "/us/state/ca/lasc/%s/" % instance.docket_number
        file_name = "gov.ca.lasc.%s.%s.%s.pdf" % (
            instance.docket_number,
            instance.document_id,
            slug,
        )

        return os.path.join(root, file_name)
    else:
        raise ValueError("Unknown model type in make_pdf_path "
                         "function: %s" % type(instance))

    if thumbs:
        root = root + "-thumbnails"
    return os.path.join(root, get_bucket_name(court_id, pacer_case_id),
                        filename)
Exemple #3
0
def upload_pdf_to_ia(self, rd_pk):
    rd = RECAPDocument.objects.get(pk=rd_pk)
    d = rd.docket_entry.docket
    file_name = get_document_filename(
        d.court_id,
        d.pacer_case_id,
        rd.document_number,
        rd.attachment_number or 0,
    )
    bucket_name = get_bucket_name(d.court_id, d.pacer_case_id)
    responses = upload_to_ia(
        self,
        identifier=bucket_name,
        files=rd.filepath_local.path,
        title=best_case_name(d),
        collection=settings.IA_COLLECTIONS,
        court_id=d.court_id,
        source_url='https://www.courtlistener.com%s' % rd.get_absolute_url(),
        media_type='texts',
        description="This item represents a case in PACER, the U.S. "
        "Government's website for federal case data. If you wish "
        "to see the entire case, please consult PACER directly.",
    )
    if responses is None:
        increment_failure_count(rd)
        return

    if all(r.ok for r in responses):
        rd.ia_upload_failure_count = None
        rd.filepath_ia = "https://archive.org/download/%s/%s" % (bucket_name,
                                                                 file_name)
        rd.save()
    else:
        increment_failure_count(rd)
Exemple #4
0
def upload_audio_to_ia(self, af_pk):
    af = Audio.objects.get(pk=af_pk)
    d = af.docket
    file_name = make_af_filename(
        d.court_id, d.docket_number, d.date_argued,
        af.local_path_original_file.path.rsplit('.', 1)[1])
    bucket_name = get_bucket_name(d.court_id, slugify(d.docket_number))
    responses = upload_to_ia(
        self,
        identifier=bucket_name,
        files={file_name: af.local_path_original_file.path},
        title=best_case_name(d),
        collection=settings.IA_OA_COLLECTIONS,
        court_id=d.court_id,
        source_url='https://www.courtlistener.com%s' % af.get_absolute_url(),
        media_type='audio',
        description='This item represents an oral argument audio file as '
        'scraped from a U.S. Government website by Free Law '
        'Project.',
    )
    if responses is None:
        increment_failure_count(af)
        return

    if all(r.ok for r in responses):
        af.ia_upload_failure_count = None
        af.filepath_ia = "https://archive.org/download/%s/%s" % (bucket_name,
                                                                 file_name)
        af.save()
    else:
        increment_failure_count(af)
Exemple #5
0
def upload_free_opinion_to_ia(self, rd_pk):
    rd = RECAPDocument.objects.get(pk=rd_pk)
    d = rd.docket_entry.docket
    file_name = get_document_filename(
        d.court_id,
        d.pacer_case_id,
        rd.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    bucket_name = get_bucket_name(d.court_id, d.pacer_case_id)
    try:
        responses = upload_to_ia(
            identifier=bucket_name,
            files=rd.filepath_local.path,
            metadata={
                'title': best_case_name(d),
                'collection': settings.IA_COLLECTIONS,
                'contributor': '<a href="https://free.law">Free Law Project</a>',
                'court': d.court_id,
                'language': 'eng',
                'mediatype': 'texts',
                'description': "This item represents a case in PACER, "
                               "the U.S. Government's website for "
                               "federal case data. If you wish to see "
                               "the entire case, please consult PACER "
                               "directly.",
                'licenseurl': 'https://www.usa.gov/government-works',
            },
        )
    except (OverloadedException, ExpatError) as exc:
        # Overloaded: IA wants us to slow down.
        # ExpatError: The syntax of the XML file that's supposed to be returned
        #             by IA is bad (or something).
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
            HTTP_403_FORBIDDEN,    # Can't access bucket, typically.
            HTTP_400_BAD_REQUEST,  # Corrupt PDF, typically.
        ]:
            return [exc.response]
        if self.request.retries == self.max_retries:
            # This exception is also raised when the endpoint is overloaded, but
            # doesn't get caught in the OverloadedException below due to
            # multiple processes running at the same time. Just give up for now.
            return
        raise self.retry(exc=exc)
    except (requests.Timeout, requests.RequestException) as exc:
        logger.warning("Timeout or unknown RequestException. Unable to upload "
                       "to IA. Trying again if retries not exceeded: %s" % rd)
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    if all(r.ok for r in responses):
        rd.filepath_ia = "https://archive.org/download/%s/%s" % (
            bucket_name, file_name)
        rd.save(do_extraction=False, index=False)
Exemple #6
0
def upload_free_opinion_to_ia(self, rd_pk):
    rd = RECAPDocument.objects.get(pk=rd_pk)
    d = rd.docket_entry.docket
    file_name = get_document_filename(
        d.court_id,
        d.pacer_case_id,
        rd.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    bucket_name = get_bucket_name(d.court_id, d.pacer_case_id)
    try:
        responses = upload_to_ia(
            identifier=bucket_name,
            files=rd.filepath_local.path,
            metadata={
                'title': best_case_name(d),
                'collection': settings.IA_COLLECTIONS,
                'contributor': '<a href="https://free.law">Free Law Project</a>',
                'court': d.court_id,
                'language': 'eng',
                'mediatype': 'texts',
                'description': "This item represents a case in PACER, "
                               "the U.S. Government's website for "
                               "federal case data. If you wish to see "
                               "the entire case, please consult PACER "
                               "directly.",
                'licenseurl': 'https://www.usa.gov/government-works',
            },
        )
    except (OverloadedException, ExpatError) as exc:
        # Overloaded: IA wants us to slow down.
        # ExpatError: The syntax of the XML file that's supposed to be returned
        #             by IA is bad (or something).
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
            HTTP_403_FORBIDDEN,    # Can't access bucket, typically.
            HTTP_400_BAD_REQUEST,  # Corrupt PDF, typically.
        ]:
            return [exc.response]
        if self.request.retries == self.max_retries:
            # This exception is also raised when the endpoint is overloaded, but
            # doesn't get caught in the OverloadedException below due to
            # multiple processes running at the same time. Just give up for now.
            return
        raise self.retry(exc=exc)
    except (requests.Timeout, requests.RequestException) as exc:
        logger.warning("Timeout or unknown RequestException. Unable to upload "
                       "to IA. Trying again if retries not exceeded: %s" % rd)
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    if all(r.ok for r in responses):
        rd.filepath_ia = "https://archive.org/download/%s/%s" % (
            bucket_name, file_name)
        rd.save(do_extraction=False, index=False)
Exemple #7
0
def upload_recap_json(self, pk):
    """Make a JSON object for a RECAP docket and upload it to IA"""
    # This is a pretty highly optimized query that uses only 13 hits to the DB
    # when generating a docket JSON rendering, regardless of how many related
    # objects the docket has such as docket entries, parties, etc.
    ds = Docket.objects.filter(pk=pk).select_related(
        'originating_court_information',
    ).prefetch_related(
        'panel',
        'parties__attorneys__roles',
        'parties__party_types__criminal_complaints',
        'parties__party_types__criminal_counts',
        # Django appears to have a bug where you can't defer a field on a
        # queryset where you prefetch the values. If you try to, it crashes.
        # We should be able to just do the prefetch below like the ones above
        # and then do the defer statement at the end, but that throws an error.
        Prefetch(
            'docket_entries__recap_documents',
            queryset=RECAPDocument.objects.all().defer('plain_text')
        )
    )
    d = ds[0]
    renderer = JSONRenderer()
    json_str = renderer.render(
        IADocketSerializer(d).data,
        accepted_media_type='application/json; indent=2',
    )

    file_name = get_docket_filename(d.court_id, d.pacer_case_id, 'json')
    bucket_name = get_bucket_name(d.court_id, d.pacer_case_id)
    responses = upload_to_ia(
        self,
        identifier=bucket_name,
        files={file_name: StringIO(json_str)},
        title=best_case_name(d),
        collection=settings.IA_COLLECTIONS,
        court_id=d.court_id,
        source_url='https://www.courtlistener.com%s' % d.get_absolute_url(),
        media_type='texts',
        description="This item represents a case in PACER, the U.S. "
                    "Government's website for federal case data. This "
                    "information is uploaded quarterly. To see our most "
                    "recent version please use the source url parameter, "
                    "linked below. To see the canonical source for this data, "
                    "please consult PACER directly.",
    )
    if responses is None:
        increment_failure_count(d)
        return

    if all(r.ok for r in responses):
        d.ia_upload_failure_count = None
        d.ia_date_first_changed = None
        d.filepath_ia_json = "https://archive.org/download/%s/%s" % (
            bucket_name, file_name)
        mark_ia_upload_needed(d)
        d.save()
    else:
        increment_failure_count(d)
def base_recap_path(instance, filename, base_dir):
    """Make a filepath, accepting an extra parameter for the base directory

    Mirrors technique used by original RECAP server to upload PDFs to IA.
    """
    return os.path.join(
        base_dir,
        get_bucket_name(
            instance.docket_entry.docket.court_id,
            instance.docket_entry.docket.pacer_case_id,
        ),
        filename,
    )
def base_recap_path(instance, filename, base_dir):
    """Make a filepath, accepting an extra parameter for the base directory

    Mirrors technique used by original RECAP server to upload PDFs to IA.
    """
    return os.path.join(
        base_dir,
        get_bucket_name(
            instance.docket_entry.docket.court_id,
            instance.docket_entry.docket.pacer_case_id,
        ),
        filename,
    )
Exemple #10
0
def make_recap_pdf_path(instance, filename):
    """Make a path for storing the a PACER document in RECAP.

    Mirrors technique used by original RECAP server to upload PDFs to IA.
    """
    return os.path.join(
        "recap",
        get_bucket_name(
            instance.docket_entry.docket.court_id,
            instance.docket_entry.docket.pacer_case_id,
        ),
        filename,
    )
Exemple #11
0
def make_recap_pdf_path(instance, filename):
    """Make a path for storing the a PACER document in RECAP.

    Mirrors technique used by original RECAP server to upload PDFs to IA.
    """
    return os.path.join(
        "recap",
        get_bucket_name(
            instance.docket_entry.docket.court_id,
            instance.docket_entry.docket.pacer_case_id,
        ),
        filename,
    )