Example #1
0
def lookup_and_save(new, debug=False):
    """Merge new docket info into the database.

    Start by attempting to lookup an existing Docket. If that's not found,
    create a new one. Either way, merge all the attributes of `new` into the
    Docket found, and then save the Docket.

    Returns None if an error occurs, else, return the new or updated Docket.
    """
    try:
        d = Docket.objects.get(pacer_case_id=new.pacer_case_id,
                               court=new.court)
    except (Docket.DoesNotExist, Docket.MultipleObjectsReturned):
        d = None

    if d is None:
        ds = Docket.objects.filter(docket_number=new.docket_number,
                                   court=new.court).order_by('-date_filed')
        count = ds.count()
        if count < 1:
            # Can't find it by pacer_case_id or docket_number. Make a new item.
            d = Docket(source=Docket.RECAP)
        elif count == 1:
            # Nailed it!
            d = ds[0]
        elif count > 1:
            # Too many dockets returned. Disambiguate.
            logger.error("Got multiple results while attempting save.")

            def is_different(x):
                return x.pacer_case_id and x.pacer_case_id != new.pacer_case_id

            if all([is_different(d) for d in ds]):
                # All the dockets found match on docket number, but have
                # different pacer_case_ids. This means that the docket has
                # multiple pacer_case_ids in PACER, and we should mirror that
                # in CL by creating a new docket for the new item.
                d = Docket(source=Docket.RECAP)
            else:
                # Just use the most recent docket. Looking at the data, this is
                # OK. Nearly all of these are dockets associated with clusters
                # that can be merged (however, that's a project for clusters).
                d = ds[0]

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    for attr, v in new.__dict__.items():
        setattr(d, attr, v)

    if not debug:
        d.save()
        logger.info("Saved as Docket %s: https://www.courtlistener.com%s" %
                    (d.pk, d.get_absolute_url()))
    return d
Example #2
0
def lookup_and_save(new, debug=False):
    """Merge new docket info into the database.

    Start by attempting to lookup an existing Docket. If that's not found,
    create a new one. Either way, merge all the attributes of `new` into the
    Docket found, and then save the Docket.

    Returns None if an error occurs, else, return the new or updated Docket.
    """
    try:
        d = Docket.objects.get(pacer_case_id=new.pacer_case_id,
                               court=new.court)
    except (Docket.DoesNotExist, Docket.MultipleObjectsReturned):
        d = None

    if d is None:
        ds = Docket.objects.filter(docket_number=new.docket_number,
                                   court=new.court).order_by('-date_filed')
        count = ds.count()
        if count < 1:
            # Can't find it by pacer_case_id or docket_number. Make a new item.
            d = Docket(source=Docket.RECAP)
        elif count == 1:
            # Nailed it!
            d = ds[0]
        elif count > 1:
            # Too many dockets returned. Disambiguate.
            logger.error("Got multiple results while attempting save.")

            def is_different(x):
                return x.pacer_case_id and x.pacer_case_id != new.pacer_case_id
            if all([is_different(d) for d in ds]):
                # All the dockets found match on docket number, but have
                # different pacer_case_ids. This means that the docket has
                # multiple pacer_case_ids in PACER, and we should mirror that
                # in CL by creating a new docket for the new item.
                d = Docket(source=Docket.RECAP)
            else:
                # Just use the most recent docket. Looking at the data, this is
                # OK. Nearly all of these are dockets associated with clusters
                # that can be merged (however, that's a project for clusters).
                d = ds[0]

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    for attr, v in new.__dict__.items():
        setattr(d, attr, v)

    if not debug:
        d.save()
        logger.info("Saved as Docket %s: https://www.courtlistener.com%s" %
                    (d.pk, d.get_absolute_url()))
    return d
Example #3
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or recap document
            // was created (implying a Solr needs updating).
            'needs_solr_update': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')

    if 'History/Documents' in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = pq.DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.callbacks = None
        return None

    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if docket_data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        self.request.callbacks = None
        return None

    # Merge the contents of the docket into CL. Attempt several lookups of
    # decreasing specificity. Note that pacer_case_id is required for Docket
    # uploads.
    d = None
    for kwargs in [{'pacer_case_id': pq.pacer_case_id,
                    'docket_number': docket_data['docket_number']},
                   {'pacer_case_id': pq.pacer_case_id},
                   {'docket_number': docket_data['docket_number'],
                    'pacer_case_id': None}]:
        try:
            d = Docket.objects.get(court_id=pq.court_id, **kwargs)
            break
        except Docket.DoesNotExist:
            continue
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to look up '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            self.request.callbacks = None
            return None

    if d is None:
        # Couldn't find it. Make a new one.
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.callbacks = None
        return {'docket_pk': d.pk, 'needs_solr_update': False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries & documents
    rds_created = []
    needs_solr_update = False
    for docket_entry in docket_data['docket_entries']:
        try:
            de, de_created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        if de_created:
            needs_solr_update = True

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        params = {
            'docket_entry': de,
            # No attachments when uploading dockets.
            'document_type': RECAPDocument.PACER_DOCUMENT,
            'document_number': docket_entry['document_number'],
        }
        try:
            rd = RECAPDocument.objects.get(**params)
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument.objects.create(
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
                **params
            )
            rds_created.append(rd)
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
Example #4
0
def process_recap_docket(pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :return: The docket that's created or updated.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if docket_data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        return None

    # Merge the contents of the docket into CL. Attempt several lookups of
    # decreasing specificity.
    d = None
    for kwargs in [{
            'pacer_case_id': pq.pacer_case_id,
            'docket_number': docket_data['docket_number']
    }, {
            'pacer_case_id': pq.pacer_case_id
    }, {
            'docket_number': docket_data['docket_number']
    }]:
        try:
            d = Docket.objects.get(court_id=pq.court_id, **kwargs)
            break
        except Docket.DoesNotExist:
            continue
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to look up '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            return None

    if d is None:
        # Couldn't find it. Make a new one.
        d = Docket(source=Docket.RECAP,
                   pacer_case_id=pq.pacer_case_id,
                   court_id=pq.court_id)

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        return d

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                })
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            try:
                RECAPDocument.objects.create(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                    pacer_doc_id=docket_entry['pacer_doc_id'],
                    is_available=False,
                )
            except IntegrityError:
                logger.warn(
                    "Creating new document with pacer_doc_id of '%s' violates "
                    "unique constraint on pacer_doc_id field." %
                    docket_entry['pacer_doc_id'])
                continue
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    mark_pq_successful(pq, d_id=d.pk)
    return d