Esempio n. 1
0
def fetch_docket_by_pacer_case_id(
    session, court_id, pacer_case_id, fq,
):
    """Download the docket from PACER and merge it into CL

    :param session: A PacerSession object to work with
    :param court_id: The CL ID of the court
    :param pacer_case_id: The pacer_case_id of the docket, if known
    :param fq: The PacerFetchQueue object
    :return: a dict with information about the docket and the new data
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    report.query(pacer_case_id, **get_fq_docket_kwargs(fq))

    docket_data = report.data
    if not docket_data:
        raise ParsingException("No data found in docket report.")
    if fq.docket_id:
        d = Docket.objects.get(pk=fq.docket_id)
    else:
        d, count = find_docket_object(
            court_id, pacer_case_id, docket_data["docket_number"]
        )
        if count > 1:
            d = d.earliest("date_created")
    rds_created, content_updated = merge_pacer_docket_into_cl_docket(
        d, pacer_case_id, docket_data, report, appellate=False,
    )
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
Esempio n. 2
0
def reprocess_docket_data(d, filepath, report_type):
    """Reprocess docket data that we already have.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.tasks import update_docket_metadata, add_docket_entries, \
        add_parties_and_attorneys
    if report_type == DOCKET:
        report = DocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id))
    with open(filepath, 'r') as f:
        text = f.read().decode('utf-8')
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None
    update_docket_metadata(d, data)
    d.save()
    add_docket_entries(d, data['docket_entries'])
    if report_type == DOCKET:
        add_parties_and_attorneys(d, data['parties'])
    return d.pk
Esempio n. 3
0
def process_docket_data(d, filepath, report_type):
    """Process docket data file.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.mergers import (
        add_docket_entries,
        add_parties_and_attorneys,
        update_docket_appellate_metadata,
        update_docket_metadata,
        add_bankruptcy_data_to_docket,
        add_claims_to_docket,
    )

    court_id = map_cl_to_pacer_id(d.court_id)
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(court_id)
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(court_id)
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(court_id)
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive(court_id)
    elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE:
        report = CaseQuery(court_id)
    elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER:
        report = ClaimsRegister(court_id)
    else:
        raise NotImplementedError("The report type with id '%s' is not yet "
                                  "supported. Perhaps you need to add it?" %
                                  report_type)
    with open(filepath, "r") as f:
        text = f.read().decode("utf-8")
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None

    if report_type == UPLOAD_TYPE.CLAIMS_REGISTER:
        add_bankruptcy_data_to_docket(d, data)
        add_claims_to_docket(d, data["claims"])
    else:
        update_docket_metadata(d, data)
        d, og_info = update_docket_appellate_metadata(d, data)
        if og_info is not None:
            og_info.save()
            d.originating_court_information = og_info
        d.save()
        if data.get("docket_entries"):
            add_docket_entries(d, data["docket_entries"])
    if report_type in (
            UPLOAD_TYPE.DOCKET,
            UPLOAD_TYPE.APPELLATE_DOCKET,
            UPLOAD_TYPE.IA_XML_FILE,
    ):
        add_parties_and_attorneys(d, data["parties"])
    return d.pk
Esempio n. 4
0
def process_docket_data(d, filepath, report_type):
    """Process docket data file.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.tasks import update_docket_metadata, add_docket_entries, \
        add_parties_and_attorneys, update_docket_appellate_metadata
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive()
    with open(filepath, 'r') as f:
        text = f.read().decode('utf-8')
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None
    update_docket_metadata(d, data)
    d, og_info = update_docket_appellate_metadata(d, data)
    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()
    add_docket_entries(d, data['docket_entries'])
    if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET,
                       UPLOAD_TYPE.IA_XML_FILE):
        add_parties_and_attorneys(d, data['parties'])
    return d.pk
Esempio n. 5
0
    def get_docket_json(self):
        """Download docket to disk from Pacer

        :return: None
        """
        q = Query()
        db = TinyDB("db/master.json")
        fjc_table = db.table("fjc")
        for row in fjc_table.search(~(q.PACER_CASE_ID == "") & (q.JSON == "False")):
            rep = DocketReport(row["COURT"], self.s)
            rep.query(
                row["PACER_CASE_ID"],
                show_parties_and_counsel=True,
                show_terminated_parties=True,
                show_list_of_member_cases=True,
                include_pdf_headers=True,
                show_multiple_docs=False,
            )
            with open(
                "downloads/json/pacer_docket_%s.json" % row["PACER_CASE_ID"], "w"
            ) as write_file:
                json.dump(rep.data, write_file, indent=4, sort_keys=True, default=str)
            with open(
                "downloads/html/pacer_docket_%s.html" % row["PACER_CASE_ID"], "w"
            ) as file:
                file.write(rep.response.text)

            fjc_table.update(
                {
                    "JSON": "True",
                    "pacer_doc_id": rep.data["docket_entries"][0]["pacer_doc_id"],
                },
                doc_ids=[row.doc_id],
            )

        logging.info("Finished collecting JSON and HTML")
Esempio n. 6
0
def process_docket_data(d, filepath, report_type):
    """Process docket data file.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.tasks import update_docket_metadata, add_docket_entries, \
        add_parties_and_attorneys, update_docket_appellate_metadata
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive()
    elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE:
        report = CaseQuery(map_cl_to_pacer_id(d.court_id))
    with open(filepath, 'r') as f:
        text = f.read().decode('utf-8')
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None
    update_docket_metadata(d, data)
    d, og_info = update_docket_appellate_metadata(d, data)
    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()
    if data.get('docket_entries'):
        add_docket_entries(d, data['docket_entries'])
    if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET,
                       UPLOAD_TYPE.IA_XML_FILE):
        add_parties_and_attorneys(d, data['parties'])
    return d.pk
Esempio n. 7
0
    def run_parsers_on_path(self, path_root,
                            required_fields=[
                                'date_filed', 'case_name', 'docket_number']):
        """Test all the parsers, faking the network query."""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, '*.html'):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):

            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            t1 = time.time()
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)
            court = filename_sans_ext.split('_')[0]

            report = DocketReport(court)
            with open(path, 'r') as f:
                report._parse_text(f.read().decode('utf-8'))
            data = report.data

            if data != {}:
                # If the docket is a valid docket, make sure some required
                # fields are populated.
                for field in required_fields:
                    self.assertTrue(
                        data[field],
                        msg="Unable to find truthy value for field %s" % field,
                    )

                self.assertEqual(data['court_id'], court)

                # Party-specific tests...
                for party in data['parties']:
                    self.assertTrue(
                        party.get('name', False),
                        msg="Every party must have a name attribute. Did not "
                            "get a value for:\n\n%s" % party
                    )
                    # Protect against effed up adversary proceedings cases that
                    # don't parse properly. See: cacb, 2:08-ap-01570-BB
                    self.assertNotIn('----', party['name'])

            if not os.path.isfile(json_path):
                bar = "*" * 50
                print("\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:"
                      "\n\n  %s\n\n"
                      "Please test the data in this file before assuming "
                      "everything worked.\n%s\n" % (bar, json_path, bar))
                with open(json_path, 'w') as f:
                    json.dump(data, f, indent=2, sort_keys=True)
                    #self.assertFalse(True)
                    continue

            with open(json_path) as f:
                j = json.load(f)
                if j != {}:
                    # Compare docket entries and parties first, for easier
                    # debugging, then compare whole objects to be sure.
                    self.assertEqual(j['docket_entries'], data['docket_entries'])
                    self.assertEqual(j['parties'], data['parties'])
                self.assertEqual(j, data)
            t2 = time.time()

            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=1)
            sys.stdout.write("✓ - %0.1fs\n" % (t2-t1))
Esempio n. 8
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or recap document
            // was created (implying a Solr needs updating).
            'needs_solr_update': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')

    if 'History/Documents' in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = pq.DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.callbacks = None
        return None

    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if docket_data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        self.request.callbacks = None
        return None

    # Merge the contents of the docket into CL. Attempt several lookups of
    # decreasing specificity. Note that pacer_case_id is required for Docket
    # uploads.
    d = None
    for kwargs in [{'pacer_case_id': pq.pacer_case_id,
                    'docket_number': docket_data['docket_number']},
                   {'pacer_case_id': pq.pacer_case_id},
                   {'docket_number': docket_data['docket_number'],
                    'pacer_case_id': None}]:
        try:
            d = Docket.objects.get(court_id=pq.court_id, **kwargs)
            break
        except Docket.DoesNotExist:
            continue
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to look up '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            self.request.callbacks = None
            return None

    if d is None:
        # Couldn't find it. Make a new one.
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.callbacks = None
        return {'docket_pk': d.pk, 'needs_solr_update': False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries & documents
    rds_created = []
    needs_solr_update = False
    for docket_entry in docket_data['docket_entries']:
        try:
            de, de_created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        if de_created:
            needs_solr_update = True

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        params = {
            'docket_entry': de,
            # No attachments when uploading dockets.
            'document_type': RECAPDocument.PACER_DOCUMENT,
            'document_number': docket_entry['document_number'],
        }
        try:
            rd = RECAPDocument.objects.get(**params)
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument.objects.create(
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
                **params
            )
            rds_created.append(rd)
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
Esempio n. 9
0
    def run_parsers_on_path(
            self,
            path_root,
            required_fields=['date_filed', 'case_name', 'docket_number']):
        """Test all the parsers, faking the network query."""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, '*.html'):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):
            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            t1 = time.time()
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)
            court = filename_sans_ext.split('_')[0]

            report = DocketReport(court)
            with open(path, 'r') as f:
                report._parse_text(f.read().decode('utf-8'))
            data = report.data

            if data != {}:
                # If the docket is a valid docket, make sure some required
                # fields are populated.
                for field in required_fields:
                    self.assertTrue(
                        data[field],
                        msg="Unable to find truthy value for field %s" % field,
                    )

                self.assertEqual(data['court_id'], court)

                # Party-specific tests...
                for party in data['parties']:
                    self.assertTrue(
                        party.get('name', False),
                        msg="Every party must have a name attribute. Did not "
                        "get a value for:\n\n%s" % party)
                    # Protect against effed up adversary proceedings cases that
                    # don't parse properly. See: cacb, 2:08-ap-01570-BB
                    self.assertNotIn('----', party['name'])

            if not os.path.isfile(json_path):
                bar = "*" * 50
                print("\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:"
                      "\n\n  %s\n\n"
                      "Please test the data in this file before assuming "
                      "everything worked.\n%s\n" % (bar, json_path, bar))
                with open(json_path, 'w') as f:
                    json.dump(data, f, indent=2, sort_keys=True)
                    #self.assertFalse(True)
                    continue

            with open(json_path) as f:
                j = json.load(f)
                if j != {}:
                    # Compare docket entries and parties first, for easier
                    # debugging, then compare whole objects to be sure.
                    self.assertEqual(j['docket_entries'],
                                     data['docket_entries'])
                    self.assertEqual(j['parties'], data['parties'])
                self.assertEqual(j, data)
            t2 = time.time()

            max_duration = 1
            duration = t2 - t1
            if duration > max_duration:
                if sys.gettrace() is None and not IS_TRAVIS:
                    # Don't do this if we're debugging.
                    raise SlownessException(
                        "The parser for '{fn}' took {duration}s to test, "
                        "which is more than the maximum allowed duration of "
                        "{max_duration}s.".format(
                            fn=filename,
                            duration=duration,
                            max_duration=max_duration,
                        ))
            sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
Esempio n. 10
0
 def setUpClass(cls):
     pacer_session = PacerSession(username='******', password='******')
     cls.report = DocketReport('psc', pacer_session)
     cls.pacer_case_id = '62866'  # 1:07-cr-00001-RJA-HKS USA v. Green
Esempio n. 11
0
#!/usr/bin/env python
#
#  Takes an .html file on the command line, parses it using the PACER
#  Docket Report parser, and outputs json to stdout.

import jsondate3 as json
import sys

from juriscraper.pacer.http import PacerSession
from juriscraper.pacer import DocketReport

pacer_session = PacerSession(username="******", password="******")
report = DocketReport("psc", pacer_session)

for path in sys.argv[1:]:
    with open(path, "r") as f:
        report._parse_text(f.read().decode("utf-8"))
    data = report.data
    print json.dumps(data, indent=2, sort_keys=True, separators=(",", ": "))
Esempio n. 12
0
def process_recap_docket(pk):
    """Process an uploaded docket from the RECAP API endpoint.

    param pk: The primary key of the processing queue item you want to work on.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    # Merge the contents of the docket into CL
    try:
        d = Docket.objects.get(
            Q(pacer_case_id=pq.pacer_case_id) |
            Q(docket_number=docket_data['docket_number']),
            court_id=pq.court_id,
        )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )
    except Docket.MultipleObjectsReturned:
        msg = "Too many dockets found when trying to look up '%s'" % pq
        logger.error(msg)
        pq.error_message = msg
        pq.status = pq.PROCESSING_FAILED
        pq.save()
        return None

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        return d

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            RECAPDocument.objects.create(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
            )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    mark_pq_successful(pq, d_id=d.pk)
    return d
Esempio n. 13
0
def get_docket_by_pacer_case_id(self,
                                data,
                                court_id,
                                cookies,
                                tag_names=None,
                                **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param data: A dict containing:
        Required: 'pacer_case_id': The internal case ID of the item in PACER.
        Optional: 'docket_pk': The ID of the docket to work on to avoid lookups
                  if it's known in advance.
    :param court_id: A courtlistener court ID.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param tag_names: A list of tag names that should be stored with the item
    in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    :return: A dict indicating if we need to update Solr.
    """
    s = PacerSession(cookies=cookies)
    if data is None:
        logger.info("Empty data argument. Terminating " "chains and exiting.")
        self.request.callbacks = None
        return

    pacer_case_id = data.get('pacer_case_id')
    report = DocketReport(map_cl_to_pacer_id(court_id), s)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    if data.get('docket_pk') is not None:
        d = Docket.objects.get(pk=data['docket_pk'])
    else:
        try:
            d = Docket.objects.get(
                pacer_case_id=pacer_case_id,
                court_id=court_id,
            )
        except Docket.DoesNotExist:
            d = None
        except Docket.MultipleObjectsReturned:
            d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    if not docket_data:
        logger.info("No valid docket data for %s.%s", court_id, pacer_case_id)
        self.request.callbacks = None
        return

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()
    tags = []
    if tag_names is not None:
        for tag_name in tag_names:
            tag, _ = Tag.objects.get_or_create(name=tag_name)
            tag.tag_object(d)
            tags.append(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, content_updated = add_docket_entries(
        d, docket_data['docket_entries'], tags=tags)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'content_updated': bool(rds_created or content_updated),
    }
Esempio n. 14
0
def process_docket_data(
    d: Docket,
    report_type: int,
    filepath: str = None,
) -> Optional[int]:
    """Process docket data file.

    :param d: A docket object to work on.
    :param report_type: Whether it's a docket or a docket history report.
    :param filepath: A local path where the item can be found. If not provided,
    the filepath_local field of the docket object will be attempted.
    """
    from cl.recap.mergers import (
        add_bankruptcy_data_to_docket,
        add_claims_to_docket,
        add_docket_entries,
        add_parties_and_attorneys,
        update_docket_appellate_metadata,
        update_docket_metadata,
    )

    court_id = map_cl_to_pacer_id(d.court_id)
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(court_id)
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(court_id)
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(court_id)
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive(court_id)
    elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE:
        report = CaseQuery(court_id)
    elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER:
        report = ClaimsRegister(court_id)
    else:
        raise NotImplementedError(
            "The report type with id '%s' is not yet "
            "supported. Perhaps you need to add it?" % report_type
        )

    if filepath:
        with open(filepath, "r") as f:
            text = f.read()
    else:
        # This is an S3 path, so get it remotely.
        text = d.filepath_local.read().decode()

    report._parse_text(text)
    data = report.data
    if data == {}:
        return None

    if report_type == UPLOAD_TYPE.CLAIMS_REGISTER:
        add_bankruptcy_data_to_docket(d, data)
        add_claims_to_docket(d, data["claims"])
    else:
        update_docket_metadata(d, data)
        d, og_info = update_docket_appellate_metadata(d, data)
        if og_info is not None:
            og_info.save()
            d.originating_court_information = og_info
        d.save()
        if data.get("docket_entries"):
            add_docket_entries(d, data["docket_entries"])
    if report_type in (
        UPLOAD_TYPE.DOCKET,
        UPLOAD_TYPE.APPELLATE_DOCKET,
        UPLOAD_TYPE.IA_XML_FILE,
    ):
        add_parties_and_attorneys(d, data["parties"])
    return d.pk
Esempio n. 15
0
def process_recap_docket(pk):
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item: %s" % pq)

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report.parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    # Merge the contents of the docket into CL
    try:
        d = Docket.objects.get(
            Q(pacer_case_id=pq.pacer_case_id) |
            Q(docket_number=docket_data['docket_number']),
            court_id=pq.court_id,
        )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )
    except Docket.MultipleObjectsReturned:
        msg = "Too many dockets found when trying to look up '%s'" % pq
        logger.error(msg)
        pq.error_message = msg
        pq.status = pq.PROCESSING_FAILED
        pq.save()
        return None

    update_docket_metadata(d, docket_data)
    d.save()

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            RECAPDocument.objects.create(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
            )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or docket_entry.pacer_doc_id

    # Parties
    for party in docket_data['parties']:
        try:
            p = Party.objects.get(name=party['name'])
        except Party.DoesNotExist:
            p = Party.objects.create(
                name=party['name'],
                extra_info=party['extra_info'],
            )
        except Party.MultipleObjectsReturned:
            continue
        else:
            if party['extra_info']:
                p.extra_info = party['extra_info']
                p.save()

        # If the party type doesn't exist, make a new one.
        if not p.party_types.filter(docket=d, name=party['type']).exists():
            PartyType.objects.create(docket=d, party=p, name=party['type'])

        # Attorneys
        for atty in party.get('attorneys', []):
            add_attorney(atty, p, d)

    pq.error_message = ''  # Clear out errors b/c successful
    pq.status = pq.PROCESSING_SUCCESSFUL
    pq.save()

    return d
 def setUp(self):
     self.session = get_pacer_session()
     self.session.login()
     self.report = DocketReport("cand", self.session)
     self.pacer_case_id = "186730"  # 4:06-cv-07294 Foley v. Bates
class PacerDocketReportTest(unittest.TestCase):
    """A variety of tests for the docket report"""
    def setUp(self):
        self.session = get_pacer_session()
        self.session.login()
        self.report = DocketReport("cand", self.session)
        self.pacer_case_id = "186730"  # 4:06-cv-07294 Foley v. Bates

    @staticmethod
    def _count_rows(html):
        """Count the rows in the docket report.

        :param html: The HTML of the docket report.
        :return: The count of the number of rows.
        """
        tree = get_html_parsed_text(html)
        return len(tree.xpath("//table[./tr/td[3]]/tr")) - 1  # No header row

    @SKIP_IF_NO_PACER_LOGIN
    def test_queries(self):
        """Do a variety of queries work?"""
        self.report.query(self.pacer_case_id)
        self.assertIn(
            "Foley v. Bates",
            self.report.response.text,
            msg="Super basic query failed",
        )

        self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1))
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(
            2,
            row_count,
            msg="Didn't get expected number of "
            "rows when filtering by start "
            "date. Got %s." % row_count,
        )

        self.report.query(
            self.pacer_case_id,
            date_start=date(2007, 11, 1),
            date_end=date(2007, 11, 28),
        )
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(
            1,
            row_count,
            msg="Didn't get expected number of "
            "rows when filtering by start and "
            "end dates. Got %s." % row_count,
        )

        self.report.query(self.pacer_case_id, doc_num_start=5, doc_num_end=5)
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(
            1,
            row_count,
            msg="Didn't get expected number of rows "
            "when filtering by doc number. Got "
            "%s" % row_count,
        )

        self.report.query(
            self.pacer_case_id,
            date_start=date(2007, 11, 1),
            date_end=date(2007, 11, 28),
            date_range_type="Entered",
        )
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(
            1,
            row_count,
            msg="Didn't get expected number of rows "
            "when filtering by start and end "
            "dates and date_range_type of "
            "Entered. Got %s" % row_count,
        )

        self.report.query(
            self.pacer_case_id,
            doc_num_start=500,
            show_parties_and_counsel=True,
        )
        self.assertIn(
            "Cheema",
            self.report.response.text,
            msg="Didn't find party info when it was explicitly "
            "requested.",
        )
        self.report.query(
            self.pacer_case_id,
            doc_num_start=500,
            show_parties_and_counsel=False,
        )
        self.assertNotIn(
            "Cheema",
            self.report.response.text,
            msg="Got party info but it was not requested.",
        )

    @SKIP_IF_NO_PACER_LOGIN
    def test_using_same_report_twice(self):
        """Do the caches get properly nuked between runs?

        See issue #187.
        """
        # Query the first one...
        self.report.query(self.pacer_case_id)
        d = self.report.data.copy()

        # Then the second one...
        second_pacer_case_id = "63111"  # 1:07-cv-00035-RJA-HKS Anson v. USA
        self.report.query(second_pacer_case_id)
        d2 = self.report.data.copy()
        self.assertNotEqual(
            d,
            d2,
            msg="Got same values for docket data of two different queries. "
            "Is there a problem with the caches on the DocketReport?",
        )
Esempio n. 18
0
 def setUp(self):
     pacer_session = PacerSession(username=PACER_USERNAME,
                                  password=PACER_PASSWORD)
     pacer_session.login()
     self.report = DocketReport('cand', pacer_session)
     self.pacer_case_id = '186730'  # 4:06-cv-07294 Foley v. Bates
Esempio n. 19
0
 def setUpClass(cls):
     pacer_session = PacerSession(username=PACER_USERNAME,
                                  password=PACER_PASSWORD)
     cls.report = DocketReport('cand', pacer_session)
     cls.pacer_case_id = '186730'  # 4:06-cv-07294 Foley v. Bates
Esempio n. 20
0
def get_docket_by_pacer_case_id(self,
                                pacer_case_id,
                                court_id,
                                session,
                                tag=None,
                                **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param pacer_case_id: The internal case ID of the item in PACER.
    :param court_id: A courtlistener court ID.
    :param session: A valid PacerSession object.
    :param tag: The tag name that should be stored with the item in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    try:
        d = Docket.objects.get(
            pacer_case_id=pacer_case_id,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        d.tags.add(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, needs_solr_update = add_docket_entries(
        d, docket_data['docket_entries'], tag=tag)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
Esempio n. 21
0
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session,
                                tag=None, **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param pacer_case_id: The internal case ID of the item in PACER.
    :param court_id: A courtlistener court ID.
    :param session: A valid PacerSession object.
    :param tag: The tag name that should be stored with the item in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    try:
        d = Docket.objects.get(
            pacer_case_id=pacer_case_id,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if d is not None and first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" % (court_id,
                                                             pacer_case_id))

    # Merge the contents into CL.
    try:
        if d is None:
            d = Docket.objects.get(
                Q(pacer_case_id=pacer_case_id) |
                Q(docket_number=docket_data['docket_number']),
                court_id=court_id,
            )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pacer_case_id,
            court_id=court_id
        )
    except Docket.MultipleObjectsReturned:
        logger.error("Too many dockets returned when trying to look up '%s.%s'" %
                     (court_id, pacer_case_id))
        return None

    update_docket_metadata(d, docket_data)
    d.save()
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        d.tags.add(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s.%s'" % (docket_entry['document_number'],
                                              court_id, pacer_case_id)
            )
            continue
        else:
            if tag is not None:
                de.tags.add(tag)

        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            try:
                rd = RECAPDocument.objects.create(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                    pacer_doc_id=docket_entry['pacer_doc_id'],
                    is_available=False,
                )
            except IntegrityError:
                # Race condition. The item was created after our get failed.
                rd = RECAPDocument.objects.get(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry "
                "number: '%s', docket: %s" % (docket_entry['document_number'], d)
            )
            continue

        rd.pacer_doc_id = rd.pacer_doc_id or docket_entry['pacer_doc_id']
        if tag is not None:
            rd.tags.add(tag)

    add_parties_and_attorneys(d, docket_data['parties'])
    logger.info("Created/updated docket: %s" % d)

    return d
Esempio n. 22
0
def process_recap_docket(pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :return: The docket that's created or updated.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if docket_data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        return None

    # Merge the contents of the docket into CL. Attempt several lookups of
    # decreasing specificity.
    d = None
    for kwargs in [{
            'pacer_case_id': pq.pacer_case_id,
            'docket_number': docket_data['docket_number']
    }, {
            'pacer_case_id': pq.pacer_case_id
    }, {
            'docket_number': docket_data['docket_number']
    }]:
        try:
            d = Docket.objects.get(court_id=pq.court_id, **kwargs)
            break
        except Docket.DoesNotExist:
            continue
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to look up '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            return None

    if d is None:
        # Couldn't find it. Make a new one.
        d = Docket(source=Docket.RECAP,
                   pacer_case_id=pq.pacer_case_id,
                   court_id=pq.court_id)

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        return d

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                })
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            try:
                RECAPDocument.objects.create(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                    pacer_doc_id=docket_entry['pacer_doc_id'],
                    is_available=False,
                )
            except IntegrityError:
                logger.warn(
                    "Creating new document with pacer_doc_id of '%s' violates "
                    "unique constraint on pacer_doc_id field." %
                    docket_entry['pacer_doc_id'])
                continue
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    mark_pq_successful(pq, d_id=d.pk)
    return d
Esempio n. 23
0
    def run_parsers_on_path(
        self,
        path_root,
        required_fields=["date_filed", "case_name", "docket_number"],
    ):
        """Test all the parsers, faking the network query."""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, "*.html"):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):

            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            t1 = time.time()
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split(".")[0]
            json_path = os.path.join(dirname, "%s.json" % filename_sans_ext)
            court = filename_sans_ext.split("_")[0]

            report = DocketReport(court)
            with open(path, "rb") as f:
                report._parse_text(f.read().decode("utf-8"))
            data = report.data

            if data != {}:
                # If the docket is a valid docket, make sure some required
                # fields are populated.
                for field in required_fields:
                    self.assertTrue(
                        data[field],
                        msg="Unable to find truthy value for field %s" % field,
                    )

                self.assertEqual(data["court_id"], court)

                # Party-specific tests...
                for party in data["parties"]:
                    self.assertTrue(
                        party.get("name", False),
                        msg="Every party must have a name attribute. Did not "
                        "get a value for:\n\n%s" % party,
                    )
                    # Protect against effed up adversary proceedings cases that
                    # don't parse properly. See: cacb, 2:08-ap-01570-BB
                    self.assertNotIn("----", party["name"])

            if not os.path.isfile(json_path):
                bar = "*" * 50
                print(
                    "\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:"
                    "\n\n  %s\n\n"
                    "Please test the data in this file before assuming "
                    "everything worked.\n%s\n" % (bar, json_path, bar)
                )
                with open(json_path, "w") as f:
                    json.dump(data, f, indent=2, sort_keys=True)
                    # self.assertFalse(True)
                    continue

            with open(json_path) as f:
                j = json.load(f)
                if j != {}:
                    # Compare docket entries and parties first, for easier
                    # debugging, then compare whole objects to be sure.
                    self.assertEqual(
                        j["docket_entries"], data["docket_entries"]
                    )
                    self.assertEqual(j["parties"], data["parties"])
                self.assertEqual(j, data)
            t2 = time.time()

            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=1)
            sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
Esempio n. 24
0
 def setUpClass(cls):
     pacer_session = login('psc', 'tr1234', 'Pass!234')
     cls.report = DocketReport('psc', pacer_session)
     cls.pacer_case_id = '62866'
Esempio n. 25
0
class PacerDocketReportTest(unittest.TestCase):
    """A variety of tests for the docket report"""

    def setUp(self):
        pacer_session = PacerSession(username=PACER_USERNAME,
                                     password=PACER_PASSWORD)
        pacer_session.login()
        self.report = DocketReport('cand', pacer_session)
        self.pacer_case_id = '186730'  # 4:06-cv-07294 Foley v. Bates

    @staticmethod
    def _count_rows(html):
        """Count the rows in the docket report.

        :param html: The HTML of the docket report.
        :return: The count of the number of rows.
        """
        tree = get_html_parsed_text(html)
        return len(tree.xpath('//table[./tr/td[3]]/tr')) - 1  # No header row

    @SKIP_IF_NO_PACER_LOGIN
    def test_queries(self):
        """Do a variety of queries work?"""
        self.report.query(self.pacer_case_id)
        self.assertIn('Foley v. Bates', self.report.response.text,
                      msg="Super basic query failed")

        self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1))
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(2, row_count, msg="Didn't get expected number of "
                                            "rows when filtering by start "
                                            "date. Got %s." % row_count)

        self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1),
                          date_end=date(2007, 11, 28))
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(1, row_count, msg="Didn't get expected number of "
                                           "rows when filtering by start and "
                                           "end dates. Got %s." % row_count)

        self.report.query(self.pacer_case_id, doc_num_start=5,
                          doc_num_end=5)
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(1, row_count, msg="Didn't get expected number of rows "
                                           "when filtering by doc number. Got "
                                           "%s" % row_count)

        self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1),
                          date_end=date(2007, 11, 28),
                          date_range_type="Entered")
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(1, row_count, msg="Didn't get expected number of rows "
                                           "when filtering by start and end "
                                           "dates and date_range_type of "
                                           "Entered. Got %s" % row_count)

        self.report.query(self.pacer_case_id, doc_num_start=500,
                          show_parties_and_counsel=True)
        self.assertIn('Cheema', self.report.response.text,
                      msg="Didn't find party info when it was explicitly "
                          "requested.")
        self.report.query(self.pacer_case_id, doc_num_start=500,
                          show_parties_and_counsel=False)
        self.assertNotIn('Cheema', self.report.response.text,
                         msg="Got party info but it was not requested.")

    @SKIP_IF_NO_PACER_LOGIN
    def test_using_same_report_twice(self):
        """Do the caches get properly nuked between runs?

        See issue #187.
        """
        # Query the first one...
        self.report.query(self.pacer_case_id)
        d = self.report.data.copy()

        # Then the second one...
        second_pacer_case_id = '63111'  # 1:07-cv-00035-RJA-HKS Anson v. USA
        self.report.query(second_pacer_case_id)
        d2 = self.report.data.copy()
        self.assertNotEqual(
            d,
            d2,
            msg="Got same values for docket data of two different queries. "
                "Is there a problem with the caches on the DocketReport?"
        )
Esempio n. 26
0
#!/usr/bin/env python
#
#  Takes an .html file on the command line, parses it using the PACER
#  Docket Report parser, and outputs json to stdout.

import jsondate as json
import sys

from juriscraper.pacer.http import PacerSession
from juriscraper.pacer import DocketReport

pacer_session = PacerSession(username='******',
                             password='******')
report = DocketReport('psc', pacer_session)

for path in sys.argv[1:]:
    with open(path, 'r') as f:
        report._parse_text(f.read().decode('utf-8'))
    data = report.data
    print json.dumps(data, indent=2, sort_keys=True, separators=(',', ': '))
Esempio n. 27
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or recap document
            // was created (implying a Solr needs updating).
            'needs_solr_update': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')

    if 'History/Documents' in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.callbacks = None
        return None

    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        self.request.callbacks = None
        return None

    # Merge the contents of the docket into CL.
    d, count = find_docket_object(pq.court_id, pq.pacer_case_id,
                                  data['docket_number'])
    if count > 1:
        logger.info("Found %s dockets during lookup. Choosing oldest." % count)
        d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.callbacks = None
        return {'docket_pk': d.pk, 'needs_solr_update': False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, needs_solr_update = add_docket_entries(d, data['docket_entries'])
    add_parties_and_attorneys(d, data['parties'])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
Esempio n. 28
0
 def setUp(self):
     pacer_session = PacerSession(username=PACER_USERNAME,
                                  password=PACER_PASSWORD)
     pacer_session.login()
     self.report = DocketReport('cand', pacer_session)
     self.pacer_case_id = '186730'  # 4:06-cv-07294 Foley v. Bates
Esempio n. 29
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work
    on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or
            // recap document was created (implying a Solr needs
            // updating).
            'content_updated': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.

    """
    start_time = now()
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))

    try:
        text = pq.filepath_local.read().decode("utf-8")
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    if "History/Documents" in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.chain = None
        return None

    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d, docket_count = find_docket_object(
        pq.court_id, pq.pacer_case_id, data["docket_number"]
    )
    if docket_count > 1:
        logger.info(
            "Found %s dockets during lookup. Choosing oldest." % docket_count
        )
        d = d.earliest("date_created")

    d.add_recap_source()
    update_docket_metadata(d, data)
    if not d.pacer_case_id:
        d.pacer_case_id = pq.pacer_case_id

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.chain = None
        return {"docket_pk": d.pk, "content_updated": False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(
        content_object=d, upload_type=UPLOAD_TYPE.DOCKET
    )
    pacer_file.filepath.save(
        "docket.html",  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, content_updated = add_docket_entries(
        d, data["docket_entries"]
    )
    add_parties_and_attorneys(d, data["parties"])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    if content_updated and docket_count > 0:
        newly_enqueued = enqueue_docket_alert(d.pk)
        if newly_enqueued:
            send_docket_alert(d.pk, start_time)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
Esempio n. 30
0
def process_recap_docket(pk):
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item: %s" % pq)

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report.parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    # Merge the contents of the docket into CL
    try:
        d = Docket.objects.get(
            Q(pacer_case_id=pq.pacer_case_id)
            | Q(docket_number=docket_data['docket_number']),
            court_id=pq.court_id,
        )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(source=Docket.RECAP,
                   pacer_case_id=pq.pacer_case_id,
                   court_id=pq.court_id)
    except Docket.MultipleObjectsReturned:
        msg = "Too many dockets found when trying to look up '%s'" % pq
        logger.error(msg)
        pq.error_message = msg
        pq.status = pq.PROCESSING_FAILED
        pq.save()
        return None

    update_docket_metadata(d, docket_data)
    d.save()

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                })
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            RECAPDocument.objects.create(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
            )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or docket_entry.pacer_doc_id

    # Parties
    for party in docket_data['parties']:
        try:
            p = Party.objects.get(name=party['name'])
        except Party.DoesNotExist:
            p = Party.objects.create(
                name=party['name'],
                extra_info=party['extra_info'],
            )
        except Party.MultipleObjectsReturned:
            continue
        else:
            if party['extra_info']:
                p.extra_info = party['extra_info']
                p.save()

        # If the party type doesn't exist, make a new one.
        if not p.party_types.filter(docket=d, name=party['type']).exists():
            PartyType.objects.create(docket=d, party=p, name=party['type'])

        # Attorneys
        for atty in party.get('attorneys', []):
            add_attorney(atty, p, d)

    pq.error_message = ''  # Clear out errors b/c successful
    pq.status = pq.PROCESSING_SUCCESSFUL
    pq.save()

    return d