Ejemplo n.º 1
0
def fetch_docket_by_pacer_case_id(
    session, court_id, pacer_case_id, fq,
):
    """Download the docket from PACER and merge it into CL

    :param session: A PacerSession object to work with
    :param court_id: The CL ID of the court
    :param pacer_case_id: The pacer_case_id of the docket, if known
    :param fq: The PacerFetchQueue object
    :return: a dict with information about the docket and the new data
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    report.query(pacer_case_id, **get_fq_docket_kwargs(fq))

    docket_data = report.data
    if not docket_data:
        raise ParsingException("No data found in docket report.")
    if fq.docket_id:
        d = Docket.objects.get(pk=fq.docket_id)
    else:
        d, count = find_docket_object(
            court_id, pacer_case_id, docket_data["docket_number"]
        )
        if count > 1:
            d = d.earliest("date_created")
    rds_created, content_updated = merge_pacer_docket_into_cl_docket(
        d, pacer_case_id, docket_data, report, appellate=False,
    )
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
Ejemplo n.º 2
0
    def get_docket_json(self):
        """Download docket to disk from Pacer

        :return: None
        """
        q = Query()
        db = TinyDB("db/master.json")
        fjc_table = db.table("fjc")
        for row in fjc_table.search(~(q.PACER_CASE_ID == "") & (q.JSON == "False")):
            rep = DocketReport(row["COURT"], self.s)
            rep.query(
                row["PACER_CASE_ID"],
                show_parties_and_counsel=True,
                show_terminated_parties=True,
                show_list_of_member_cases=True,
                include_pdf_headers=True,
                show_multiple_docs=False,
            )
            with open(
                "downloads/json/pacer_docket_%s.json" % row["PACER_CASE_ID"], "w"
            ) as write_file:
                json.dump(rep.data, write_file, indent=4, sort_keys=True, default=str)
            with open(
                "downloads/html/pacer_docket_%s.html" % row["PACER_CASE_ID"], "w"
            ) as file:
                file.write(rep.response.text)

            fjc_table.update(
                {
                    "JSON": "True",
                    "pacer_doc_id": rep.data["docket_entries"][0]["pacer_doc_id"],
                },
                doc_ids=[row.doc_id],
            )

        logging.info("Finished collecting JSON and HTML")
Ejemplo n.º 3
0
def get_docket_by_pacer_case_id(self,
                                pacer_case_id,
                                court_id,
                                session,
                                tag=None,
                                **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param pacer_case_id: The internal case ID of the item in PACER.
    :param court_id: A courtlistener court ID.
    :param session: A valid PacerSession object.
    :param tag: The tag name that should be stored with the item in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    try:
        d = Docket.objects.get(
            pacer_case_id=pacer_case_id,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        d.tags.add(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, needs_solr_update = add_docket_entries(
        d, docket_data['docket_entries'], tag=tag)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
Ejemplo n.º 4
0
def get_docket_by_pacer_case_id(self,
                                data,
                                court_id,
                                cookies,
                                tag_names=None,
                                **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param data: A dict containing:
        Required: 'pacer_case_id': The internal case ID of the item in PACER.
        Optional: 'docket_pk': The ID of the docket to work on to avoid lookups
                  if it's known in advance.
    :param court_id: A courtlistener court ID.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param tag_names: A list of tag names that should be stored with the item
    in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    :return: A dict indicating if we need to update Solr.
    """
    s = PacerSession(cookies=cookies)
    if data is None:
        logger.info("Empty data argument. Terminating " "chains and exiting.")
        self.request.callbacks = None
        return

    pacer_case_id = data.get('pacer_case_id')
    report = DocketReport(map_cl_to_pacer_id(court_id), s)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    if data.get('docket_pk') is not None:
        d = Docket.objects.get(pk=data['docket_pk'])
    else:
        try:
            d = Docket.objects.get(
                pacer_case_id=pacer_case_id,
                court_id=court_id,
            )
        except Docket.DoesNotExist:
            d = None
        except Docket.MultipleObjectsReturned:
            d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    if not docket_data:
        logger.info("No valid docket data for %s.%s", court_id, pacer_case_id)
        self.request.callbacks = None
        return

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()
    tags = []
    if tag_names is not None:
        for tag_name in tag_names:
            tag, _ = Tag.objects.get_or_create(name=tag_name)
            tag.tag_object(d)
            tags.append(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, content_updated = add_docket_entries(
        d, docket_data['docket_entries'], tags=tags)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'content_updated': bool(rds_created or content_updated),
    }
Ejemplo n.º 5
0
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session,
                                tag=None, **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param pacer_case_id: The internal case ID of the item in PACER.
    :param court_id: A courtlistener court ID.
    :param session: A valid PacerSession object.
    :param tag: The tag name that should be stored with the item in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    try:
        d = Docket.objects.get(
            pacer_case_id=pacer_case_id,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if d is not None and first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" % (court_id,
                                                             pacer_case_id))

    # Merge the contents into CL.
    try:
        if d is None:
            d = Docket.objects.get(
                Q(pacer_case_id=pacer_case_id) |
                Q(docket_number=docket_data['docket_number']),
                court_id=court_id,
            )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pacer_case_id,
            court_id=court_id
        )
    except Docket.MultipleObjectsReturned:
        logger.error("Too many dockets returned when trying to look up '%s.%s'" %
                     (court_id, pacer_case_id))
        return None

    update_docket_metadata(d, docket_data)
    d.save()
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        d.tags.add(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s.%s'" % (docket_entry['document_number'],
                                              court_id, pacer_case_id)
            )
            continue
        else:
            if tag is not None:
                de.tags.add(tag)

        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            try:
                rd = RECAPDocument.objects.create(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                    pacer_doc_id=docket_entry['pacer_doc_id'],
                    is_available=False,
                )
            except IntegrityError:
                # Race condition. The item was created after our get failed.
                rd = RECAPDocument.objects.get(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry "
                "number: '%s', docket: %s" % (docket_entry['document_number'], d)
            )
            continue

        rd.pacer_doc_id = rd.pacer_doc_id or docket_entry['pacer_doc_id']
        if tag is not None:
            rd.tags.add(tag)

    add_parties_and_attorneys(d, docket_data['parties'])
    logger.info("Created/updated docket: %s" % d)

    return d
Ejemplo n.º 6
0
class PacerDocketReportTest(unittest.TestCase):
    """A variety of tests for the docket report"""

    def setUp(self):
        pacer_session = PacerSession(username=PACER_USERNAME,
                                     password=PACER_PASSWORD)
        pacer_session.login()
        self.report = DocketReport('cand', pacer_session)
        self.pacer_case_id = '186730'  # 4:06-cv-07294 Foley v. Bates

    @staticmethod
    def _count_rows(html):
        """Count the rows in the docket report.

        :param html: The HTML of the docket report.
        :return: The count of the number of rows.
        """
        tree = get_html_parsed_text(html)
        return len(tree.xpath('//table[./tr/td[3]]/tr')) - 1  # No header row

    @SKIP_IF_NO_PACER_LOGIN
    def test_queries(self):
        """Do a variety of queries work?"""
        self.report.query(self.pacer_case_id)
        self.assertIn('Foley v. Bates', self.report.response.text,
                      msg="Super basic query failed")

        self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1))
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(2, row_count, msg="Didn't get expected number of "
                                            "rows when filtering by start "
                                            "date. Got %s." % row_count)

        self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1),
                          date_end=date(2007, 11, 28))
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(1, row_count, msg="Didn't get expected number of "
                                           "rows when filtering by start and "
                                           "end dates. Got %s." % row_count)

        self.report.query(self.pacer_case_id, doc_num_start=5,
                          doc_num_end=5)
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(1, row_count, msg="Didn't get expected number of rows "
                                           "when filtering by doc number. Got "
                                           "%s" % row_count)

        self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1),
                          date_end=date(2007, 11, 28),
                          date_range_type="Entered")
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(1, row_count, msg="Didn't get expected number of rows "
                                           "when filtering by start and end "
                                           "dates and date_range_type of "
                                           "Entered. Got %s" % row_count)

        self.report.query(self.pacer_case_id, doc_num_start=500,
                          show_parties_and_counsel=True)
        self.assertIn('Cheema', self.report.response.text,
                      msg="Didn't find party info when it was explicitly "
                          "requested.")
        self.report.query(self.pacer_case_id, doc_num_start=500,
                          show_parties_and_counsel=False)
        self.assertNotIn('Cheema', self.report.response.text,
                         msg="Got party info but it was not requested.")

    @SKIP_IF_NO_PACER_LOGIN
    def test_using_same_report_twice(self):
        """Do the caches get properly nuked between runs?

        See issue #187.
        """
        # Query the first one...
        self.report.query(self.pacer_case_id)
        d = self.report.data.copy()

        # Then the second one...
        second_pacer_case_id = '63111'  # 1:07-cv-00035-RJA-HKS Anson v. USA
        self.report.query(second_pacer_case_id)
        d2 = self.report.data.copy()
        self.assertNotEqual(
            d,
            d2,
            msg="Got same values for docket data of two different queries. "
                "Is there a problem with the caches on the DocketReport?"
        )
class PacerDocketReportTest(unittest.TestCase):
    """A variety of tests for the docket report"""
    def setUp(self):
        self.session = get_pacer_session()
        self.session.login()
        self.report = DocketReport("cand", self.session)
        self.pacer_case_id = "186730"  # 4:06-cv-07294 Foley v. Bates

    @staticmethod
    def _count_rows(html):
        """Count the rows in the docket report.

        :param html: The HTML of the docket report.
        :return: The count of the number of rows.
        """
        tree = get_html_parsed_text(html)
        return len(tree.xpath("//table[./tr/td[3]]/tr")) - 1  # No header row

    @SKIP_IF_NO_PACER_LOGIN
    def test_queries(self):
        """Do a variety of queries work?"""
        self.report.query(self.pacer_case_id)
        self.assertIn(
            "Foley v. Bates",
            self.report.response.text,
            msg="Super basic query failed",
        )

        self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1))
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(
            2,
            row_count,
            msg="Didn't get expected number of "
            "rows when filtering by start "
            "date. Got %s." % row_count,
        )

        self.report.query(
            self.pacer_case_id,
            date_start=date(2007, 11, 1),
            date_end=date(2007, 11, 28),
        )
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(
            1,
            row_count,
            msg="Didn't get expected number of "
            "rows when filtering by start and "
            "end dates. Got %s." % row_count,
        )

        self.report.query(self.pacer_case_id, doc_num_start=5, doc_num_end=5)
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(
            1,
            row_count,
            msg="Didn't get expected number of rows "
            "when filtering by doc number. Got "
            "%s" % row_count,
        )

        self.report.query(
            self.pacer_case_id,
            date_start=date(2007, 11, 1),
            date_end=date(2007, 11, 28),
            date_range_type="Entered",
        )
        row_count = self._count_rows(self.report.response.text)
        self.assertEqual(
            1,
            row_count,
            msg="Didn't get expected number of rows "
            "when filtering by start and end "
            "dates and date_range_type of "
            "Entered. Got %s" % row_count,
        )

        self.report.query(
            self.pacer_case_id,
            doc_num_start=500,
            show_parties_and_counsel=True,
        )
        self.assertIn(
            "Cheema",
            self.report.response.text,
            msg="Didn't find party info when it was explicitly "
            "requested.",
        )
        self.report.query(
            self.pacer_case_id,
            doc_num_start=500,
            show_parties_and_counsel=False,
        )
        self.assertNotIn(
            "Cheema",
            self.report.response.text,
            msg="Got party info but it was not requested.",
        )

    @SKIP_IF_NO_PACER_LOGIN
    def test_using_same_report_twice(self):
        """Do the caches get properly nuked between runs?

        See issue #187.
        """
        # Query the first one...
        self.report.query(self.pacer_case_id)
        d = self.report.data.copy()

        # Then the second one...
        second_pacer_case_id = "63111"  # 1:07-cv-00035-RJA-HKS Anson v. USA
        self.report.query(second_pacer_case_id)
        d2 = self.report.data.copy()
        self.assertNotEqual(
            d,
            d2,
            msg="Got same values for docket data of two different queries. "
            "Is there a problem with the caches on the DocketReport?",
        )