コード例 #1
0
def reprocess_docket_data(d, filepath, report_type):
    """Reprocess docket data that we already have.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.tasks import update_docket_metadata, add_docket_entries, \
        add_parties_and_attorneys
    if report_type == DOCKET:
        report = DocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id))
    with open(filepath, 'r') as f:
        text = f.read().decode('utf-8')
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None
    update_docket_metadata(d, data)
    d.save()
    add_docket_entries(d, data['docket_entries'])
    if report_type == DOCKET:
        add_parties_and_attorneys(d, data['parties'])
    return d.pk
コード例 #2
0
def process_docket_data(d, filepath, report_type):
    """Process docket data file.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.mergers import (
        add_docket_entries,
        add_parties_and_attorneys,
        update_docket_appellate_metadata,
        update_docket_metadata,
        add_bankruptcy_data_to_docket,
        add_claims_to_docket,
    )

    court_id = map_cl_to_pacer_id(d.court_id)
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(court_id)
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(court_id)
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(court_id)
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive(court_id)
    elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE:
        report = CaseQuery(court_id)
    elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER:
        report = ClaimsRegister(court_id)
    else:
        raise NotImplementedError("The report type with id '%s' is not yet "
                                  "supported. Perhaps you need to add it?" %
                                  report_type)
    with open(filepath, "r") as f:
        text = f.read().decode("utf-8")
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None

    if report_type == UPLOAD_TYPE.CLAIMS_REGISTER:
        add_bankruptcy_data_to_docket(d, data)
        add_claims_to_docket(d, data["claims"])
    else:
        update_docket_metadata(d, data)
        d, og_info = update_docket_appellate_metadata(d, data)
        if og_info is not None:
            og_info.save()
            d.originating_court_information = og_info
        d.save()
        if data.get("docket_entries"):
            add_docket_entries(d, data["docket_entries"])
    if report_type in (
            UPLOAD_TYPE.DOCKET,
            UPLOAD_TYPE.APPELLATE_DOCKET,
            UPLOAD_TYPE.IA_XML_FILE,
    ):
        add_parties_and_attorneys(d, data["parties"])
    return d.pk
コード例 #3
0
ファイル: pacer.py プロジェクト: saizai/courtlistener
def process_docket_data(d, filepath, report_type):
    """Process docket data file.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.tasks import update_docket_metadata, add_docket_entries, \
        add_parties_and_attorneys, update_docket_appellate_metadata
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive()
    with open(filepath, 'r') as f:
        text = f.read().decode('utf-8')
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None
    update_docket_metadata(d, data)
    d, og_info = update_docket_appellate_metadata(d, data)
    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()
    add_docket_entries(d, data['docket_entries'])
    if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET,
                       UPLOAD_TYPE.IA_XML_FILE):
        add_parties_and_attorneys(d, data['parties'])
    return d.pk
コード例 #4
0
ファイル: pacer.py プロジェクト: freelawproject/courtlistener
def process_docket_data(d, filepath, report_type):
    """Process docket data file.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.tasks import update_docket_metadata, add_docket_entries, \
        add_parties_and_attorneys, update_docket_appellate_metadata
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive()
    elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE:
        report = CaseQuery(map_cl_to_pacer_id(d.court_id))
    with open(filepath, 'r') as f:
        text = f.read().decode('utf-8')
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None
    update_docket_metadata(d, data)
    d, og_info = update_docket_appellate_metadata(d, data)
    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()
    if data.get('docket_entries'):
        add_docket_entries(d, data['docket_entries'])
    if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET,
                       UPLOAD_TYPE.IA_XML_FILE):
        add_parties_and_attorneys(d, data['parties'])
    return d.pk
コード例 #5
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work
    on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or
            // recap document was created (implying a Solr needs
            // updating).
            'content_updated': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.

    """
    start_time = now()
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))

    try:
        text = pq.filepath_local.read().decode("utf-8")
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    if "History/Documents" in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.chain = None
        return None

    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d, docket_count = find_docket_object(
        pq.court_id, pq.pacer_case_id, data["docket_number"]
    )
    if docket_count > 1:
        logger.info(
            "Found %s dockets during lookup. Choosing oldest." % docket_count
        )
        d = d.earliest("date_created")

    d.add_recap_source()
    update_docket_metadata(d, data)
    if not d.pacer_case_id:
        d.pacer_case_id = pq.pacer_case_id

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.chain = None
        return {"docket_pk": d.pk, "content_updated": False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(
        content_object=d, upload_type=UPLOAD_TYPE.DOCKET
    )
    pacer_file.filepath.save(
        "docket.html",  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, content_updated = add_docket_entries(
        d, data["docket_entries"]
    )
    add_parties_and_attorneys(d, data["parties"])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    if content_updated and docket_count > 0:
        newly_enqueued = enqueue_docket_alert(d.pk)
        if newly_enqueued:
            send_docket_alert(d.pk, start_time)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
コード例 #6
0
ファイル: tasks.py プロジェクト: devuri/courtlistener
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or recap document
            // was created (implying a Solr needs updating).
            'needs_solr_update': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')

    if 'History/Documents' in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.callbacks = None
        return None

    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        self.request.callbacks = None
        return None

    # Merge the contents of the docket into CL.
    d, count = find_docket_object(pq.court_id, pq.pacer_case_id,
                                  data['docket_number'])
    if count > 1:
        logger.info("Found %s dockets during lookup. Choosing oldest." % count)
        d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.callbacks = None
        return {'docket_pk': d.pk, 'needs_solr_update': False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, needs_solr_update = add_docket_entries(d, data['docket_entries'])
    add_parties_and_attorneys(d, data['parties'])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
コード例 #7
0
ファイル: tasks.py プロジェクト: haoyangt/courtlistener
def process_recap_docket(pk):
    """Process an uploaded docket from the RECAP API endpoint.

    param pk: The primary key of the processing queue item you want to work on.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    # Merge the contents of the docket into CL
    try:
        d = Docket.objects.get(
            Q(pacer_case_id=pq.pacer_case_id) |
            Q(docket_number=docket_data['docket_number']),
            court_id=pq.court_id,
        )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )
    except Docket.MultipleObjectsReturned:
        msg = "Too many dockets found when trying to look up '%s'" % pq
        logger.error(msg)
        pq.error_message = msg
        pq.status = pq.PROCESSING_FAILED
        pq.save()
        return None

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        return d

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            RECAPDocument.objects.create(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
            )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    mark_pq_successful(pq, d_id=d.pk)
    return d
コード例 #8
0
    def run_parsers_on_path(
        self,
        path_root,
        required_fields=["date_filed", "case_name", "docket_number"],
    ):
        """Test all the parsers, faking the network query."""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, "*.html"):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):

            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            t1 = time.time()
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split(".")[0]
            json_path = os.path.join(dirname, "%s.json" % filename_sans_ext)
            court = filename_sans_ext.split("_")[0]

            report = DocketReport(court)
            with open(path, "rb") as f:
                report._parse_text(f.read().decode("utf-8"))
            data = report.data

            if data != {}:
                # If the docket is a valid docket, make sure some required
                # fields are populated.
                for field in required_fields:
                    self.assertTrue(
                        data[field],
                        msg="Unable to find truthy value for field %s" % field,
                    )

                self.assertEqual(data["court_id"], court)

                # Party-specific tests...
                for party in data["parties"]:
                    self.assertTrue(
                        party.get("name", False),
                        msg="Every party must have a name attribute. Did not "
                        "get a value for:\n\n%s" % party,
                    )
                    # Protect against effed up adversary proceedings cases that
                    # don't parse properly. See: cacb, 2:08-ap-01570-BB
                    self.assertNotIn("----", party["name"])

            if not os.path.isfile(json_path):
                bar = "*" * 50
                print(
                    "\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:"
                    "\n\n  %s\n\n"
                    "Please test the data in this file before assuming "
                    "everything worked.\n%s\n" % (bar, json_path, bar)
                )
                with open(json_path, "w") as f:
                    json.dump(data, f, indent=2, sort_keys=True)
                    # self.assertFalse(True)
                    continue

            with open(json_path) as f:
                j = json.load(f)
                if j != {}:
                    # Compare docket entries and parties first, for easier
                    # debugging, then compare whole objects to be sure.
                    self.assertEqual(
                        j["docket_entries"], data["docket_entries"]
                    )
                    self.assertEqual(j["parties"], data["parties"])
                self.assertEqual(j, data)
            t2 = time.time()

            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=1)
            sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
コード例 #9
0
ファイル: test_pacer.py プロジェクト: janderse/juriscraper
    def run_parsers_on_path(self, path_root,
                            required_fields=[
                                'date_filed', 'case_name', 'docket_number']):
        """Test all the parsers, faking the network query."""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, '*.html'):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):

            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            t1 = time.time()
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)
            court = filename_sans_ext.split('_')[0]

            report = DocketReport(court)
            with open(path, 'r') as f:
                report._parse_text(f.read().decode('utf-8'))
            data = report.data

            if data != {}:
                # If the docket is a valid docket, make sure some required
                # fields are populated.
                for field in required_fields:
                    self.assertTrue(
                        data[field],
                        msg="Unable to find truthy value for field %s" % field,
                    )

                self.assertEqual(data['court_id'], court)

                # Party-specific tests...
                for party in data['parties']:
                    self.assertTrue(
                        party.get('name', False),
                        msg="Every party must have a name attribute. Did not "
                            "get a value for:\n\n%s" % party
                    )
                    # Protect against effed up adversary proceedings cases that
                    # don't parse properly. See: cacb, 2:08-ap-01570-BB
                    self.assertNotIn('----', party['name'])

            if not os.path.isfile(json_path):
                bar = "*" * 50
                print("\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:"
                      "\n\n  %s\n\n"
                      "Please test the data in this file before assuming "
                      "everything worked.\n%s\n" % (bar, json_path, bar))
                with open(json_path, 'w') as f:
                    json.dump(data, f, indent=2, sort_keys=True)
                    #self.assertFalse(True)
                    continue

            with open(json_path) as f:
                j = json.load(f)
                if j != {}:
                    # Compare docket entries and parties first, for easier
                    # debugging, then compare whole objects to be sure.
                    self.assertEqual(j['docket_entries'], data['docket_entries'])
                    self.assertEqual(j['parties'], data['parties'])
                self.assertEqual(j, data)
            t2 = time.time()

            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=1)
            sys.stdout.write("✓ - %0.1fs\n" % (t2-t1))
コード例 #10
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or recap document
            // was created (implying a Solr needs updating).
            'needs_solr_update': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')

    if 'History/Documents' in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = pq.DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.callbacks = None
        return None

    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if docket_data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        self.request.callbacks = None
        return None

    # Merge the contents of the docket into CL. Attempt several lookups of
    # decreasing specificity. Note that pacer_case_id is required for Docket
    # uploads.
    d = None
    for kwargs in [{'pacer_case_id': pq.pacer_case_id,
                    'docket_number': docket_data['docket_number']},
                   {'pacer_case_id': pq.pacer_case_id},
                   {'docket_number': docket_data['docket_number'],
                    'pacer_case_id': None}]:
        try:
            d = Docket.objects.get(court_id=pq.court_id, **kwargs)
            break
        except Docket.DoesNotExist:
            continue
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to look up '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            self.request.callbacks = None
            return None

    if d is None:
        # Couldn't find it. Make a new one.
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.callbacks = None
        return {'docket_pk': d.pk, 'needs_solr_update': False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries & documents
    rds_created = []
    needs_solr_update = False
    for docket_entry in docket_data['docket_entries']:
        try:
            de, de_created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        if de_created:
            needs_solr_update = True

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        params = {
            'docket_entry': de,
            # No attachments when uploading dockets.
            'document_type': RECAPDocument.PACER_DOCUMENT,
            'document_number': docket_entry['document_number'],
        }
        try:
            rd = RECAPDocument.objects.get(**params)
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument.objects.create(
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
                **params
            )
            rds_created.append(rd)
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
コード例 #11
0
    def run_parsers_on_path(
            self,
            path_root,
            required_fields=['date_filed', 'case_name', 'docket_number']):
        """Test all the parsers, faking the network query."""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, '*.html'):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):
            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            t1 = time.time()
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)
            court = filename_sans_ext.split('_')[0]

            report = DocketReport(court)
            with open(path, 'r') as f:
                report._parse_text(f.read().decode('utf-8'))
            data = report.data

            if data != {}:
                # If the docket is a valid docket, make sure some required
                # fields are populated.
                for field in required_fields:
                    self.assertTrue(
                        data[field],
                        msg="Unable to find truthy value for field %s" % field,
                    )

                self.assertEqual(data['court_id'], court)

                # Party-specific tests...
                for party in data['parties']:
                    self.assertTrue(
                        party.get('name', False),
                        msg="Every party must have a name attribute. Did not "
                        "get a value for:\n\n%s" % party)
                    # Protect against effed up adversary proceedings cases that
                    # don't parse properly. See: cacb, 2:08-ap-01570-BB
                    self.assertNotIn('----', party['name'])

            if not os.path.isfile(json_path):
                bar = "*" * 50
                print("\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:"
                      "\n\n  %s\n\n"
                      "Please test the data in this file before assuming "
                      "everything worked.\n%s\n" % (bar, json_path, bar))
                with open(json_path, 'w') as f:
                    json.dump(data, f, indent=2, sort_keys=True)
                    #self.assertFalse(True)
                    continue

            with open(json_path) as f:
                j = json.load(f)
                if j != {}:
                    # Compare docket entries and parties first, for easier
                    # debugging, then compare whole objects to be sure.
                    self.assertEqual(j['docket_entries'],
                                     data['docket_entries'])
                    self.assertEqual(j['parties'], data['parties'])
                self.assertEqual(j, data)
            t2 = time.time()

            max_duration = 1
            duration = t2 - t1
            if duration > max_duration:
                if sys.gettrace() is None and not IS_TRAVIS:
                    # Don't do this if we're debugging.
                    raise SlownessException(
                        "The parser for '{fn}' took {duration}s to test, "
                        "which is more than the maximum allowed duration of "
                        "{max_duration}s.".format(
                            fn=filename,
                            duration=duration,
                            max_duration=max_duration,
                        ))
            sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
コード例 #12
0
def process_docket_data(
    d: Docket,
    report_type: int,
    filepath: str = None,
) -> Optional[int]:
    """Process docket data file.

    :param d: A docket object to work on.
    :param report_type: Whether it's a docket or a docket history report.
    :param filepath: A local path where the item can be found. If not provided,
    the filepath_local field of the docket object will be attempted.
    """
    from cl.recap.mergers import (
        add_bankruptcy_data_to_docket,
        add_claims_to_docket,
        add_docket_entries,
        add_parties_and_attorneys,
        update_docket_appellate_metadata,
        update_docket_metadata,
    )

    court_id = map_cl_to_pacer_id(d.court_id)
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(court_id)
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(court_id)
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(court_id)
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive(court_id)
    elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE:
        report = CaseQuery(court_id)
    elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER:
        report = ClaimsRegister(court_id)
    else:
        raise NotImplementedError(
            "The report type with id '%s' is not yet "
            "supported. Perhaps you need to add it?" % report_type
        )

    if filepath:
        with open(filepath, "r") as f:
            text = f.read()
    else:
        # This is an S3 path, so get it remotely.
        text = d.filepath_local.read().decode()

    report._parse_text(text)
    data = report.data
    if data == {}:
        return None

    if report_type == UPLOAD_TYPE.CLAIMS_REGISTER:
        add_bankruptcy_data_to_docket(d, data)
        add_claims_to_docket(d, data["claims"])
    else:
        update_docket_metadata(d, data)
        d, og_info = update_docket_appellate_metadata(d, data)
        if og_info is not None:
            og_info.save()
            d.originating_court_information = og_info
        d.save()
        if data.get("docket_entries"):
            add_docket_entries(d, data["docket_entries"])
    if report_type in (
        UPLOAD_TYPE.DOCKET,
        UPLOAD_TYPE.APPELLATE_DOCKET,
        UPLOAD_TYPE.IA_XML_FILE,
    ):
        add_parties_and_attorneys(d, data["parties"])
    return d.pk
コード例 #13
0
#!/usr/bin/env python
#
#  Takes an .html file on the command line, parses it using the PACER
#  Docket Report parser, and outputs json to stdout.

import jsondate3 as json
import sys

from juriscraper.pacer.http import PacerSession
from juriscraper.pacer import DocketReport

pacer_session = PacerSession(username="******", password="******")
report = DocketReport("psc", pacer_session)

for path in sys.argv[1:]:
    with open(path, "r") as f:
        report._parse_text(f.read().decode("utf-8"))
    data = report.data
    print json.dumps(data, indent=2, sort_keys=True, separators=(",", ": "))
コード例 #14
0
#!/usr/bin/env python
#
#  Takes an .html file on the command line, parses it using the PACER
#  Docket Report parser, and outputs json to stdout.

import jsondate as json
import sys

from juriscraper.pacer.http import PacerSession
from juriscraper.pacer import DocketReport

pacer_session = PacerSession(username='******',
                             password='******')
report = DocketReport('psc', pacer_session)

for path in sys.argv[1:]:
    with open(path, 'r') as f:
        report._parse_text(f.read().decode('utf-8'))
    data = report.data
    print json.dumps(data, indent=2, sort_keys=True, separators=(',', ': '))
コード例 #15
0
def process_recap_docket(pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :return: The docket that's created or updated.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if docket_data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        return None

    # Merge the contents of the docket into CL. Attempt several lookups of
    # decreasing specificity.
    d = None
    for kwargs in [{
            'pacer_case_id': pq.pacer_case_id,
            'docket_number': docket_data['docket_number']
    }, {
            'pacer_case_id': pq.pacer_case_id
    }, {
            'docket_number': docket_data['docket_number']
    }]:
        try:
            d = Docket.objects.get(court_id=pq.court_id, **kwargs)
            break
        except Docket.DoesNotExist:
            continue
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to look up '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            return None

    if d is None:
        # Couldn't find it. Make a new one.
        d = Docket(source=Docket.RECAP,
                   pacer_case_id=pq.pacer_case_id,
                   court_id=pq.court_id)

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        return d

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                })
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            try:
                RECAPDocument.objects.create(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                    pacer_doc_id=docket_entry['pacer_doc_id'],
                    is_available=False,
                )
            except IntegrityError:
                logger.warn(
                    "Creating new document with pacer_doc_id of '%s' violates "
                    "unique constraint on pacer_doc_id field." %
                    docket_entry['pacer_doc_id'])
                continue
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    mark_pq_successful(pq, d_id=d.pk)
    return d