Python AttachmentPage._parse_textの例、juriscraper.pacer.AttachmentPage._parse_text Pythonの例

コード例 #1

0

ファイルを表示

    def test_parsing_results(self):
        """Can we do a simple query and parse?"""
        paths = []
        path_root = os.path.join(TESTS_ROOT, "examples", "pacer",
                                 "attachment_pages")
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, '*.html'):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):
            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)
            court = filename_sans_ext.split('_')[0]

            report = AttachmentPage(court)
            with open(path, 'r') as f:
                report._parse_text(f.read().decode('utf-8'))
            data = report.data
            if not os.path.exists(json_path):
                with open(json_path, 'w') as f:
                    print("Creating new file at %s" % json_path)
                    json.dump(data, f, indent=2, sort_keys=True)
                    continue
            with open(json_path) as f:
                j = json.load(f)
                self.assertEqual(j, data)

            sys.stdout.write("✓\n")

コード例 #2

0

ファイルを表示

ファイル: test_pacer.py プロジェクト: voutilad/juriscraper

    def test_parsing_results(self):
        """Can we do a simple query and parse?"""
        paths = []
        path_root = os.path.join(TESTS_ROOT, "examples", "pacer",
                                 "attachment_pages")
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, '*.html'):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):
            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)
            court = filename_sans_ext.split('_')[0]

            report = AttachmentPage(court)
            with open(path, 'r') as f:
                report._parse_text(f.read().decode('utf-8'))
            data = report.data
            with open(json_path) as f:
                j = json.load(f)
                self.assertEqual(j, data)

            sys.stdout.write("✓\n")

コード例 #3

0

ファイルを表示

ファイル: tasks.py プロジェクト: devuri/courtlistener

def process_recap_attachment(self, pk):
    """Process an uploaded attachment page from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on
    :return: None
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id))
    with open(pq.filepath_local.path) as f:
        text = f.read().decode('utf-8')
    att_page._parse_text(text)
    att_data = att_page.data
    logger.info("Parsing completed for item %s" % pq)

    if pq.pacer_case_id in ['undefined', 'null']:
        # Bad data from the client. Fix it with parsed data.
        pq.pacer_case_id = att_data.get('pacer_case_id')
        pq.save()

    # Merge the contents of the data into CL.
    try:
        params = {
            'pacer_doc_id': att_data['pacer_doc_id'],
            'docket_entry__docket__court': pq.court,
        }
        if pq.pacer_case_id:
            params['docket_entry__docket__pacer_case_id'] = pq.pacer_case_id
        main_rd = RECAPDocument.objects.get(**params)
    except RECAPDocument.MultipleObjectsReturned:
        # Unclear how to proceed and we don't want to associate this data with
        # the wrong case. We must punt.
        msg = "Too many documents found when attempting to associate " \
              "attachment data"
        mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
        return None
    except RECAPDocument.DoesNotExist as exc:
        msg = "Could not find docket to associate with attachment metadata"
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            return None
        else:
            mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = main_rd.docket_entry
    if att_data['document_number'] is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        att_data['document_number'] = main_rd.document_number

    rds_created = []
    if not pq.debug:
        # Save the old HTML to the docket entry.
        pacer_file = PacerHtmlFiles(content_object=de,
                                    upload_type=ATTACHMENT_PAGE)
        pacer_file.filepath.save(
            'attachment_page.html',  # Irrelevant b/c UUIDFileSystemStorage
            ContentFile(text),
        )

        # Create/update the attachment items.
        for attachment in att_data['attachments']:
            if all([attachment['attachment_number'],
                    # Missing on sealed items.
                    attachment.get('pacer_doc_id', False),
                    # Missing on some restricted docs (see Juriscraper)
                    attachment['page_count'] is not None,
                    attachment['description']]):
                rd, created = RECAPDocument.objects.update_or_create(
                    docket_entry=de,
                    document_number=att_data['document_number'],
                    attachment_number=attachment['attachment_number'],
                    document_type=RECAPDocument.ATTACHMENT,
                )
                if created:
                    rds_created.append(rd)
                needs_save = False
                for field in ['description', 'pacer_doc_id']:
                    if attachment[field]:
                        setattr(rd, field, attachment[field])
                        needs_save = True
                if needs_save:
                    rd.save()

                # Do *not* do this async — that can cause race conditions.
                add_or_update_recap_document([rd.pk], force_commit=False)

    mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)
    process_orphan_documents(rds_created, pq.court_id,
                             main_rd.docket_entry.docket.date_filed)

コード例 #4

0

ファイルを表示

def process_recap_attachment(self, pk, tag_names=None):
    """Process an uploaded attachment page from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on
    :param tag_names: A list of tag names to add to all items created or
    modified in this function.
    :return: None
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id))
    with open(pq.filepath_local.path) as f:
        text = f.read().decode("utf-8")
    att_page._parse_text(text)
    att_data = att_page.data
    logger.info("Parsing completed for item %s" % pq)

    if att_data == {}:
        # Bad attachment page.
        msg = "Not a valid attachment page upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    if pq.pacer_case_id in ["undefined", "null"]:
        # Bad data from the client. Fix it with parsed data.
        pq.pacer_case_id = att_data.get("pacer_case_id")
        pq.save()

    # Merge the contents of the data into CL.
    try:
        params = {
            "pacer_doc_id": att_data["pacer_doc_id"],
            "docket_entry__docket__court": pq.court,
        }
        if pq.pacer_case_id:
            params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id
        main_rd = RECAPDocument.objects.get(**params)
    except RECAPDocument.MultipleObjectsReturned:
        # Unclear how to proceed and we don't want to associate this data with
        # the wrong case. We must punt.
        msg = (
            "Too many documents found when attempting to associate "
            "attachment data"
        )
        mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
        return None
    except RECAPDocument.DoesNotExist as exc:
        msg = "Could not find docket to associate with attachment metadata"
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = main_rd.docket_entry
    if att_data["document_number"] is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        att_data["document_number"] = main_rd.document_number

    rds_created = []
    if not pq.debug:
        # Save the old HTML to the docket entry.
        pacer_file = PacerHtmlFiles(
            content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE
        )
        pacer_file.filepath.save(
            "attachment_page.html",  # Irrelevant b/c UUIDFileSystemStorage
            ContentFile(text),
        )

        # Create/update the attachment items.
        tags = []
        if tag_names:
            for tag_name in tag_names:
                tag, _ = Tag.objects.get_or_create(name=tag_name)
                tags.append(tag)
        for attachment in att_data["attachments"]:
            if all(
                [
                    attachment["attachment_number"],
                    # Missing on sealed items.
                    attachment.get("pacer_doc_id", False),
                    # Missing on some restricted docs (see Juriscraper)
                    attachment["page_count"] is not None,
                    attachment["description"],
                ]
            ):
                rd, created = RECAPDocument.objects.update_or_create(
                    docket_entry=de,
                    document_number=att_data["document_number"],
                    attachment_number=attachment["attachment_number"],
                    document_type=RECAPDocument.ATTACHMENT,
                )
                if created:
                    rds_created.append(rd)
                needs_save = False
                for field in ["description", "pacer_doc_id"]:
                    if attachment[field]:
                        setattr(rd, field, attachment[field])
                        needs_save = True

                # Only set page_count and file_size if they're blank, in case
                # we got the real value by measuring.
                if rd.page_count is None:
                    rd.page_count = attachment["page_count"]
                if rd.file_size is None and attachment["file_size_str"]:
                    try:
                        rd.file_size = convert_size_to_bytes(
                            attachment["file_size_str"]
                        )
                    except ValueError:
                        pass

                if needs_save:
                    rd.save()
                if tags:
                    for tag in tags:
                        tag.tag_object(rd)

                # Do *not* do this async — that can cause race conditions.
                add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)
    process_orphan_documents(
        rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed
    )
    changed = mark_ia_upload_needed(de.docket)
    if changed:
        de.docket.save()

コード例 #5

0

ファイルを表示

def get_data_from_att_report(text: str, court_id: str) -> Dict[str, str]:
    att_page = AttachmentPage(map_cl_to_pacer_id(court_id))
    att_page._parse_text(text)
    att_data = att_page.data
    return att_data

コード例 #6

0

ファイルを表示

ファイル: mergers.py プロジェクト: litewarp/courtlistener

def get_data_from_att_report(text, court_id):
    att_page = AttachmentPage(map_cl_to_pacer_id(court_id))
    att_page._parse_text(text)
    att_data = att_page.data
    return att_data

コード例 #7

0

ファイルを表示

def process_recap_attachment(self, pk):
    """Process an uploaded attachment page from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on
    :return: None
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    att_page._parse_text(text)
    att_data = att_page.data
    logger.info("Parsing completed for item %s" % pq)

    # Merge the contents of the data into CL.
    try:
        rd = RECAPDocument.objects.get(
            pacer_doc_id=att_data['pacer_doc_id'],
            docket_entry__docket__court=pq.court,
        )
    except RECAPDocument.DoesNotExist as exc:
        msg = "Could not find docket to associate with attachment metadata"
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            return None
        else:
            mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = rd.docket_entry
    if att_data['document_number'] is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        att_data['document_number'] = rd.document_number

    if not pq.debug:
        # Save the old HTML to the docket entry.
        pacer_file = PacerHtmlFiles(content_object=de)
        pacer_file.filepath.save(
            'attachment_page.html',  # Irrelevant b/c UUIDFileSystemStorage
            ContentFile(text),
        )

        # Create/update the attachment items.
        for attachment in att_data['attachments']:
            if all([
                    attachment['attachment_number'],
                    # Missing on sealed items.
                    attachment.get('pacer_doc_id', False),
                    # Missing on some restricted docs (see Juriscraper)
                    attachment['page_count'] is not None,
                    attachment['description']
            ]):
                rd, created = RECAPDocument.objects.update_or_create(
                    docket_entry=de,
                    document_number=att_data['document_number'],
                    attachment_number=attachment['attachment_number'],
                    document_type=RECAPDocument.ATTACHMENT,
                )
                needs_save = False
                for field in ['description', 'pacer_doc_id']:
                    if attachment[field]:
                        setattr(rd, field, attachment[field])
                        needs_save = True
                if needs_save:
                    try:
                        rd.save()
                    except IntegrityError:
                        # Happens when we hit courtlistener/issues#765, in which
                        # we violate the unique constraint on pacer_doc_id.
                        continue

                # Do *not* do this async — that can cause race conditions.
                add_or_update_recap_document([rd.pk], force_commit=False)

    mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)