Example #1
0
 def to_model(self):
     # Create a citation object as in our models. Eventually, the version in
     # our models should probably be the only object named "Citation". Until
     # then, this function helps map from this object to the Citation object
     # in the models.
     c = ModelCitation(
         **{
             key: value
             for key, value in self.__dict__.items()
             if key in ModelCitation._meta.get_all_field_names()
         })
     canon = REPORTERS[self.canonical_reporter]
     cite_type = canon[self.lookup_index]["cite_type"]
     c.type = map_reporter_db_cite_type(cite_type)
     return c
def merge_or_add_opinions(
    cluster_id: int,
    html_str: str,
    data: Dict[str, Any],
    date_argued: datetime.date,
    date_filed: datetime.date,
    case_names: Dict[str, str],
    status: str,
    docket_number: str,
    found_citations: List[FoundCitation],
) -> Optional[Docket]:
    """Merge opinions if applicable.

    If opinion not in system, merge or add to cluster.
    If opinion in system came from harvard, add new opinion to cluster, else
    we merge new opinion data into scraped opinion.

    :param cluster_id: Opinion Cluster id.
    :param html_str: HTML opinion to add.
    :param data: Case data to import.
    :param date_argued: Date case was argued.
    :param date_filed: Date case was filed.
    :param case_names: A dict with the three case name types
    :param status: Whether it's precedential
    :param docket_number: The docket number
    :param found_citations: A list of FoundCitation objects.
    :return: The merged docket, cluster, and opinion.
    """
    does_exist = (Opinion.objects.filter(cluster_id=cluster_id).exclude(
        html_anon_2020="").exists())
    if does_exist:
        logger.info(f"Opinion already in database at {cluster_id}")
        return

    logger.info(f"Starting merger of opinions in cluster {cluster_id}.")

    cluster = OpinionCluster.objects.get(pk=cluster_id)
    docket = cluster.docket

    # Dates are uniformly good in our dataset
    # validation and is_approx not needed

    # Merge docket information
    docket.add_anon_2020_source()
    docket.date_argued = date_argued or docket.date_argued
    docket.docket_number = docket_number or docket.docket_number
    docket.case_name_short = (case_names["case_name_short"]
                              or docket.case_name_short)
    docket.case_name = case_names["case_name"] or docket.case_name
    docket.case_name_full = (case_names["case_name_full"]
                             or docket.case_name_full)

    # Merge cluster information
    cluster.date_filed = date_filed or cluster.date_filed
    cluster.precedential_status = status or cluster.precedential_status
    cluster.attorneys = data["representation"] or cluster.attorneys
    cluster.disposition = data["summary_disposition"] or cluster.disposition
    cluster.summary = data["summary_court"] or cluster.summary
    cluster.history = data["history"] or cluster.history
    cluster.cross_reference = (data["history_docket_numbers"]
                               or cluster.cross_reference)
    cluster.correction = data["publication_status_note"] or cluster.correction
    if data["judges"]:
        cluster.judges = (data["judges"].replace("{", "").replace("}", "")
                          or cluster.judges)
    cluster.case_name_short = (case_names["case_name_short"]
                               or cluster.case_name_short)
    cluster.case_name = case_names["case_name"] or cluster.case_name
    cluster.case_name_full = (case_names["case_name_full"]
                              or cluster.case_name_full)

    docket.save()
    cluster.save()

    # Add citations to cluster if applicable
    for citation in found_citations:
        Citation.objects.get_or_create(
            volume=citation.volume,
            reporter=citation.reporter,
            page=citation.page,
            type=map_reporter_db_cite_type(
                REPORTERS[citation.canonical_reporter][0]["cite_type"]),
            cluster_id=cluster.id,
        )

    # Merge with scrape or add opinion to cluster with harvard
    if OpinionCluster.objects.get(pk=cluster_id).source == "C":
        opinion = Opinion.objects.get(cluster_id=cluster_id)
        logger.info("Merge with Harvard data")
        opinion.html_anon_2020 = html_str
    else:
        opinion = Opinion(
            cluster_id=cluster.id,
            type=Opinion.COMBINED,
            html_anon_2020=html_str,
            extracted_by_ocr=False,
        )
    opinion.save()
    logger.info(f"Finished merging opinion in cluster {cluster_id}.")
    return docket
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"])
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(
                        itertools.chain.from_iterable(judge_list + author_list)
                    )
                )
            )
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number"
                        % trunc(docket_string, length=5000, ellipsis="...")
                    )
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase(
                        "".join(extract_judge_last_name(author_tag_str))
                    )
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
def add_new_records(
    html_str: str,
    data: Dict[str, Any],
    date_argued: datetime.date,
    date_filed: datetime.date,
    case_names: Dict[str, str],
    status: str,
    docket_number: str,
    found_citations: List[FoundCitation],
    court_id: str,
) -> Docket:
    """Create new records in the DB based on parsed data

    :param html_str: HTML opinion to add
    :param data: Case data to import
    :param date_argued: Date case was argued.
    :param date_filed: Date case was filed.
    :param case_names: A dict with the three case name types
    :param status: Whether it's precedential
    :param docket_number: The docket number
    :param found_citations: A list of FoundCitation objects.
    :param court_id: The CL id of the court
    :return: None.
    """
    docket = Docket.objects.create(
        **case_names,
        docket_number=docket_number,
        court_id=court_id,
        source=Docket.ANON_2020,
        ia_needs_upload=False,
        date_argued=date_argued,
    )

    logger.info("Add cluster for: %s", found_citations[0].base_citation())
    judges = data["judges"] or ""
    cluster = OpinionCluster(
        **case_names,
        precedential_status=status,
        docket_id=docket.id,
        source=docket.ANON_2020,
        date_filed=date_filed,
        attorneys=data["representation"] or "",
        disposition=data["summary_disposition"] or "",
        summary=data["summary_court"] or "",
        history=data["history"] or "",
        cross_reference=data["history_docket_numbers"] or "",
        correction=data["publication_status_note"] or "",
        judges=judges.replace("{", "").replace("}", "") or "",
    )
    cluster.save(index=False)

    for citation in found_citations:
        logger.info("Adding citation for: %s", citation.base_citation())
        Citation.objects.get_or_create(
            volume=citation.volume,
            reporter=citation.reporter,
            page=citation.page,
            type=map_reporter_db_cite_type(
                REPORTERS[citation.canonical_reporter][0]["cite_type"]),
            cluster_id=cluster.id,
        )

    op = Opinion(
        cluster_id=cluster.id,
        type=Opinion.COMBINED,
        html_anon_2020=html_str,
        extracted_by_ocr=False,
    )
    op.save()
    logger.info(
        f"Finished importing cluster {cluster.id}; {found_citations[0].base_citation()}"
    )
    return docket
def parse_harvard_opinions(reporter, volume):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"], html=False)
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            find_judge_names(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            find_judge_names(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            list(set(itertools.chain.from_iterable(judge_list + author_list)))
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            # Iterate over other xml fields in Harvard data set
            # and save as string list   for further processing at a later date.
            json_fields = [
                "attorneys",
                "disposition",
                "syllabus",
                "summary",
                "history",
                "otherdate",
                "seealso",
                "headnotes",
                "correction",
            ]
            data_set = {}
            while json_fields:
                key = json_fields.pop(0)
                data_set[key] = "|".join([x.text for x in soup.find_all(key)])

            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=data_set["attorneys"],
                disposition=data_set["disposition"],
                syllabus=data_set["syllabus"],
                summary=data_set["summary"],
                history=data_set["history"],
                other_dates=data_set["otherdate"],
                cross_reference=data_set["seealso"],
                headnotes=data_set["headnotes"],
                correction=data_set["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            for op in soup.find_all("opinion"):
                joined_by_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(judge_list)))
                    )
                )
                author_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(author_list)))
                    )
                )

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                Opinion.objects.create(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    joined_by_str=joined_by_str,
                    extracted_by_ocr=True,
                )

        logger.info("Finished: %s", citation.base_citation())