Example #1
0
 def get_judges(self, node):
     """Parse out the judge string and then look it up in the DB"""
     try:
         s = self.case_details.xpath('%s/text()' % node)[0].strip()
     except IndexError:
         print "  Couldn't get judge for node: %s" % node
         return None, ''
     else:
         judge_names = find_judge_names(s)
         judges = []
         for judge_name in judge_names:
             judges.append(
                 find_person(judge_name,
                             self.court.pk,
                             case_date=self.date_filed))
         judges = [c for c in judges if c is not None]
         if len(judges) == 0:
             print "  No judges found after lookup."
             logger.info("No judge for: %s" %
                         ((s, self.court.pk, self.date_filed), ))
             return None, s
         elif len(judges) == 1:
             return judges[0], s
         elif len(judges) > 1:
             print "  Too many judges found: %s" % len(judges)
             return None, s
Example #2
0
 def get_judges(self, node):
     """Parse out the judge string and then look it up in the DB"""
     try:
         s = self.case_details.xpath('%s/text()' % node)[0].strip()
     except IndexError:
         print "  Couldn't get judge for node: %s" % node
         return None, ''
     else:
         judge_names = find_judge_names(s)
         judges = []
         for judge_name in judge_names:
             judges.append(find_person(judge_name, self.court.pk,
                                       case_date=self.date_filed))
         judges = [c for c in judges if c is not None]
         if len(judges) == 0:
             print "  No judges found after lookup."
             logger.info("No judge for: %s" % (
                 (s, self.court.pk, self.date_filed),
             ))
             return None, s
         elif len(judges) == 1:
             return judges[0], s
         elif len(judges) > 1:
             print "  Too many judges found: %s" % len(judges)
             return None, s
Example #3
0
def get_candidate_judge_objects(judge_str, court_id, event_date):
    """Take a string of text in a time and place and figure out which judges
    match up to it.
    """
    judges = find_judge_names(judge_str)

    if len(judges) == 0:
        return []

    candidates = []
    for judge in judges:
        candidates.append(find_person(judge, court_id, case_date=event_date))
    return [c for c in candidates if c is not None]
Example #4
0
 def test_get_judge_from_string_columbia(self):
     """Can we cleanly get a judge value from a string?"""
     tests = (
         (
             "CLAYTON <italic>Ch. Jus. of the Superior Court,</italic> "
             "delivered the following opinion of this Court: ",
             ["clayton"],
         ),
         ("OVERTON, J. &#8212; ", ["overton"]),
         ("BURWELL, J.:", ["burwell"]),
     )
     for q, a in tests:
         self.assertEqual(find_judge_names(q), a)
Example #5
0
 def test_get_judge_from_string_columbia(self):
     """Can we cleanly get a judge value from a string?"""
     tests = ((
         'CLAYTON <italic>Ch. Jus. of the Superior Court,</italic> '
         'delivered the following opinion of this Court: ',
         ['clayton'],
     ), (
         'OVERTON, J. &#8212; ',
         ['overton'],
     ), (
         'BURWELL, J.:',
         ['burwell'],
     ))
     for q, a in tests:
         self.assertEqual(find_judge_names(q), a)
Example #6
0
 def test_get_judge_from_string_columbia(self):
     """Can we cleanly get a judge value from a string?"""
     tests = ((
         'CLAYTON <italic>Ch. Jus. of the Superior Court,</italic> '
         'delivered the following opinion of this Court: ',
         ['clayton'],
     ), (
         'OVERTON, J. &#8212; ',
         ['overton'],
     ), (
         'BURWELL, J.:',
         ['burwell'],
     ))
     for q, a in tests:
         self.assertEqual(find_judge_names(q), a)
Example #7
0
def get_candidate_judges(judge_str, court_id, event_date):
    """Figure out who a judge is from a string and some metadata.

    :param judge_str: A string containing the judge's name.
    :param court_id: A CL Court ID where the case occurred.
    :param event_date: The date of the case.
    :return: Tuple consisting of (Judge, judge_str), where Judge is a judge
    object or None if a judge cannot be identified, and s is the original
    string passed in.
    """
    if not judge_str:
        return None

    judges = find_judge_names(judge_str)

    if len(judges) == 0:
        return []

    candidates = []
    for judge in judges:
        candidates.append(find_person(judge, court_id, case_date=event_date))
    return [c for c in candidates if c is not None]
Example #8
0
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download",
             file_path.split("/", 9)[-1]])

        if OpinionCluster.objects.filter(
                filepath_json_harvard=file_path).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"], html=False)
        if not cites:
            logger.info("No citation found for %s." %
                        data["citations"][0]["cite"])
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            find_judge_names(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            find_judge_names(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(itertools.chain.from_iterable(judge_list +
                                                      author_list)))))
        judges = titlecase(judges)
        docket_string = (data["docket_number"].replace(
            "Docket No.", "").replace("Docket Nos.", "").strip())

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number" %
                        trunc(docket_string, length=5000, ellipsis="..."))
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase("".join(
                        find_judge_names(author_tag_str)))
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
df = pd.read_csv('/vagrant/flp/columbia_data/judges/fed-judges-test.csv')

cas = ['ca' + str(n) for n in range(1, 12)]

matchcount = 0
panelcount = 0
zerocount = 0

for i, row in df.iterrows():
    #if row.court_id not in cas:
    #    continue
    if pd.isnull(row.judges):
        continue

    judges = find_judge_names(row.judges)
    date_filed = dt.strptime(row.date_filed, "%Y-%m-%d")
    candidates = []
    for judge in judges:
        candidates.append(
            find_person(judge, row.court_id, case_date=date_filed))

    candidates = [c for c in candidates if c is not None]

    if len(candidates) == 1:
        author = candidates[0]
        print(author)
    elif len(candidates) > 1:
        panel = candidates
        print(panel)
    else:
Example #10
0
def assign_authors(testing=False):

    clusters = (OpinionCluster.objects.exclude(judges='').exclude(
        docket__court__jurisdiction='FB').select_related(
            'docket__court__id').only('date_filed', 'judges',
                                      'docket__court_id'))
    total = clusters.count()
    i = 0

    for cluster in clusters:
        i += 1
        print u"(%s/%s): Processing: %s, %s" % (i, total, cluster.pk,
                                                cluster.date_filed)
        #print u"  Judge string: %s".encode('utf-8') % cluster.judges

        judgestr = unidecode(cluster.judges)
        print "  Judge string: %s" % judgestr

        if 'curiam' in judgestr.lower():
            opinion = cluster.sub_opinions.all()[0]
            opinion.per_curiam = True
            print u'  Per Curiam assigned.'
            if not testing:
                opinion.save(index=False)
            continue

        #judges = find_judge_names(cluster.judges)

        judges = find_judge_names(judgestr)

        if len(judges) == 0:
            continue

        candidates = []
        for judge in judges:
            candidates.append(
                find_person(judge,
                            cluster.docket.court_id,
                            case_date=cluster.date_filed))
        candidates = [c for c in candidates if c is not None]

        if len(candidates) == 0:
            # more than one judge token, but no DB matches, continue
            print u'  No match.'
            continue

        if len(candidates) > 1:
            # more than one DB match, assign panel and continue
            print u'  Panel assigned: %s' % candidates
            if not testing:
                for candidate in candidates:
                    cluster.panel.add(candidate)
            continue

        # only one candidate, assign author
        opinion = cluster.sub_opinions.all()[0]
        if len(judges) == 1:
            # one judge token, one DB match
            opinion.author = candidates[0]
            print '  Author assigned: %s' % unidecode(str(candidates[0]))
        else:
            # multiple judge tokens, one DB match
            opinion.author = candidates[0]
            print '  Author assigned: %s (with %d missing tokens)' % (
                unidecode(str(candidates[0])), len(judges) - 1)

        if not testing:
            opinion.save(index=False)
def parse_harvard_opinions(reporter, volume):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"], html=False)
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            find_judge_names(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            find_judge_names(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            list(set(itertools.chain.from_iterable(judge_list + author_list)))
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            # Iterate over other xml fields in Harvard data set
            # and save as string list   for further processing at a later date.
            json_fields = [
                "attorneys",
                "disposition",
                "syllabus",
                "summary",
                "history",
                "otherdate",
                "seealso",
                "headnotes",
                "correction",
            ]
            data_set = {}
            while json_fields:
                key = json_fields.pop(0)
                data_set[key] = "|".join([x.text for x in soup.find_all(key)])

            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=data_set["attorneys"],
                disposition=data_set["disposition"],
                syllabus=data_set["syllabus"],
                summary=data_set["summary"],
                history=data_set["history"],
                other_dates=data_set["otherdate"],
                cross_reference=data_set["seealso"],
                headnotes=data_set["headnotes"],
                correction=data_set["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            for op in soup.find_all("opinion"):
                joined_by_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(judge_list)))
                    )
                )
                author_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(author_list)))
                    )
                )

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                Opinion.objects.create(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    joined_by_str=joined_by_str,
                    extracted_by_ocr=True,
                )

        logger.info("Finished: %s", citation.base_citation())
Example #12
0
def assign_authors(testing=False):

    clusters = (OpinionCluster.objects
                .exclude(judges='')
                .exclude(docket__court__jurisdiction='FB')
                .select_related('docket__court__id')
                .only('date_filed', 'judges', 'docket__court_id'))
    total = clusters.count()
    i = 0

    for cluster in clusters:
        i += 1
        print u"(%s/%s): Processing: %s, %s" % (i, total, cluster.pk,
                                               cluster.date_filed)
        #print u"  Judge string: %s".encode('utf-8') % cluster.judges

        judgestr = unidecode(cluster.judges)
        print "  Judge string: %s" % judgestr

        if 'curiam' in judgestr.lower():
            opinion = cluster.sub_opinions.all()[0]
            opinion.per_curiam = True
            print u'  Per Curiam assigned.'
            if not testing:
                opinion.save(index=False)
            continue

        #judges = find_judge_names(cluster.judges)

        judges = find_judge_names(judgestr)

        if len(judges) == 0:
            continue

        candidates = []
        for judge in judges:
            candidates.append(find_person(judge,
                                          cluster.docket.court_id,
                                          case_date=cluster.date_filed))
        candidates = [c for c in candidates if c is not None]

        if len(candidates) == 0:
            # more than one judge token, but no DB matches, continue
            print u'  No match.'
            continue

        if len(candidates) > 1:
            # more than one DB match, assign panel and continue
            print u'  Panel assigned: %s' % candidates
            if not testing:
                for candidate in candidates:
                    cluster.panel.add(candidate)
            continue

        # only one candidate, assign author
        opinion = cluster.sub_opinions.all()[0]
        if len(judges) == 1:
            # one judge token, one DB match
            opinion.author = candidates[0]
            print '  Author assigned: %s' % unidecode(str(candidates[0]))
        else:
            # multiple judge tokens, one DB match
            opinion.author = candidates[0]
            print '  Author assigned: %s (with %d missing tokens)' % (
                unidecode(str(candidates[0])),
                len(judges)-1
            )

        if not testing:
            opinion.save(index=False)
df = pd.read_csv('/vagrant/flp/columbia_data/judges/fed-judges-test.csv')

cas = ['ca'+str(n) for n in range(1,12)]

matchcount = 0
panelcount = 0
zerocount = 0

for i, row in df.iterrows():
    #if row.court_id not in cas:
    #    continue
    if pd.isnull(row.judges):
        continue

    judges = find_judge_names(row.judges)
    date_filed = dt.strptime(row.date_filed, "%Y-%m-%d")
    candidates = []
    for judge in judges:
        candidates.append(find_person(judge, row.court_id, case_date=date_filed))

    candidates = [c for c in candidates if c is not None]

    if len(candidates) == 1:
        author = candidates[0]
        print(author)
    elif len(candidates) > 1:
        panel = candidates
        print(panel)
    else:
        print('No match.',row.judges)