Example #1
0
def disambiguate_reporters(
    citations: List[Union[Citation, NonopinionCitation]]
) -> List[Union[Citation, NonopinionCitation]]:
    """Convert a list of citations to a list of unambiguous ones.

    Goal is to figure out:
     - citation.canonical_reporter
     - citation.lookup_index

    And there are a few things that can be ambiguous:
     - More than one variation.
     - More than one reporter for the key.
     - Could be an edition (or not)
     - All combinations of the above:
        - More than one variation.
        - More than one variation, with more than one reporter for the key.
        - More than one variation, with more than one reporter for the key,
          which is an edition.
        - More than one variation, which is an edition
        - ...

    For variants, we just need to sort out the canonical_reporter.

    If it's not possible to disambiguate the reporter, we simply have to drop
    it.
    """
    unambiguous_citations = []
    for citation in citations:
        # Only disambiguate citations with a reporter
        if not isinstance(citation, (FullCitation, ShortformCitation)):
            unambiguous_citations.append(citation)
            continue

        # Non-variant items (P.R.R., A.2d, Wash., etc.)
        if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None:
            citation.canonical_reporter = EDITIONS[citation.reporter]
            if len(REPORTERS[EDITIONS[citation.reporter]]) == 1:
                # Single reporter, easy-peasy.
                citation.lookup_index = 0
                unambiguous_citations.append(citation)
                continue

            # Multiple books under this key, but which is correct?
            if citation.year:
                # attempt resolution by date
                possible_citations = []
                rep_len = len(REPORTERS[EDITIONS[citation.reporter]])
                for i in range(0, rep_len):
                    if is_date_in_reporter(
                            REPORTERS[EDITIONS[citation.reporter]][i]
                        ["editions"],
                            citation.year,
                    ):
                        possible_citations.append((citation.reporter, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit
                    # after filtering by year.
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

        # Try doing a variation of an edition.
        elif VARIATIONS_ONLY.get(citation.reporter) is not None:
            if len(VARIATIONS_ONLY[citation.reporter]) == 1:
                # Only one variation -- great, use it.
                citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[
                    citation.reporter][0]]
                cached_variation = citation.reporter
                citation.reporter = VARIATIONS_ONLY[citation.reporter][0]
                if len(REPORTERS[citation.canonical_reporter]) == 1:
                    # It's a single reporter under a misspelled key.
                    citation.lookup_index = 0
                    unambiguous_citations.append(citation)
                    continue

                # Multiple reporters under a single misspelled key
                # (e.g. Wn.2d --> Wash --> Va Reports, Wash or
                #                          Washington Reports).
                if citation.year:
                    # attempt resolution by date
                    possible_citations = []
                    rep_can = len(REPORTERS[citation.canonical_reporter])
                    for i in range(0, rep_can):
                        if is_date_in_reporter(
                                REPORTERS[citation.canonical_reporter][i]
                            ["editions"],
                                citation.year,
                        ):
                            possible_citations.append((citation.reporter, i))
                    if len(possible_citations) == 1:
                        # We were able to identify only one hit after
                        # filtering by year.
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue
                # Attempt resolution by unique variation
                # (e.g. Cr. can only be Cranch[0])
                possible_citations = []
                reps = REPORTERS[citation.canonical_reporter]
                for i in range(0, len(reps)):
                    for variation in REPORTERS[citation.canonical_reporter][i][
                            "variations"].items():
                        if variation[0] == cached_variation:
                            possible_citations.append((variation[1], i))
                if len(possible_citations) == 1:
                    # We were able to find a single match after filtering
                    # by variation.
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue
            else:
                # Multiple variations, deal with them.
                possible_citations = []
                for reporter_key in VARIATIONS_ONLY[citation.reporter]:
                    for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])):
                        # This inner loop works regardless of the number of
                        # reporters under the key.
                        key = REPORTERS[EDITIONS[reporter_key]]
                        if citation.year:
                            cite_year = citation.year
                            if is_date_in_reporter(key[i]["editions"],
                                                   cite_year):
                                possible_citations.append((reporter_key, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit after filtering by
                    # year.
                    citation.canonical_reporter = EDITIONS[
                        possible_citations[0][0]]
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

    return unambiguous_citations
Example #2
0
def get_date_filed(clean_html_tree, citations, case_path=None, court=None):
    path = '//center[descendant::text()[not(starts-with(normalize-space(.), "No.") or starts-with(normalize-space(.), "Case No.") or starts-with(normalize-space(.), "Record No."))]]'

    # Get a reasonable date range based on reporters in the citations.
    reporter_keys = [citation.reporter for citation in citations]
    range_dates = []
    for reporter_key in reporter_keys:
        for reporter in REPORTERS.get(EDITIONS.get(reporter_key)):
            try:
                range_dates.extend(reporter["editions"][reporter_key])
            except KeyError:
                # Fails when a reporter_key points to more than one reporter, one of which doesn't have the edition
                # queried. For example, Wash. 2d isn't in REPORTERS['Wash.']['editions'][0].
                pass
    if range_dates:
        start, end = min(range_dates) - timedelta(weeks=(20 * 52)), max(range_dates) + timedelta(weeks=20 * 52)
        if end > now():
            end = now()

    dates = []
    for e in clean_html_tree.xpath(path):
        text = tostring(e, method="text", encoding="unicode")
        # Items like "February 4, 1991, at 9:05 A.M." stump the lexer in the date parser. Consequently, we purge
        # the word at, and anything after it.
        text = re.sub(" at .*", "", text)

        # The parser recognizes numbers like 121118 as a date. This corpus does not have dates in that format.
        text = re.sub("\d{5,}", "", text)

        # The parser can't handle 'Sept.' so we tweak it.
        text = text.replace("Sept.", "Sep.")

        # The parser recognizes dates like December 3, 4, 1908 as 2004-12-3 19:08.
        re_match = re.search("\d{1,2}, \d{1,2}, \d{4}", text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # The parser recognizes dates like October 12-13, 1948 as 2013-10-12, not as 1948-10-12
        # See: https://www.courtlistener.com/scotus/9ANY/grand-river-dam-authority-v-grand-hydro/
        re_match = re.search("\d{1,2}-\d{1,2}, \d{4}", text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # Sometimes there's a string like: "Review Denied July 26, 2006. Skip this.
        if "denied" in text.lower():
            continue

        try:
            if range_dates:
                found = parse_dates.parse_dates(text, sane_start=start, sane_end=end)
            else:
                found = parse_dates.parse_dates(text, sane_end=now())
            if found:
                dates.extend(found)
        except UnicodeEncodeError:
            # If it has unicode is crashes dateutil's parser, but is unlikely to be the date.
            pass

    # Get the date from our SCOTUS date table
    scotus_dates_found = []
    if not dates and court == "scotus":
        for citation in citations:
            try:
                # Scotus dates are in the form of a list, since a single citation can refer to several dates.
                found = scotus_dates["%s %s %s" % (citation.volume, citation.reporter, citation.page)]
                if len(found) == 1:
                    scotus_dates_found.extend(found)
            except KeyError:
                pass
        if len(scotus_dates_found) == 1:
            dates = scotus_dates_found

    if not dates:
        # Try to grab the year from the citations, if it's the same in all of them.
        years = set([citation.year for citation in citations if citation.year])
        if len(years) == 1:
            dates.append(datetime.datetime(list(years)[0], 1, 1))

    if not dates:
        try:
            dates = fixes[case_path]["dates"]
        except KeyError:
            if "input_dates" in DEBUG:
                # subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate()
                print "  No date found for: file://%s" % case_path
                input_date = raw_input("  What should be here (YYYY-MM-DD)? ")
                add_fix(case_path, {"dates": [datetime.datetime.strptime(input_date, "%Y-%m-%d")]})
                dates = [datetime.datetime.strptime(input_date, "%Y-%m-%d")]
            if "log_bad_dates" in DEBUG:
                # Write the failed case out to file.
                with open("missing_dates.txt", "a") as out:
                    out.write("%s\n" % case_path)

    if dates:
        if "date" in DEBUG:
            log_print("  Using date: %s of dates found: %s" % (max(dates), dates))
        return max(dates)
    else:
        if "date" in DEBUG:
            log_print("  No dates found")
        return []
Example #3
0
def reporter_or_volume_handler(request, reporter, volume=None):
    """Show all the volumes for a given reporter abbreviation or all the cases
    for a reporter-volume dyad.

    Two things going on here:
    1. We don't know which reporter the user actually wants when they provide
       an ambiguous abbreviation. Just show them all.
    2. We want to also show off that we know all these reporter abbreviations.
    """
    root_reporter = EDITIONS.get(reporter)
    if not root_reporter:
        return throw_404(
            request,
            {
                "no_reporters": True,
                "reporter": reporter,
                "private": True,
            },
        )

    volume_names = [r["name"] for r in REPORTERS[root_reporter]]
    variation_names = {}
    variation_abbrevs = VARIATIONS_ONLY.get(reporter, [])
    for abbrev in variation_abbrevs:
        for r in REPORTERS[abbrev]:
            if r["name"] not in volume_names:
                variation_names[r["name"]] = abbrev

    if volume is None:
        # Show all the volumes for the case
        volumes_in_reporter = list(
            Citation.objects.filter(reporter=reporter).order_by(
                "reporter", "volume").values_list("volume",
                                                  flat=True).distinct())

        if not volumes_in_reporter:
            return throw_404(
                request,
                {
                    "no_volumes": True,
                    "reporter": reporter,
                    "volume_names": volume_names,
                    "private": True,
                },
            )

        return render(
            request,
            "volumes_for_reporter.html",
            {
                "reporter": reporter,
                "volume_names": volume_names,
                "volumes": volumes_in_reporter,
                "variation_names": variation_names,
                "private": False,
            },
        )
    else:
        # Show all the cases for a volume-reporter dyad
        cases_in_volume = OpinionCluster.objects.filter(
            citations__reporter=reporter,
            citations__volume=volume).order_by("date_filed", "citations__page")

        if not cases_in_volume:
            return throw_404(
                request,
                {
                    "no_cases": True,
                    "reporter": reporter,
                    "volume_names": volume_names,
                    "volume": volume,
                    "private": True,
                },
            )

        paginator = Paginator(cases_in_volume, 250, orphans=5)
        page = request.GET.get("page")
        try:
            cases = paginator.page(page)
        except PageNotAnInteger:
            cases = paginator.page(1)
        except EmptyPage:
            cases = paginator.page(paginator.num_pages)

        return render(
            request,
            "volumes_for_reporter.html",
            {
                "cases": cases,
                "reporter": reporter,
                "variation_names": variation_names,
                "volume": volume,
                "volume_names": volume_names,
                "private": True,
            },
        )
Example #4
0
def get_date_filed(clean_html_tree, citations, case_path=None, court=None):
    path = '//center[descendant::text()[not(starts-with(normalize-space(.), "No.") or starts-with(normalize-space(.), "Case No.") or starts-with(normalize-space(.), "Record No."))]]'

    # Get a reasonable date range based on reporters in the citations.
    reporter_keys = [citation.reporter for citation in citations]
    range_dates = []
    for reporter_key in reporter_keys:
        for reporter in REPORTERS.get(EDITIONS.get(reporter_key)):
            try:
                range_dates.extend(reporter['editions'][reporter_key])
            except KeyError:
                # Fails when a reporter_key points to more than one reporter, one of which doesn't have the edition
                # queried. For example, Wash. 2d isn't in REPORTERS['Wash.']['editions'][0].
                pass
    if range_dates:
        start, end = min(range_dates) - timedelta(
            weeks=(20 * 52)), max(range_dates) + timedelta(weeks=20 * 52)
        if end > now():
            end = now()

    dates = []
    for e in clean_html_tree.xpath(path):
        text = tostring(e, method='text', encoding='unicode')
        # Items like "February 4, 1991, at 9:05 A.M." stump the lexer in the date parser. Consequently, we purge
        # the word at, and anything after it.
        text = re.sub(' at .*', '', text)

        # The parser recognizes numbers like 121118 as a date. This corpus does not have dates in that format.
        text = re.sub('\d{5,}', '', text)

        # The parser can't handle 'Sept.' so we tweak it.
        text = text.replace('Sept.', 'Sep.')

        # The parser recognizes dates like December 3, 4, 1908 as 2004-12-3 19:08.
        re_match = re.search('\d{1,2}, \d{1,2}, \d{4}', text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # The parser recognizes dates like October 12-13, 1948 as 2013-10-12, not as 1948-10-12
        # See: https://www.courtlistener.com/scotus/9ANY/grand-river-dam-authority-v-grand-hydro/
        re_match = re.search('\d{1,2}-\d{1,2}, \d{4}', text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # Sometimes there's a string like: "Review Denied July 26, 2006. Skip this.
        if 'denied' in text.lower():
            continue

        try:
            if range_dates:
                found = parse_dates.parse_dates(text,
                                                sane_start=start,
                                                sane_end=end)
            else:
                found = parse_dates.parse_dates(text, sane_end=now())
            if found:
                dates.extend(found)
        except UnicodeEncodeError:
            # If it has unicode is crashes dateutil's parser, but is unlikely to be the date.
            pass

    # Get the date from our SCOTUS date table
    scotus_dates_found = []
    if not dates and court == 'scotus':
        for citation in citations:
            try:
                # Scotus dates are in the form of a list, since a single citation can refer to several dates.
                found = scotus_dates["%s %s %s" %
                                     (citation.volume, citation.reporter,
                                      citation.page)]
                if len(found) == 1:
                    scotus_dates_found.extend(found)
            except KeyError:
                pass
        if len(scotus_dates_found) == 1:
            dates = scotus_dates_found

    if not dates:
        # Try to grab the year from the citations, if it's the same in all of them.
        years = set([citation.year for citation in citations if citation.year])
        if len(years) == 1:
            dates.append(datetime.datetime(list(years)[0], 1, 1))

    if not dates:
        try:
            dates = fixes[case_path]['dates']
        except KeyError:
            if 'input_dates' in DEBUG:
                #subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate()
                print '  No date found for: file://%s' % case_path
                input_date = raw_input('  What should be here (YYYY-MM-DD)? ')
                add_fix(case_path, {
                    'dates':
                    [datetime.datetime.strptime(input_date, '%Y-%m-%d')]
                })
                dates = [datetime.datetime.strptime(input_date, '%Y-%m-%d')]
            if 'log_bad_dates' in DEBUG:
                # Write the failed case out to file.
                with open('missing_dates.txt', 'a') as out:
                    out.write('%s\n' % case_path)

    if dates:
        if 'date' in DEBUG:
            log_print("  Using date: %s of dates found: %s" %
                      (max(dates), dates))
        return max(dates)
    else:
        if 'date' in DEBUG:
            log_print("  No dates found")
        return []
Example #5
0
def disambiguate_reporters(citations):
    """Convert a list of citations to a list of unambiguous ones.

    Goal is to figure out:
     - citation.canonical_reporter
     - citation.lookup_index

    And there are a few things that can be ambiguous:
     - More than one variation.
     - More than one reporter for the key.
     - Could be an edition (or not)
     - All combinations of the above:
        - More than one variation.
        - More than one variation, with more than one reporter for the key.
        - More than one variation, with more than one reporter for the key,
          which is an edition.
        - More than one variation, which is an edition
        - ...

    For variants, we just need to sort out the canonical_reporter.

    If it's not possible to disambiguate the reporter, we simply have to drop
    it.
    """
    unambiguous_citations = []
    for citation in citations:
        # Non-variant items (P.R.R., A.2d, Wash., etc.)
        if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None:
            citation.canonical_reporter = EDITIONS[citation.reporter]
            if len(REPORTERS[EDITIONS[citation.reporter]]) == 1:
                # Single reporter, easy-peasy.
                citation.lookup_index = 0
                unambiguous_citations.append(citation)
                continue
            else:
                # Multiple books under this key, but which is correct?
                if citation.year:
                    # attempt resolution by date
                    possible_citations = []
                    for i in range(0, len(REPORTERS[EDITIONS[citation.reporter]])):
                        if is_date_in_reporter(REPORTERS[EDITIONS[citation.reporter]][i]["editions"], citation.year):
                            possible_citations.append((citation.reporter, i))
                    if len(possible_citations) == 1:
                        # We were able to identify only one hit after filtering by year.
                        citation.reporter = possible_citations[0][0]
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue

        # Try doing a variation of an edition.
        elif VARIATIONS_ONLY.get(citation.reporter) is not None:
            if len(VARIATIONS_ONLY[citation.reporter]) == 1:
                # Only one variation -- great, use it.
                citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[citation.reporter][0]]
                cached_variation = citation.reporter
                citation.reporter = VARIATIONS_ONLY[citation.reporter][0]
                if len(REPORTERS[citation.canonical_reporter]) == 1:
                    # It's a single reporter under a misspelled key.
                    citation.lookup_index = 0
                    unambiguous_citations.append(citation)
                    continue
                else:
                    # Multiple reporters under a single misspelled key
                    # (e.g. Wn.2d --> Wash --> Va Reports, Wash or
                    #                          Washington Reports).
                    if citation.year:
                        # attempt resolution by date
                        possible_citations = []
                        for i in range(0, len(REPORTERS[citation.canonical_reporter])):
                            if is_date_in_reporter(
                                REPORTERS[citation.canonical_reporter][i]["editions"], citation.year
                            ):
                                possible_citations.append((citation.reporter, i))
                        if len(possible_citations) == 1:
                            # We were able to identify only one hit after filtering by year.
                            citation.lookup_index = possible_citations[0][1]
                            unambiguous_citations.append(citation)
                            continue
                    # Attempt resolution by unique variation (e.g. Cr. can only be Cranch[0])
                    possible_citations = []
                    for i in range(0, len(REPORTERS[citation.canonical_reporter])):
                        for variation in REPORTERS[citation.canonical_reporter][i]["variations"].items():
                            if variation[0] == cached_variation:
                                possible_citations.append((variation[1], i))
                    if len(possible_citations) == 1:
                        # We were able to find a single match after filtering by variation.
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue
            else:
                # Multiple variations, deal with them.
                possible_citations = []
                for reporter_key in VARIATIONS_ONLY[citation.reporter]:
                    for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])):
                        # This inner loop works regardless of the number of reporters under the key.
                        if is_date_in_reporter(REPORTERS[EDITIONS[reporter_key]][i]["editions"], citation.year):
                            possible_citations.append((reporter_key, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit after filtering by year.
                    citation.canonical_reporter = EDITIONS[possible_citations[0][0]]
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

    return unambiguous_citations
Example #6
0
def reporter_or_volume_handler(request, reporter, volume=None):
    """Show all the volumes for a given reporter abbreviation or all the cases
    for a reporter-volume dyad.

    Two things going on here:
    1. We don't know which reporter the user actually wants when they provide
       an ambiguous abbreviation. Just show them all.
    2. We want to also show off that we know all these reporter abbreviations.
    """
    root_reporter = EDITIONS.get(reporter)
    if not root_reporter:
        return throw_404(request, {
            'no_reporters': True,
            'reporter': reporter,
            'private': True,
        })

    volume_names = [r['name'] for r in REPORTERS[root_reporter]]
    variation_names = {}
    variation_abbrevs = VARIATIONS_ONLY.get(reporter, [])
    for abbrev in variation_abbrevs:
        for r in REPORTERS[abbrev]:
            if r['name'] not in volume_names:
                variation_names[r['name']] = abbrev

    if volume is None:
        # Show all the volumes for the case
        volumes_in_reporter = list(Citation.objects
                                   .filter(reporter=reporter)
                                   .order_by('reporter', 'volume')
                                   .values_list('volume', flat=True)
                                   .distinct())

        if not volumes_in_reporter:
            return throw_404(request, {
                'no_volumes': True,
                'reporter': reporter,
                'volume_names': volume_names,
                'private': True,
            })

        return render(
            request,
            'volumes_for_reporter.html',
            {
                'reporter': reporter,
                'volume_names': volume_names,
                'volumes': volumes_in_reporter,
                'variation_names': variation_names,
                'private': False,
            },
        )
    else:
        # Show all the cases for a volume-reporter dyad
        cases_in_volume = (OpinionCluster.objects
                           .filter(citations__reporter=reporter,
                                   citations__volume=volume)
                           .order_by('date_filed', 'citations__page'))

        if not cases_in_volume:
            return throw_404(request, {
                'no_cases': True,
                'reporter': reporter,
                'volume_names': volume_names,
                'volume': volume,
                'private': True,
            })

        paginator = Paginator(cases_in_volume, 250, orphans=5)
        page = request.GET.get('page')
        try:
            cases = paginator.page(page)
        except PageNotAnInteger:
            cases = paginator.page(1)
        except EmptyPage:
            cases = paginator.page(paginator.num_pages)

        return render(request, 'volumes_for_reporter.html', {
            'cases': cases,
            'reporter': reporter,
            'variation_names': variation_names,
            'volume': volume,
            'volume_names': volume_names,
            'private': True,
        })