コード例 #1
0
    def get_filter_query_params(self, request, view):
        def lc_values(values):
            return [
                value.lower() for value in values if isinstance(value, str)
            ]

        query_params = super().get_filter_query_params(request, view)

        if 'cite' in query_params:
            query_params['cite']['values'] = [
                models.normalize_cite(cite)
                for cite in lc_values(query_params['cite']['values'])
            ]

        if 'court' in query_params:
            query_params['court']['values'] = lc_values(
                query_params['court']['values'])

        if 'jurisdiction' in query_params:
            query_params['jurisdiction']['values'] = lc_values(
                query_params['jurisdiction']['values'])

        if 'cites_to' in query_params:
            old_cites_to = query_params['cites_to']['values']
            query_params['cites_to']['values'] = []
            for cite in old_cites_to:
                # check if case id is passed in
                if cite.isdigit():
                    try:
                        case = CaseDocument.get(id=cite)
                        # add all citations relating to case
                        query_params['cites_to']['values'] += [
                            c['normalized_cite'] for c in case.citations
                        ]
                    except NotFoundError:
                        pass
                else:
                    query_params['cites_to']['values'].append(
                        normalize_cite(cite))
コード例 #2
0
ファイル: test_tasks.py プロジェクト: anastasia/capstone
def test_extract_citations(case_factory, tmpdir, settings, elasticsearch):
    from scripts.extract_cites import EDITIONS as processed_editions
    settings.MISSED_CITATIONS_DIR = str(tmpdir)
    blocked_by_date = set(
        k for k in list(EDITIONS.keys()) + list(VARIATIONS_ONLY.keys())
        if all(c['start_year'] > 2000 for c in processed_editions[k]))
    legitimate_cites = [
        "225 F. Supp. 552",  # correct
        ["125 f supp 152", "125 F. Supp. 152"],  # normalized
        ["125 Burnett (Wis.) 152", "125 Bur. 152"],  # normalized
        ["1 F. 2d 2", "1 F.2d 2"],  # not matched as "1 F. 2"
        "2 1/2 Mass. 1",  # special volume numbers
        "3 Suppl. Mass. 2",  # special volume numbers
        "1 La.App. 5 Cir. 2",  # not matched as "1 La.App. 5"
        "2000 WL 12345",  # vendor cite
    ]
    legitimate_cites += [
        "1 %s 1" % c for c in EDITIONS.keys() if c not in blocked_by_date
    ]
    legitimate_cites += [["1 %s 1" % k, "1 %s 1" % v]
                         for k, vv in VARIATIONS_ONLY.items() for v in vv
                         if k not in blocked_by_date]
    legitimate_cites_normalized = set(
        normalize_cite(c if type(c) is str else c[1])
        for c in legitimate_cites)
    legitimate_cites = [
        c if type(c) is str else c[0] for c in legitimate_cites
    ]
    illegitimate_cites = [
        "2 Dogs 3",  # unrecognized reporter
        "3 Dogs 4",  # duplicate unrecognized reporter
        "1 or 2",  # not matched as 1 Or. 2
        "word1 Mass. 2word",  # not matched if part of larger word
        "1 Mass.\n 2",  # no match across newlines
        "1 A.3d 1",  # no match to reporter that started publishing in 2010
    ]
    illegitimate_cites += ["1 %s 1" % c for c in blocked_by_date]
    case = case_factory(
        body_cache__text=", some text, ".join(legitimate_cites +
                                              illegitimate_cites),
        decision_date=datetime(2000, 1, 1))
    fabfile.extract_all_citations()
    update_elasticsearch_from_queue()

    # check extracted cites
    cites = list(ExtractedCitation.objects.all())
    cite_set = set(c.cite for c in cites)
    normalized_cite_set = set(c.normalized_cite for c in cites)
    assert cite_set == set(legitimate_cites)
    assert normalized_cite_set == legitimate_cites_normalized
    assert all(c.cited_by_id == case.pk for c in cites)
コード例 #3
0
 def get_filter_query_params(self, request, view):
     query_params = super().get_filter_query_params(request, view)
     if not query_params:
         raise ValidationError("Query parameter 'q' is required")
     extracted_cites = {
         normalize_cite(i[1]): i[0]
         for v in query_params['q']['values']
         for i in extract_citations_from_text(v)
     }
     if not extracted_cites:
         raise ValidationError("No citations found in query.")
     query_params['q']['values'] = set(extracted_cites)
     request.extracted_cites = extracted_cites
     return query_params
コード例 #4
0
    def retrieve(self, request, *args, **kwargs):
        # for user's convenience, if user gets /cases/casecitation or /cases/Case Citation (or any non-numeric value)
        # we redirect to /cases/?cite=casecitation
        id = kwargs[self.lookup_field]
        if not id.isdigit():
            normalized_cite = models.normalize_cite(id)
            query_string = urllib.parse.urlencode(dict(
                self.request.query_params, cite=normalized_cite),
                                                  doseq=True)
            new_url = reverse('cases-list') + "?" + query_string
            return HttpResponseRedirect(new_url)

        if self.request.query_params.get('format') == 'html':
            # if previously-supported format=html is requested, redirect to frontend_url
            case = models.CaseMetadata.objects.filter(id=id).first()
            if case:
                return HttpResponseRedirect(case.get_full_frontend_url())
コード例 #5
0
def extract_citations(case):
    misses = defaultdict(lambda: defaultdict(int))
    # Don't count instances where a case cites to itself. Typically this is listing the parallel cite,
    # which leads to false matches when the parallel page has more than one case.
    self_cites = {c.normalized_cite for c in case.citations.all()}
    case_citations = [
        ExtractedCitation(cite=cite,
                          normalized_cite=normalize_cite(normalized_cite),
                          cited_by=case,
                          reporter_name_original=reporter_str,
                          volume_number_original=vol_num,
                          page_number_original=page_num)
        for cite, normalized_cite, vol_num,
        reporter_str, page_num in extract_citations_from_text(
            case.body_cache.text, case.decision_date.year, misses)
        if normalized_cite not in self_cites
    ]
コード例 #6
0
def extract_citations_from_text(text, max_year=9999, misses=None):
    # use dict.fromkeys to remove dupes while preserving order
    for match in dict.fromkeys(re.findall(cite_extracting_regex, text)):
        vol_num, reporter_str, page_num = match

        # fix known OCR errors
        if reporter_str in TRANSLATIONS:
            reporter_str = TRANSLATIONS[reporter_str]

        # skip strings like 'or' that are known non-citations
        if reporter_str in INVALID_REPORTERS:
            if misses is not None:
                misses['blocked'][reporter_str] += 1
            continue

        # Look for found reporter string in the official and nominative REPORTER dicts
        # Try exact match, then normalized match
        candidates = EDITIONS.get(reporter_str) or EDITIONS.get(
            normalize_cite(reporter_str))
        if not candidates:
            # reporter not found, removing cite and adding to misses list
            if misses is not None:
                misses['not_found'][reporter_str] += 1
            continue

        # Find a candidate reporter that was in operation prior to this case being published.
        # Reporters are sorted by end date, so this will prefer newer reporters.
        best_candidate = next(
            (c for c in candidates if c['start_year'] <= max_year), None)
        if not best_candidate:
            if misses is not None:
                misses['invalid_date'][reporter_str] += 1
            continue

        cite = " ".join(match)
        normalized_cite = "%s %s %s" % (vol_num, best_candidate['reporter'],
                                        page_num)

        yield cite, normalized_cite, vol_num, reporter_str, page_num
コード例 #7
0
def main(dry_run='true', output_missing='false'):
    # download data
    cap_cites_path = base_path / 'us_cites.csv'
    if not cap_cites_path.exists():
        print("pre-loading cap cites")
        us_reporter = Reporter.objects.get(short_name='U.S.')
        with connections['capdb'].cursor() as cursor:
            cursor.execute("""
                select m.id, m.volume_id, cite, name, name_abbreviation, decision_date 
                from capdb_casemetadata m, capdb_citation c where m.reporter_id=%s and c.case_id=m.id and c.cite like '%%U.S.%%'
            """, [us_reporter.id])
            with cap_cites_path.open('w') as out_file:
                csv_writer = csv.writer(out_file)
                for row in cursor.fetchall():
                    csv_writer.writerow(row)

    # load data
    print("loading data")
    scdb_new_cites_path = base_path / 'SCDB_2019_01_caseCentered_Citation.csv'
    scdb_old_cites_path = base_path / 'SCDB_Legacy_05_caseCentered_Citation.csv'
    cap_cites = list(csv.DictReader((line.replace('\xad', '') for line in cap_cites_path.open()), exported_columns))
    scdb_cites = list(csv.DictReader(scdb_new_cites_path.open(encoding='iso-8859-1'))) + list(csv.DictReader(scdb_old_cites_path.open(encoding='iso-8859-1')))
    scdb_cites = [c for c in scdb_cites if c['usCite']]
    cap_cites_by_id = {c['id']:c for c in cap_cites}
    scdb_cites_by_id = {c['caseId']:c for c in scdb_cites}
    scdb_cites_lookup = group_by(scdb_cites, lambda c: c['usCite'])

    # count terms for tf_idf
    print("counting terms")
    word_counts = Counter()
    word_counts.update(i for c in cap_cites for i in tokenize(c['name']))
    word_counts.update(i for c in scdb_cites for i in tokenize(c['caseName']))
    document_count = len(cap_cites)+len(scdb_cites)

    ### first pass at checking for matches -- find all cases where cites are the same and titles are similar.
    # These are "strong" matches.
    print("checking for matches")
    matched_cap_case_ids = set()
    for cap_cite in tqdm(cap_cites):
        cite = cap_cite['cite']

        # strip nominatives from CAP cites
        cite = re.sub(r'\(.*\) ', '', cite)

        # skip CAP cites that don't look like "123 U.S. 456"
        if not re.match(r'\w+ U\.S\. \w+$', cite):
            continue

        if cite in scdb_cites_lookup:
            candidates = scdb_cites_lookup[cite]
            candidates_by_name = {c['caseName'].lower(): c for c in candidates}
            best = get_best_match(
                [cap_cite['name'].lower(), cap_cite['name_abbreviation'].lower()],
                candidates_by_name.keys(),
                word_counts, document_count
            )
            if best:
                c = candidates_by_name[best[0]]
                c['cap_cite'] = cap_cite
                c['match_quality'] = 'strong'
                matched_cap_case_ids.add(cap_cite['id'])
            else:
                for c in candidates:
                    c.setdefault('failed_matches', []).append(cap_cite)

    # apply manual_matches overrides
    for k, v in manual_matches.items():
        c = scdb_cites_by_id[k]
        c['cap_cite'] = cap_cites_by_id[v]
        c['match_quality'] = 'confirmed'
        matched_cap_case_ids.add(c['cap_cite']['id'])

    # write SCDB cites to database
    print("Applying corrections")
    edit_out = csv.writer((base_path / 'scdb_cite_edits.csv').open('w'))
    cite_objs = Citation.objects.filter(case_id__in=matched_cap_case_ids).select_related('case')
    cite_objs_by_case_id = group_by(cite_objs, lambda c: c.case_id)
    to_create = []
    to_update = []
    for scdb_cite in scdb_cites:
        if 'cap_cite' not in scdb_cite:
            continue
        case_id = int(scdb_cite['cap_cite']['id'])
        existing_cite_objs_by_reporter = {get_cite_reporter(c.cite): c for c in cite_objs_by_case_id[case_id]}
        expected_cites = [['SCDB', 'SCDB %s' % scdb_cite['caseId'], 'vendor']]
        for scdb_key, cite_type in [["usCite", "official"], ["sctCite", "parallel"], ["ledCite", "parallel"], ["lexisCite", "vendor"]]:
            cite_val = scdb_cite[scdb_key]
            if cite_val:
                expected_cites.append([get_cite_reporter(cite_val), cite_val, cite_type])
        for reporter, cite_val, cite_type in expected_cites:
            if reporter in existing_cite_objs_by_reporter:
                new_cite = existing_cite_objs_by_reporter.pop(reporter)
                if new_cite.cite == cite_val:
                    edit_out.writerow([case_id, 'skip', new_cite.id, cite_val])
                else:
                    edit_out.writerow([case_id, 'update', new_cite.id, new_cite.cite, cite_val])
                    new_cite.cite = cite_val
                    to_update.append(new_cite)
            else:
                new_cite = Citation(cite=cite_val, type=cite_type, case_id=case_id, normalized_cite=normalize_cite(cite_val))
                to_create.append(new_cite)
                edit_out.writerow([case_id, 'create', new_cite.type, new_cite.cite])
        if existing_cite_objs_by_reporter:
            edit_out.writerow([case_id, 'warning', 'ignored cite']+[c.cite for c in existing_cite_objs_by_reporter.values()])

    if dry_run == 'false':
        with EditLog(description='Add SCDB cites').record():
            Citation.objects.bulk_create(to_create)
            Citation.objects.bulk_update(to_update, ['cite'])

    if output_missing != 'true':
        return

    ### second pass at checking for matches -- for all SCDB cites that don't have matches, fetch all cases we have for the
    # same volume, and look for similar titles. These are "weak" matches.
    print("checking for fallback matches")
    no_match = [c for c in scdb_cites if 'cap_cite' not in c]
    missing_by_volume = group_by(no_match, lambda c: c['usCite'].split()[0])
    cap_cites_by_volume = group_by(cap_cites, lambda c: c['cite'].split()[0])

    # fetch all cases from the DB that belong to volumes where SCDB cases are missing
    target_volumes = set(c['volume_id'] for v in missing_by_volume for c in cap_cites_by_volume[v])
    db_cites = Citation.objects.filter(cite__contains='U.S.', case__volume_id__in=target_volumes).select_related('case')
    db_cases = []
    for cite in db_cites:
        # skip cases already affirmatively matched
        if str(cite.id) in matched_cap_case_ids:
            continue

        # conform DB objects to export format from csv
        c = cite.case
        c.cite = cite
        c.decision_date = c.decision_date.strftime("%Y-%m-%d")
        c.name = c.name.replace('\xad', '')
        c.name_abbreviation = c.name_abbreviation.replace('\xad', '')
        db_cases.append(c)
    db_cases_by_volume_id = group_by(db_cases, lambda d: d.volume_id)

    # check each missing SCDB cite for cases in the same volume with similar titles
    for v, missing in tqdm(missing_by_volume.items()):
        if v in cap_cites_by_volume:
            cases = sum((db_cases_by_volume_id[vol_id] for vol_id in set(c['volume_id'] for c in cap_cites_by_volume[v])), [])
            cases_by_name = {}
            for c in cases:
                cases_by_name[c.name.lower()] = c
                cases_by_name[c.name_abbreviation.lower()] = c

            for m in missing:
                if re.match(r'131 U\.S\. [a-z]+$', m['usCite']):
                    # special case -- skip cites like "131 U.S. lxxxiii", as we know we don't have them
                    continue
                best = get_best_match([m['caseName']], cases_by_name.keys(), word_counts, document_count, tf_threshold=10)
                if not best:
                    continue
                c = cases_by_name[best[0]]
                m['cap_cite'] = {k: getattr(c, k) for k in exported_columns}
                m['match_quality'] = 'weak'

    # output
    csv_out = csv.writer((base_path / 'scdb_cite_matchup.csv').open('w'))
    csv_out.writerow(['match quality', 'volume number', 'SCDB name', 'SCDB cite', 'SCDB date', 'SCDB ID', 'CAP name', 'CAP cite', 'CAP date', 'CAP ID', 'CAP vol id'])
    for c in scdb_cites:
        match = c.get('cap_cite')
        match_row = [match['name'], match['cite'], match['decision_date'], match['id'], match['volume_id']] if match else []
        csv_out.writerow([c.get('match_quality', 'none'), c['usCite'].split()[0], c['caseName'], c['usCite'], c['dateDecision'], c['caseId']]+match_row)
コード例 #8
0
def main(dry_run='true'):
    # includes all patterns to look for and how they should be modified
    with Path(__file__).parent.joinpath(
            "modification_instructions.json").open() as fp:
        modification_instructions = json.load(fp,
                                              object_pairs_hook=OrderedDict)

    # fetch and cache citations
    print("Prefetching citations")
    all_cites = Citation.objects.filter(type="parallel").values_list(
        'id', 'cite')

    # opens and scopes the log file handler
    with Path.home().joinpath("citation_update_logger.tsv").open(
            "a") as logfile:

        #writes the header column on the log
        csvlog = csv.writer(logfile, delimiter='\t', quotechar='"')
        csvlog.writerow([
            "timestamp", "action", "New ID", "Case ID", "New Value",
            "Old Value", "Old ID", "Split String", "Replacement Explainer",
            "Filtered", "Regex Verified", "Dry Run"
        ])

        for current_pattern, data in modification_instructions.items():
            print("Fixing %s" % current_pattern)
            modifications = data['modifications']
            jack_count = data['counts_and_examples'][
                'count']  # the count from jack's original report
            example_cases = [
                pk for entry_set in data['counts_and_examples']['examples']
                for pk in entry_set
            ]  #original examples
            print(example_cases)
            regex = '^' + get_escaped_regex(
                current_pattern) + '$'  #turn the citation pattern into a regex
            matcher = re.compile(regex)
            matching_citation_ids = [
                id for id, cite in all_cites if matcher.match(cite)
            ]
            matching_cite_query = Citation.objects.filter(
                id__in=matching_citation_ids)

            # simplify the list of example cases to make sure our search regex gets them
            example_cite_ids = []
            for epk in example_cases:
                case = CaseMetadata.objects.get(pk=epk)
                for cite in case.citations.filter(type="parallel"):
                    example_cite_ids.append(cite.pk)

            to_update = []
            to_insert = []
            to_log = []
            matching_citation_count = 0

            for matching_citation in matching_cite_query:
                matching_citation_count += 1
                csv_replacement_explainer = ""  # string that will say in fairly plain english what was done
                regex_verified = [
                ]  # simple splits don't have a verification regex— it just splits and checks to make sure the output doesn't match the original pattern. I want to specify in the log if it was regex verified
                cite = matching_citation.cite
                case_id = matching_citation.case_id
                if matching_citation.pk in example_cite_ids:
                    example_cite_ids.remove(
                        matching_citation.pk
                    )  # this list should be empty with every pattern

                # The modifications can include a split string which will split a citation up into mutiple citations,
                # 'filters' which is a list of regex substitution pairs, and "kill" which drops one section. The list orders
                # are important because they have the same indexes as the split order, so you can know where to apply what

                if 'splitstring' in modifications:
                    new_cites = [
                        c.strip()
                        for c in cite.split(modifications['splitstring'])
                        if c.strip()
                    ]
                else:
                    new_cites = [cite]

                if 'filters' in modifications:
                    new_cites_filtered = []
                    filters = modifications['filters']
                    csv_replacement_explainer += "Using {} replacement".format(
                        len(filters)) if len(
                            filters) == 1 else "Using {} replacements".format(
                                len(filters))
                    assert len(new_cites) == len(filters)
                    for index, (filter_dict,
                                new_cite) in enumerate(zip(filters,
                                                           new_cites)):
                        if 'kill' in filter_dict:
                            # print("Dropping {}".format(new_cite))
                            csv_replacement_explainer += ", drop split field {} ({})".format(
                                index, new_cite)
                            continue
                        for pattern in filter_dict['patterns']:
                            csv_replacement_explainer += ", replace '{}' with '{}' in split field {} ({})".format(
                                pattern[0], pattern[1], index, new_cite)
                            new_cite = re.sub(pattern[0], pattern[1], new_cite)
                        # The 'goal' is a pattern that the new citation should match after being processed.
                        if 'goal' in filter_dict:
                            csv_replacement_explainer += " to get '{}'".format(
                                filter_dict['goal'])
                            regex_verified.append(new_cite)
                            if not re.match(
                                    '^' +
                                    get_escaped_regex(filter_dict['goal']) +
                                    '$', new_cite):
                                raise Exception(
                                    "Doesn't Match: '{}'\nCurrent Pattern:'{}'\nRegex: '{}'\nCite_Section: '{}'\n"
                                    "Goal: '{}'\nEscaped Goal: '{}'".format(
                                        cite, current_pattern,
                                        get_escaped_regex(current_pattern),
                                        new_cite, filter_dict['goal'],
                                        get_escaped_regex(
                                            filter_dict['goal'])))
                        new_cites_filtered.append(new_cite)
                    new_cites = new_cites_filtered

                # if it matches the original pattern, it's wrong
                for c in new_cites:
                    if re.match(matcher, c):
                        raise Exception(
                            "New Cite Matches Original Regex: '{}'\nCurrent Pattern:'{}'\nRegex: '{}'\n"
                            "Cite_Section: '{}'\n".format(
                                cite, current_pattern,
                                get_escaped_regex(current_pattern), c))
                # update records and log
                for index, new_citation in enumerate(new_cites):
                    reg = True if new_citation in regex_verified else False
                    action = "update" if index == 0 else "create"
                    if action == 'update':
                        # print("Updating: {}".format(new_citation))
                        matching_citation.cite = new_citation
                        matching_citation.normalized_cite = normalize_cite(
                            new_citation)
                        new_citation_obj = matching_citation
                        to_update.append(matching_citation)
                    elif action == 'create':
                        # print("Creating: {}".format(new_citation))
                        new_citation_obj = Citation(
                            case_id=case_id,
                            type="parallel",
                            cite=new_citation,
                            normalized_cite=normalize_cite(new_citation),
                        )
                        to_insert.append(new_citation_obj)

                    to_log.append(
                        (cite, reg, action, csv_replacement_explainer,
                         matching_citation, new_citation_obj))

            if not to_log:
                print("- nothing to do")
                continue

            if dry_run == 'false':
                with EditLog(description='Fix citations matching %s' %
                             current_pattern).record() as edit:
                    Citation.objects.bulk_update(to_update,
                                                 ['cite', 'normalized_cite'])
                    Citation.objects.bulk_create(to_insert)
                timestamp = edit.timestamp
            else:
                timestamp = datetime.now()

            log_filters = str(modifications['filters']
                              ) if 'filters' in modifications else None
            log_splitstring = str(modifications['splitstring']
                                  ) if 'splitstring' in modifications else None
            for cite, reg, action, csv_replacement_explainer, matching_citation, new_citation_obj in to_log:
                # "timestamp", "action", "New ID", "Case ID", "New Value", "Old Value", "Old ID",
                # "Split String", "Replacement Explainer", "Filtered", "Regex Verified", "Dry Run"
                csvlog.writerow([
                    timestamp,
                    action,
                    new_citation_obj.pk,
                    matching_citation.case_id,
                    new_citation_obj.cite,
                    cite,
                    matching_citation.pk,
                    log_splitstring,
                    csv_replacement_explainer,
                    log_filters,
                    reg,
                    dry_run,
                ])

            if matching_citation_count != jack_count:
                # This didn't happen after a dry run with production data
                print(
                    "non-matching Jack Count: {}, Query Count: {}, Pattern: {}"
                    .format(jack_count, matching_citation_count,
                            current_pattern))

            if len(example_cite_ids) > 0:
                # This didn't happen after a dry run with production data
                raise Exception("non-matching example in {}: {}".format(
                    current_pattern, ", ".join(example_cite_ids)))
コード例 #9
0
def load_cluster(args):
    """
        Load a single CourtListener cluster with its opinions from disk, and return metadata.
        This is called within a process pool; see ingest_courtlistener for how it's used.
    """
    cluster_member, opinions_dir = args
    with cluster_member.open() as f:
        cluster = json.load(f)

    # skip clusters without citations
    if not cluster['citations']:
        return None

    # load text of all opinions for this cluster as a single string with html stripped, for simhashing
    opinion_texts = []
    for opinion_url in cluster['sub_opinions']:
        opinion_id = opinion_url.split('/')[-2]
        try:
            with opinions_dir.joinpath(f'{opinion_id}.json').open() as f:
                opinion = json.load(f)
        except FileNotFoundError:
            print("- Opinion file not found:", opinion_id)
            continue
        opinion_text = next((opinion[k] for k in [
            'html_with_citations', 'plain_text', 'html', 'html_lawbox',
            'html_columbia', 'xml_harvard'
        ] if opinion[k]), '')
        opinion_texts.append(re.sub(r'<.+?>', '', opinion_text))

    # process citations
    citations = []
    for c in cluster['citations']:
        cite = f"{c['volume']} {c['reporter']} {c['page']}"
        try:
            page_int = int(c['page'])
        except (TypeError, ValueError):
            page_int = None
        citations.append({
            'cite': cite,
            'normalized_cite': normalize_cite(cite),
            'type': c['type'],
            'volume': c['volume'],
            'reporter': c['reporter'],
            'page': c['page'],
            'page_int': page_int,
        })

    # return metadata
    return {
        'id': f"cl-{cluster['id']}",
        'source': 'cl',
        'source_id': cluster['id'],
        'citations': citations,
        'name_short': cluster['case_name'],
        'name_full': cluster['case_name_full'],
        'decision_date': cluster['date_filed'],
        'frontend_url':
        'https://www.courtlistener.com' + cluster['absolute_url'],
        'api_url': cluster['resource_uri'].replace(':80/', '/'),
        'simhash': get_simhash("\n".join(opinion_texts))
    }
コード例 #10
0
 def append(k, v):
     for key in (k, normalize_cite(k)):
         if v not in editions[key]:
             editions[key].append(v)
コード例 #11
0
def fetch(request):
    """ Extract citations from text and link to PDFs. """

    # zip file download
    error = None
    if request.method == 'POST' and request.POST.get('download'):
        if not request.user.is_authenticated:
            return HttpResponseForbidden()
        case_ids = set(request.POST.getlist('case_ids'))
        if not request.user.unlimited_access_in_effect(
        ) and request.user.case_allowance_remaining < len(case_ids):
            error = "You do not have sufficient downloads remaining to fetch the requested cases"
        else:
            cases_by_id = {
                c.id: c
                for c in CaseMetadata.objects.filter(
                    pk__in=case_ids).prefetch_related('citations')
            }
            try:
                tmp = tempfile.NamedTemporaryFile(delete=False)
                with ZipFile(tmp.name, 'w') as zip:
                    for case_id in case_ids:
                        api_response = api_request(request,
                                                   CaseDocumentViewSet,
                                                   'retrieve', {'id': case_id},
                                                   {
                                                       'format': 'pdf',
                                                       'full_case': 'true'
                                                   })
                        if not hasattr(
                                api_response,
                                'data') or api_response.status_code != 200:
                            return api_response
                        zip.writestr(
                            "cases/" +
                            cases_by_id[int(case_id)].get_pdf_name(),
                            api_response.data)
                return FileResponse(open(tmp.name, 'rb'),
                                    as_attachment=True,
                                    filename='cases.zip')
            finally:
                os.remove(tmp.name)

    # prefer POST because it doesn't record queried text in server logs, but also accept GET to allow linking to search results
    text = request.POST.get('q', '') or request.GET.get('q', '')
    citations = None

    if text:
        citations = extract_citations_from_text(text)
        if citations:
            # extract citations
            citations = [{
                'cite': c[0],
                'normalized_cite': normalize_cite(c[1]),
                'before': '',
                'after': ''
            } for c in extract_citations_from_text(text)]

            # get possible cases matching each extracted cite
            cases = CaseMetadata.objects.in_scope().filter(
                citations__normalized_cite__in=[
                    c['normalized_cite'] for c in citations
                ]).prefetch_related('citations').distinct()
            cases_by_cite = defaultdict(list)
            for case in cases:
                for cite in case.citations.all():
                    cases_by_cite[cite.normalized_cite].append(case)

            for result in citations:
                result['cases'] = cases_by_cite.get(result['normalized_cite'],
                                                    [])

                # add context before and after matched cite
                context_before = 40
                context_after = 30
                m = re.search(
                    r'([^\n]{,%s})\b%s\b([^\n]{,%s})' %
                    (context_before, re.escape(result['cite']), context_after),
                    text)
                if m:
                    result['before'] = ('... ' if len(m[1]) == context_before
                                        else '') + m[1]
                    result['after'] = m[2] + (' ...' if len(m[2])
                                              == context_after else '')