Ejemplo n.º 1
0
def manual_pre_edits():
    """
        These fixes were determined from inspecting the initial output of main() for error messages.
    """
    with EditLog(
        description='Swap reporter and volume number for NOTALEPH001894 (was 26 S.C.L. (McMul.)) '
        'and NOTALEPH001893 (was 7 S.C. Eq. (McCord Eq.))'
    ).record():
        v1 = VolumeMetadata.objects.get(barcode='NOTALEPH001894', reporter__short_name='S.C.L. (McMul.)')
        v2 = VolumeMetadata.objects.get(barcode='NOTALEPH001893', reporter__short_name='S.C. Eq. (McCord Eq.)')
        new_r1 = v2.reporter
        new_volnum1 = v2.volume_number
        new_r2 = v1.reporter
        new_volnum2 = v1.volume_number

        v1.volume_number = new_volnum1
        v1.set_reporter(new_r1)

        v2.volume_number = new_volnum2
        v2.set_reporter(new_r2)

    with EditLog(
         description='Combine reporters 909 and 1075 into just 1075, and 1076 and 919 into just 919. '
         'This combines the first and second run of S.C.L. (Rich.) and S.C. Eq. (Rich. Eq.), respectively. '
         'They are combined as they use the same numbering sequence -- see '
         'https://www.sccourts.org/courtreg/displayRule.cfm?ruleID=268.0&subRuleID=&ruleType=APP'
    ).record():
        for first_id, second_id in ((909, 1075), (1076, 919)):
            r1 = Reporter.objects.get(id=first_id)
            r2 = Reporter.objects.get(id=second_id)
            r2.start_year = r1.start_year
            r2.volume_count += r1.volume_count
            r2.hollis += r1.hollis
            r2.save()
            for v in r1.volumes.all():
                v.set_reporter(r2)
            r1.delete()

    with EditLog(
        description='Mark 32044078662582 (3A Tenn. (Cooke)) as a duplicate of (the appendix to) 32044078663788 (158 Tenn.). '
        'See E. Lucy Ogden, A Note on 3A Tenn. Reports, 19 Tenn. L. Rev. 74 (1945), explaining that former was separately published '
        'for attorneys who did not subscribe to latter and should not be cited.'
    ).record():
        old = VolumeMetadata.objects.get(pk='32044078662582')
        new = VolumeMetadata.objects.get(pk='32044078663788')
        if not old.duplicate_of == new:
            old.set_duplicate(True, new)

    with EditLog(
        description='Update case citation from "50 50 Ky. (B. Mon.) 178 183" to "50 Ky. (B. Mon.) 183".'
    ).record():
        c = Citation.objects.get(cite="50 50 Ky. (B. Mon.) 178 183")
        c.cite = "50 Ky. (B. Mon.) 183"
        c.save()
Ejemplo n.º 2
0
def manual_pre_edits(dry_run='true'):
    """
        These fixes were determined from inspecting the initial output of main() for error messages.
    """

    ## delete "U. S." cites and manually detected typo cites

    to_delete = []
    changed_cites = []
    potential_matches = list(Citation.objects.filter(cite__contains=' U. S. '))
    actual_matches = [c for c in potential_matches if ';' not in c.cite and re.match(r'\d+ U\. S\. \d+', c.cite)]
    actual_matches += [
        Citation.objects.get(case_id=1493694, cite="822 F.2d 52"),
        Citation.objects.get(case_id=6210086, cite="For"),
    ]
    for cite in actual_matches:
        print("Would delete %s" % cite)
        to_delete.append(cite)
        changed_cites.append(cite.cite)

    if dry_run == 'false':
        with EditLog(
            description='Delete incorrectly identified cites matching "\d+ U. S. \d+" (extra space). '
            'These all refer to prior history of the case.'
        ).record():
            for cite in to_delete:
                cite.delete()
            CaseMetadata.update_frontend_urls(changed_cites)
Ejemplo n.º 3
0
def test_data_edit(volume_metadata):
    with EditLog(description="test").record() as edit:
        volume_metadata.publisher = "foo"
        volume_metadata.save()
    transactions = list(edit.transactions.all())
    assert len(transactions) == 1
    volume_metadata.refresh_from_db()
    assert transactions[0].timestamp == volume_metadata.sys_period.lower
Ejemplo n.º 4
0
def main(dry_run='true', output_missing='false'):
    # download data
    cap_cites_path = base_path / 'us_cites.csv'
    if not cap_cites_path.exists():
        print("pre-loading cap cites")
        us_reporter = Reporter.objects.get(short_name='U.S.')
        with connections['capdb'].cursor() as cursor:
            cursor.execute("""
                select m.id, m.volume_id, cite, name, name_abbreviation, decision_date 
                from capdb_casemetadata m, capdb_citation c where m.reporter_id=%s and c.case_id=m.id and c.cite like '%%U.S.%%'
            """, [us_reporter.id])
            with cap_cites_path.open('w') as out_file:
                csv_writer = csv.writer(out_file)
                for row in cursor.fetchall():
                    csv_writer.writerow(row)

    # load data
    print("loading data")
    scdb_new_cites_path = base_path / 'SCDB_2019_01_caseCentered_Citation.csv'
    scdb_old_cites_path = base_path / 'SCDB_Legacy_05_caseCentered_Citation.csv'
    cap_cites = list(csv.DictReader((line.replace('\xad', '') for line in cap_cites_path.open()), exported_columns))
    scdb_cites = list(csv.DictReader(scdb_new_cites_path.open(encoding='iso-8859-1'))) + list(csv.DictReader(scdb_old_cites_path.open(encoding='iso-8859-1')))
    scdb_cites = [c for c in scdb_cites if c['usCite']]
    cap_cites_by_id = {c['id']:c for c in cap_cites}
    scdb_cites_by_id = {c['caseId']:c for c in scdb_cites}
    scdb_cites_lookup = group_by(scdb_cites, lambda c: c['usCite'])

    # count terms for tf_idf
    print("counting terms")
    word_counts = Counter()
    word_counts.update(i for c in cap_cites for i in tokenize(c['name']))
    word_counts.update(i for c in scdb_cites for i in tokenize(c['caseName']))
    document_count = len(cap_cites)+len(scdb_cites)

    ### first pass at checking for matches -- find all cases where cites are the same and titles are similar.
    # These are "strong" matches.
    print("checking for matches")
    matched_cap_case_ids = set()
    for cap_cite in tqdm(cap_cites):
        cite = cap_cite['cite']

        # strip nominatives from CAP cites
        cite = re.sub(r'\(.*\) ', '', cite)

        # skip CAP cites that don't look like "123 U.S. 456"
        if not re.match(r'\w+ U\.S\. \w+$', cite):
            continue

        if cite in scdb_cites_lookup:
            candidates = scdb_cites_lookup[cite]
            candidates_by_name = {c['caseName'].lower(): c for c in candidates}
            best = get_best_match(
                [cap_cite['name'].lower(), cap_cite['name_abbreviation'].lower()],
                candidates_by_name.keys(),
                word_counts, document_count
            )
            if best:
                c = candidates_by_name[best[0]]
                c['cap_cite'] = cap_cite
                c['match_quality'] = 'strong'
                matched_cap_case_ids.add(cap_cite['id'])
            else:
                for c in candidates:
                    c.setdefault('failed_matches', []).append(cap_cite)

    # apply manual_matches overrides
    for k, v in manual_matches.items():
        c = scdb_cites_by_id[k]
        c['cap_cite'] = cap_cites_by_id[v]
        c['match_quality'] = 'confirmed'
        matched_cap_case_ids.add(c['cap_cite']['id'])

    # write SCDB cites to database
    print("Applying corrections")
    edit_out = csv.writer((base_path / 'scdb_cite_edits.csv').open('w'))
    cite_objs = Citation.objects.filter(case_id__in=matched_cap_case_ids).select_related('case')
    cite_objs_by_case_id = group_by(cite_objs, lambda c: c.case_id)
    to_create = []
    to_update = []
    for scdb_cite in scdb_cites:
        if 'cap_cite' not in scdb_cite:
            continue
        case_id = int(scdb_cite['cap_cite']['id'])
        existing_cite_objs_by_reporter = {get_cite_reporter(c.cite): c for c in cite_objs_by_case_id[case_id]}
        expected_cites = [['SCDB', 'SCDB %s' % scdb_cite['caseId'], 'vendor']]
        for scdb_key, cite_type in [["usCite", "official"], ["sctCite", "parallel"], ["ledCite", "parallel"], ["lexisCite", "vendor"]]:
            cite_val = scdb_cite[scdb_key]
            if cite_val:
                expected_cites.append([get_cite_reporter(cite_val), cite_val, cite_type])
        for reporter, cite_val, cite_type in expected_cites:
            if reporter in existing_cite_objs_by_reporter:
                new_cite = existing_cite_objs_by_reporter.pop(reporter)
                if new_cite.cite == cite_val:
                    edit_out.writerow([case_id, 'skip', new_cite.id, cite_val])
                else:
                    edit_out.writerow([case_id, 'update', new_cite.id, new_cite.cite, cite_val])
                    new_cite.cite = cite_val
                    to_update.append(new_cite)
            else:
                new_cite = Citation(cite=cite_val, type=cite_type, case_id=case_id, normalized_cite=normalize_cite(cite_val))
                to_create.append(new_cite)
                edit_out.writerow([case_id, 'create', new_cite.type, new_cite.cite])
        if existing_cite_objs_by_reporter:
            edit_out.writerow([case_id, 'warning', 'ignored cite']+[c.cite for c in existing_cite_objs_by_reporter.values()])

    if dry_run == 'false':
        with EditLog(description='Add SCDB cites').record():
            Citation.objects.bulk_create(to_create)
            Citation.objects.bulk_update(to_update, ['cite'])

    if output_missing != 'true':
        return

    ### second pass at checking for matches -- for all SCDB cites that don't have matches, fetch all cases we have for the
    # same volume, and look for similar titles. These are "weak" matches.
    print("checking for fallback matches")
    no_match = [c for c in scdb_cites if 'cap_cite' not in c]
    missing_by_volume = group_by(no_match, lambda c: c['usCite'].split()[0])
    cap_cites_by_volume = group_by(cap_cites, lambda c: c['cite'].split()[0])

    # fetch all cases from the DB that belong to volumes where SCDB cases are missing
    target_volumes = set(c['volume_id'] for v in missing_by_volume for c in cap_cites_by_volume[v])
    db_cites = Citation.objects.filter(cite__contains='U.S.', case__volume_id__in=target_volumes).select_related('case')
    db_cases = []
    for cite in db_cites:
        # skip cases already affirmatively matched
        if str(cite.id) in matched_cap_case_ids:
            continue

        # conform DB objects to export format from csv
        c = cite.case
        c.cite = cite
        c.decision_date = c.decision_date.strftime("%Y-%m-%d")
        c.name = c.name.replace('\xad', '')
        c.name_abbreviation = c.name_abbreviation.replace('\xad', '')
        db_cases.append(c)
    db_cases_by_volume_id = group_by(db_cases, lambda d: d.volume_id)

    # check each missing SCDB cite for cases in the same volume with similar titles
    for v, missing in tqdm(missing_by_volume.items()):
        if v in cap_cites_by_volume:
            cases = sum((db_cases_by_volume_id[vol_id] for vol_id in set(c['volume_id'] for c in cap_cites_by_volume[v])), [])
            cases_by_name = {}
            for c in cases:
                cases_by_name[c.name.lower()] = c
                cases_by_name[c.name_abbreviation.lower()] = c

            for m in missing:
                if re.match(r'131 U\.S\. [a-z]+$', m['usCite']):
                    # special case -- skip cites like "131 U.S. lxxxiii", as we know we don't have them
                    continue
                best = get_best_match([m['caseName']], cases_by_name.keys(), word_counts, document_count, tf_threshold=10)
                if not best:
                    continue
                c = cases_by_name[best[0]]
                m['cap_cite'] = {k: getattr(c, k) for k in exported_columns}
                m['match_quality'] = 'weak'

    # output
    csv_out = csv.writer((base_path / 'scdb_cite_matchup.csv').open('w'))
    csv_out.writerow(['match quality', 'volume number', 'SCDB name', 'SCDB cite', 'SCDB date', 'SCDB ID', 'CAP name', 'CAP cite', 'CAP date', 'CAP ID', 'CAP vol id'])
    for c in scdb_cites:
        match = c.get('cap_cite')
        match_row = [match['name'], match['cite'], match['decision_date'], match['id'], match['volume_id']] if match else []
        csv_out.writerow([c.get('match_quality', 'none'), c['usCite'].split()[0], c['caseName'], c['usCite'], c['dateDecision'], c['caseId']]+match_row)
Ejemplo n.º 5
0
def main(dry_run='true'):
    # includes all patterns to look for and how they should be modified
    with Path(__file__).parent.joinpath(
            "modification_instructions.json").open() as fp:
        modification_instructions = json.load(fp,
                                              object_pairs_hook=OrderedDict)

    # fetch and cache citations
    print("Prefetching citations")
    all_cites = Citation.objects.filter(type="parallel").values_list(
        'id', 'cite')

    # opens and scopes the log file handler
    with Path.home().joinpath("citation_update_logger.tsv").open(
            "a") as logfile:

        #writes the header column on the log
        csvlog = csv.writer(logfile, delimiter='\t', quotechar='"')
        csvlog.writerow([
            "timestamp", "action", "New ID", "Case ID", "New Value",
            "Old Value", "Old ID", "Split String", "Replacement Explainer",
            "Filtered", "Regex Verified", "Dry Run"
        ])

        for current_pattern, data in modification_instructions.items():
            print("Fixing %s" % current_pattern)
            modifications = data['modifications']
            jack_count = data['counts_and_examples'][
                'count']  # the count from jack's original report
            example_cases = [
                pk for entry_set in data['counts_and_examples']['examples']
                for pk in entry_set
            ]  #original examples
            print(example_cases)
            regex = '^' + get_escaped_regex(
                current_pattern) + '$'  #turn the citation pattern into a regex
            matcher = re.compile(regex)
            matching_citation_ids = [
                id for id, cite in all_cites if matcher.match(cite)
            ]
            matching_cite_query = Citation.objects.filter(
                id__in=matching_citation_ids)

            # simplify the list of example cases to make sure our search regex gets them
            example_cite_ids = []
            for epk in example_cases:
                case = CaseMetadata.objects.get(pk=epk)
                for cite in case.citations.filter(type="parallel"):
                    example_cite_ids.append(cite.pk)

            to_update = []
            to_insert = []
            to_log = []
            matching_citation_count = 0

            for matching_citation in matching_cite_query:
                matching_citation_count += 1
                csv_replacement_explainer = ""  # string that will say in fairly plain english what was done
                regex_verified = [
                ]  # simple splits don't have a verification regex— it just splits and checks to make sure the output doesn't match the original pattern. I want to specify in the log if it was regex verified
                cite = matching_citation.cite
                case_id = matching_citation.case_id
                if matching_citation.pk in example_cite_ids:
                    example_cite_ids.remove(
                        matching_citation.pk
                    )  # this list should be empty with every pattern

                # The modifications can include a split string which will split a citation up into mutiple citations,
                # 'filters' which is a list of regex substitution pairs, and "kill" which drops one section. The list orders
                # are important because they have the same indexes as the split order, so you can know where to apply what

                if 'splitstring' in modifications:
                    new_cites = [
                        c.strip()
                        for c in cite.split(modifications['splitstring'])
                        if c.strip()
                    ]
                else:
                    new_cites = [cite]

                if 'filters' in modifications:
                    new_cites_filtered = []
                    filters = modifications['filters']
                    csv_replacement_explainer += "Using {} replacement".format(
                        len(filters)) if len(
                            filters) == 1 else "Using {} replacements".format(
                                len(filters))
                    assert len(new_cites) == len(filters)
                    for index, (filter_dict,
                                new_cite) in enumerate(zip(filters,
                                                           new_cites)):
                        if 'kill' in filter_dict:
                            # print("Dropping {}".format(new_cite))
                            csv_replacement_explainer += ", drop split field {} ({})".format(
                                index, new_cite)
                            continue
                        for pattern in filter_dict['patterns']:
                            csv_replacement_explainer += ", replace '{}' with '{}' in split field {} ({})".format(
                                pattern[0], pattern[1], index, new_cite)
                            new_cite = re.sub(pattern[0], pattern[1], new_cite)
                        # The 'goal' is a pattern that the new citation should match after being processed.
                        if 'goal' in filter_dict:
                            csv_replacement_explainer += " to get '{}'".format(
                                filter_dict['goal'])
                            regex_verified.append(new_cite)
                            if not re.match(
                                    '^' +
                                    get_escaped_regex(filter_dict['goal']) +
                                    '$', new_cite):
                                raise Exception(
                                    "Doesn't Match: '{}'\nCurrent Pattern:'{}'\nRegex: '{}'\nCite_Section: '{}'\n"
                                    "Goal: '{}'\nEscaped Goal: '{}'".format(
                                        cite, current_pattern,
                                        get_escaped_regex(current_pattern),
                                        new_cite, filter_dict['goal'],
                                        get_escaped_regex(
                                            filter_dict['goal'])))
                        new_cites_filtered.append(new_cite)
                    new_cites = new_cites_filtered

                # if it matches the original pattern, it's wrong
                for c in new_cites:
                    if re.match(matcher, c):
                        raise Exception(
                            "New Cite Matches Original Regex: '{}'\nCurrent Pattern:'{}'\nRegex: '{}'\n"
                            "Cite_Section: '{}'\n".format(
                                cite, current_pattern,
                                get_escaped_regex(current_pattern), c))
                # update records and log
                for index, new_citation in enumerate(new_cites):
                    reg = True if new_citation in regex_verified else False
                    action = "update" if index == 0 else "create"
                    if action == 'update':
                        # print("Updating: {}".format(new_citation))
                        matching_citation.cite = new_citation
                        matching_citation.normalized_cite = normalize_cite(
                            new_citation)
                        new_citation_obj = matching_citation
                        to_update.append(matching_citation)
                    elif action == 'create':
                        # print("Creating: {}".format(new_citation))
                        new_citation_obj = Citation(
                            case_id=case_id,
                            type="parallel",
                            cite=new_citation,
                            normalized_cite=normalize_cite(new_citation),
                        )
                        to_insert.append(new_citation_obj)

                    to_log.append(
                        (cite, reg, action, csv_replacement_explainer,
                         matching_citation, new_citation_obj))

            if not to_log:
                print("- nothing to do")
                continue

            if dry_run == 'false':
                with EditLog(description='Fix citations matching %s' %
                             current_pattern).record() as edit:
                    Citation.objects.bulk_update(to_update,
                                                 ['cite', 'normalized_cite'])
                    Citation.objects.bulk_create(to_insert)
                timestamp = edit.timestamp
            else:
                timestamp = datetime.now()

            log_filters = str(modifications['filters']
                              ) if 'filters' in modifications else None
            log_splitstring = str(modifications['splitstring']
                                  ) if 'splitstring' in modifications else None
            for cite, reg, action, csv_replacement_explainer, matching_citation, new_citation_obj in to_log:
                # "timestamp", "action", "New ID", "Case ID", "New Value", "Old Value", "Old ID",
                # "Split String", "Replacement Explainer", "Filtered", "Regex Verified", "Dry Run"
                csvlog.writerow([
                    timestamp,
                    action,
                    new_citation_obj.pk,
                    matching_citation.case_id,
                    new_citation_obj.cite,
                    cite,
                    matching_citation.pk,
                    log_splitstring,
                    csv_replacement_explainer,
                    log_filters,
                    reg,
                    dry_run,
                ])

            if matching_citation_count != jack_count:
                # This didn't happen after a dry run with production data
                print(
                    "non-matching Jack Count: {}, Query Count: {}, Pattern: {}"
                    .format(jack_count, matching_citation_count,
                            current_pattern))

            if len(example_cite_ids) > 0:
                # This didn't happen after a dry run with production data
                raise Exception("non-matching example in {}: {}".format(
                    current_pattern, ", ".join(example_cite_ids)))
Ejemplo n.º 6
0
def main(dry_run='true'):

    # handle each line from manual_fixes.csv
    fixes = csv.DictReader(Path(__file__).parent.joinpath('manual_fixes.csv').open())
    for fix in fixes:
        if not fix['official']:
            continue

        with EditLog(
            description='Mark reporter %s as nominative for reporter %s' % (fix['short'], fix['official'])
        ).record():
            nominative_reporter = Reporter.objects.get(id=fix['id'])
            print("Updating %s" % nominative_reporter)
            if nominative_reporter.is_nominative:
                print("- skipping, already fixed")
                continue

            ## associate nominative reporter with official reporter

            if fix['official'].isnumeric():
                official_reporter = Reporter.objects.get(id=fix['official'])
            else:
                official_reporter = Reporter.objects.get(short_name=fix['official'])
            nominative_reporter.nominative_for = official_reporter
            nominative_reporter.is_nominative = True
            nominative_reporter.short_name = fix['nominative']

            print("- update %s to be nominative for %s" % (nominative_reporter, official_reporter))
            if dry_run == 'false':
                nominative_reporter.save()

            ## prepare to process each volume in nominative reporter

            print("- update volumes")
            volumes = natsorted(nominative_reporter.volumes.filter(duplicate=False).order_by('volume_number'), key=lambda v: v.volume_number)
            last_volume_numbers = []
            volume_index = 0

            # the 'official offset' column indicates how official volume numbers are derived from nominative volume numbers.
            # it can be in two formats
            official_offsets = {}
            official_offset_default = None
            if ',' in fix['official offset']:
                # the first format is a set of ranges, like "1-2: 18, 3-14: 24", meaning volumes 1 and 2 were renumbered
                # to 18, 19, etc., and 3-14 were renumbered to 24, 25, etc. Parse this into a dict like {1: 18, 2: 19, 3: 24, 4: 25 ...}
                for offset_range in fix['official offset'].split(', '):
                    start_stop, offset = offset_range.split(': ')
                    start, stop = start_stop.split('-')
                    offset = int(offset)
                    for i, vol_num in enumerate(range(int(start), int(stop)+1)):
                        official_offsets[vol_num] = offset + i
            else:
                # the second format is just a single number
                official_offset_default = int(fix['official offset'])

            ## process each volume

            for volume in volumes:

                ## update volume to have correct volume_number, nominative_volume_number, and references to its official reporter and its nominative reporter

                volume_number = int(volume.volume_number)
                if volume_number in last_volume_numbers:
                    print(" - WARNING: duplicate volume number %s" % volume_number)
                else:
                    volume_index += 1
                expected_official_volume_number = official_offsets[volume_index] if official_offsets else volume_index + official_offset_default - 1
                expected_nominative_volume_number = volume_index
                if volume_number != expected_nominative_volume_number and volume_number != expected_official_volume_number:
                    print(" - ERROR: Unexpected volume number: %s" % volume_number)
                    continue
                last_volume_numbers = [expected_official_volume_number, expected_nominative_volume_number]
                volume.nominative_volume_number = expected_nominative_volume_number
                volume.volume_number = expected_official_volume_number
                volume.reporter = official_reporter
                volume.nominative_reporter = nominative_reporter

                print(" - update %s to %s,%s" % (volume_number, volume.volume_number, volume.nominative_volume_number))
                if dry_run == 'false':
                    volume.save()

                ## update citations for each case in volume

                # Do some sanity checking here -- if the case is supposed to end up with official citation "5 Mass. 123"
                # and nominative citation "1 Bar 123", then we expect the current official citation to start with either
                # a "1" or "5", followed by either "Mass." or "Bar" or "Mass. (Bar)".

                # figure out what we expect:
                print("  - update cases")
                official_cite_prefix = "%s %s " % (volume.volume_number, official_reporter.short_name)
                nominative_cite_prefix = "%s %s " % (volume.nominative_volume_number, nominative_reporter.short_name)
                expected_short_names = [official_reporter.short_name, nominative_reporter.short_name, "%s (%s)" % (official_reporter.short_name, nominative_reporter.short_name)]
                expected_prefixes = [alphanum("%s %s" % (n, prefix)) for n in [expected_official_volume_number, expected_nominative_volume_number] for prefix in expected_short_names]
                if volume.barcode in cite_overrides:
                    wrong_old_prefix, fixed_old_prefix = cite_overrides[volume.barcode]
                else:
                    wrong_old_prefix, fixed_old_prefix = None, None

                for case in volume.case_metadatas.prefetch_related('citations'):

                    # check if existing cite matches expectations:
                    official_cite = next(c for c in case.citations.all() if c.type == 'official')
                    old_official_cite = official_cite.cite
                    old_prefix, old_page_num = old_official_cite.rsplit(' ', 1)
                    if fixed_old_prefix and wrong_old_prefix == old_prefix:
                        old_prefix = fixed_old_prefix
                    if alphanum(old_prefix) not in expected_prefixes:
                        print("   - ERROR: cite %s not expected" % old_official_cite)
                        continue

                    # create new official and nominative cites:
                    official_cite.cite = official_cite_prefix + old_page_num
                    nominative_cite = deepcopy(official_cite)
                    nominative_cite.cite = nominative_cite_prefix + old_page_num
                    nominative_cite.type = 'nominative'
                    nominative_cite.pk = None
                    print("   - update %s to %s and %s" % (old_official_cite, official_cite, nominative_cite))
                    if dry_run == 'false':
                        official_cite.save()
                        nominative_cite.save()

            if dry_run != 'false':
                raise EditLog.Cancel
Ejemplo n.º 7
0
def main(dry_run="true"):
    with EditLog(description='Fix duplicate volumes').record(
            dry_run=dry_run != "false"):
        # mark duplicates
        for duplicate_volume in to_suppress_to_keep:
            preferred_volume = VolumeMetadata.objects.get(
                pk=duplicate_volume[1])
            for suppress_this in duplicate_volume[0]:
                vol = VolumeMetadata.objects.get(pk=suppress_this)
                if dry_run == "false":
                    vol.set_duplicate(preferred_volume)
                print("set_duplicate,%s,%s" %
                      (vol.barcode, preferred_volume.barcode))

        # mark second parts
        for parts in part_2:
            vol_entry_1 = VolumeMetadata.objects.get(pk=parts[0])
            vol_entry_2 = VolumeMetadata.objects.get(pk=parts[1])

            i = 0
            label_1 = ""
            while not label_1.isdigit():
                label_1 = vol_entry_1.page_structures.order_by(
                    '-order')[i].label
                i += 1
            label_1 = int(label_1)

            i = 0
            label_2 = ""
            while not label_2.isdigit():
                label_2 = vol_entry_2.page_structures.order_by(
                    '-order')[i].label
                i += 1
            label_2 = int(label_2)

            if label_1 > label_2:
                vol_entry_1, vol_entry_2 = vol_entry_2, vol_entry_1
            vol_entry_2.second_part_of = vol_entry_1
            if dry_run == "false":
                vol_entry_2.save()

            print("second_part_of,%s,%s" %
                  (vol_entry_2.barcode, vol_entry_1.barcode))

        # update Wn2d_185
        Wn2d_185 = VolumeMetadata.objects.get(pk="Wn2d_185")
        reporter = Reporter.objects.get(short_name="Wash. 2d")
        if dry_run == "false":
            Wn2d_185.set_reporter(reporter)
        print("set_reporter,%s,%s" % (Wn2d_185.barcode, reporter.pk))

        # update 32044057887291
        nc_vol = VolumeMetadata.objects.get(pk="32044057887291")
        if dry_run == "false":
            nc_vol.set_volume_number("5")
        print("set_volume_number,%s,%s" % (nc_vol.barcode, "5"))

        #update 25 Tex. Supp
        tex_supp = VolumeMetadata.objects.get(pk="32044078588621")
        if dry_run == "false":
            tex_supp.set_volume_number("25 Supp.")
        print("set_volume_number,%s,%s" % (tex_supp.barcode, "25 Supp."))

        for case in tex_supp.case_metadatas.all():
            cite = "25 Tex. Supp. {}".format(case.first_page)
            if dry_run == "false":
                Citation.objects.create(type="parallel", cite=cite, case=case)
            print("new_citation,%s,%s" % (tex_supp.barcode, cite))

        # update 32044078699600
        volume = VolumeMetadata.objects.get(pk="32044078699600")
        duplicate_of = VolumeMetadata.objects.get(pk="32044078592631")
        if dry_run == "false":
            volume.set_volume_number('25')
            volume.set_duplicate(duplicate_of)
        print("set_volume_number,%s,%s" % (volume.barcode, '25'))
        print("set_duplicate,%s,%s" % (volume.barcode, duplicate_of.pk))

        # update volume numbers
        for replace in simple_replace:
            vol = VolumeMetadata.objects.get(pk=replace[0])
            if dry_run == "false":
                vol.set_volume_number(replace[1])
            print("set_volume_number,%s,%s" % (vol.barcode, replace[1]))
Ejemplo n.º 8
0
def main(dry_run='true', log_file='/tmp/fix_reporter_jurs.log'):
    dry_run = dry_run != 'false'
    with open(log_file, 'a') as log:
        for fix in fixes:
            cases = CaseMetadata.objects.filter(reporter_id=fix['Reporter'])
            if fix['Volume'] != '*':
                cases = cases.filter(volume__volume_number=fix['Volume'])

            if fix['Correct Citation']:
                cites = Citation.objects.filter(
                    case__in=cases, cite__contains=fix['Wrong Citation'])
                with EditLog(
                        description=
                        'Correct citations for reporter %s volume %s from %s to %s'
                        % (fix['Reporter'], fix['Volume'],
                           fix['Wrong Citation'], fix['Correct Citation']),
                        dry_run=dry_run,
                ).record():
                    actions = Citation.replace_reporter(
                        cites,
                        fix['Wrong Citation'],
                        fix['Correct Citation'],
                        dry_run=dry_run)
                for cite, old_cite, new_cite in actions:
                    write_log(
                        log, {
                            "action": 'fix_cite',
                            "cite_id": cite.id,
                            "old": old_cite,
                            "new": new_cite
                        })

            elif fix['Correct Reporter']:
                volume = VolumeMetadata.objects.filter(
                    volume_number=fix['Volume'],
                    reporter_id=fix['Reporter']).first()
                if volume is None:
                    print("WARNING: nothing to do for %s; skipping" % fix)
                    continue
                new_reporter = Reporter.objects.get(id=fix['Correct Reporter'])
                write_log(
                    log, {
                        "action": 'fix_reporter',
                        "volume_id": volume.pk,
                        "old_reporter_id": volume.reporter_id,
                        "new_reporter_id": new_reporter.id
                    })
                if not dry_run:
                    with EditLog(
                            description=
                            'Correct reporter for volume %s from %s to %s' %
                        (volume.pk, volume.reporter_id,
                         new_reporter.id)).record():
                        volume.set_reporter(new_reporter)

            else:
                cases = cases.filter(jurisdiction__name=fix['Wrong Jur'])
                new_jur = Jurisdiction.objects.get(name=fix['Correct Jur'])
                new_court = None
                if fix['Correct Court']:
                    new_court = Court.objects.get(name=fix['Correct Court'])
                to_update = []
                for case in cases:
                    case.jurisdiction = new_jur
                    message = {
                        "action": 'fix_jur',
                        "case_id": case.pk,
                        "old_jur": fix['Wrong Jur'],
                        "new_jur": fix['Correct Jur']
                    }
                    if new_court:
                        message.update(old_court=case.court_id,
                                       new_court=new_court.id)
                        case.court = new_court
                    to_update.append(case)
                    write_log(log, message)
                if not to_update:
                    print("WARNING: nothing to do for %s; skipping" % fix)
                    continue
                if not dry_run:
                    with EditLog(
                            description=
                            'Correct jurisdictions in reporter %s volume %s from %s to %s'
                            %
                        (fix['Reporter'], fix['Volume'], fix['Wrong Jur'],
                         fix['Correct Jur']), ).record():
                        CaseMetadata.objects.bulk_update(
                            to_update, ['court', 'jurisdiction'])
Ejemplo n.º 9
0
                out.writerow([case.id, "skip", second.cite])
                continue

            # type 3:
            if re.match(r'\d+ F.2d \d+;.*$', second.cite):
                old_val = second.cite
                second.cite = old_val.split(';', 1)[0]
                to_update.append(second)
                changed_cites.add(old_val)
                changed_cites.add(second.cite)
                out.writerow([case.id, "update", old_val, second.cite])
                continue

        # type 4:
        delete_cites = [c.cite for c in rest]
        changed_cites.update(delete_cites)
        out.writerow([case.id, "delete"] + delete_cites)
        to_delete.extend(rest)

    # apply edits:
    if dry_run == 'false':
        with EditLog(
                description=
                'Remove incorrectly-detected citations from Ct. Cl. reporter. '
                'See https://github.com/harvard-lil/capstone/issues/1192'
        ).record():
            Citation.objects.bulk_update(to_update, ['cite'])
            for obj in to_delete:
                obj.delete()
            CaseMetadata.update_frontend_urls(changed_cites)
Ejemplo n.º 10
0
                        if old_token != old_val:
                            raise Exception("attempt to edit out-of-date token")
                        tokens[index] = new_val

        # update case, if needed
        metadata_count = 0
        for field in metadata_fields:
            if field not in data['metadata']:
                continue
            old_val, new_val = data['metadata'][field]
            if getattr(case, field) == old_val:
                metadata_count += 1
                setattr(case, field, new_val)

        if metadata_count or word_count:
            with EditLog(description='Case %s edited by user %s: %s metadata fields, %s words' % (case.id, request.user.id, metadata_count, word_count), user_id=request.user.id).record():
                CorrectionLog(description=data['description'], user_id=request.user.id, case=case).save()
                if pages_to_save:
                    PageStructure.objects.bulk_update(pages_to_save, ['blocks'])
                    case.sync_case_body_cache(blocks_by_id=PageStructure.blocks_by_id(pages))

                    # re-extract citations
                    existing_cites = {c.cite: c for c in ExtractedCitation.objects.filter(cited_by=case)}
                    new_cites = {c.cite: c for c in case.extract_citations()[0]}
                    ExtractedCitation.objects.filter(id__in=[v.id for k, v in existing_cites.items() if k not in new_cites]).delete()
                    ExtractedCitation.objects.bulk_create([v for k, v in new_cites.items() if k not in existing_cites])
                case.save()
                case.reindex()  # manual reindex for instant results

        return HttpResponse('OK')