Esempio n. 1
0
def import_f3():
    """Iterate over the F3 documents and import them.

    A couple complications belie what would otherwise be a simple process:
     1. Duplicate detection. This is done by filtering by query and then
        refining the results that are found. For more details, see the
        dup_finder code.
     2. Merging duplicate documents. See their code in the f3_helpers module.
    """
    simulate = False
    corpus = dup_helpers.Corpus("%s/Resource.org/data/F3/" % settings.INSTALL_ROOT)
    vol_file = open("%s/Resource.org/logs/vol_file.txt" % settings.INSTALL_ROOT, "r+")
    case_file = open("%s/Resource.org/logs/case_file.txt" % settings.INSTALL_ROOT, "r+")
    stat_file = open("%s/Resource.org/logs/training_stats.csv" % settings.INSTALL_ROOT, "a")
    try:
        volume_num = int(vol_file.readline())
    except ValueError:
        # the volume file is emtpy or otherwise failing.
        volume_num = 0
    vol_file.close()
    for volume in corpus[volume_num:]:
        print "################"
        print " Vol: %s" % volume_num
        print "################"
        try:
            j = int(case_file.readline())
            print "Case: %s" % j
        except ValueError:
            j = 0
        case_file.close()
        for case in volume[j:]:
            if dup_helpers.need_dup_check_for_date_and_court(case):
                run_dup_check(case, simulate)
            else:
                print "Dup check not needed. Adding the opinion."
                if not simulate:
                    dup_helpers.add_case(case)

            # save our location within the volume
            j += 1
            case_file = open("%s/Resource.org/logs/case_file.txt" % settings.INSTALL_ROOT, "w")
            case_file.write(str(j))
            case_file.close()
        # save our location within the corpus
        volume_num += 1
        vol_file = open("%s/Resource.org/logs/vol_file.txt" % settings.INSTALL_ROOT, "w")
        vol_file.write(str(volume_num))
        vol_file.close()
Esempio n. 2
0
def run_dup_check(case, simulate=True):
    """Runs a series of duplicate checking code, generating and analyzing
    stats about whether the case is a duplicate.

    """
    print "Running dup check..."
    # stats takes the form: [count_from_search] or
    #                       [count_from_search,
    #                        count_from_docket_num,
    #                        [case_name_diff_1, diff_2, diff_3, etc],
    #                        [content_length_percent_diff_1, 2, 3],
    #                        [content_diff_1, 2, 3]
    #                       ]
    # candidates is a list of 0 to n possible duplicates
    stats, candidates = dup_finder.get_dup_stats(case)
    if len(candidates) == 0:
        print "  No candidates found. Adding the opinion."
        if not simulate:
            dup_helpers.add_case(case)
    elif (re.sub("(\D|0)", "", case.docket_number) == re.sub("(\D|0)", "", candidates[0]["docketNumber"])) and (
        len(candidates) == 1
    ):
        # If the docket numbers are identical, and there was only
        # one result
        print "  Match made on docket number of single candidate. Merging the " "opinions."
        if not simulate:
            dup_helpers.merge_cases_simple(case, candidates[0]["id"])
    elif len(dup_helpers.find_same_docket_numbers(case, candidates)) == 1:
        print "  One of the %s candidates had an identical docket number. " "Merging the opinions." % len(candidates)
        if not simulate:
            dup_helpers.merge_cases_simple(case, dup_helpers.find_same_docket_numbers(case, candidates)[0]["id"])
    elif len(dup_helpers.find_same_docket_numbers(case, candidates)) > 0:
        print "  Several of the %s candidates had an identical docket " "number. Merging the opinions." % len(
            candidates
        )
        if not simulate:
            target_ids = [can["id"] for can in dup_helpers.find_same_docket_numbers(case, candidates)]
            dup_helpers.merge_cases_complex(case, target_ids)
    else:
        # Possible duplicate, filter out obviously bad cases, and
        # then pass forward for manual review if necessary.
        filtered_candidates, stats = dup_helpers.filter_by_stats(candidates, stats)
        if len(filtered_candidates) == 0:
            print "After filtering, no candidates remain. Adding the opinion."
            if not simulate:
                dup_helpers.add_case(case)
        else:
            print "FILTERED STATS: %s" % stats
            duplicates = []
            for k in range(0, len(filtered_candidates)):
                # Have to determine by "hand"
                print "  %s) Case name: %s" % (k + 1, case.case_name)
                print "                %s" % filtered_candidates[k]["caseName"]
                print "      Docket nums: %s" % case.docket_number
                print "                   %s" % filtered_candidates[k]["docketNumber"]
                print "      Candidate URL: %s" % case.download_url
                print "      Match URL: https://www.courtlistener.com%s" % (filtered_candidates[k]["absolute_url"])

                choice = raw_input("Is this a duplicate? [Y/n]: ")
                choice = choice or "y"
                if choice == "y":
                    duplicates.append(filtered_candidates[k]["id"])

            if len(duplicates) == 0:
                print "No duplicates found after manual determination. " "Adding the opinion."
                if not simulate:
                    dup_helpers.add_case(case)
            elif len(duplicates) == 1:
                print "Single duplicate found after manual determination. " "Merging the opinions."
                if not simulate:
                    dup_helpers.merge_cases_simple(case, duplicates[0])
            elif len(duplicates) > 1:
                print "Multiple duplicates found after manual determination. " "Merging the opinions."
                if not simulate:
                    dup_helpers.merge_cases_complex(case, duplicates)