예제 #1
0
def merge_cases_simple(new, target_id):
    """Add `new` to the database, merging with target_id

     Merging is done by picking the best fields from each item.
    """
    target = Document.objects.get(pk=target_id)
    print "Merging %s with" % new.citation.case_name
    print "        %s" % target.citation.case_name

    cached_source = target.source  # Original value is needed below.
    if target.source == 'C':
        target.source = 'LC'
    elif target.source == 'R':
        target.source = 'LR'
    elif target.source == 'CR':
        target.source = 'LCR'

    # Add the URL if it's not a court one, replacing resource.org's info in some cases.
    if cached_source == 'R':
        target.download_url = new.download_url

    # Recreate the slug from the new case name (this changes the URL, but the old will continue working)
    target.citation.slug = trunc(slugify(new.citation.case_name), 50)

    # Take the case name from the new item; they tend to be pretty good
    target.citation.case_name = new.citation.case_name

    # Add the docket number if the old doesn't exist, but keep the old if one does.
    if not target.citation.docket_number:
        target.citation.docket_number = new.citation.docket_number

    # Get the citations from the new item (ditch the old).
    target.citation.federal_cite_one = new.citation.federal_cite_one
    target.citation.federal_cite_two = new.citation.federal_cite_two
    target.citation.federal_cite_three = new.citation.federal_cite_three
    target.citation.state_cite_one = new.citation.state_cite_one
    target.citation.state_cite_two = new.citation.state_cite_two
    target.citation.state_cite_three = new.citation.state_cite_three
    target.citation.state_cite_regional = new.citation.state_cite_regional
    target.citation.specialty_cite_one = new.citation.specialty_cite_one
    target.citation.scotus_early_cite = new.citation.scotus_early_cite
    target.citation.lexis_cite = new.citation.lexis_cite
    target.citation.westlaw_cite = new.citation.westlaw_cite
    target.citation.neutral_cite = new.citation.neutral_cite

    # Add judge information if lacking. New is dirty, but better than none.
    if not target.judges:
        target.judges = new.judges

    # Add the text.
    target.html_lawbox, blocked = anonymize(new.html)
    if blocked:
        target.blocked = True
        target.date_blocked = now()

    target.extracted_by_ocr = False  # No longer true for any LB case.

    save_doc_and_cite(target, index=False)
예제 #2
0
def merge_cases_simple(new, target_id):
    """Add `new` to the database, merging with target_id

     Merging is done by picking the best fields from each item.
    """
    target = Document.objects.get(pk=target_id)
    print "Merging %s with" % new.citation.case_name
    print "        %s" % target.citation.case_name

    cached_source = target.source  # Original value is needed below.
    if target.source == 'C':
        target.source = 'LC'
    elif target.source == 'R':
        target.source = 'LR'
    elif target.source == 'CR':
        target.source = 'LCR'

    # Add the URL if it's not a court one, replacing resource.org's info in some cases.
    if cached_source == 'R':
        target.download_url = new.download_url

    # Recreate the slug from the new case name (this changes the URL, but the old will continue working)
    target.citation.slug = trunc(slugify(new.citation.case_name), 50)

    # Take the case name from the new item; they tend to be pretty good
    target.citation.case_name = new.citation.case_name

    # Add the docket number if the old doesn't exist, but keep the old if one does.
    if not target.citation.docket_number:
        target.citation.docket_number = new.citation.docket_number

    # Get the citations from the new item (ditch the old).
    target.citation.federal_cite_one = new.citation.federal_cite_one
    target.citation.federal_cite_two = new.citation.federal_cite_two
    target.citation.federal_cite_three = new.citation.federal_cite_three
    target.citation.state_cite_one = new.citation.state_cite_one
    target.citation.state_cite_two = new.citation.state_cite_two
    target.citation.state_cite_three = new.citation.state_cite_three
    target.citation.state_cite_regional = new.citation.state_cite_regional
    target.citation.specialty_cite_one = new.citation.specialty_cite_one
    target.citation.scotus_early_cite = new.citation.scotus_early_cite
    target.citation.lexis_cite = new.citation.lexis_cite
    target.citation.westlaw_cite = new.citation.westlaw_cite
    target.citation.neutral_cite = new.citation.neutral_cite

    # Add judge information if lacking. New is dirty, but better than none.
    if not target.judges:
        target.judges = new.judges

    # Add the text.
    target.html_lawbox, blocked = anonymize(new.html)
    if blocked:
        target.blocked = True
        target.date_blocked = now()

    target.extracted_by_ocr = False  # No longer true for any LB case.

    save_doc_and_cite(target, index=False)
예제 #3
0
def main():
    parser = argparse.ArgumentParser(description="Import the corpus provided by lawbox")
    parser.add_argument(
        "-s",
        "--simulate",
        default=False,
        required=False,
        action="store_true",
        help="Run the code in simulate mode, making no permanent changes.",
    )
    parser.add_argument("-d", "--dir", type=readable_dir, help="The directory where the lawbox bulk data can be found.")
    parser.add_argument(
        "-f",
        "--file",
        type=str,
        default="index.txt",
        required=False,
        dest="file_name",
        help="The file that has all the URLs to import, one per line.",
    )
    parser.add_argument(
        "-l",
        "--line",
        type=int,
        default=1,
        required=False,
        help="If provided, this will be the line number in the index file where we resume processing.",
    )
    parser.add_argument(
        "-r",
        "--resume",
        default=False,
        required=False,
        action="store_true",
        help="Use the saved marker to resume operation where it last failed.",
    )
    parser.add_argument(
        "-x",
        "--random",
        default=False,
        required=False,
        action="store_true",
        help="Pick cases randomly rather than serially.",
    )
    parser.add_argument(
        "-m",
        "--marker",
        type=str,
        default="lawbox_progress_marker.txt",
        required=False,
        help="The name of the file that tracks the progress (useful if multiple versions run at same time)",
    )
    parser.add_argument(
        "-e", "--end", type=int, required=False, default=2000000, help="An optional endpoint for an importer."
    )
    args = parser.parse_args()

    if args.dir:

        def case_generator(dir_root):
            """Yield cases, one by one to the importer by recursing and iterating the import directory"""
            for root, dirnames, filenames in os.walk(dir_root):
                for filename in fnmatch.filter(filenames, "*"):
                    yield os.path.join(root, filename)

        cases = case_generator(args.root)
        i = 0
    else:

        def generate_random_line(file_name):
            while True:
                total_bytes = os.stat(file_name).st_size
                random_point = random.randint(0, total_bytes)
                f = open(file_name)
                f.seek(random_point)
                f.readline()  # skip this line to clear the partial line
                yield f.readline().strip()

        def case_generator(line_number):
            """Yield cases from the index file."""
            enumerated_line_number = line_number - 1  # The enumeration is zero-index, but files are one-index.
            index_file = open(args.file_name)
            for i, line in enumerate(index_file):
                if i >= enumerated_line_number:
                    yield line.strip()

        if args.random:
            cases = generate_random_line(args.file_name)
            i = 0
        elif args.resume:
            with open(args.marker) as marker:
                resume_point = int(marker.read().strip())
            cases = case_generator(resume_point)
            i = resume_point
        else:
            cases = case_generator(args.line)
            i = args.line

    for case_path in cases:
        if i % 1000 == 0:
            db.reset_queries()  # Else we leak memory when DEBUG is True

        if "counter" in DEBUG:  # and i % 1000 == 0:
            log_print("\n%s: Doing case (%s): file://%s" % (datetime.datetime.now(), i, case_path))
        try:
            doc = import_law_box_case(case_path)
            duplicates = find_duplicates(doc, case_path)
            if not args.simulate:
                if len(duplicates) == 0:
                    doc.html_lawbox, blocked = anonymize(doc.html)
                    doc.html = ""
                    if blocked:
                        doc.blocked = True
                        doc.date_blocked = now()
                        # Save nothing to the index for now (it'll get done when we find citations)
                    save_doc_and_cite(doc, index=False)
                if len(duplicates) == 1:
                    dup_helpers.merge_cases_simple(doc, duplicates[0])
                if len(duplicates) > 1:
                    # complex_merge
                    if "log_multimerge" in DEBUG:
                        with open("index_multimerge.txt", "a") as log:
                            log.write("%s\n" % case_path)
            if args.resume:
                # Don't change the progress marker unless you're in resume mode.
                with open(args.marker, "w") as marker:
                    marker.write(str(i + 1))  # Files are one-index, not zero-index
            with open("lawbox_fix_file.pkl", "wb") as fix_file:
                pickle.dump(fixes, fix_file)
            i += 1
            if i == args.end:
                log_print("Hit the endpoint after importing number %s. Breaking." % i)
                break
        except Exception, err:
            log_print(traceback.format_exc())
            exit(1)
예제 #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Import the corpus provided by lawbox')
    parser.add_argument(
        '-s',
        '--simulate',
        default=False,
        required=False,
        action='store_true',
        help='Run the code in simulate mode, making no permanent changes.')
    parser.add_argument(
        '-d',
        '--dir',
        type=readable_dir,
        help='The directory where the lawbox bulk data can be found.')
    parser.add_argument(
        '-f',
        '--file',
        type=str,
        default="index.txt",
        required=False,
        dest="file_name",
        help="The file that has all the URLs to import, one per line.")
    parser.add_argument(
        '-l',
        '--line',
        type=int,
        default=1,
        required=False,
        help=
        'If provided, this will be the line number in the index file where we resume processing.'
    )
    parser.add_argument(
        '-r',
        '--resume',
        default=False,
        required=False,
        action='store_true',
        help='Use the saved marker to resume operation where it last failed.')
    parser.add_argument('-x',
                        '--random',
                        default=False,
                        required=False,
                        action='store_true',
                        help='Pick cases randomly rather than serially.')
    parser.add_argument(
        '-m',
        '--marker',
        type=str,
        default='lawbox_progress_marker.txt',
        required=False,
        help=
        "The name of the file that tracks the progress (useful if multiple versions run at same time)"
    )
    parser.add_argument('-e',
                        '--end',
                        type=int,
                        required=False,
                        default=2000000,
                        help="An optional endpoint for an importer.")
    args = parser.parse_args()

    if args.dir:

        def case_generator(dir_root):
            """Yield cases, one by one to the importer by recursing and iterating the import directory"""
            for root, dirnames, filenames in os.walk(dir_root):
                for filename in fnmatch.filter(filenames, '*'):
                    yield os.path.join(root, filename)

        cases = case_generator(args.root)
        i = 0
    else:

        def generate_random_line(file_name):
            while True:
                total_bytes = os.stat(file_name).st_size
                random_point = random.randint(0, total_bytes)
                f = open(file_name)
                f.seek(random_point)
                f.readline()  # skip this line to clear the partial line
                yield f.readline().strip()

        def case_generator(line_number):
            """Yield cases from the index file."""
            enumerated_line_number = line_number - 1  # The enumeration is zero-index, but files are one-index.
            index_file = open(args.file_name)
            for i, line in enumerate(index_file):
                if i >= enumerated_line_number:
                    yield line.strip()

        if args.random:
            cases = generate_random_line(args.file_name)
            i = 0
        elif args.resume:
            with open(args.marker) as marker:
                resume_point = int(marker.read().strip())
            cases = case_generator(resume_point)
            i = resume_point
        else:
            cases = case_generator(args.line)
            i = args.line

    for case_path in cases:
        if i % 1000 == 0:
            db.reset_queries()  # Else we leak memory when DEBUG is True

        if 'counter' in DEBUG:  #and i % 1000 == 0:
            log_print("\n%s: Doing case (%s): file://%s" %
                      (datetime.datetime.now(), i, case_path))
        try:
            doc = import_law_box_case(case_path)
            duplicates = find_duplicates(doc, case_path)
            if not args.simulate:
                if len(duplicates) == 0:
                    doc.html_lawbox, blocked = anonymize(doc.html)
                    doc.html = ''
                    if blocked:
                        doc.blocked = True
                        doc.date_blocked = now()
                        # Save nothing to the index for now (it'll get done when we find citations)
                    save_doc_and_cite(doc, index=False)
                if len(duplicates) == 1:
                    dup_helpers.merge_cases_simple(doc, duplicates[0])
                if len(duplicates) > 1:
                    #complex_merge
                    if 'log_multimerge' in DEBUG:
                        with open('index_multimerge.txt', 'a') as log:
                            log.write('%s\n' % case_path)
            if args.resume:
                # Don't change the progress marker unless you're in resume mode.
                with open(args.marker, 'w') as marker:
                    marker.write(str(i +
                                     1))  # Files are one-index, not zero-index
            with open('lawbox_fix_file.pkl', 'wb') as fix_file:
                pickle.dump(fixes, fix_file)
            i += 1
            if i == args.end:
                log_print(
                    "Hit the endpoint after importing number %s. Breaking." %
                    i)
                break
        except Exception, err:
            log_print(traceback.format_exc())
            exit(1)