Exemple #1
0
def extract_doc_content(pk, callback=None, citation_countdown=0):
    """
    Given a document, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    TODO: this implementation cannot be distributed due to using local paths.
    """
    d = Document.objects.get(pk=pk)

    path = d.local_path.path

    extension = path.split('.')[-1]
    if extension == 'doc':
        content, err = extract_from_doc(path, DEVNULL)
    elif extension == 'html':
        content, err = extract_from_html(path)
    elif extension == 'pdf':
        d, content, err = extract_from_pdf(d, path, DEVNULL, callback)
    elif extension == 'txt':
        content, err = extract_from_txt(path)
    elif extension == 'wpd':
        d, content, err = extract_from_wpd(d, path, DEVNULL)
    else:
        print(
            '*****Unable to extract content due to unknown extension: %s '
            'on d: %s****' % (extension, d))
        return 2

    if extension in ['html', 'wpd']:
        d.html, blocked = anonymize(content)
    else:
        d.plain_text, blocked = anonymize(content)

    if blocked:
        d.blocked = True
        d.date_blocked = now()

    if err:
        print "****Error extracting text from %s: %s****" % (extension, d)
        return d

    try:
        if citation_countdown == 0:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            d.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            d.save(index=True)
    except Exception, e:
        print "****Error saving text to the db for: %s****" % d
        print traceback.format_exc()
        return d
Exemple #2
0
def extract_doc_content(pk, callback=None, citation_countdown=0):
    """
    Given a document, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    TODO: this implementation cannot be distributed due to using local paths.
    """
    doc = Document.objects.get(pk=pk)

    path = str(doc.local_path)
    path = os.path.join(settings.MEDIA_ROOT, path)

    extension = path.split('.')[-1]
    if extension == 'doc':
        content, err = extract_from_doc(path, DEVNULL)
    elif extension == 'html':
        content, err = extract_from_html(path)
    elif extension == 'pdf':
        doc, content, err = extract_from_pdf(doc, path, DEVNULL, callback)
    elif extension == 'txt':
        content, err = extract_from_txt(path)
    elif extension == 'wpd':
        doc, content, err = extract_from_wpd(doc, path, DEVNULL)
    else:
        print ('*****Unable to extract content due to unknown extension: %s '
               'on doc: %s****' % (extension, doc))
        return 2

    if extension in ['html', 'wpd']:
        doc.html, blocked = anonymize(content)
    else:
        doc.plain_text, blocked = anonymize(content)

    if blocked:
        doc.blocked = True
        doc.date_blocked = now()

    if err:
        print "****Error extracting text from %s: %s****" % (extension, doc)
        return doc

    try:
        if citation_countdown == 0:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            doc.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            doc.save(index=True)
    except Exception, e:
        print "****Error saving text to the db for: %s****" % doc
        print traceback.format_exc()
        return doc
Exemple #3
0
def extract_doc_content(pk, callback=None, citation_countdown=0):
    """
    Given a document, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    TODO: this implementation cannot be distributed due to using local paths.
    """
    doc = Document.objects.get(pk=pk)

    path = str(doc.local_path)
    path = os.path.join(settings.MEDIA_ROOT, path)

    extension = path.split('.')[-1]
    if extension == 'doc':
        content, err = extract_from_doc(path, DEVNULL)
    elif extension == 'html':
        content, err = extract_from_html(path)
    elif extension == 'pdf':
        doc, content, err = extract_from_pdf(doc, path, DEVNULL, callback)
    elif extension == 'txt':
        content, err = extract_from_txt(path)
    elif extension == 'wpd':
        doc, content, err = extract_from_wpd(doc, path, DEVNULL)
    else:
        print(
            '*****Unable to extract content due to unknown extension: %s '
            'on doc: %s****' % (extension, doc))
        return 2

    if extension in ['html', 'wpd']:
        doc.html, blocked = anonymize(content)
    else:
        doc.plain_text, blocked = anonymize(content)

    if blocked:
        doc.blocked = True
        doc.date_blocked = now()

    if err:
        print "****Error extracting text from %s: %s****" % (extension, doc)
        return doc

    try:
        if citation_countdown == 0:
            doc.save(index=False)
        else:
            doc.save(index=True, commit=False)
    except Exception, e:
        print "****Error saving text to the db for: %s****" % doc
        print traceback.format_exc()
        return doc
def cleaner(simulate=False, verbose=False):
    """Re-run the anonymize function across the whole corpus.

    The anonymize function was previously missing any documents that contained
    punctuation before or after an ID. This script re-runs the function, fixing
    the error.
    """
    docs = queryset_generator(Document.objects.all())
    for doc in docs:
        text = doc.plain_text
        clean_lines = []
        any_mods = []
        for line in text.split('\n'):
            clean_line, modified = anonymize(line)
            if modified:
                print "Fixing text in document: %s" % doc.pk
                print "Line reads: %s" % line
                fix = raw_input("Fix the line? [Y/n]: ") or 'y'
                if fix.lower() == 'y':
                    clean_lines.append(clean_line)
                    any_mods.append(modified)
                else:
                    clean_lines.append(line)
            else:
                clean_lines.append(line)

        if not simulate and any(any_mods):
            doc.plain_text = '\n'.join(clean_lines)
            doc.blocked = True
            doc.date_blocked = now()
            doc.save()
Exemple #5
0
def cleaner(simulate=False, verbose=False):
    """Re-run the anonymize function across the whole corpus.

    The anonymize function was previously missing any documents that contained
    punctuation before or after an ID. This script re-runs the function, fixing
    the error.
    """
    docs = queryset_generator(Document.objects.all())
    for doc in docs:
        text = doc.plain_text
        clean_lines = []
        any_mods = []
        for line in text.split('\n'):
            clean_line, modified = anonymize(line)
            if modified:
                print "Fixing text in document: %s" % doc.pk
                print "Line reads: %s" % line
                fix = raw_input("Fix the line? [Y/n]: ") or 'y'
                if fix.lower() == 'y':
                    clean_lines.append(clean_line)
                    any_mods.append(modified)
                else:
                    clean_lines.append(line)
            else:
                clean_lines.append(line)

        if not simulate and any(any_mods):
            doc.plain_text = '\n'.join(clean_lines)
            doc.blocked = True
            doc.date_blocked = now()
            doc.save()
Exemple #6
0
def merge_cases_simple(new, target_id):
    """Add `new` to the database, merging with target_id

     Merging is done by picking the best fields from each item.
    """
    target = Document.objects.get(pk=target_id)
    print "Merging %s with" % new.citation.case_name
    print "        %s" % target.citation.case_name

    cached_source = target.source  # Original value is needed below.
    if target.source == 'C':
        target.source = 'LC'
    elif target.source == 'R':
        target.source = 'LR'
    elif target.source == 'CR':
        target.source = 'LCR'

    # Add the URL if it's not a court one, replacing resource.org's info in some cases.
    if cached_source == 'R':
        target.download_url = new.download_url

    # Recreate the slug from the new case name (this changes the URL, but the old will continue working)
    target.citation.slug = trunc(slugify(new.citation.case_name), 50)

    # Take the case name from the new item; they tend to be pretty good
    target.citation.case_name = new.citation.case_name

    # Add the docket number if the old doesn't exist, but keep the old if one does.
    if not target.citation.docket_number:
        target.citation.docket_number = new.citation.docket_number

    # Get the citations from the new item (ditch the old).
    target.citation.federal_cite_one = new.citation.federal_cite_one
    target.citation.federal_cite_two = new.citation.federal_cite_two
    target.citation.federal_cite_three = new.citation.federal_cite_three
    target.citation.state_cite_one = new.citation.state_cite_one
    target.citation.state_cite_two = new.citation.state_cite_two
    target.citation.state_cite_three = new.citation.state_cite_three
    target.citation.state_cite_regional = new.citation.state_cite_regional
    target.citation.specialty_cite_one = new.citation.specialty_cite_one
    target.citation.scotus_early_cite = new.citation.scotus_early_cite
    target.citation.lexis_cite = new.citation.lexis_cite
    target.citation.westlaw_cite = new.citation.westlaw_cite
    target.citation.neutral_cite = new.citation.neutral_cite

    # Add judge information if lacking. New is dirty, but better than none.
    if not target.judges:
        target.judges = new.judges

    # Add the text.
    target.html_lawbox, blocked = anonymize(new.html)
    if blocked:
        target.blocked = True
        target.date_blocked = now()

    target.extracted_by_ocr = False  # No longer true for any LB case.

    save_doc_and_cite(target, index=False)
Exemple #7
0
def merge_cases_simple(new, target_id):
    """Add `new` to the database, merging with target_id

     Merging is done by picking the best fields from each item.
    """
    target = Document.objects.get(pk=target_id)
    print "Merging %s with" % new.citation.case_name
    print "        %s" % target.citation.case_name

    cached_source = target.source  # Original value is needed below.
    if target.source == 'C':
        target.source = 'LC'
    elif target.source == 'R':
        target.source = 'LR'
    elif target.source == 'CR':
        target.source = 'LCR'

    # Add the URL if it's not a court one, replacing resource.org's info in some cases.
    if cached_source == 'R':
        target.download_url = new.download_url

    # Recreate the slug from the new case name (this changes the URL, but the old will continue working)
    target.citation.slug = trunc(slugify(new.citation.case_name), 50)

    # Take the case name from the new item; they tend to be pretty good
    target.citation.case_name = new.citation.case_name

    # Add the docket number if the old doesn't exist, but keep the old if one does.
    if not target.citation.docket_number:
        target.citation.docket_number = new.citation.docket_number

    # Get the citations from the new item (ditch the old).
    target.citation.federal_cite_one = new.citation.federal_cite_one
    target.citation.federal_cite_two = new.citation.federal_cite_two
    target.citation.federal_cite_three = new.citation.federal_cite_three
    target.citation.state_cite_one = new.citation.state_cite_one
    target.citation.state_cite_two = new.citation.state_cite_two
    target.citation.state_cite_three = new.citation.state_cite_three
    target.citation.state_cite_regional = new.citation.state_cite_regional
    target.citation.specialty_cite_one = new.citation.specialty_cite_one
    target.citation.scotus_early_cite = new.citation.scotus_early_cite
    target.citation.lexis_cite = new.citation.lexis_cite
    target.citation.westlaw_cite = new.citation.westlaw_cite
    target.citation.neutral_cite = new.citation.neutral_cite

    # Add judge information if lacking. New is dirty, but better than none.
    if not target.judges:
        target.judges = new.judges

    # Add the text.
    target.html_lawbox, blocked = anonymize(new.html)
    if blocked:
        target.blocked = True
        target.date_blocked = now()

    target.extracted_by_ocr = False  # No longer true for any LB case.

    save_doc_and_cite(target, index=False)
def main():
    parser = argparse.ArgumentParser(description="Import the corpus provided by lawbox")
    parser.add_argument(
        "-s",
        "--simulate",
        default=False,
        required=False,
        action="store_true",
        help="Run the code in simulate mode, making no permanent changes.",
    )
    parser.add_argument("-d", "--dir", type=readable_dir, help="The directory where the lawbox bulk data can be found.")
    parser.add_argument(
        "-f",
        "--file",
        type=str,
        default="index.txt",
        required=False,
        dest="file_name",
        help="The file that has all the URLs to import, one per line.",
    )
    parser.add_argument(
        "-l",
        "--line",
        type=int,
        default=1,
        required=False,
        help="If provided, this will be the line number in the index file where we resume processing.",
    )
    parser.add_argument(
        "-r",
        "--resume",
        default=False,
        required=False,
        action="store_true",
        help="Use the saved marker to resume operation where it last failed.",
    )
    parser.add_argument(
        "-x",
        "--random",
        default=False,
        required=False,
        action="store_true",
        help="Pick cases randomly rather than serially.",
    )
    parser.add_argument(
        "-m",
        "--marker",
        type=str,
        default="lawbox_progress_marker.txt",
        required=False,
        help="The name of the file that tracks the progress (useful if multiple versions run at same time)",
    )
    parser.add_argument(
        "-e", "--end", type=int, required=False, default=2000000, help="An optional endpoint for an importer."
    )
    args = parser.parse_args()

    if args.dir:

        def case_generator(dir_root):
            """Yield cases, one by one to the importer by recursing and iterating the import directory"""
            for root, dirnames, filenames in os.walk(dir_root):
                for filename in fnmatch.filter(filenames, "*"):
                    yield os.path.join(root, filename)

        cases = case_generator(args.root)
        i = 0
    else:

        def generate_random_line(file_name):
            while True:
                total_bytes = os.stat(file_name).st_size
                random_point = random.randint(0, total_bytes)
                f = open(file_name)
                f.seek(random_point)
                f.readline()  # skip this line to clear the partial line
                yield f.readline().strip()

        def case_generator(line_number):
            """Yield cases from the index file."""
            enumerated_line_number = line_number - 1  # The enumeration is zero-index, but files are one-index.
            index_file = open(args.file_name)
            for i, line in enumerate(index_file):
                if i >= enumerated_line_number:
                    yield line.strip()

        if args.random:
            cases = generate_random_line(args.file_name)
            i = 0
        elif args.resume:
            with open(args.marker) as marker:
                resume_point = int(marker.read().strip())
            cases = case_generator(resume_point)
            i = resume_point
        else:
            cases = case_generator(args.line)
            i = args.line

    for case_path in cases:
        if i % 1000 == 0:
            db.reset_queries()  # Else we leak memory when DEBUG is True

        if "counter" in DEBUG:  # and i % 1000 == 0:
            log_print("\n%s: Doing case (%s): file://%s" % (datetime.datetime.now(), i, case_path))
        try:
            doc = import_law_box_case(case_path)
            duplicates = find_duplicates(doc, case_path)
            if not args.simulate:
                if len(duplicates) == 0:
                    doc.html_lawbox, blocked = anonymize(doc.html)
                    doc.html = ""
                    if blocked:
                        doc.blocked = True
                        doc.date_blocked = now()
                        # Save nothing to the index for now (it'll get done when we find citations)
                    save_doc_and_cite(doc, index=False)
                if len(duplicates) == 1:
                    dup_helpers.merge_cases_simple(doc, duplicates[0])
                if len(duplicates) > 1:
                    # complex_merge
                    if "log_multimerge" in DEBUG:
                        with open("index_multimerge.txt", "a") as log:
                            log.write("%s\n" % case_path)
            if args.resume:
                # Don't change the progress marker unless you're in resume mode.
                with open(args.marker, "w") as marker:
                    marker.write(str(i + 1))  # Files are one-index, not zero-index
            with open("lawbox_fix_file.pkl", "wb") as fix_file:
                pickle.dump(fixes, fix_file)
            i += 1
            if i == args.end:
                log_print("Hit the endpoint after importing number %s. Breaking." % i)
                break
        except Exception, err:
            log_print(traceback.format_exc())
            exit(1)
def main():
    parser = argparse.ArgumentParser(
        description='Import the corpus provided by lawbox')
    parser.add_argument(
        '-s',
        '--simulate',
        default=False,
        required=False,
        action='store_true',
        help='Run the code in simulate mode, making no permanent changes.')
    parser.add_argument(
        '-d',
        '--dir',
        type=readable_dir,
        help='The directory where the lawbox bulk data can be found.')
    parser.add_argument(
        '-f',
        '--file',
        type=str,
        default="index.txt",
        required=False,
        dest="file_name",
        help="The file that has all the URLs to import, one per line.")
    parser.add_argument(
        '-l',
        '--line',
        type=int,
        default=1,
        required=False,
        help=
        'If provided, this will be the line number in the index file where we resume processing.'
    )
    parser.add_argument(
        '-r',
        '--resume',
        default=False,
        required=False,
        action='store_true',
        help='Use the saved marker to resume operation where it last failed.')
    parser.add_argument('-x',
                        '--random',
                        default=False,
                        required=False,
                        action='store_true',
                        help='Pick cases randomly rather than serially.')
    parser.add_argument(
        '-m',
        '--marker',
        type=str,
        default='lawbox_progress_marker.txt',
        required=False,
        help=
        "The name of the file that tracks the progress (useful if multiple versions run at same time)"
    )
    parser.add_argument('-e',
                        '--end',
                        type=int,
                        required=False,
                        default=2000000,
                        help="An optional endpoint for an importer.")
    args = parser.parse_args()

    if args.dir:

        def case_generator(dir_root):
            """Yield cases, one by one to the importer by recursing and iterating the import directory"""
            for root, dirnames, filenames in os.walk(dir_root):
                for filename in fnmatch.filter(filenames, '*'):
                    yield os.path.join(root, filename)

        cases = case_generator(args.root)
        i = 0
    else:

        def generate_random_line(file_name):
            while True:
                total_bytes = os.stat(file_name).st_size
                random_point = random.randint(0, total_bytes)
                f = open(file_name)
                f.seek(random_point)
                f.readline()  # skip this line to clear the partial line
                yield f.readline().strip()

        def case_generator(line_number):
            """Yield cases from the index file."""
            enumerated_line_number = line_number - 1  # The enumeration is zero-index, but files are one-index.
            index_file = open(args.file_name)
            for i, line in enumerate(index_file):
                if i >= enumerated_line_number:
                    yield line.strip()

        if args.random:
            cases = generate_random_line(args.file_name)
            i = 0
        elif args.resume:
            with open(args.marker) as marker:
                resume_point = int(marker.read().strip())
            cases = case_generator(resume_point)
            i = resume_point
        else:
            cases = case_generator(args.line)
            i = args.line

    for case_path in cases:
        if i % 1000 == 0:
            db.reset_queries()  # Else we leak memory when DEBUG is True

        if 'counter' in DEBUG:  #and i % 1000 == 0:
            log_print("\n%s: Doing case (%s): file://%s" %
                      (datetime.datetime.now(), i, case_path))
        try:
            doc = import_law_box_case(case_path)
            duplicates = find_duplicates(doc, case_path)
            if not args.simulate:
                if len(duplicates) == 0:
                    doc.html_lawbox, blocked = anonymize(doc.html)
                    doc.html = ''
                    if blocked:
                        doc.blocked = True
                        doc.date_blocked = now()
                        # Save nothing to the index for now (it'll get done when we find citations)
                    save_doc_and_cite(doc, index=False)
                if len(duplicates) == 1:
                    dup_helpers.merge_cases_simple(doc, duplicates[0])
                if len(duplicates) > 1:
                    #complex_merge
                    if 'log_multimerge' in DEBUG:
                        with open('index_multimerge.txt', 'a') as log:
                            log.write('%s\n' % case_path)
            if args.resume:
                # Don't change the progress marker unless you're in resume mode.
                with open(args.marker, 'w') as marker:
                    marker.write(str(i +
                                     1))  # Files are one-index, not zero-index
            with open('lawbox_fix_file.pkl', 'wb') as fix_file:
                pickle.dump(fixes, fix_file)
            i += 1
            if i == args.end:
                log_print(
                    "Hit the endpoint after importing number %s. Breaking." %
                    i)
                break
        except Exception, err:
            log_print(traceback.format_exc())
            exit(1)
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(
        vol_tree, case_location_relative)
    cite = Citation(
        case_name=case_name,
        docket_number=get_docket_number(case_location),
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
    )
    docket = Docket(
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        citation=cite,
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    cite.save()
    docket.save()
    doc.docket = docket
    doc.citation = cite
    doc.save()

    # Update the citation graph
    from alert.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(vol_tree,
                                                 case_location_relative)
    cite = Citation(
        case_name=case_name,
        docket_number=get_docket_number(case_location),
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
    )
    docket = Docket(
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        citation=cite,
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    cite.save()
    docket.save()
    doc.docket = docket
    doc.citation = cite
    doc.save()

    # Update the citation graph
    from alert.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc