def extract_doc_content(pk, callback=None, citation_countdown=0): """ Given a document, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. TODO: this implementation cannot be distributed due to using local paths. """ d = Document.objects.get(pk=pk) path = d.local_path.path extension = path.split('.')[-1] if extension == 'doc': content, err = extract_from_doc(path, DEVNULL) elif extension == 'html': content, err = extract_from_html(path) elif extension == 'pdf': d, content, err = extract_from_pdf(d, path, DEVNULL, callback) elif extension == 'txt': content, err = extract_from_txt(path) elif extension == 'wpd': d, content, err = extract_from_wpd(d, path, DEVNULL) else: print( '*****Unable to extract content due to unknown extension: %s ' 'on d: %s****' % (extension, d)) return 2 if extension in ['html', 'wpd']: d.html, blocked = anonymize(content) else: d.plain_text, blocked = anonymize(content) if blocked: d.blocked = True d.date_blocked = now() if err: print "****Error extracting text from %s: %s****" % (extension, d) return d try: if citation_countdown == 0: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. d.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule d.save(index=True) except Exception, e: print "****Error saving text to the db for: %s****" % d print traceback.format_exc() return d
def extract_doc_content(pk, callback=None, citation_countdown=0): """ Given a document, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. TODO: this implementation cannot be distributed due to using local paths. """ doc = Document.objects.get(pk=pk) path = str(doc.local_path) path = os.path.join(settings.MEDIA_ROOT, path) extension = path.split('.')[-1] if extension == 'doc': content, err = extract_from_doc(path, DEVNULL) elif extension == 'html': content, err = extract_from_html(path) elif extension == 'pdf': doc, content, err = extract_from_pdf(doc, path, DEVNULL, callback) elif extension == 'txt': content, err = extract_from_txt(path) elif extension == 'wpd': doc, content, err = extract_from_wpd(doc, path, DEVNULL) else: print ('*****Unable to extract content due to unknown extension: %s ' 'on doc: %s****' % (extension, doc)) return 2 if extension in ['html', 'wpd']: doc.html, blocked = anonymize(content) else: doc.plain_text, blocked = anonymize(content) if blocked: doc.blocked = True doc.date_blocked = now() if err: print "****Error extracting text from %s: %s****" % (extension, doc) return doc try: if citation_countdown == 0: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. doc.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule doc.save(index=True) except Exception, e: print "****Error saving text to the db for: %s****" % doc print traceback.format_exc() return doc
def extract_doc_content(pk, callback=None, citation_countdown=0): """ Given a document, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. TODO: this implementation cannot be distributed due to using local paths. """ doc = Document.objects.get(pk=pk) path = str(doc.local_path) path = os.path.join(settings.MEDIA_ROOT, path) extension = path.split('.')[-1] if extension == 'doc': content, err = extract_from_doc(path, DEVNULL) elif extension == 'html': content, err = extract_from_html(path) elif extension == 'pdf': doc, content, err = extract_from_pdf(doc, path, DEVNULL, callback) elif extension == 'txt': content, err = extract_from_txt(path) elif extension == 'wpd': doc, content, err = extract_from_wpd(doc, path, DEVNULL) else: print( '*****Unable to extract content due to unknown extension: %s ' 'on doc: %s****' % (extension, doc)) return 2 if extension in ['html', 'wpd']: doc.html, blocked = anonymize(content) else: doc.plain_text, blocked = anonymize(content) if blocked: doc.blocked = True doc.date_blocked = now() if err: print "****Error extracting text from %s: %s****" % (extension, doc) return doc try: if citation_countdown == 0: doc.save(index=False) else: doc.save(index=True, commit=False) except Exception, e: print "****Error saving text to the db for: %s****" % doc print traceback.format_exc() return doc
def cleaner(simulate=False, verbose=False): """Re-run the anonymize function across the whole corpus. The anonymize function was previously missing any documents that contained punctuation before or after an ID. This script re-runs the function, fixing the error. """ docs = queryset_generator(Document.objects.all()) for doc in docs: text = doc.plain_text clean_lines = [] any_mods = [] for line in text.split('\n'): clean_line, modified = anonymize(line) if modified: print "Fixing text in document: %s" % doc.pk print "Line reads: %s" % line fix = raw_input("Fix the line? [Y/n]: ") or 'y' if fix.lower() == 'y': clean_lines.append(clean_line) any_mods.append(modified) else: clean_lines.append(line) else: clean_lines.append(line) if not simulate and any(any_mods): doc.plain_text = '\n'.join(clean_lines) doc.blocked = True doc.date_blocked = now() doc.save()
def merge_cases_simple(new, target_id): """Add `new` to the database, merging with target_id Merging is done by picking the best fields from each item. """ target = Document.objects.get(pk=target_id) print "Merging %s with" % new.citation.case_name print " %s" % target.citation.case_name cached_source = target.source # Original value is needed below. if target.source == 'C': target.source = 'LC' elif target.source == 'R': target.source = 'LR' elif target.source == 'CR': target.source = 'LCR' # Add the URL if it's not a court one, replacing resource.org's info in some cases. if cached_source == 'R': target.download_url = new.download_url # Recreate the slug from the new case name (this changes the URL, but the old will continue working) target.citation.slug = trunc(slugify(new.citation.case_name), 50) # Take the case name from the new item; they tend to be pretty good target.citation.case_name = new.citation.case_name # Add the docket number if the old doesn't exist, but keep the old if one does. if not target.citation.docket_number: target.citation.docket_number = new.citation.docket_number # Get the citations from the new item (ditch the old). target.citation.federal_cite_one = new.citation.federal_cite_one target.citation.federal_cite_two = new.citation.federal_cite_two target.citation.federal_cite_three = new.citation.federal_cite_three target.citation.state_cite_one = new.citation.state_cite_one target.citation.state_cite_two = new.citation.state_cite_two target.citation.state_cite_three = new.citation.state_cite_three target.citation.state_cite_regional = new.citation.state_cite_regional target.citation.specialty_cite_one = new.citation.specialty_cite_one target.citation.scotus_early_cite = new.citation.scotus_early_cite target.citation.lexis_cite = new.citation.lexis_cite target.citation.westlaw_cite = new.citation.westlaw_cite target.citation.neutral_cite = new.citation.neutral_cite # Add judge information if lacking. New is dirty, but better than none. if not target.judges: target.judges = new.judges # Add the text. target.html_lawbox, blocked = anonymize(new.html) if blocked: target.blocked = True target.date_blocked = now() target.extracted_by_ocr = False # No longer true for any LB case. save_doc_and_cite(target, index=False)
def main(): parser = argparse.ArgumentParser(description="Import the corpus provided by lawbox") parser.add_argument( "-s", "--simulate", default=False, required=False, action="store_true", help="Run the code in simulate mode, making no permanent changes.", ) parser.add_argument("-d", "--dir", type=readable_dir, help="The directory where the lawbox bulk data can be found.") parser.add_argument( "-f", "--file", type=str, default="index.txt", required=False, dest="file_name", help="The file that has all the URLs to import, one per line.", ) parser.add_argument( "-l", "--line", type=int, default=1, required=False, help="If provided, this will be the line number in the index file where we resume processing.", ) parser.add_argument( "-r", "--resume", default=False, required=False, action="store_true", help="Use the saved marker to resume operation where it last failed.", ) parser.add_argument( "-x", "--random", default=False, required=False, action="store_true", help="Pick cases randomly rather than serially.", ) parser.add_argument( "-m", "--marker", type=str, default="lawbox_progress_marker.txt", required=False, help="The name of the file that tracks the progress (useful if multiple versions run at same time)", ) parser.add_argument( "-e", "--end", type=int, required=False, default=2000000, help="An optional endpoint for an importer." ) args = parser.parse_args() if args.dir: def case_generator(dir_root): """Yield cases, one by one to the importer by recursing and iterating the import directory""" for root, dirnames, filenames in os.walk(dir_root): for filename in fnmatch.filter(filenames, "*"): yield os.path.join(root, filename) cases = case_generator(args.root) i = 0 else: def generate_random_line(file_name): while True: total_bytes = os.stat(file_name).st_size random_point = random.randint(0, total_bytes) f = open(file_name) f.seek(random_point) f.readline() # skip this line to clear the partial line yield f.readline().strip() def case_generator(line_number): """Yield cases from the index file.""" enumerated_line_number = line_number - 1 # The enumeration is zero-index, but files are one-index. index_file = open(args.file_name) for i, line in enumerate(index_file): if i >= enumerated_line_number: yield line.strip() if args.random: cases = generate_random_line(args.file_name) i = 0 elif args.resume: with open(args.marker) as marker: resume_point = int(marker.read().strip()) cases = case_generator(resume_point) i = resume_point else: cases = case_generator(args.line) i = args.line for case_path in cases: if i % 1000 == 0: db.reset_queries() # Else we leak memory when DEBUG is True if "counter" in DEBUG: # and i % 1000 == 0: log_print("\n%s: Doing case (%s): file://%s" % (datetime.datetime.now(), i, case_path)) try: doc = import_law_box_case(case_path) duplicates = find_duplicates(doc, case_path) if not args.simulate: if len(duplicates) == 0: doc.html_lawbox, blocked = anonymize(doc.html) doc.html = "" if blocked: doc.blocked = True doc.date_blocked = now() # Save nothing to the index for now (it'll get done when we find citations) save_doc_and_cite(doc, index=False) if len(duplicates) == 1: dup_helpers.merge_cases_simple(doc, duplicates[0]) if len(duplicates) > 1: # complex_merge if "log_multimerge" in DEBUG: with open("index_multimerge.txt", "a") as log: log.write("%s\n" % case_path) if args.resume: # Don't change the progress marker unless you're in resume mode. with open(args.marker, "w") as marker: marker.write(str(i + 1)) # Files are one-index, not zero-index with open("lawbox_fix_file.pkl", "wb") as fix_file: pickle.dump(fixes, fix_file) i += 1 if i == args.end: log_print("Hit the endpoint after importing number %s. Breaking." % i) break except Exception, err: log_print(traceback.format_exc()) exit(1)
def main(): parser = argparse.ArgumentParser( description='Import the corpus provided by lawbox') parser.add_argument( '-s', '--simulate', default=False, required=False, action='store_true', help='Run the code in simulate mode, making no permanent changes.') parser.add_argument( '-d', '--dir', type=readable_dir, help='The directory where the lawbox bulk data can be found.') parser.add_argument( '-f', '--file', type=str, default="index.txt", required=False, dest="file_name", help="The file that has all the URLs to import, one per line.") parser.add_argument( '-l', '--line', type=int, default=1, required=False, help= 'If provided, this will be the line number in the index file where we resume processing.' ) parser.add_argument( '-r', '--resume', default=False, required=False, action='store_true', help='Use the saved marker to resume operation where it last failed.') parser.add_argument('-x', '--random', default=False, required=False, action='store_true', help='Pick cases randomly rather than serially.') parser.add_argument( '-m', '--marker', type=str, default='lawbox_progress_marker.txt', required=False, help= "The name of the file that tracks the progress (useful if multiple versions run at same time)" ) parser.add_argument('-e', '--end', type=int, required=False, default=2000000, help="An optional endpoint for an importer.") args = parser.parse_args() if args.dir: def case_generator(dir_root): """Yield cases, one by one to the importer by recursing and iterating the import directory""" for root, dirnames, filenames in os.walk(dir_root): for filename in fnmatch.filter(filenames, '*'): yield os.path.join(root, filename) cases = case_generator(args.root) i = 0 else: def generate_random_line(file_name): while True: total_bytes = os.stat(file_name).st_size random_point = random.randint(0, total_bytes) f = open(file_name) f.seek(random_point) f.readline() # skip this line to clear the partial line yield f.readline().strip() def case_generator(line_number): """Yield cases from the index file.""" enumerated_line_number = line_number - 1 # The enumeration is zero-index, but files are one-index. index_file = open(args.file_name) for i, line in enumerate(index_file): if i >= enumerated_line_number: yield line.strip() if args.random: cases = generate_random_line(args.file_name) i = 0 elif args.resume: with open(args.marker) as marker: resume_point = int(marker.read().strip()) cases = case_generator(resume_point) i = resume_point else: cases = case_generator(args.line) i = args.line for case_path in cases: if i % 1000 == 0: db.reset_queries() # Else we leak memory when DEBUG is True if 'counter' in DEBUG: #and i % 1000 == 0: log_print("\n%s: Doing case (%s): file://%s" % (datetime.datetime.now(), i, case_path)) try: doc = import_law_box_case(case_path) duplicates = find_duplicates(doc, case_path) if not args.simulate: if len(duplicates) == 0: doc.html_lawbox, blocked = anonymize(doc.html) doc.html = '' if blocked: doc.blocked = True doc.date_blocked = now() # Save nothing to the index for now (it'll get done when we find citations) save_doc_and_cite(doc, index=False) if len(duplicates) == 1: dup_helpers.merge_cases_simple(doc, duplicates[0]) if len(duplicates) > 1: #complex_merge if 'log_multimerge' in DEBUG: with open('index_multimerge.txt', 'a') as log: log.write('%s\n' % case_path) if args.resume: # Don't change the progress marker unless you're in resume mode. with open(args.marker, 'w') as marker: marker.write(str(i + 1)) # Files are one-index, not zero-index with open('lawbox_fix_file.pkl', 'wb') as fix_file: pickle.dump(fixes, fix_file) i += 1 if i == args.end: log_print( "Hit the endpoint after importing number %s. Breaking." % i) break except Exception, err: log_print(traceback.format_exc()) exit(1)
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status( vol_tree, case_location_relative) cite = Citation( case_name=case_name, docket_number=get_docket_number(case_location), federal_cite_one=get_west_cite(vol_tree, case_location_relative), ) docket = Docket( court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), citation=cite, docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() cite.save() docket.save() doc.docket = docket doc.citation = cite doc.save() # Update the citation graph from alert.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status(vol_tree, case_location_relative) cite = Citation( case_name=case_name, docket_number=get_docket_number(case_location), federal_cite_one=get_west_cite(vol_tree, case_location_relative), ) docket = Docket( court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), citation=cite, docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() cite.save() docket.save() doc.docket = docket doc.citation = cite doc.save() # Update the citation graph from alert.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc