def test_anonymize(self): """Can we properly anonymize SSNs, EINs, and A-Numbers?""" # Simple cases. Anonymize them. self.assertEqual(anonymize('111-11-1111'), ('XXX-XX-XXXX', True)) self.assertEqual(anonymize('11-1111111'), ('XX-XXXXXXX', True)) self.assertEqual(anonymize('A11111111'), ('AXXXXXXXX', True)) self.assertEqual(anonymize('A111111111'), ('AXXXXXXXX', True)) # Starting or ending with letters isn't an SSN self.assertEqual(anonymize('A111-11-1111'), ('A111-11-1111', False)) self.assertEqual(anonymize('111-11-1111A'), ('111-11-1111A', False)) # Matches in a sentence self.assertEqual( anonymize('Term 111-11-1111 Term'), ('Term XXX-XX-XXXX Term', True), ) self.assertEqual( anonymize('Term 11-1111111 Term'), ('Term XX-XXXXXXX Term', True), ) self.assertEqual( anonymize('Term A11111111 Term'), ('Term AXXXXXXXX Term', True), ) # Multiple matches self.assertEqual( anonymize("Term 111-11-1111 Term 111-11-1111 Term"), ('Term XXX-XX-XXXX Term XXX-XX-XXXX Term', True), )
def test_anonymize(self) -> None: """Can we properly anonymize SSNs, EINs, and A-Numbers?""" # Simple cases. Anonymize them. self.assertEqual(anonymize("111-11-1111"), ("XXX-XX-XXXX", True)) self.assertEqual(anonymize("11-1111111"), ("XX-XXXXXXX", True)) self.assertEqual(anonymize("A11111111"), ("AXXXXXXXX", True)) self.assertEqual(anonymize("A111111111"), ("AXXXXXXXX", True)) # Starting or ending with letters isn't an SSN self.assertEqual(anonymize("A111-11-1111"), ("A111-11-1111", False)) self.assertEqual(anonymize("111-11-1111A"), ("111-11-1111A", False)) # Matches in a sentence self.assertEqual( anonymize("Term 111-11-1111 Term"), ("Term XXX-XX-XXXX Term", True), ) self.assertEqual( anonymize("Term 11-1111111 Term"), ("Term XX-XXXXXXX Term", True) ) self.assertEqual( anonymize("Term A11111111 Term"), ("Term AXXXXXXXX Term", True) ) # Multiple matches self.assertEqual( anonymize("Term 111-11-1111 Term 111-11-1111 Term"), ("Term XXX-XX-XXXX Term XXX-XX-XXXX Term", True), )
def extract_doc_content(pk, callback=None, citation_countdown=0): """ Given a document, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. TODO: this implementation cannot be distributed due to using local paths. """ opinion = Opinion.objects.get(pk=pk) path = opinion.local_path.path extension = path.split('.')[-1] if extension == 'doc': content, err = extract_from_doc(path, DEVNULL) elif extension == 'html': content, err = extract_from_html(path) elif extension == 'pdf': opinion, content, err = extract_from_pdf(opinion, path, DEVNULL, callback) elif extension == 'txt': content, err = extract_from_txt(path) elif extension == 'wpd': opinion, content, err = extract_from_wpd(opinion, path, DEVNULL) else: print( '*****Unable to extract content due to unknown extension: %s ' 'on opinion: %s****' % (extension, opinion)) return 2 if extension in ['html', 'wpd']: opinion.html, blocked = anonymize(content) else: opinion.plain_text, blocked = anonymize(content) if blocked: opinion.cluster.blocked = True opinion.cluster.date_blocked = now() if err: print("****Error extracting text from %s: %s****" % (extension, opinion)) return opinion try: if citation_countdown == 0: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.cluster.save(index=False) opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.cluster.save(index=False) opinion.save(index=True) except Exception, e: print "****Error saving text to the db for: %s****" % opinion print traceback.format_exc() return opinion
def extract_doc_content(pk, callback=None, citation_countdown=0): """ Given a document, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. TODO: this implementation cannot be distributed due to using local paths. """ opinion = Opinion.objects.get(pk=pk) path = opinion.local_path.path extension = path.split('.')[-1] if extension == 'doc': content, err = extract_from_doc(path, DEVNULL) elif extension == 'html': content, err = extract_from_html(path) elif extension == 'pdf': opinion, content, err = extract_from_pdf(opinion, path, DEVNULL, callback) elif extension == 'txt': content, err = extract_from_txt(path) elif extension == 'wpd': opinion, content, err = extract_from_wpd(opinion, path, DEVNULL) else: print ('*****Unable to extract content due to unknown extension: %s ' 'on opinion: %s****' % (extension, opinion)) return 2 if extension in ['html', 'wpd']: opinion.html, blocked = anonymize(content) else: opinion.plain_text, blocked = anonymize(content) if blocked: opinion.cluster.blocked = True opinion.cluster.date_blocked = now() if err: print ("****Error extracting text from %s: %s****" % (extension, opinion)) return opinion try: if citation_countdown == 0: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.cluster.save(index=False) opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.cluster.save(index=False) opinion.save(index=True) except Exception, e: print "****Error saving text to the db for: %s****" % opinion print traceback.format_exc() return opinion
def extract_recap_pdf(pk, skip_ocr=False): doc = RECAPDocument.objects.get(pk=pk) path = doc.filepath_local.path process = make_pdftotext_process(path) content, err = process.communicate() if needs_ocr(content): if not skip_ocr: # probably an image PDF. Send it to OCR. success, content = extract_by_ocr(path) if success: doc.ocr_status = RECAPDocument.OCR_COMPLETE elif content == u"" or not success: content = u"Unable to extract document content." doc.ocr_status = RECAPDocument.OCR_FAILED else: content = u"" doc.ocr_status = RECAPDocument.OCR_NEEDED else: doc.ocr_status = RECAPDocument.OCR_UNNECESSARY doc.plain_text, _ = anonymize(content) doc.save() return path
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status( vol_tree, case_location_relative) docket = Docket( docket_number=get_docket_number(case_location), court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( case_name=case_name, federal_cite_one=get_west_cite(vol_tree, case_location_relative), date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() docket.save() doc.docket = docket doc.save() # Update the citation graph from cl.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status(vol_tree, case_location_relative) docket = Docket( docket_number=get_docket_number(case_location), court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( case_name=case_name, federal_cite_one=get_west_cite(vol_tree, case_location_relative), date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() docket.save() doc.docket = docket doc.save() # Update the citation graph from cl.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc
def extract_recap_pdf( pks: Union[int, List[int]], skip_ocr: bool = False, check_if_needed: bool = True, ) -> List[int]: """Extract the contents from a RECAP PDF if necessary.""" if not is_iter(pks): pks = [pks] processed = [] for pk in pks: rd = RECAPDocument.objects.get(pk=pk) if check_if_needed and not rd.needs_extraction: # Early abort if the item doesn't need extraction and the user # hasn't disabled early abortion. processed.append(pk) continue with NamedTemporaryFile( prefix="extract_file_", suffix=".pdf", buffering=0, # Make sure it's on disk when we try to use it ) as tmp: tmp.write(rd.filepath_local.read()) process = make_pdftotext_process(tmp.name) content, err = process.communicate() content = content.decode() if needs_ocr(content): if not skip_ocr: # probably an image PDF. Send it to OCR. success, content = extract_by_ocr(tmp.name) if success: rd.ocr_status = RECAPDocument.OCR_COMPLETE elif content == "" or not success: content = "Unable to extract document content." rd.ocr_status = RECAPDocument.OCR_FAILED else: content = "" rd.ocr_status = RECAPDocument.OCR_NEEDED else: rd.ocr_status = RECAPDocument.OCR_UNNECESSARY rd.plain_text, _ = anonymize(content) # Do not do indexing here. Creates race condition in celery. rd.save(index=False, do_extraction=False) processed.append(pk) return processed
def extract_recap_pdf(pks, skip_ocr=False, check_if_needed=True): """Extract the contents from a RECAP PDF if necessary.""" if not is_iter(pks): pks = [pks] processed = [] for pk in pks: rd = RECAPDocument.objects.get(pk=pk) if check_if_needed and not rd.needs_extraction: # Early abort if the item doesn't need extraction and the user # hasn't disabled early abortion. processed.append(pk) continue path = rd.filepath_local.path process = make_pdftotext_process(path) content, err = process.communicate() content = content.decode() if needs_ocr(content): if not skip_ocr: # probably an image PDF. Send it to OCR. success, content = extract_by_ocr(path) if success: rd.ocr_status = RECAPDocument.OCR_COMPLETE elif content == "" or not success: content = "Unable to extract document content." rd.ocr_status = RECAPDocument.OCR_FAILED else: content = "" rd.ocr_status = RECAPDocument.OCR_NEEDED else: rd.ocr_status = RECAPDocument.OCR_UNNECESSARY rd.plain_text, _ = anonymize(content) # Do not do indexing here. Creates race condition in celery. rd.save(index=False, do_extraction=False) processed.append(pk) return processed
def extract_recap_pdf(pks, skip_ocr=False, check_if_needed=True): """Extract the contents from a RECAP PDF if necessary.""" if not is_iter(pks): pks = [pks] processed = [] for pk in pks: rd = RECAPDocument.objects.get(pk=pk) if check_if_needed and not rd.needs_extraction: # Early abort if the item doesn't need extraction and the user # hasn't disabled early abortion. processed.append(pk) continue path = rd.filepath_local.path process = make_pdftotext_process(path) content, err = process.communicate() if needs_ocr(content): if not skip_ocr: # probably an image PDF. Send it to OCR. success, content = extract_by_ocr(path) if success: rd.ocr_status = RECAPDocument.OCR_COMPLETE elif content == u'' or not success: content = u'Unable to extract document content.' rd.ocr_status = RECAPDocument.OCR_FAILED else: content = u'' rd.ocr_status = RECAPDocument.OCR_NEEDED else: rd.ocr_status = RECAPDocument.OCR_UNNECESSARY rd.plain_text, _ = anonymize(content) # Do not do indexing here. Creates race condition in celery. rd.save(index=False, do_extraction=False) processed.append(pk) return processed
def main(): parser = argparse.ArgumentParser( description='Import the corpus provided by lawbox') parser.add_argument( '-s', '--simulate', default=False, required=False, action='store_true', help='Run the code in simulate mode, making no permanent changes.') parser.add_argument( '-d', '--dir', type=readable_dir, help='The directory where the lawbox bulk data can be found.') parser.add_argument( '-f', '--file', type=str, default="index.txt", required=False, dest="file_name", help="The file that has all the URLs to import, one per line.") parser.add_argument( '-l', '--line', type=int, default=1, required=False, help= 'If provided, this will be the line number in the index file where we resume processing.' ) parser.add_argument( '-r', '--resume', default=False, required=False, action='store_true', help='Use the saved marker to resume operation where it last failed.') parser.add_argument('-x', '--random', default=False, required=False, action='store_true', help='Pick cases randomly rather than serially.') parser.add_argument( '-m', '--marker', type=str, default='lawbox_progress_marker.txt', required=False, help= "The name of the file that tracks the progress (useful if multiple versions run at same time)" ) parser.add_argument('-e', '--end', type=int, required=False, default=2000000, help="An optional endpoint for an importer.") args = parser.parse_args() if args.dir: def case_generator(dir_root): """Yield cases, one by one to the importer by recursing and iterating the import directory""" for root, dirnames, filenames in os.walk(dir_root): for filename in fnmatch.filter(filenames, '*'): yield os.path.join(root, filename) cases = case_generator(args.root) i = 0 else: def generate_random_line(file_name): while True: total_bytes = os.stat(file_name).st_size random_point = random.randint(0, total_bytes) f = open(file_name) f.seek(random_point) f.readline() # skip this line to clear the partial line yield f.readline().strip() def case_generator(line_number): """Yield cases from the index file.""" enumerated_line_number = line_number - 1 # The enumeration is zero-index, but files are one-index. index_file = open(args.file_name) for i, line in enumerate(index_file): if i >= enumerated_line_number: yield line.strip() if args.random: cases = generate_random_line(args.file_name) i = 0 elif args.resume: with open(args.marker) as marker: resume_point = int(marker.read().strip()) cases = case_generator(resume_point) i = resume_point else: cases = case_generator(args.line) i = args.line for case_path in cases: if i % 1000 == 0: db.reset_queries() # Else we leak memory when DEBUG is True if 'counter' in DEBUG: # and i % 1000 == 0: log_print("\n%s: Doing case (%s): file://%s" % (datetime.datetime.now(), i, case_path)) try: doc = import_law_box_case(case_path) duplicates = find_duplicates(doc, case_path) if not args.simulate: if len(duplicates) == 0: doc.html_lawbox, blocked = anonymize(doc.html) doc.html = '' if blocked: doc.blocked = True doc.date_blocked = now() # Save nothing to the index for now (it'll get done # when we find citations) doc.save(index=False) if len(duplicates) == 1: dup_helpers.merge_cases_simple(doc, duplicates[0]) if len(duplicates) > 1: # complex_merge if 'log_multimerge' in DEBUG: with open('index_multimerge.txt', 'a') as log: log.write('%s\n' % case_path) if args.resume: # Don't change the progress marker unless you're in resume mode with open(args.marker, 'w') as marker: marker.write(str(i + 1)) # Files are one-index, not zero-index with open('lawbox_fix_file.pkl', 'wb') as fix_file: pickle.dump(fixes, fix_file) i += 1 if i == args.end: log_print( "Hit the endpoint after importing number %s. Breaking." % i) break except Exception, err: log_print(traceback.format_exc()) exit(1)
def update_docket_appellate_metadata(d, docket_data): """Update the metadata specific to appellate cases.""" if not any([ docket_data.get('originating_court_information'), docket_data.get('appeal_from'), docket_data.get('panel') ]): # Probably not appellate. return d, None d.panel_str = ', '.join(docket_data.get('panel', [])) or d.panel_str d.appellate_fee_status = docket_data.get('fee_status', '') or d.appellate_fee_status d.appellate_case_type_information = docket_data.get( 'case_type_information', '') or d.appellate_case_type_information d.appeal_from_str = docket_data.get('appeal_from', '') or d.appeal_from_str # Do originating court information dict og_info = docket_data.get('originating_court_information') if not og_info: return d, None if og_info.get('court_id'): cl_id = map_pacer_to_cl_id(og_info['court_id']) if Court.objects.filter(pk=cl_id).exists(): # Ensure the court exists. Sometimes PACER does weird things, # like in 14-1743 in CA3, where it says the court_id is 'uspci'. # If we don't do this check, the court ID could be invalid, and # our whole save of the docket fails. d.appeal_from_id = cl_id if d.originating_court_information: d_og_info = d.originating_court_information else: d_og_info = OriginatingCourtInformation() # Ensure we don't share A-Numbers, which can sometimes be in the docket # number field. docket_number = og_info.get('docket_number', '') or d_og_info.docket_number docket_number, _ = anonymize(docket_number) d_og_info.docket_number = docket_number d_og_info.court_reporter = og_info.get('court_reporter', '') or d_og_info.court_reporter d_og_info.date_disposed = og_info.get( 'date_disposed') or d_og_info.date_disposed d_og_info.date_filed = og_info.get('date_filed') or d_og_info.date_filed d_og_info.date_judgment = og_info.get( 'date_judgment') or d_og_info.date_judgment d_og_info.date_judgment_eod = og_info.get( 'date_judgment_eod') or d_og_info.date_judgment_eod d_og_info.date_filed_noa = og_info.get( 'date_filed_noa') or d_og_info.date_filed_noa d_og_info.date_received_coa = og_info.get( 'date_received_coa') or d_og_info.date_received_coa d_og_info.assigned_to_str = og_info.get( 'assigned_to') or d_og_info.assigned_to_str d_og_info.ordering_judge_str = og_info.get( 'ordering_judge') or d_og_info.ordering_judge_str if not all([d.appeal_from_id, d_og_info.date_filed]): # Can't do judge lookups. Call it quits. return d, d_og_info if og_info.get('assigned_to'): judges = get_candidate_judges(og_info['assigned_to'], d.appeal_from_id, d_og_info.date_filed) if judges is not None and len(judges) == 1: d_og_info.assigned_to = judges[0] if og_info.get('ordering_judge'): judges = get_candidate_judges(og_info['ordering_judge'], d.appeal_from_id, d_og_info.date_filed) if judges is not None and len(judges) == 1: d_og_info.ordering_judge = judges[0] return d, d_og_info
def extract_doc_content(pk, do_ocr=False, citation_jitter=False): """ Given an opinion PK, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. This implementation uses local paths. :param pk: The opinion primary key to work on :param do_ocr: Whether the PDF converting function should use OCR :param citation_jitter: Whether to apply jitter before running the citation parsing code. This can be useful do spread these tasks out when doing a larger scrape. """ opinion = Opinion.objects.get(pk=pk) path = opinion.local_path.path extension = path.split(".")[-1] if extension == "doc": content, err = extract_from_doc(path) elif extension == "docx": content, err = extract_from_docx(path) elif extension == "html": content, err = extract_from_html(path) elif extension == "pdf": content, err = extract_from_pdf(path, opinion, do_ocr) elif extension == "txt": content, err = extract_from_txt(path) elif extension == "wpd": content, err = extract_from_wpd(path, opinion) else: print("*****Unable to extract content due to unknown extension: %s " "on opinion: %s****" % (extension, opinion)) return assert isinstance( content, str), "content must be of type str, not %s" % type(content) # Do page count, if possible opinion.page_count = get_page_count(path, extension) # Do blocked status if extension in ["html", "wpd"]: opinion.html, blocked = anonymize(content) else: opinion.plain_text, blocked = anonymize(content) if blocked: opinion.cluster.blocked = True opinion.cluster.date_blocked = now() update_document_from_text(opinion) if err: print(err) print("****Error extracting text from %s: %s****" % (extension, opinion)) return # Save item, and index Solr if needed. # noinspection PyBroadException try: opinion.cluster.docket.save() opinion.cluster.save(index=False) if not citation_jitter: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.save(index=True) except Exception: print("****Error saving text to the db for: %s****\n%s" % (opinion, traceback.format_exc())) return # Identify and link citations within the document content find_citations_for_opinion_by_pks.apply_async( ([opinion.pk], ), countdown=random.randint(0, 3600))
def merge_cases_simple(new, target_id): """Add `new` to the database, merging with target_id Merging is done by picking the best fields from each item. """ # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # !! THIS CODE IS OUT OF DATE AND UNMAINTAINED. FEEL FREE TO FIX IT, BUT !! # !! DO NOT TRUST IT IN ITS CURRENT STATE. !! # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! target = OpinionCluster.objects.get(pk=target_id) print "Merging %s with" % new.case_name print " %s" % target.case_name cached_source = target.source # Original value is needed below. if target.source == 'C': target.source = 'LC' elif target.source == 'R': target.source = 'LR' elif target.source == 'CR': target.source = 'LCR' # Add the URL if it's not a court one, replacing public.resource.org's # info in some cases. if cached_source == 'R': target.download_url = new.download_url # Recreate the slug from the new case name (this changes the URL, but the # old will continue working) target.slug = slugify(trunc(new.case_name, 75)) # Take the case name from the new item; they tend to be pretty good target.case_name = new.case_name # Add the docket number if the old doesn't exist, but keep the old if one # does. if not target.docket.docket_number: target.docket.docket_number = new.docket.docket_number # Get the citations from the new item (ditch the old). target.federal_cite_one = new.federal_cite_one target.federal_cite_two = new.federal_cite_two target.federal_cite_three = new.federal_cite_three target.state_cite_one = new.state_cite_one target.state_cite_two = new.state_cite_two target.state_cite_three = new.state_cite_three target.state_cite_regional = new.state_cite_regional target.specialty_cite_one = new.specialty_cite_one target.scotus_early_cite = new.scotus_early_cite target.lexis_cite = new.lexis_cite target.westlaw_cite = new.westlaw_cite target.neutral_cite = new.neutral_cite # Add judge information if lacking. New is dirty, but better than none. if not target.judges: target.judges = new.judges # Add the text. target.html_lawbox, blocked = anonymize(new.html) if blocked: target.blocked = True target.date_blocked = now() target.extracted_by_ocr = False # No longer true for any LB case.
def extract_doc_content(pk, callback=None, citation_countdown=0): """ Given a document, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. TODO: this implementation cannot be distributed due to using local paths. """ opinion = Opinion.objects.get(pk=pk) path = opinion.local_path.path extension = path.split(".")[-1] if extension == "doc": content, err = extract_from_doc(path) elif extension == "html": content, err = extract_from_html(path) elif extension == "pdf": opinion, content, err = extract_from_pdf(opinion, path, callback) elif extension == "txt": content, err = extract_from_txt(path) elif extension == "wpd": opinion, content, err = extract_from_wpd(opinion, path) else: print ( "*****Unable to extract content due to unknown extension: %s " "on opinion: %s****" % (extension, opinion) ) return 2 # Do page count, if possible opinion.page_count = get_page_count(path, extension) # Do blocked status if extension in ["html", "wpd"]: opinion.html, blocked = anonymize(content) else: opinion.plain_text, blocked = anonymize(content) if blocked: opinion.cluster.blocked = True opinion.cluster.date_blocked = now() if err: print ("****Error extracting text from %s: %s****" % (extension, opinion)) return opinion # Save item, and index Solr if needed. try: if citation_countdown == 0: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.cluster.save(index=False) opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.cluster.save(index=False) opinion.save(index=True) except Exception: print "****Error saving text to the db for: %s****" % opinion print traceback.format_exc() return opinion # Identify and link citations within the document content update_document_by_id.apply_async((opinion.pk,), countdown=citation_countdown) return opinion
def main(): parser = argparse.ArgumentParser( description='Import the corpus provided by lawbox') parser.add_argument('-s', '--simulate', default=False, required=False, action='store_true', help='Run the code in simulate mode, making no permanent changes.') parser.add_argument('-d', '--dir', type=readable_dir, help='The directory where the lawbox bulk data can be found.') parser.add_argument('-f', '--file', type=str, default="index.txt", required=False, dest="file_name", help="The file that has all the URLs to import, one per line.") parser.add_argument('-l', '--line', type=int, default=1, required=False, help='If provided, this will be the line number in the index file where we resume processing.') parser.add_argument('-r', '--resume', default=False, required=False, action='store_true', help='Use the saved marker to resume operation where it last failed.') parser.add_argument('-x', '--random', default=False, required=False, action='store_true', help='Pick cases randomly rather than serially.') parser.add_argument('-m', '--marker', type=str, default='lawbox_progress_marker.txt', required=False, help="The name of the file that tracks the progress (useful if multiple versions run at same time)") parser.add_argument('-e', '--end', type=int, required=False, default=2000000, help="An optional endpoint for an importer.") args = parser.parse_args() if args.dir: def case_generator(dir_root): """Yield cases, one by one to the importer by recursing and iterating the import directory""" for root, dirnames, filenames in os.walk(dir_root): for filename in fnmatch.filter(filenames, '*'): yield os.path.join(root, filename) cases = case_generator(args.root) i = 0 else: def generate_random_line(file_name): while True: total_bytes = os.stat(file_name).st_size random_point = random.randint(0, total_bytes) f = open(file_name) f.seek(random_point) f.readline() # skip this line to clear the partial line yield f.readline().strip() def case_generator(line_number): """Yield cases from the index file.""" enumerated_line_number = line_number - 1 # The enumeration is zero-index, but files are one-index. index_file = open(args.file_name) for i, line in enumerate(index_file): if i >= enumerated_line_number: yield line.strip() if args.random: cases = generate_random_line(args.file_name) i = 0 elif args.resume: with open(args.marker) as marker: resume_point = int(marker.read().strip()) cases = case_generator(resume_point) i = resume_point else: cases = case_generator(args.line) i = args.line for case_path in cases: if i % 1000 == 0: db.reset_queries() # Else we leak memory when DEBUG is True if 'counter' in DEBUG: # and i % 1000 == 0: log_print("\n%s: Doing case (%s): file://%s" % ( datetime.datetime.now(), i, case_path)) try: doc = import_law_box_case(case_path) duplicates = find_duplicates(doc, case_path) if not args.simulate: if len(duplicates) == 0: doc.html_lawbox, blocked = anonymize(doc.html) doc.html = '' if blocked: doc.blocked = True doc.date_blocked = now() # Save nothing to the index for now (it'll get done # when we find citations) doc.save(index=False) if len(duplicates) == 1: dup_helpers.merge_cases_simple(doc, duplicates[0]) if len(duplicates) > 1: # complex_merge if 'log_multimerge' in DEBUG: with open('index_multimerge.txt', 'a') as log: log.write('%s\n' % case_path) if args.resume: # Don't change the progress marker unless you're in resume mode with open(args.marker, 'w') as marker: marker.write( str(i + 1)) # Files are one-index, not zero-index with open('lawbox_fix_file.pkl', 'wb') as fix_file: pickle.dump(fixes, fix_file) i += 1 if i == args.end: log_print( "Hit the endpoint after importing number %s. Breaking." % i) break except Exception, err: log_print(traceback.format_exc()) exit(1)
def extract_doc_content(pk, do_ocr=False, citation_jitter=False): """ Given an opinion PK, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. This implementation uses local paths. :param pk: The opinion primary key to work on :param do_ocr: Whether the PDF converting function should use OCR :param citation_jitter: Whether to apply jitter before running the citation parsing code. This can be useful do spread these tasks out when doing a larger scrape. """ opinion = Opinion.objects.get(pk=pk) path = opinion.local_path.path extension = path.split('.')[-1] if extension == 'doc': content, err = extract_from_doc(path) elif extension == 'docx': content, err = extract_from_docx(path) elif extension == 'html': content, err = extract_from_html(path) elif extension == 'pdf': content, err = extract_from_pdf(path, opinion, do_ocr) elif extension == 'txt': content, err = extract_from_txt(path) elif extension == 'wpd': content, err = extract_from_wpd(path, opinion) else: print ('*****Unable to extract content due to unknown extension: %s ' 'on opinion: %s****' % (extension, opinion)) return # Do page count, if possible opinion.page_count = get_page_count(path, extension) # Do blocked status if extension in ['html', 'wpd']: opinion.html, blocked = anonymize(content) else: opinion.plain_text, blocked = anonymize(content) if blocked: opinion.cluster.blocked = True opinion.cluster.date_blocked = now() if err: print ("****Error extracting text from %s: %s****" % (extension, opinion)) return # Save item, and index Solr if needed. # noinspection PyBroadException try: if not citation_jitter: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.cluster.save(index=False) opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.cluster.save(index=False) opinion.save(index=True) except Exception: print("****Error saving text to the db for: %s****\n%s" % (opinion, traceback.format_exc())) return # Identify and link citations within the document content find_citations_for_opinion_by_pks.apply_async( ([opinion.pk],), countdown=random.randint(0, 3600) )