def do_many(dir_path, limit=None, random_order=False, status_interval=100): """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents]. Parses each .xml document, instantiates the associated model object, and saves the object. Prints/logs status updates and tracebacks instead of raising exceptions. :param dir_path: The directory. :param limit: A limit on how many files to run through. If None, will run through all (or if random order, forever). :param random_order: If true, will run through the directories and files in random order. :param status_interval: How often a status update will be given. """ if limit: total = limit elif not random_order: print "Getting an initial file count ..." total = 0 for _, _, file_names in os.walk(dir_path): total += len(fnmatch.filter(file_names, '*.xml')) else: total = None # go through the files, yielding parsed files and printing status updates as we go count = 0 for path in file_generator(dir_path, random_order, limit): # grab the fallback text from the path if it's there court_fallback = '' matches = re.compile('data/([a-z_]+?/[a-z_]+?)/').findall(path) if matches: court_fallback = matches[0] # try to parse/save the case and print any exceptions with full tracebacks try: parsed = parse_file(path, court_fallback=court_fallback) make_and_save(parsed) except Exception as e: # print simple exception summaries for known problems if 'mismatched tag' in str(e): print "Mismatched tag exception encountered in file '%s':%s" % ( path, str(e).split(':', 1)[1]) elif 'Failed to get a citation' in str(e): print "Exception in file '%s': %s" % (path, str(e)) else: # otherwise, print generic traceback print print "Exception encountered in file '%s':" % path print traceback.format_exc() print # status update count += 1 if count % status_interval == 0: if total: print "Finished %s out of %s files." % (count, total) else: print "Finished %s files." % count
def do_many(dir_path, limit=None, random_order=False, status_interval=100): """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents]. Parses each .xml document, instantiates the associated model object, and saves the object. Prints/logs status updates and tracebacks instead of raising exceptions. :param dir_path: The directory. :param limit: A limit on how many files to run through. If None, will run through all (or if random order, forever). :param random_order: If true, will run through the directories and files in random order. :param status_interval: How often a status update will be given. """ if limit: total = limit elif not random_order: print "Getting an initial file count ..." total = 0 for _, _, file_names in os.walk(dir_path): total += len(fnmatch.filter(file_names, '*.xml')) else: total = None # go through the files, yielding parsed files and printing status updates as we go count = 0 for path in file_generator(dir_path, random_order, limit): # grab the fallback text from the path if it's there court_fallback = '' matches = re.compile('data/([a-z_]+?/[a-z_]+?)/').findall(path) if matches: court_fallback = matches[0] # try to parse/save the case and print any exceptions with full tracebacks try: parsed = parse_file(path, court_fallback=court_fallback) make_and_save(parsed) except Exception as e: # print simple exception summaries for known problems if 'mismatched tag' in str(e): print "Mismatched tag exception encountered in file '%s':%s" % (path, str(e).split(':', 1)[1]) elif 'Failed to get a citation' in str(e): print "Exception in file '%s': %s" % (path, str(e)) else: # otherwise, print generic traceback print print "Exception encountered in file '%s':" % path print traceback.format_exc() print # status update count += 1 if count % status_interval == 0: if total: print "Finished %s out of %s files." % (count, total) else: print "Finished %s files." % count
def do_many(dir_path, limit, random_order, status_interval, log_file, newcases, skipdupes, skip_newcases, avoid_nocites, courtdates, startfolder, startfile, debug): """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents]. Parses each .xml document, instantiates the associated model object, and saves the object. Prints/logs status updates and tracebacks instead of raising exceptions. :param dir_path: The directory. :param limit: A limit on how many files to run through. If None, will run through all (or if random order, forever). :param random_order: If true, will run through the directories and files in random order. :param status_interval: How often a status update will be given. :param log_file: If not None, file paths that raise Exceptions will be logged to this file. :param newcases: If true, skip court-years that already have data. :param skipdupes: If true, skip duplicates. :param skip_newcases: If true, skip cases imported under newcases. :param avoid_nocites: If true, skip cases from dates after any case with no cite. :param courtdates: If true, skip cases with dates before court established. :param startfolder: If not None, start on startfolder :param startfile: If not None, start on this file (for resuming) """ if limit: total = limit elif not random_order: print("Getting an initial file count ...") print total = 0 for _, _, file_names in os.walk(dir_path): total += len(fnmatch.filter(file_names, '*.xml')) else: total = None log = None if log_file: print("Logging problematic file paths to '%s' ..." % log_file) print log = logging.getLogger(__name__) log.setLevel(logging.INFO) log.addHandler(logging.FileHandler(log_file)) # go through the files, yielding parsed files and printing status updates as # we go folders = glob(dir_path + '/*') folders.sort() count = 0 # get earliest dates for each court if newcases: print('Only new cases: getting earliest dates by court.') min_dates = get_min_dates() else: min_dates = None if avoid_nocites: if newcases: raise Exception( "Cannot use both avoid_nocites and newcases options.") print( 'Avoiding no cites: getting earliest dates by court with no citation.' ) min_dates = get_min_nocite() if courtdates: start_dates = get_courtdates() else: start_dates = None # check if skipping first columbias cases if skip_newcases: skiplist = get_path_list() else: skiplist = set() # start/resume functionality if startfolder is not None: skipfolder = True else: skipfolder = False if startfile is not None: skipfile = True else: skipfile = False for folder in folders: if skipfolder: if startfolder is not None: checkfolder = folder.split('/')[-1] if checkfolder == startfolder: skipfolder = False else: continue print(folder) for path in file_generator(folder, random_order, limit): if skipfile: if startfile is not None: checkfile = path.split('/')[-1] if checkfile == startfile: skipfile = False else: continue if path in skiplist: continue # skip cases in 'misc*' folders -- they are relatively different # than the other cases, so we'll deal with them later if 'miscellaneous_court_opinions' in path: continue print(path) # try to parse/save the case and print any exceptions with full # tracebacks try: parsed = parse_file(path) make_and_save(parsed, skipdupes, min_dates, start_dates, debug) except Exception as e: # log the file name if log: log.info(path) # print simple exception summaries for known problems known = [ 'mismatched tag', 'Failed to get a citation', 'Failed to find a court ID', 'null value in column "date_filed"', 'duplicate(s)' ] if any(k in str(e) for k in known): print print "Known exception in file '%s':" % path print str(e) print else: # otherwise, print generic traceback print print "Unknown exception in file '%s':" % path print traceback.format_exc() print # status update count += 1 if count % status_interval == 0: print if total: print "Finished %s out of %s files." % (count, total) else: print "Finished %s files." % count print
def do_many(dir_path, limit, random_order, status_interval, log_file, newcases, skipdupes, skip_newcases, avoid_nocites, courtdates, startfolder, startfile, debug): """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents]. Parses each .xml document, instantiates the associated model object, and saves the object. Prints/logs status updates and tracebacks instead of raising exceptions. :param dir_path: The directory. :param limit: A limit on how many files to run through. If None, will run through all (or if random order, forever). :param random_order: If true, will run through the directories and files in random order. :param status_interval: How often a status update will be given. :param log_file: If not None, file paths that raise Exceptions will be logged to this file. :param newcases: If true, skip court-years that already have data. :param skipdupes: If true, skip duplicates. :param skip_newcases: If true, skip cases imported under newcases. :param avoid_nocites: If true, skip cases from dates after any case with no cite. :param courtdates: If true, skip cases with dates before court established. :param startfolder: If not None, start on startfolder :param startfile: If not None, start on this file (for resuming) """ if limit: total = limit elif not random_order: print ("Getting an initial file count ...") print total = 0 for _, _, file_names in os.walk(dir_path): total += len(fnmatch.filter(file_names, '*.xml')) else: total = None log = None if log_file: print ("Logging problematic file paths to '%s' ..." % log_file) print log = logging.getLogger(__name__) log.setLevel(logging.INFO) log.addHandler(logging.FileHandler(log_file)) # go through the files, yielding parsed files and printing status updates as # we go folders = glob(dir_path+'/*') folders.sort() count = 0 # get earliest dates for each court if newcases: print('Only new cases: getting earliest dates by court.') min_dates = get_min_dates() else: min_dates = None if avoid_nocites: if newcases: raise Exception("Cannot use both avoid_nocites and newcases options.") print('Avoiding no cites: getting earliest dates by court with no citation.') min_dates = get_min_nocite() if courtdates: start_dates = get_courtdates() else: start_dates = None # check if skipping first columbias cases if skip_newcases: skiplist = get_path_list() else: skiplist = set() # start/resume functionality if startfolder is not None: skipfolder = True else: skipfolder = False if startfile is not None: skipfile = True else: skipfile = False for folder in folders: if skipfolder: if startfolder is not None: checkfolder = folder.split('/')[-1] if checkfolder == startfolder: skipfolder = False else: continue print(folder) for path in file_generator(folder, random_order, limit): if skipfile: if startfile is not None: checkfile = path.split('/')[-1] if checkfile == startfile: skipfile = False else: continue if path in skiplist: continue # skip cases in 'misc*' folders -- they are relatively different # than the other cases, so we'll deal with them later if 'miscellaneous_court_opinions' in path: continue print(path) # try to parse/save the case and print any exceptions with full # tracebacks try: parsed = parse_file(path) make_and_save(parsed, skipdupes, min_dates, start_dates, debug) except Exception as e: # log the file name if log: log.info(path) # print simple exception summaries for known problems known = [ 'mismatched tag', 'Failed to get a citation', 'Failed to find a court ID', 'null value in column "date_filed"', 'duplicate(s)' ] if any(k in str(e) for k in known): print print "Known exception in file '%s':" % path print str(e) print else: # otherwise, print generic traceback print print "Unknown exception in file '%s':" % path print traceback.format_exc() print # status update count += 1 if count % status_interval == 0: print if total: print "Finished %s out of %s files." % (count, total) else: print "Finished %s files." % count print
def do_many( dir_path, limit, random_order, status_interval, newcases, skipdupes, skip_newcases, avoid_nocites, courtdates, startfolder, startfile, debug, ): """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents]. Parses each .xml document, instantiates the associated model object, and saves the object. Prints/logs status updates and tracebacks instead of raising exceptions. :param dir_path: The directory. :param limit: A limit on how many files to run through. If None, will run through all (or if random order, forever). :param random_order: If true, will run through the directories and files in random order. :param status_interval: How often a status update will be given. :param newcases: If true, skip court-years that already have data. :param skipdupes: If true, skip duplicates. :param skip_newcases: If true, skip cases imported under newcases. :param avoid_nocites: If true, skip cases from dates after any case with no cite. :param courtdates: If true, skip cases with dates before court established. :param startfolder: If not None, start on startfolder :param startfile: If not None, start on this file (for resuming) """ if limit: total = limit elif not random_order: logger.info("Getting an initial file count...") total = 0 for _, _, file_names in os.walk(dir_path): total += len(fnmatch.filter(file_names, "*.xml")) else: total = None # go through the files, yielding parsed files and printing status updates as # we go folders = glob(f"{dir_path}/*") folders.sort() count = 0 # get earliest dates for each court if newcases: logger.info("Only new cases: getting earliest dates by court.") min_dates = get_min_dates() else: min_dates = None if avoid_nocites: if newcases: raise Exception( "Cannot use both avoid_nocites and newcases options.") logger.info("Avoiding no cites: getting earliest dates by court with " "no citation.") min_dates = get_min_nocite() if courtdates: start_dates = get_courtdates() else: start_dates = None # check if skipping first columbias cases if skip_newcases: skiplist = get_path_list() else: skiplist = set() # start/resume functionality if startfolder is not None: skipfolder = True else: skipfolder = False if startfile is not None: skipfile = True else: skipfile = False for folder in folders: if skipfolder: if startfolder is not None: checkfolder = folder.split("/")[-1] if checkfolder == startfolder: skipfolder = False else: continue logger.debug(folder) for path in file_generator(folder, random_order, limit): if skipfile: if startfile is not None: checkfile = path.split("/")[-1] if checkfile == startfile: skipfile = False else: continue if path in skiplist: continue # skip cases in 'misc*' folders -- they are relatively different # than the other cases, so we'll deal with them later if "miscellaneous_court_opinions" in path: continue logger.debug(path) # try to parse/save the case and show any exceptions with full # tracebacks try: parsed = parse_file(path) make_and_save(parsed, skipdupes, min_dates, start_dates, debug) except Exception as e: logger.info(path) # show simple exception summaries for known problems known = [ "mismatched tag", "Failed to get a citation", "Failed to find a court ID", 'null value in column "date_filed"', "duplicate(s)", ] if any(k in str(e) for k in known): logger.info(f"Known exception in file '{path}':") logger.info(str(e)) else: logger.info(f"Unknown exception in file '{path}':") logger.info(traceback.format_exc()) # status update count += 1 if count % status_interval == 0: if total: logger.info(f"Finished {count} out of {total} files.") else: logger.info(f"Finished {count} files.")