def do_many( dir_path, limit, random_order, status_interval, newcases, skipdupes, skip_newcases, avoid_nocites, courtdates, startfolder, startfile, debug, ): """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents]. Parses each .xml document, instantiates the associated model object, and saves the object. Prints/logs status updates and tracebacks instead of raising exceptions. :param dir_path: The directory. :param limit: A limit on how many files to run through. If None, will run through all (or if random order, forever). :param random_order: If true, will run through the directories and files in random order. :param status_interval: How often a status update will be given. :param newcases: If true, skip court-years that already have data. :param skipdupes: If true, skip duplicates. :param skip_newcases: If true, skip cases imported under newcases. :param avoid_nocites: If true, skip cases from dates after any case with no cite. :param courtdates: If true, skip cases with dates before court established. :param startfolder: If not None, start on startfolder :param startfile: If not None, start on this file (for resuming) """ if limit: total = limit elif not random_order: logger.info("Getting an initial file count...") total = 0 for _, _, file_names in os.walk(dir_path): total += len(fnmatch.filter(file_names, "*.xml")) else: total = None # go through the files, yielding parsed files and printing status updates as # we go folders = glob(f"{dir_path}/*") folders.sort() count = 0 # get earliest dates for each court if newcases: logger.info("Only new cases: getting earliest dates by court.") min_dates = get_min_dates() else: min_dates = None if avoid_nocites: if newcases: raise Exception( "Cannot use both avoid_nocites and newcases options.") logger.info("Avoiding no cites: getting earliest dates by court with " "no citation.") min_dates = get_min_nocite() if courtdates: start_dates = get_courtdates() else: start_dates = None # check if skipping first columbias cases if skip_newcases: skiplist = get_path_list() else: skiplist = set() # start/resume functionality if startfolder is not None: skipfolder = True else: skipfolder = False if startfile is not None: skipfile = True else: skipfile = False for folder in folders: if skipfolder: if startfolder is not None: checkfolder = folder.split("/")[-1] if checkfolder == startfolder: skipfolder = False else: continue logger.debug(folder) for path in file_generator(folder, random_order, limit): if skipfile: if startfile is not None: checkfile = path.split("/")[-1] if checkfile == startfile: skipfile = False else: continue if path in skiplist: continue # skip cases in 'misc*' folders -- they are relatively different # than the other cases, so we'll deal with them later if "miscellaneous_court_opinions" in path: continue logger.debug(path) # try to parse/save the case and show any exceptions with full # tracebacks try: parsed = parse_file(path) make_and_save(parsed, skipdupes, min_dates, start_dates, debug) except Exception as e: logger.info(path) # show simple exception summaries for known problems known = [ "mismatched tag", "Failed to get a citation", "Failed to find a court ID", 'null value in column "date_filed"', "duplicate(s)", ] if any(k in str(e) for k in known): logger.info(f"Known exception in file '{path}':") logger.info(str(e)) else: logger.info(f"Unknown exception in file '{path}':") logger.info(traceback.format_exc()) # status update count += 1 if count % status_interval == 0: if total: logger.info(f"Finished {count} out of {total} files.") else: logger.info(f"Finished {count} files.")
def do_many(dir_path, limit, random_order, status_interval, newcases, skipdupes, skip_newcases, avoid_nocites, courtdates, startfolder, startfile, debug): """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents]. Parses each .xml document, instantiates the associated model object, and saves the object. Prints/logs status updates and tracebacks instead of raising exceptions. :param dir_path: The directory. :param limit: A limit on how many files to run through. If None, will run through all (or if random order, forever). :param random_order: If true, will run through the directories and files in random order. :param status_interval: How often a status update will be given. :param newcases: If true, skip court-years that already have data. :param skipdupes: If true, skip duplicates. :param skip_newcases: If true, skip cases imported under newcases. :param avoid_nocites: If true, skip cases from dates after any case with no cite. :param courtdates: If true, skip cases with dates before court established. :param startfolder: If not None, start on startfolder :param startfile: If not None, start on this file (for resuming) """ if limit: total = limit elif not random_order: logger.info("Getting an initial file count...") total = 0 for _, _, file_names in os.walk(dir_path): total += len(fnmatch.filter(file_names, '*.xml')) else: total = None # go through the files, yielding parsed files and printing status updates as # we go folders = glob(dir_path+'/*') folders.sort() count = 0 # get earliest dates for each court if newcases: logger.info('Only new cases: getting earliest dates by court.') min_dates = get_min_dates() else: min_dates = None if avoid_nocites: if newcases: raise Exception("Cannot use both avoid_nocites and newcases options.") logger.info('Avoiding no cites: getting earliest dates by court with ' 'no citation.') min_dates = get_min_nocite() if courtdates: start_dates = get_courtdates() else: start_dates = None # check if skipping first columbias cases if skip_newcases: skiplist = get_path_list() else: skiplist = set() # start/resume functionality if startfolder is not None: skipfolder = True else: skipfolder = False if startfile is not None: skipfile = True else: skipfile = False for folder in folders: if skipfolder: if startfolder is not None: checkfolder = folder.split('/')[-1] if checkfolder == startfolder: skipfolder = False else: continue logger.debug(folder) for path in file_generator(folder, random_order, limit): if skipfile: if startfile is not None: checkfile = path.split('/')[-1] if checkfile == startfile: skipfile = False else: continue if path in skiplist: continue # skip cases in 'misc*' folders -- they are relatively different # than the other cases, so we'll deal with them later if 'miscellaneous_court_opinions' in path: continue logger.debug(path) # try to parse/save the case and show any exceptions with full # tracebacks try: parsed = parse_file(path) make_and_save(parsed, skipdupes, min_dates, start_dates, debug) except Exception as e: logger.info(path) # show simple exception summaries for known problems known = [ 'mismatched tag', 'Failed to get a citation', 'Failed to find a court ID', 'null value in column "date_filed"', 'duplicate(s)' ] if any(k in str(e) for k in known): logger.info("Known exception in file '%s':" % path) logger.info(str(e)) else: logger.info("Unknown exception in file '%s':" % path) logger.info(traceback.format_exc()) # status update count += 1 if count % status_interval == 0: if total: logger.info("Finished %s out of %s files." % (count, total)) else: logger.info("Finished %s files." % count)