Ejemplo n.º 1
0
def do_many(
    dir_path,
    limit,
    random_order,
    status_interval,
    newcases,
    skipdupes,
    skip_newcases,
    avoid_nocites,
    courtdates,
    startfolder,
    startfile,
    debug,
):
    """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents].
    Parses each .xml document, instantiates the associated model object, and
    saves the object. Prints/logs status updates and tracebacks instead of
    raising exceptions.

    :param dir_path: The directory.
    :param limit: A limit on how many files to run through. If None, will run
    through all (or if random order, forever).
    :param random_order: If true, will run through the directories and files in
    random order.
    :param status_interval: How often a status update will be given.
    :param newcases: If true, skip court-years that already have data.
    :param skipdupes: If true, skip duplicates.
    :param skip_newcases: If true, skip cases imported under newcases.
    :param avoid_nocites: If true, skip cases from dates after any case with no cite.
    :param courtdates: If true, skip cases with dates before court established.
    :param startfolder: If not None, start on startfolder
    :param startfile: If not None, start on this file (for resuming)
    """
    if limit:
        total = limit
    elif not random_order:
        logger.info("Getting an initial file count...")
        total = 0
        for _, _, file_names in os.walk(dir_path):
            total += len(fnmatch.filter(file_names, "*.xml"))
    else:
        total = None
    # go through the files, yielding parsed files and printing status updates as
    # we go
    folders = glob(f"{dir_path}/*")
    folders.sort()
    count = 0

    # get earliest dates for each court
    if newcases:
        logger.info("Only new cases: getting earliest dates by court.")
        min_dates = get_min_dates()
    else:
        min_dates = None

    if avoid_nocites:
        if newcases:
            raise Exception(
                "Cannot use both avoid_nocites and newcases options.")
        logger.info("Avoiding no cites: getting earliest dates by court with "
                    "no citation.")
        min_dates = get_min_nocite()

    if courtdates:
        start_dates = get_courtdates()
    else:
        start_dates = None

    # check if skipping first columbias cases

    if skip_newcases:
        skiplist = get_path_list()
    else:
        skiplist = set()

    # start/resume functionality
    if startfolder is not None:
        skipfolder = True
    else:
        skipfolder = False
    if startfile is not None:
        skipfile = True
    else:
        skipfile = False

    for folder in folders:
        if skipfolder:
            if startfolder is not None:
                checkfolder = folder.split("/")[-1]
                if checkfolder == startfolder:
                    skipfolder = False
                else:
                    continue
        logger.debug(folder)

        for path in file_generator(folder, random_order, limit):

            if skipfile:
                if startfile is not None:
                    checkfile = path.split("/")[-1]
                    if checkfile == startfile:
                        skipfile = False
                    else:
                        continue

            if path in skiplist:
                continue

            # skip cases in 'misc*' folders -- they are relatively different
            # than the other cases, so we'll deal with them later
            if "miscellaneous_court_opinions" in path:
                continue

            logger.debug(path)

            # try to parse/save the case and show any exceptions with full
            # tracebacks
            try:
                parsed = parse_file(path)
                make_and_save(parsed, skipdupes, min_dates, start_dates, debug)
            except Exception as e:
                logger.info(path)
                # show simple exception summaries for known problems
                known = [
                    "mismatched tag",
                    "Failed to get a citation",
                    "Failed to find a court ID",
                    'null value in column "date_filed"',
                    "duplicate(s)",
                ]
                if any(k in str(e) for k in known):
                    logger.info(f"Known exception in file '{path}':")
                    logger.info(str(e))
                else:
                    logger.info(f"Unknown exception in file '{path}':")
                    logger.info(traceback.format_exc())
        # status update
        count += 1
        if count % status_interval == 0:
            if total:
                logger.info(f"Finished {count} out of {total} files.")
            else:
                logger.info(f"Finished {count} files.")
def do_many(dir_path, limit, random_order, status_interval,
            newcases, skipdupes, skip_newcases, avoid_nocites, courtdates,
            startfolder, startfile, debug):
    """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents].
    Parses each .xml document, instantiates the associated model object, and
    saves the object. Prints/logs status updates and tracebacks instead of
    raising exceptions.

    :param dir_path: The directory.
    :param limit: A limit on how many files to run through. If None, will run
    through all (or if random order, forever).
    :param random_order: If true, will run through the directories and files in
    random order.
    :param status_interval: How often a status update will be given.
    :param newcases: If true, skip court-years that already have data.
    :param skipdupes: If true, skip duplicates.    
    :param skip_newcases: If true, skip cases imported under newcases.
    :param avoid_nocites: If true, skip cases from dates after any case with no cite.
    :param courtdates: If true, skip cases with dates before court established.
    :param startfolder: If not None, start on startfolder
    :param startfile: If not None, start on this file (for resuming)
    """
    if limit:
        total = limit
    elif not random_order:
        logger.info("Getting an initial file count...")
        total = 0
        for _, _, file_names in os.walk(dir_path):
            total += len(fnmatch.filter(file_names, '*.xml'))
    else:
        total = None
    # go through the files, yielding parsed files and printing status updates as
    # we go
    folders = glob(dir_path+'/*')
    folders.sort()
    count = 0

    # get earliest dates for each court
    if newcases:
        logger.info('Only new cases: getting earliest dates by court.')
        min_dates = get_min_dates()
    else:
        min_dates = None

    if avoid_nocites:
        if newcases:
            raise Exception("Cannot use both avoid_nocites and newcases options.")
        logger.info('Avoiding no cites: getting earliest dates by court with '
                    'no citation.')
        min_dates = get_min_nocite()

    if courtdates:
        start_dates = get_courtdates()
    else:
        start_dates = None

    # check if skipping first columbias cases

    if skip_newcases:
        skiplist = get_path_list()
    else:
        skiplist = set()

    # start/resume functionality
    if startfolder is not None:
        skipfolder = True
    else:
        skipfolder = False
    if startfile is not None:
        skipfile = True
    else:
        skipfile = False

    for folder in folders:
        if skipfolder:
            if startfolder is not None:
                checkfolder = folder.split('/')[-1]
                if checkfolder == startfolder:
                    skipfolder = False
                else:
                    continue
        logger.debug(folder)

        for path in file_generator(folder, random_order, limit):

            if skipfile:
                if startfile is not None:
                    checkfile = path.split('/')[-1]
                    if checkfile == startfile:
                        skipfile = False
                    else:
                        continue

            if path in skiplist:
                continue

            # skip cases in 'misc*' folders -- they are relatively different
            # than the other cases, so we'll deal with them later
            if 'miscellaneous_court_opinions' in path:
                continue

            logger.debug(path)

            # try to parse/save the case and show any exceptions with full
            # tracebacks
            try:
                parsed = parse_file(path)
                make_and_save(parsed, skipdupes, min_dates, start_dates, debug)
            except Exception as e:
                logger.info(path)
                # show simple exception summaries for known problems
                known = [
                    'mismatched tag', 'Failed to get a citation',
                    'Failed to find a court ID',
                    'null value in column "date_filed"', 'duplicate(s)'
                ]
                if any(k in str(e) for k in known):
                    logger.info("Known exception in file '%s':" % path)
                    logger.info(str(e))
                else:
                    logger.info("Unknown exception in file '%s':" % path)
                    logger.info(traceback.format_exc())
        # status update
        count += 1
        if count % status_interval == 0:
            if total:
                logger.info("Finished %s out of %s files." % (count, total))
            else:
                logger.info("Finished %s files." % count)