Beispiel #1
0
def open_job_history (history_file):
    """
    Given a history file name, opens the correspond sqllite db file and returns the handle
    """
    if not history_file:
        history_file = get_default_history_file_name ()

    return dbdict.open(history_file, picklevalues=True)
def needs_update_check_modify_time (*params, **kwargs):
    """
    Given input and output files, see if all exist and whether output files are later than input files
    Each can be

        #. string: assumed to be a filename "file1"
        #. any other type
        #. arbitrary nested sequence of (1) and (2)

    """
    # conditions for rerunning a job:
    #   1. forced to rerun entire taskset
    #   2. 1+ Output files don't exist
    #   3. 1+ of input files is newer than 1+ output files  -- ruffus does this level right now...
    #   4. internal completion time for that file is out of date   # incomplete runs will be rerun automatically
    #   5. checksum of code that ran the file is out of date       # changes to function body result in rerun
    #   6. checksum of the args that ran the file are out of date  # appropriate config file changes result in rerun
    try:
        task = kwargs['task']
    except KeyError:
        # allow the task not to be specified and fall back to classic
        # file timestamp behavior (either this or fix all the test cases,
        # which often don't have proper tasks)
        class Namespace:
            pass
        task = Namespace()
        task.checksum_level = CHECKSUM_FILE_TIMESTAMPS

    job_history = dbdict.open(RUFFUS_HISTORY_FILE, picklevalues=True)

    # missing output means build
    if len(params) < 2:
        return True

    i, o = params[0:2]
    i = get_strings_in_nested_sequence(i)
    o = get_strings_in_nested_sequence(o)

    #
    # build: missing output file
    #
    if len(o) == 0:
        return True, "Missing output file"

    # missing input / output file means always build
    missing_files = []
    for io in (i, o):
        for p in io:
            if not os.path.exists(p):
                missing_files.append(p)
    if len(missing_files):
        return True, "Missing file%s [%s]" % ("s" if len(missing_files) > 1 else "",
                                            ", ".join(missing_files))

    # existing files, but from previous interrupted runs
    if task.checksum_level >= CHECKSUM_HISTORY_TIMESTAMPS:
        incomplete_files = []
        func_changed_files = []
        param_changed_files = []
        #for io in (i, o):
        #    for p in io:
        #        if p not in job_history:
        #            incomplete_files.append(p)
        for p in o:
            if p not in job_history:
                incomplete_files.append(p)
        if len(incomplete_files):
            return True, "Previous incomplete run leftover%s: [%s]" % ("s" if len(incomplete_files) > 1 else "",
                                                ", ".join(incomplete_files))
        # check if function that generated our output file has changed
        for p in o:
            old_chksum = job_history[p]
            new_chksum = JobHistoryChecksum(p, None, params[2:], task)
            if task.checksum_level >= CHECKSUM_FUNCTIONS_AND_PARAMS and \
                            new_chksum.chksum_params != old_chksum.chksum_params:
                param_changed_files.append(p)
            elif task.checksum_level >= CHECKSUM_FUNCTIONS and \
                            new_chksum.chksum_func != old_chksum.chksum_func:
                func_changed_files.append(p)

        if len(func_changed_files):
            return True, "Pipeline function has changed: [%s]" % (", ".join(func_changed_files))
        if len(param_changed_files):
            return True, "Pipeline parameters have changed: [%s]" % (", ".join(param_changed_files))

    #
    #   missing input -> build only if output absent or function is out of date
    #
    if len(i) == 0:
        return False, "Missing input files"


    #
    #   get sorted modified times for all input and output files
    #
    filename_to_times = [[], []]
    file_times = [[], []]



    #_____________________________________________________________________________________

    #   pretty_io_with_date_times

    #_____________________________________________________________________________________
    def pretty_io_with_date_times (filename_to_times):

        # sort
        for io in range(2) :
            filename_to_times[io].sort()


        #
        #   add asterisk for all files which are causing this job to be out of date
        #
        file_name_to_asterisk = dict()
        oldest_output_mtime = filename_to_times[1][0][0]
        for mtime, file_name in filename_to_times[0]:
            file_name_to_asterisk[file_name] = "*" if mtime >= oldest_output_mtime else " "
        newest_output_mtime = filename_to_times[0][-1][0]
        for mtime, file_name  in filename_to_times[1]:
            file_name_to_asterisk[file_name] = "*" if mtime <= newest_output_mtime else " "



        #
        #   try to fit in 100 - 15 = 85 char lines
        #   date time ~ 25 characters so limit file name to 55 characters
        #
        msg = "\n"
        category_names = "Input", "Output"
        for io in range(2):
            msg += "  %s files:\n" % category_names[io]
            for mtime, file_name in filename_to_times[io]:
                file_datetime_str = epoch_seconds_to_str(mtime)
                msg += ("   " +                                         # indent
                        file_name_to_asterisk[file_name] + " " +        # asterisked out of date files
                        file_datetime_str + ": " +                      # date time of file
                        get_readable_path_str(file_name, 55) + "\n")    # file name truncated to 55
        return msg


    #
    #   Ignore output file if it is found in the list of input files
    #       By definition they have the same timestamp,
    #       and the job will otherwise appear to be out of date
    #
    #   Symbolic links followed
    real_input_file_names = set()
    for input_file_name in i:
        real_input_file_names.add(os.path.realpath(input_file_name))
        if task.checksum_level >= CHECKSUM_HISTORY_TIMESTAMPS and input_file_name in job_history:
            mtime = max(os.path.getmtime(input_file_name), job_history[input_file_name].mtime)
        else:
            mtime = os.path.getmtime(input_file_name)
        filename_to_times[0].append((mtime, input_file_name))
        file_times[0].append(mtime)


    # for output files, we need to check modification time *in addition* to
    # function and argument checksums...
    for output_file_name in o:
        real_file_name = os.path.realpath(output_file_name)
        if task.checksum_level >= CHECKSUM_HISTORY_TIMESTAMPS:
            old_chksum = job_history[output_file_name]
            mtime = min(os.path.getmtime(output_file_name), old_chksum.mtime)
        else:
            mtime = os.path.getmtime(output_file_name)
        if real_file_name not in real_input_file_names:
            file_times[1].append(mtime)
        filename_to_times[1].append((mtime, output_file_name))


    #
    #   Debug: Force print modified file names and times
    #
    #if len(file_times[0]) and len (file_times[1]):
    #    print >>sys.stderr, pretty_io_with_date_times(filename_to_times), file_times, (max(file_times[0]) >= min(file_times[1]))
    #else:
    #    print >>sys.stderr, i, o

    #
    #   update if any input file >= (more recent) output file
    #
    if len(file_times[0]) and len (file_times[1]) and max(file_times[0]) >= min(file_times[1]):
        return True, pretty_io_with_date_times(filename_to_times)
    return False, "Up to date"