Ejemplo n.º 1
0
def synchronize_files(inputpaths, outpath, database=None, tqdm_bar=None, report_file=None, ptee=None, verbose=False):
    ''' Main function to synchronize files contents by majority vote
    The main job of this function is to walk through the input folders and align the files, so that we can compare every files across every folders, one by one.
    The whole trick here is to align files, so that we don't need to memorize all the files in memory and we compare all equivalent files together: to do that, we ensure that we walk through the input directories in alphabetical order, and we pick the relative filepath at the top of the alphabetical order, this ensures the alignment of files between different folders, without memorizing  the whole trees structures.
    '''
    # (Generator) Files Synchronization Algorithm:
    # Needs a function stable_dir_walking, which will walk through directories recursively but in always the same order on all platforms (same order for files but also for folders), whatever order it is, as long as it is stable.
    # Until there's no file in any of the input folders to be processed:
    # - curfiles <- load first file for each folder by using stable_dir_walking on each input folder.
    # - curfiles_grouped <- group curfiles_ordered:
    #    * curfiles_ordered <- order curfiles alphabetically (need to separate the relative parent directory and the filename, to account for both without ambiguity)
    #    * curfiles_grouped <- empty list
    #    * curfiles_grouped[0] = add first element in curfiles_ordered
    #    * last_group = 0
    #    * for every subsequent element nextelt in curfiles_ordered:
    #        . if nextelt == curfiles_grouped[last_group][0]: add nextelt into curfiles_grouped[last_group] (the latest group in curfiles_grouped)
    #        . else: create a new group in curfiles_grouped (last_group += 1) and add nextelt into curfiles_grouped[last_group]
    # At this stage, curfiles_grouped[0] should contain a group of files with the same relative filepath from different input folders, and since we used stable_dir_walking, we are guaranteed that this file is the next to be processed in alphabetical order.
    # - Majority vote byte-by-byte for each of curfiles_grouped[0], and output winning byte to the output file.
    # - Update files list alignment: we will now ditch files in curfiles_grouped[0] from curfiles, and replace by the next files respectively from each respective folder. Since we processed in alphabetical (or whatever) order, the next loaded files will match the files in other curfiles_grouped groups that we could not process before.
    # At this point (after the loop), all input files have been processed in order, without maintaining the whole files list in memory, just one file per input folder.

    # Init files walking generator for each inputpaths
    recgen = [recwalk(path, sorting=True) for path in inputpaths]
    curfiles = {}
    recgen_exhausted = {}
    recgen_exhausted_count = 0
    nbpaths = len(inputpaths)
    retcode = 0

    if not ptee: ptee = sys.stdout

    # Open report file and write header
    if report_file is not None:
        rfile = open(report_file, 'wb')
        r_writer = csv.writer(rfile, delimiter='|', lineterminator='\n', quotechar='"')
        r_header = ["filepath"] + ["dir%i" % (i+1) for i in xrange(nbpaths)] + ["hash-correct", "error_code", "errors"]
        r_length = len(r_header)
        r_writer.writerow(r_header)

    # Initialization: load the first batch of files, one for each folder
    for i in xrange(len(recgen)):
        recgen_exhausted[i] = False
        try:
            if curfiles.get(i, None) is None:
                curfiles[i] = relpath_posix(recgen[i].next(), inputpaths[i])[1]
        except StopIteration:
            recgen_exhausted[i] = True
            recgen_exhausted_count += 1

    # Files lists alignment loop
    while recgen_exhausted_count < nbpaths:
        errcode = 0
        errmsg = None

        # Init a new report's row
        if report_file: r_row = ["-"] * r_length

        # -- Group equivalent relative filepaths together
        #print curfiles # debug
        curfiles_grouped = sort_group(curfiles, True)

        # -- Extract first group of equivalent filepaths (this allows us to process with the same alphabetical order on all platforms)
        # Note that the remaining files in other groups will be processed later, because their alphabetical order is higher to the first group, this means that the first group is to be processed now
        to_process = curfiles_grouped[0]
        #print to_process # debug

        # -- Byte-by-byte majority vote on the first group of files
        # Need the relative filepath also (note that there's only one since it's a group of equivalent relative filepaths, only the absolute path is different between files of a same group)
        relfilepath = path2unix(os.path.join(*to_process[0][1]))
        if report_file: r_row[0] = relfilepath
        if verbose: ptee.write("- Processing file %s." % relfilepath)
        # Generate output path
        outpathfull = os.path.join(outpath, relfilepath)
        create_dir_if_not_exist(os.path.dirname(outpathfull))
        # Initialize the list of absolute filepaths
        fileslist = []
        for elt in to_process:
            i = elt[0]
            fileslist.append(os.path.join(inputpaths[i], os.path.join(*elt[1])))
            if report_file: r_row[i+1] = 'X' # put an X in the report file below each folder that contains this file
        # If there's only one file, just copy it over
        if len(to_process) == 1:
            shutil.copyfile(fileslist[0], outpathfull)
            id = to_process[0][0]
            if report_file: r_row[id+1] = 'O'
        # Else, merge by majority vote
        else:
            # Before-merge check using rfigc database, if provided
            # If one of the files in the input folders is already correct, just copy it over
            correct_file = None
            if database:
                for id, filepath in enumerate(fileslist):
                    if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" % (filepath, database)) == 0:
                        correct_file = filepath
                        correct_id = to_process[id][0]
                        break

            # If one correct file was found, copy it over
            if correct_file:
                create_dir_if_not_exist(os.path.dirname(outpathfull))
                shutil.copyfile(correct_file, outpathfull)
                if report_file:
                    r_row[correct_id+1] = "O"
                    r_row[-3] = "OK"
            # Else, we need to do the majority vote merge
            else:
                # Do the majority vote merge
                errcode, errmsg = majority_vote_byte_scan(relfilepath, fileslist, outpath)

        # After-merge/move check using rfigc database, if provided
        if database:
            if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" % (outpathfull, database)) == 1:
                errcode = 1
                r_row[-3] = "KO"
                if not errmsg: errmsg = ''
                errmsg += " File could not be totally repaired according to rfigc database."
            else:
                if report_file:
                    r_row[-3] = "OK"
                    if errmsg: errmsg += " But merged file is correct according to rfigc database."

        # Display errors if any
        if errcode:
            if report_file:
                r_row[-2] = "KO"
                r_row[-1] = errmsg
            ptee.write(errmsg)
            retcode = 1
        else:
            if report_file: r_row[-2] = "OK"

        # Save current report's row
        if report_file:
            r_writer.writerow(r_row)

        # -- Update files lists alignment (ie, retrieve new files but while trying to keep the alignment)
        for elt in to_process:  # for files of the first group (the ones we processed)
            i = elt[0]
            # Walk their respective folders and load up the next file
            try:
                if not recgen_exhausted.get(i, False):
                    curfiles[i] = relpath_posix(recgen[i].next(), inputpaths[i])[1]
            # If there's no file left in this folder, mark this input folder as exhausted and continue with the others
            except StopIteration:
                curfiles[i] = None
                recgen_exhausted[i] = True
                recgen_exhausted_count += 1
        if tqdm_bar: tqdm_bar.update()
    if tqdm_bar: tqdm_bar.close()

    # Closing report file
    if report_file:
        # Write list of directories and legend
        rfile.write("\n=> Input directories:")
        for id, ipath in enumerate(inputpaths):
            rfile.write("\n\t- dir%i = %s" % ((id+1), ipath))
        rfile.write("\n=> Output directory: %s" % outpath)
        rfile.write("\n=> Legend: X=existing/selected for majority vote, O=only used this file, - = not existing, OK = check correct, KO = check incorrect (file was not recovered)\n")
        # Close the report file handle
        rfile.close()

    return retcode
Ejemplo n.º 2
0
def majority_vote_byte_scan(relfilepath, fileslist, outpath, blocksize=65535, default_char_null=False):
    '''Takes a list of files in string format representing the same data, and disambiguate by majority vote: for position in string, if the character is not the same accross all entries, we keep the major one. If none, it will be replaced by a null byte (because we can't know if any of the entries are correct about this character).
    relfilepath is the filename or the relative file path relative to the parent directory (ie, this is the relative path so that we can compare the files from several directories).'''
    # The idea of replication combined with ECC was a bit inspired by this paper: Friedman, Roy, Yoav Kantor, and Amir Kantor. "Combining Erasure-Code and Replication Redundancy Schemes for Increased Storage and Repair Efficiency in P2P Storage Systems.", 2013, Technion, Computer Science Department, Technical Report CS-2013-03
    # But it is a very well known concept in redundancy engineering, usually called triple-modular redundancy (which is here extended to n-modular since we can supply any number of files we want, not just three).
    # Preference in case of ambiguity is always given to the file of the first folder.

    fileshandles = []
    for filepath in fileslist:
        if filepath:
            # Already a file handle? Just store it in the fileshandles list
            if hasattr(filepath, 'read'):
                fileshandles.append(filepath)
            # Else it's a string filepath, open the file
            else:
                fileshandles.append(open(filepath, 'rb'))

    # Create and open output (merged) file, except if we were already given a file handle
    if hasattr(outpath, 'write'):
        outfile = outpath
    else:
        outpathfull = os.path.join(outpath, relfilepath)
        pardir = os.path.dirname(outpathfull)
        if not os.path.exists(pardir):
            os.makedirs(pardir)
        outfile = open(outpathfull, 'wb')

    # Cannot vote if there's not at least 3 files!
    # In this case, just copy the file from the first folder, verbatim
    if len(fileshandles) < 3:
        # If there's at least one input file, then copy it verbatim to the output folder
        if fileshandles:
            create_dir_if_not_exist(os.path.dirname(outpathfull))
            buf = 1
            while (buf):
                buf = fileshandles[0].read()
                outfile.write(buf)
                outfile.flush()
        return (1, "Error with file %s: only %i copies available, cannot vote (need at least 3)! Copied the first file from the first folder, verbatim." % (relfilepath, len(fileshandles)))

    errors = []
    entries = [1]*len(fileshandles)  # init with 0 to start the while loop
    while (entries.count('') < len(fileshandles)):
        final_entry = []
        # Read a block from all input files into memory
        for i in xrange(len(fileshandles)):
            entries[i] = fileshandles[i].read(blocksize)

        # End of file for all files, we exit
        if entries.count('') == len(fileshandles):
            break
        # Else if there's only one file, just copy the file's content over
        elif len(entries) == 1:
            final_entry = entries[0]

        # Else, do the majority vote
        else:
            # Walk along each column (imagine the strings being rows in a matrix, then we pick one column at each iteration = all characters at position i of each string), so that we can compare these characters easily
            for i in xrange(max(len(entry) for entry in entries)):
                hist = {} # kind of histogram, we just memorize how many times a character is presented at the position i in each string TODO: use collections.Counter instead of dict()?
                # Extract the character at position i of each string and compute the histogram at the same time (number of time this character appear among all strings at this position i)
                for entry in entries:
                    # Check if we are not beyond the current entry's length
                    if i < len(entry): # TODO: check this line, this should allow the vote to continue even if some files are shorter than others
                        # Extract the character and use it to contribute to the histogram
                        # TODO: add warning message when one file is not of the same size as the others
                        key = str(ord(entry[i])) # convert to the ascii value to avoid any funky problem with encoding in dict keys
                        hist[key] = hist.get(key, 0) + 1 # increment histogram for this value. If it does not exists, use 0. (essentially equivalent to hist[key] += 1 but with exception management if key did not already exists)
                # If there's only one character (it's the same accross all strings at position i), then it's an exact match, we just save the character and we can skip to the next iteration
                if len(hist) == 1:
                    final_entry.append(chr(int(hist.iterkeys().next())))
                    continue
                # Else, the character is different among different entries, we will pick the major one (mode)
                elif len(hist) > 1:
                    # Sort the dict by value (and reverse because we want the most frequent first)
                    skeys = sorted(hist, key=hist.get, reverse=True)
                    # Ambiguity! If each entries present a different character (thus the major has only an occurrence of 1), then it's too ambiguous and we just set a null byte to signal that
                    if hist[skeys[0]] == 1:
                        if default_char_null:
                            if default_char_null is True:
                                final_entry.append("\x00")
                            else:
                                final_entry.append(default_char_null)
                        else:
                            # Use the entry of the first file that is still open
                            first_char = ''
                            for entry in entries:
                                # Found the first file that has a character at this position: store it and break loop
                                if i < len(entry):
                                    first_char = entry[i]
                                    break
                            # Use this character in spite of ambiguity
                            final_entry.append(first_char)
                        errors.append(outfile.tell() + i) # Print an error indicating the characters that failed
                    # Else if there is a tie (at least two characters appear with the same frequency), then we just pick one of them
                    elif hist[skeys[0]] == hist[skeys[1]]:
                        final_entry.append(chr(int(skeys[0]))) # TODO: find a way to account for both characters. Maybe return two different strings that will both have to be tested? (eg: maybe one has a tampered hash, both will be tested and if one correction pass the hash then it's ok we found the correct one)
                    # Else we have a clear major character that appear in more entries than any other character, then we keep this one
                    else:
                        final_entry.append(chr(int(skeys[0]))) # alternative one-liner: max(hist.iteritems(), key=operator.itemgetter(1))[0]
                    continue
            # Concatenate to a string (this is faster than using a string from the start and concatenating at each iteration because Python strings are immutable so Python has to copy over the whole string, it's in O(n^2)
            final_entry = ''.join(final_entry)
            # Commit to output file
            outfile.write(final_entry)
            outfile.flush()

    # Errors signaling
    if errors:
        error_msg = "Unrecoverable corruptions (because of ambiguity) in file %s on characters: %s." % (relfilepath, [hex(int(x)) for x in errors]) # Signal to user that this file has unrecoverable corruptions (he may try to fix the bits manually or with his own script)
        return (1, error_msg) # return an error
    # Close all input files
    for fh in fileshandles:
        fh.close()
    # Close output file
    if outfile != outpath:  # close only if we were not given a file handle in the first place
        outfile.flush()
        outfile.close()
    return (0, None)
def synchronize_files(inputpaths,
                      outpath,
                      database=None,
                      tqdm_bar=None,
                      report_file=None,
                      ptee=None,
                      verbose=False):
    ''' Main function to synchronize files contents by majority vote
    The main job of this function is to walk through the input folders and align the files, so that we can compare every files across every folders, one by one.
    The whole trick here is to align files, so that we don't need to memorize all the files in memory and we compare all equivalent files together: to do that, we ensure that we walk through the input directories in alphabetical order, and we pick the relative filepath at the top of the alphabetical order, this ensures the alignment of files between different folders, without memorizing  the whole trees structures.
    '''
    # (Generator) Files Synchronization Algorithm:
    # Needs a function stable_dir_walking, which will walk through directories recursively but in always the same order on all platforms (same order for files but also for folders), whatever order it is, as long as it is stable.
    # Until there's no file in any of the input folders to be processed:
    # - curfiles <- load first file for each folder by using stable_dir_walking on each input folder.
    # - curfiles_grouped <- group curfiles_ordered:
    #    * curfiles_ordered <- order curfiles alphabetically (need to separate the relative parent directory and the filename, to account for both without ambiguity)
    #    * curfiles_grouped <- empty list
    #    * curfiles_grouped[0] = add first element in curfiles_ordered
    #    * last_group = 0
    #    * for every subsequent element nextelt in curfiles_ordered:
    #        . if nextelt == curfiles_grouped[last_group][0]: add nextelt into curfiles_grouped[last_group] (the latest group in curfiles_grouped)
    #        . else: create a new group in curfiles_grouped (last_group += 1) and add nextelt into curfiles_grouped[last_group]
    # At this stage, curfiles_grouped[0] should contain a group of files with the same relative filepath from different input folders, and since we used stable_dir_walking, we are guaranteed that this file is the next to be processed in alphabetical order.
    # - Majority vote byte-by-byte for each of curfiles_grouped[0], and output winning byte to the output file.
    # - Update files list alignment: we will now ditch files in curfiles_grouped[0] from curfiles, and replace by the next files respectively from each respective folder. Since we processed in alphabetical (or whatever) order, the next loaded files will match the files in other curfiles_grouped groups that we could not process before.
    # At this point (after the loop), all input files have been processed in order, without maintaining the whole files list in memory, just one file per input folder.

    # Init files walking generator for each inputpaths
    recgen = [recwalk(path, sorting=True) for path in inputpaths]
    curfiles = {}
    recgen_exhausted = {}
    recgen_exhausted_count = 0
    nbpaths = len(inputpaths)
    retcode = 0

    if not ptee: ptee = sys.stdout

    # Open report file and write header
    if report_file is not None:
        rfile = open(report_file, 'wb')
        r_writer = csv.writer(rfile,
                              delimiter='|',
                              lineterminator='\n',
                              quotechar='"')
        r_header = ["filepath"] + [
            "dir%i" % (i + 1) for i in xrange(nbpaths)
        ] + ["hash-correct", "error_code", "errors"]
        r_length = len(r_header)
        r_writer.writerow(r_header)

    # Initialization: load the first batch of files, one for each folder
    for i in xrange(len(recgen)):
        recgen_exhausted[i] = False
        try:
            if curfiles.get(i, None) is None:
                curfiles[i] = relpath_posix(recgen[i].next(), inputpaths[i])[1]
        except StopIteration:
            recgen_exhausted[i] = True
            recgen_exhausted_count += 1

    # Files lists alignment loop
    while recgen_exhausted_count < nbpaths:
        errcode = 0
        errmsg = None

        # Init a new report's row
        if report_file: r_row = ["-"] * r_length

        # -- Group equivalent relative filepaths together
        #print curfiles # debug
        curfiles_grouped = sort_group(curfiles, True)

        # -- Extract first group of equivalent filepaths (this allows us to process with the same alphabetical order on all platforms)
        # Note that the remaining files in other groups will be processed later, because their alphabetical order is higher to the first group, this means that the first group is to be processed now
        to_process = curfiles_grouped[0]
        #print to_process # debug

        # -- Byte-by-byte majority vote on the first group of files
        # Need the relative filepath also (note that there's only one since it's a group of equivalent relative filepaths, only the absolute path is different between files of a same group)
        relfilepath = path2unix(os.path.join(*to_process[0][1]))
        if report_file: r_row[0] = relfilepath
        if verbose: ptee.write("- Processing file %s." % relfilepath)
        # Generate output path
        outpathfull = os.path.join(outpath, relfilepath)
        create_dir_if_not_exist(os.path.dirname(outpathfull))
        # Initialize the list of absolute filepaths
        fileslist = []
        for elt in to_process:
            i = elt[0]
            fileslist.append(os.path.join(inputpaths[i],
                                          os.path.join(*elt[1])))
            if report_file:
                r_row[
                    i +
                    1] = 'X'  # put an X in the report file below each folder that contains this file
        # If there's only one file, just copy it over
        if len(to_process) == 1:
            shutil.copyfile(fileslist[0], outpathfull)
            id = to_process[0][0]
            if report_file: r_row[id + 1] = 'O'
        # Else, merge by majority vote
        else:
            # Before-merge check using rfigc database, if provided
            # If one of the files in the input folders is already correct, just copy it over
            correct_file = None
            if database:
                for id, filepath in enumerate(fileslist):
                    if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" %
                                  (filepath, database)) == 0:
                        correct_file = filepath
                        correct_id = to_process[id][0]
                        break

            # If one correct file was found, copy it over
            if correct_file:
                create_dir_if_not_exist(os.path.dirname(outpathfull))
                shutil.copyfile(correct_file, outpathfull)
                if report_file:
                    r_row[correct_id + 1] = "O"
                    r_row[-3] = "OK"
            # Else, we need to do the majority vote merge
            else:
                # Do the majority vote merge
                errcode, errmsg = majority_vote_byte_scan(
                    relfilepath, fileslist, outpath)

        # After-merge/move check using rfigc database, if provided
        if database:
            if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" %
                          (outpathfull, database)) == 1:
                errcode = 1
                r_row[-3] = "KO"
                if not errmsg: errmsg = ''
                errmsg += " File could not be totally repaired according to rfigc database."
            else:
                if report_file:
                    r_row[-3] = "OK"
                    if errmsg:
                        errmsg += " But merged file is correct according to rfigc database."

        # Display errors if any
        if errcode:
            if report_file:
                r_row[-2] = "KO"
                r_row[-1] = errmsg
            ptee.write(errmsg)
            retcode = 1
        else:
            if report_file: r_row[-2] = "OK"

        # Save current report's row
        if report_file:
            r_writer.writerow(r_row)

        # -- Update files lists alignment (ie, retrieve new files but while trying to keep the alignment)
        for elt in to_process:  # for files of the first group (the ones we processed)
            i = elt[0]
            # Walk their respective folders and load up the next file
            try:
                if not recgen_exhausted.get(i, False):
                    curfiles[i] = relpath_posix(recgen[i].next(),
                                                inputpaths[i])[1]
            # If there's no file left in this folder, mark this input folder as exhausted and continue with the others
            except StopIteration:
                curfiles[i] = None
                recgen_exhausted[i] = True
                recgen_exhausted_count += 1
        if tqdm_bar: tqdm_bar.update()
    if tqdm_bar: tqdm_bar.close()

    # Closing report file
    if report_file:
        # Write list of directories and legend
        rfile.write("\n=> Input directories:")
        for id, ipath in enumerate(inputpaths):
            rfile.write("\n\t- dir%i = %s" % ((id + 1), ipath))
        rfile.write("\n=> Output directory: %s" % outpath)
        rfile.write(
            "\n=> Legend: X=existing/selected for majority vote, O=only used this file, - = not existing, OK = check correct, KO = check incorrect (file was not recovered)\n"
        )
        # Close the report file handle
        rfile.close()

    return retcode
def majority_vote_byte_scan(relfilepath,
                            fileslist,
                            outpath,
                            blocksize=65535,
                            default_char_null=False):
    '''Takes a list of files in string format representing the same data, and disambiguate by majority vote: for position in string, if the character is not the same accross all entries, we keep the major one. If none, it will be replaced by a null byte (because we can't know if any of the entries are correct about this character).
    relfilepath is the filename or the relative file path relative to the parent directory (ie, this is the relative path so that we can compare the files from several directories).'''
    # The idea of replication combined with ECC was a bit inspired by this paper: Friedman, Roy, Yoav Kantor, and Amir Kantor. "Combining Erasure-Code and Replication Redundancy Schemes for Increased Storage and Repair Efficiency in P2P Storage Systems.", 2013, Technion, Computer Science Department, Technical Report CS-2013-03
    # But it is a very well known concept in redundancy engineering, usually called triple-modular redundancy (which is here extended to n-modular since we can supply any number of files we want, not just three).
    # Preference in case of ambiguity is always given to the file of the first folder.

    fileshandles = []
    for filepath in fileslist:
        if filepath:
            # Already a file handle? Just store it in the fileshandles list
            if hasattr(filepath, 'read'):
                fileshandles.append(filepath)
            # Else it's a string filepath, open the file
            else:
                fileshandles.append(open(filepath, 'rb'))

    # Create and open output (merged) file, except if we were already given a file handle
    if hasattr(outpath, 'write'):
        outfile = outpath
    else:
        outpathfull = os.path.join(outpath, relfilepath)
        pardir = os.path.dirname(outpathfull)
        if not os.path.exists(pardir):
            os.makedirs(pardir)
        outfile = open(outpathfull, 'wb')

    # Cannot vote if there's not at least 3 files!
    # In this case, just copy the file from the first folder, verbatim
    if len(fileshandles) < 3:
        # If there's at least one input file, then copy it verbatim to the output folder
        if fileshandles:
            create_dir_if_not_exist(os.path.dirname(outpathfull))
            buf = 1
            while (buf):
                buf = fileshandles[0].read()
                outfile.write(buf)
                outfile.flush()
        return (
            1,
            "Error with file %s: only %i copies available, cannot vote (need at least 3)! Copied the first file from the first folder, verbatim."
            % (relfilepath, len(fileshandles)))

    errors = []
    entries = [1] * len(fileshandles)  # init with 0 to start the while loop
    while (entries.count('') < len(fileshandles)):
        final_entry = []
        # Read a block from all input files into memory
        for i in xrange(len(fileshandles)):
            entries[i] = fileshandles[i].read(blocksize)

        # End of file for all files, we exit
        if entries.count('') == len(fileshandles):
            break
        # Else if there's only one file, just copy the file's content over
        elif len(entries) == 1:
            final_entry = entries[0]

        # Else, do the majority vote
        else:
            # Walk along each column (imagine the strings being rows in a matrix, then we pick one column at each iteration = all characters at position i of each string), so that we can compare these characters easily
            for i in xrange(max(len(entry) for entry in entries)):
                hist = {
                }  # kind of histogram, we just memorize how many times a character is presented at the position i in each string TODO: use collections.Counter instead of dict()?
                # Extract the character at position i of each string and compute the histogram at the same time (number of time this character appear among all strings at this position i)
                for entry in entries:
                    # Check if we are not beyond the current entry's length
                    if i < len(
                            entry
                    ):  # TODO: check this line, this should allow the vote to continue even if some files are shorter than others
                        # Extract the character and use it to contribute to the histogram
                        # TODO: add warning message when one file is not of the same size as the others
                        key = str(
                            ord(entry[i])
                        )  # convert to the ascii value to avoid any funky problem with encoding in dict keys
                        hist[key] = hist.get(
                            key, 0
                        ) + 1  # increment histogram for this value. If it does not exists, use 0. (essentially equivalent to hist[key] += 1 but with exception management if key did not already exists)
                # If there's only one character (it's the same accross all strings at position i), then it's an exact match, we just save the character and we can skip to the next iteration
                if len(hist) == 1:
                    final_entry.append(chr(int(hist.iterkeys().next())))
                    continue
                # Else, the character is different among different entries, we will pick the major one (mode)
                elif len(hist) > 1:
                    # Sort the dict by value (and reverse because we want the most frequent first)
                    skeys = sorted(hist, key=hist.get, reverse=True)
                    # Ambiguity! If each entries present a different character (thus the major has only an occurrence of 1), then it's too ambiguous and we just set a null byte to signal that
                    if hist[skeys[0]] == 1:
                        if default_char_null:
                            if default_char_null is True:
                                final_entry.append("\x00")
                            else:
                                final_entry.append(default_char_null)
                        else:
                            # Use the entry of the first file that is still open
                            first_char = ''
                            for entry in entries:
                                # Found the first file that has a character at this position: store it and break loop
                                if i < len(entry):
                                    first_char = entry[i]
                                    break
                            # Use this character in spite of ambiguity
                            final_entry.append(first_char)
                        errors.append(
                            outfile.tell() + i
                        )  # Print an error indicating the characters that failed
                    # Else if there is a tie (at least two characters appear with the same frequency), then we just pick one of them
                    elif hist[skeys[0]] == hist[skeys[1]]:
                        final_entry.append(
                            chr(int(skeys[0]))
                        )  # TODO: find a way to account for both characters. Maybe return two different strings that will both have to be tested? (eg: maybe one has a tampered hash, both will be tested and if one correction pass the hash then it's ok we found the correct one)
                    # Else we have a clear major character that appear in more entries than any other character, then we keep this one
                    else:
                        final_entry.append(
                            chr(int(skeys[0]))
                        )  # alternative one-liner: max(hist.iteritems(), key=operator.itemgetter(1))[0]
                    continue
            # Concatenate to a string (this is faster than using a string from the start and concatenating at each iteration because Python strings are immutable so Python has to copy over the whole string, it's in O(n^2)
            final_entry = ''.join(final_entry)
            # Commit to output file
            outfile.write(final_entry)
            outfile.flush()

    # Errors signaling
    if errors:
        error_msg = "Unrecoverable corruptions (because of ambiguity) in file %s on characters: %s." % (
            relfilepath, [hex(int(x)) for x in errors]
        )  # Signal to user that this file has unrecoverable corruptions (he may try to fix the bits manually or with his own script)
        return (1, error_msg)  # return an error
    # Close all input files
    for fh in fileshandles:
        fh.close()
    # Close output file
    if outfile != outpath:  # close only if we were not given a file handle in the first place
        outfile.flush()
        outfile.close()
    return (0, None)
def main(argv=None):
    if argv is None:  # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(
            argv, basestring
    ):  # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv)  # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Resiliency Tester
Description: Given a directory and a configuration file (containing the commands to execute before and after file tampering), this script will generate a testing tree, where the files will be corrupted randomly and then the supplied repair commands will be executed, and repair stats will be computed at each step (for each stage and repair commands).

The testing process works in stages:
1- Before_tamper stage: Run preparatory commands before tampering (useful to generate ecc/database files).
2- Tamper stage: Tamper the files and/or databases.
3- After_tamper stage: Run after tampering commands, aka preparatory commands before repair stage.
4- Repair stage: Run repair commands, each repair command reusing the files generated (partially repaired) by the previous stage. This is indeed your repair workchain that you define here.
5- Statistics are generated for each stage.

Note that the original files are never tampered, we tamper only the copy we did inside the test folder.
Also note that the test folder will not be removed at the end, so that you can see for yourself the files resulting of each stage, and eventually use other tools to compute additional stats.
    '''
    ep = '''Use --gui as the first argument to use with a GUI (via Gooey).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and
                          not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
        widget_multidir = {"widget": "MultiDirChooser"}
    else:  # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]:
            argv.remove(
                '--ignore-gooey'
            )  # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
        widget_multidir = {}

    # Required arguments
    main_parser.add_argument(
        '-i',
        '--input',
        metavar='"/path/to/original/files/"',
        type=is_dir_or_file,
        nargs=1,
        required=True,
        help='Specify the path to the directory containing the sample data.',
        **widget_dir)
    main_parser.add_argument(
        '-o',
        '--output',
        metavar='/test/folder/',
        nargs=1,
        required=True,
        help=
        'Path to the test folder that will be created to store temporary test files.',
        **widget_dir)
    main_parser.add_argument(
        '-c',
        '--config',
        metavar='/some/folder/config.txt',
        type=str,
        nargs=1,
        required=True,  #type=argparse.FileType('rt')
        help=
        'Path to the configuration file (containing the commands to execute, Makefile format). Possible entries: before_tamper, tamper, after_tamper, repair. Note that you can use a few special tags to trigger string interpolation: {inputdir}, {dbdir}, {outputdir}.',
        **widget_file)

    # Optional arguments
    main_parser.add_argument(
        '-p',
        '--parallel',
        action='store_true',
        required=False,
        help=
        'If true, repair commands will be run on the tampered files, not on the previous repair results. Useful if you want to try different strategies/commands/programs. By default, false, thus the repair commands will take advantage of the results of previous repair commands.'
    )
    main_parser.add_argument(
        '-m',
        '--multiple',
        metavar=1,
        type=int,
        default=1,
        required=False,
        help='Run multiple times the resiliency test, and average the stats.',
        **widget_text)

    # Optional general arguments
    main_parser.add_argument(
        '-l',
        '--log',
        metavar='/some/folder/filename.log',
        type=str,
        nargs=1,
        required=False,
        help=
        'Path to the log file. (Output will be piped to both the stdout and the log file)',
        **widget_filesave)
    main_parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        required=False,
        default=False,
        help='Force overwriting the output folder even if it already exists.')
    main_parser.add_argument('-v',
                             '--verbose',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Verbose mode (show more output).')
    main_parser.add_argument(
        '--silent',
        action='store_true',
        required=False,
        default=False,
        help=
        'No console output (but if --log specified, the log will still be saved in the specified file).'
    )

    #== Parsing the arguments
    args = main_parser.parse_args(argv)  # Storing all arguments to args

    #-- Set variables from arguments
    origpath = fullpath(
        args.input[0]
    )  # path to the input directory (where the original, sample data is)
    outputpath = fullpath(args.output[0])
    configfile = fullpath(args.config[0])
    parallel = args.parallel
    multiple = args.multiple
    force = args.force
    verbose = args.verbose
    silent = args.silent

    #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
    #rootfolderpath = os.path.dirname(inputpath)

    # -- Checking arguments
    if not os.path.isdir(origpath):
        raise NameError("Input path needs to be a directory!")

    if not os.path.exists(configfile):
        raise NameError(
            "Please provide a configuration file in order to run a test!")
    else:
        commands = parse_configfile(configfile)

    if os.path.exists(outputpath) and not force:
        raise NameError(
            "Specified test folder (output path) %s already exists! Use --force to overwrite this directory."
            % outputpath)
    else:
        remove_if_exist(outputpath)

    if multiple < 1:
        multiple = 1

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #

    # == Main branch
    ptee.write("====================================")
    ptee.write("Resiliency tester, started on %s" %
               datetime.datetime.now().isoformat())
    ptee.write("====================================")

    ptee.write("Testing folder %s into test folder %s for %i run(s)." %
               (origpath, outputpath, multiple))

    fstats = {}
    for m in xrange(multiple):
        run_nb = m + 1

        ptee.write("===== Resiliency tester: starting run %i =====" % run_nb)

        # -- Define directories tree for this test run
        # testpath is the basepath for the current run
        # Generate a specific subdirectory for the current run
        testpath = os.path.join(outputpath, "run%i" % run_nb)
        dbdir = fullpath(os.path.join(testpath, "db"))
        origdbdir = fullpath(os.path.join(testpath, "origdb"))
        tamperdir = fullpath(os.path.join(testpath, "tampered"))
        repairdir = fullpath(os.path.join(testpath, "repair"))

        # == START TEST RUN
        # Create test folder
        create_dir_if_not_exist(testpath)

        # Before tampering
        ptee.write("=== BEFORE TAMPERING ===")
        create_dir_if_not_exist(dbdir)
        for i, cmd in enumerate(commands["before_tamper"]):
            scmd = interpolate_dict(cmd,
                                    interp_args={
                                        "inputdir": origpath,
                                        "dbdir": dbdir
                                    })
            ptee.write("Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)
        copy_any(dbdir,
                 origdbdir)  # make a copy because we may tamper the db files

        # Tampering
        ptee.write("=== TAMPERING ===")
        copy_any(origpath, tamperdir)
        for i, cmd in enumerate(commands["tamper"]):
            scmd = interpolate_dict(cmd,
                                    interp_args={
                                        "inputdir": tamperdir,
                                        "dbdir": dbdir
                                    })
            ptee.write("- RTEST: Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)

        # After tampering
        ptee.write("=== AFTER TAMPERING ===")
        for i, cmd in enumerate(commands["after_tamper"]):
            scmd = interpolate_dict(cmd,
                                    interp_args={
                                        "inputdir": tamperdir,
                                        "dbdir": dbdir
                                    })
            ptee.write("- RTEST: Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)

        # Repairing
        ptee.write("=== REPAIRING ===")
        indir = tamperdir
        finalrepairdir = ''
        for i, cmd in enumerate(commands["repair"]):
            outdir = "%s%i" % (repairdir, i)
            scmd = interpolate_dict(cmd,
                                    interp_args={
                                        "inputdir": indir,
                                        "dbdir": dbdir,
                                        "outputdir": outdir
                                    })
            ptee.write("- RTEST: Executing command: %s" % scmd)
            create_dir_if_not_exist(outdir)
            execute_command(scmd, ptee=ptee)
            copy_any(
                indir, outdir, only_missing=True
            )  # copy the files that did not need any repair (or could not be repaired at all!)
            finalrepairdir = outdir
            # If parallel, do not reuse the previous repair resulting files, repair from the tampered files directly
            if not parallel: indir = outdir

        # Stats
        stats = compute_all_diff_stats(commands, origpath, tamperdir,
                                       repairdir, finalrepairdir)
        ptee.write(
            "========== Resiliency tester results for run %i ==========" %
            run_nb)
        for key, stat in stats.iteritems():
            ptee.write("=> Stage: %s" % key)
            ptee.write(pretty_print_stats(stat))

        if run_nb == 1:
            fstats = stats
        else:
            fstats = stats_running_average(fstats, stats, run_nb - 1)

    ptee.write("============================")
    ptee.write("RESILIENCY TESTER FINAL AVERAGED RESULTS OVER %i RUNS" %
               multiple)
    ptee.write("============================")
    for key, stat in fstats.iteritems():
        ptee.write("=> Stage: %s" % key)
        ptee.write(pretty_print_stats(stat))

    # Shutting down
    del ptee
    # Completely repair all the files? Return OK
    if stats["final"]["error"] == 0:
        return 0
    else:
        return 1
Ejemplo n.º 6
0
def main(argv=None):
    if argv is None: # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv) # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Resiliency Tester
Description: Given a directory and a configuration file (containing the commands to execute before and after file tampering), this script will generate a testing tree, where the files will be corrupted randomly and then the supplied repair commands will be executed, and repair stats will be computed at each step (for each stage and repair commands).

The testing process works in stages:
1- Before_tamper stage: Run preparatory commands before tampering (useful to generate ecc/database files).
2- Tamper stage: Tamper the files and/or databases.
3- After_tamper stage: Run after tampering commands, aka preparatory commands before repair stage.
4- Repair stage: Run repair commands, each repair command reusing the files generated (partially repaired) by the previous stage. This is indeed your repair workchain that you define here.
5- Statistics are generated for each stage.

Note that the original files are never tampered, we tamper only the copy we did inside the test folder.
Also note that the test folder will not be removed at the end, so that you can see for yourself the files resulting of each stage, and eventually use other tools to compute additional stats.
    '''
    ep = '''Use --gui as the first argument to use with a GUI (via Gooey).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
        widget_multidir = {"widget": "MultiDirChooser"}
    else: # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
        widget_multidir = {}

    # Required arguments
    main_parser.add_argument('-i', '--input', metavar='"/path/to/original/files/"', type=is_dir_or_file, nargs=1, required=True,
                        help='Specify the path to the directory containing the sample data.', **widget_dir)
    main_parser.add_argument('-o', '--output', metavar='/test/folder/', nargs=1, required=True,
                        help='Path to the test folder that will be created to store temporary test files.', **widget_dir)
    main_parser.add_argument('-c', '--config', metavar='/some/folder/config.txt', type=str, nargs=1, required=True, #type=argparse.FileType('rt')
                        help='Path to the configuration file (containing the commands to execute, Makefile format). Possible entries: before_tamper, tamper, after_tamper, repair. Note that you can use a few special tags to trigger string interpolation: {inputdir}, {dbdir}, {outputdir}.', **widget_file)

    # Optional arguments
    main_parser.add_argument('-p', '--parallel', action='store_true', required=False,
                        help='If true, repair commands will be run on the tampered files, not on the previous repair results. Useful if you want to try different strategies/commands/programs. By default, false, thus the repair commands will take advantage of the results of previous repair commands.')
    main_parser.add_argument('-m', '--multiple', metavar=1, type=int, default=1, required=False,
                        help='Run multiple times the resiliency test, and average the stats.', **widget_text)

    # Optional general arguments
    main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False,
                        help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave)
    main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False,
                        help='Force overwriting the output folder even if it already exists.')
    main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False,
                        help='Verbose mode (show more output).')
    main_parser.add_argument('--silent', action='store_true', required=False, default=False,
                        help='No console output (but if --log specified, the log will still be saved in the specified file).')

    #== Parsing the arguments
    args = main_parser.parse_args(argv) # Storing all arguments to args

    #-- Set variables from arguments
    origpath = fullpath(args.input[0]) # path to the input directory (where the original, sample data is)
    outputpath = fullpath(args.output[0])
    configfile = fullpath(args.config[0])
    parallel = args.parallel
    multiple = args.multiple
    force = args.force
    verbose = args.verbose
    silent = args.silent

    #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
        #rootfolderpath = os.path.dirname(inputpath)

    # -- Checking arguments
    if not os.path.isdir(origpath):
        raise NameError("Input path needs to be a directory!")

    if not os.path.exists(configfile):
        raise NameError("Please provide a configuration file in order to run a test!")
    else:
        commands = parse_configfile(configfile)

    if os.path.exists(outputpath) and not force:
        raise NameError("Specified test folder (output path) %s already exists! Use --force to overwrite this directory." % outputpath)
    else:
        remove_if_exist(outputpath)

    if multiple < 1:
        multiple = 1

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #

    # == Main branch
    ptee.write("====================================")
    ptee.write("Resiliency tester, started on %s" % datetime.datetime.now().isoformat())
    ptee.write("====================================")
    
    ptee.write("Testing folder %s into test folder %s for %i run(s)." % (origpath, outputpath, multiple))

    fstats = {}
    for m in xrange(multiple):
        run_nb = m + 1

        ptee.write("===== Resiliency tester: starting run %i =====" % run_nb)

        # -- Define directories tree for this test run
        # testpath is the basepath for the current run
        # Generate a specific subdirectory for the current run
        testpath = os.path.join(outputpath, "run%i" % run_nb)
        dbdir = fullpath(os.path.join(testpath, "db"))
        origdbdir = fullpath(os.path.join(testpath, "origdb"))
        tamperdir = fullpath(os.path.join(testpath, "tampered"))
        repairdir = fullpath(os.path.join(testpath, "repair"))

        # == START TEST RUN
        # Create test folder
        create_dir_if_not_exist(testpath)

        # Before tampering
        ptee.write("=== BEFORE TAMPERING ===")
        create_dir_if_not_exist(dbdir)
        for i, cmd in enumerate(commands["before_tamper"]):
            scmd = interpolate_dict(cmd, interp_args={"inputdir": origpath, "dbdir": dbdir})
            ptee.write("Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)
        copy_any(dbdir, origdbdir) # make a copy because we may tamper the db files

        # Tampering
        ptee.write("=== TAMPERING ===")
        copy_any(origpath, tamperdir)
        for i, cmd in enumerate(commands["tamper"]):
            scmd = interpolate_dict(cmd, interp_args={"inputdir": tamperdir, "dbdir": dbdir})
            ptee.write("- RTEST: Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)

        # After tampering
        ptee.write("=== AFTER TAMPERING ===")
        for i, cmd in enumerate(commands["after_tamper"]):
            scmd = interpolate_dict(cmd, interp_args={"inputdir": tamperdir, "dbdir": dbdir})
            ptee.write("- RTEST: Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)

        # Repairing
        ptee.write("=== REPAIRING ===")
        indir = tamperdir
        finalrepairdir = ''
        for i, cmd in enumerate(commands["repair"]):
            outdir = "%s%i" % (repairdir, i)
            scmd = interpolate_dict(cmd, interp_args={"inputdir": indir, "dbdir": dbdir, "outputdir": outdir})
            ptee.write("- RTEST: Executing command: %s" % scmd)
            create_dir_if_not_exist(outdir)
            execute_command(scmd, ptee=ptee)
            copy_any(indir, outdir, only_missing=True) # copy the files that did not need any repair (or could not be repaired at all!)
            finalrepairdir = outdir
            # If parallel, do not reuse the previous repair resulting files, repair from the tampered files directly
            if not parallel: indir = outdir

        # Stats
        stats = compute_all_diff_stats(commands, origpath, tamperdir, repairdir, finalrepairdir)
        ptee.write("========== Resiliency tester results for run %i ==========" % run_nb)
        for key, stat in stats.iteritems():
            ptee.write("=> Stage: %s" % key)
            ptee.write(pretty_print_stats(stat))

        if run_nb == 1:
            fstats = stats
        else:
            fstats = stats_running_average(fstats, stats, run_nb-1)

    ptee.write("============================")
    ptee.write("RESILIENCY TESTER FINAL AVERAGED RESULTS OVER %i RUNS" % multiple)
    ptee.write("============================")
    for key, stat in fstats.iteritems():
        ptee.write("=> Stage: %s" % key)
        ptee.write(pretty_print_stats(stat))

    # Shutting down
    del ptee
    # Completely repair all the files? Return OK
    if stats["final"]["error"] == 0:
        return 0
    else:
        return 1