Ejemplo n.º 1
0
def main(argv=None):
    if argv is None: # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv) # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Replication Repair
Description: Given a set of directories (or files), try to repair your files by scanning each byte, cast a majority vote among all copies, and then output the winning byte. This process is usually called triple-modular redundancy (but here it should be called n-modular redundancy since you can use as many copies as you have).
It is recommended for long term storage to store several copies of your files on different storage mediums. Everything's fine until all your copies are partially corrupted. In this case, this script can help you, by taking advantage of your multiple copies, without requiring a pregenerated ecc file. Just specify the path to every copies, and the script will try to recover them.
Replication can repair exactly r-2 errors using majority vote (you need at least 2 blocks for majority vote to work), where r is the number of replications: if r=3, you get a redundancy rate of 1/3, if r=4, rate is 2/4, etc.
This script can also take advantage of a database generated by rfigc.py to make sure that the recovered files are correct, or to select files that are already correct.

Note: in case the end result is not what you expected, you can try a different order of input directories: in case of ambiguity, the first input folder has precedence over subsequent folders.
Note2: in case some files with the same names are of different length, the merging will continue until the longest file is exhausted.
Note3: last modification date is not (yet) accounted for.
    '''
    ep = '''Use --gui as the first argument to use with a GUI (via Gooey).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
        widget_multidir = {"widget": "MultiDirChooser"}
    else: # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
        widget_multidir = {}

    # Required arguments
    main_parser.add_argument('-i', '--input', metavar='"/path/to/copy1/" "/path/to/copy2/" "etc."', type=is_dir_or_file, nargs='+', required=True,
                        help='Specify the paths to every copies you have (minimum 3 copies, else it won\'t work!). Can be folders or files (if you want to repair only one file). Order matters: in case of ambiguity, the first folder where the file exists will be chosen.', **widget_multidir)
    main_parser.add_argument('-o', '--output', metavar='/ouput/folder/', nargs=1, required=True,
                        help='Where the recovered files will be stored.', **widget_dir)

    # Optional general arguments
    main_parser.add_argument('-d', '--database', metavar='database.csv', type=is_file, required=False,
                        help='Path to a previously generated rfigc.py database. If provided, this will be used to check that the repaired files are correct (and also to find already correct files in copies).', **widget_file)
    main_parser.add_argument('-r', '--report', metavar='/some/folder/report.csv', type=str, required=False,
                        help='Save all results of the repair process in a report file, with detailed descriptions of ambiguous repairs (ie, when majority vote came to a draw).', **widget_filesave)
    main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False,
                        help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave)
    main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False,
                        help='Force overwriting the output folder even if it already exists.')
    main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False,
                        help='Verbose mode (show more output).')
    main_parser.add_argument('--silent', action='store_true', required=False, default=False,
                        help='No console output (but if --log specified, the log will still be saved in the specified file).')

    #== Parsing the arguments
    args = main_parser.parse_args(argv) # Storing all arguments to args

    #-- Set variables from arguments
    inputpaths = [fullpath(x) for x in args.input] # path to the files to repair (ie, paths to all the different copies the user has)
    outputpath = fullpath(args.output[0])
    force = args.force
    verbose = args.verbose
    silent = args.silent

    if len(inputpaths) < 3:
        raise Exception('Need at least 3 copies to do a replication repair/majority vote!')

    #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
        #rootfolderpath = os.path.dirname(inputpath)

    report_file = None
    if args.report: report_file = os.path.basename(fullpath(args.report))
    database = None
    if args.database: database = args.database

    # -- Checking arguments
    if os.path.exists(outputpath) and not force:
        raise NameError('Specified output path %s already exists! Use --force if you want to overwrite.' % outputpath)

    if database and not os.path.isfile(database):
        raise NameError('Specified rfigc database file %s does not exist!' % database)

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        #sys.stdout = Tee(args.log[0], 'a')
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)


    # == PROCESSING BRANCHING == #

    # == Precomputation of ecc file size
    # Precomputing is important so that the user can know what size to expect before starting (and how much time it will take...).
    filescount = 0
    sizetotal = 0
    sizeheaders = 0
    visitedfiles = {}
    ptee.write("Precomputing list of files and predicted statistics...")
    prebar = tqdm.tqdm(file=ptee, disable=silent)
    for inputpath in inputpaths:
        for (dirpath, filename) in recwalk(inputpath):
            # Get full absolute filepath
            filepath = os.path.join(dirpath, filename)
            relfilepath = path2unix(os.path.relpath(filepath, inputpath)) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different)

            # Only increase the files count if we didn't see this file before
            if not visitedfiles.get(relfilepath, None):
                # Counting the total number of files we will process (so that we can show a progress bar with ETA)
                filescount = filescount + 1
                # Add the file to the list of already visited files
                visitedfiles[relfilepath] = True
                # Get the current file's size
                size = os.stat(filepath).st_size
                # Compute total size of all files
                sizetotal = sizetotal + size
            prebar.update()
    prebar.close()
    ptee.write("Precomputing done.")

    # == Majority vote repair
    # For each folder, align the files lists and then majority vote over each byte to repair
    ptee.write("====================================")
    ptee.write("Replication repair, started on %s" % datetime.datetime.now().isoformat())
    ptee.write("====================================")

    # Prepare progress bar if necessary
    if silent:
        tqdm_bar = None
    else:
        tqdm_bar = tqdm.tqdm(total=filescount, file=ptee, leave=True, unit="files")
    # Call the main function to synchronize files using majority vote
    errcode = synchronize_files(inputpaths, outputpath, database=database, tqdm_bar=tqdm_bar, report_file=report_file, ptee=ptee, verbose=verbose)
    #ptee.write("All done! Stats:\n- Total files processed: %i\n- Total files corrupted: %i\n- Total files repaired completely: %i\n- Total files repaired partially: %i\n- Total files corrupted but not repaired at all: %i\n- Total files skipped: %i" % (files_count, files_corrupted, files_repaired_completely, files_repaired_partially, files_corrupted - (files_repaired_partially + files_repaired_completely), files_skipped) )
    if tqdm_bar: tqdm_bar.close()
    ptee.write("All done!")
    if report_file: ptee.write("Saved replication repair results in report file: %s" % report_file)
    del ptee
    return errcode
Ejemplo n.º 2
0
def main(argv=None):
    if argv is None: # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv) # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''ECC file repairer
Description: Repair the structure of an ecc file, mainly the ecc markers, so that at least the ecc correction can align correctly the ecc entries and fields.
Note: An ecc structure repair does NOT allow to recover from more errors on your files, it only allows to repair an ecc file so that its structure is valid and can be read correctly.
    '''
    ep = ''' '''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
    else: # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
    # Required arguments
    main_parser.add_argument('-i', '--input', metavar='eccfile.txt', type=str, required=True,
                        help='Path to the ecc file to repair.', **widget_file)
    main_parser.add_argument('-o', '--output', metavar='eccfile_repaired.txt', type=str, required=True, #type=argparse.FileType('rt')
                        help='Output path where to save the repaired ecc file.', **widget_filesave)
    main_parser.add_argument('-t', '--threshold', type=float, default=0.3, required=False,
                        help='Distance threshold for the heuristic hamming distance repair. This must be a float, eg, 0.2 means that if there are 20% characters different between an ecc marker and a substring in the ecc file, it will be detected as a marker and corrected.', **widget_text)

    # Optional general arguments
    main_parser.add_argument('--index', metavar='eccfile.txt.idx', type=str, required=False,
                        help='Path to the index backup file corresponding to the ecc file (optional but helps a lot).', **widget_file)
    main_parser.add_argument('--ecc_algo', type=int, default=1, required=False,
                        help='What algorithm use to generate and verify the ECC? Values possible: 1-4. 1 is the formal, fully verified Reed-Solomon in base 3 ; 2 is a faster implementation but still based on the formal base 3 ; 3 is an even faster implementation but based on another library which may not be correct ; 4 is the fastest implementation supporting US FAA ADSB UAT RS FEC standard but is totally incompatible with the other three (a text encoded with any of 1-3 modes will be decodable with any one of them).', **widget_text)
    main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, required=False,
                        help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave)
    main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False,
                        help='Verbose mode (show more output).')
    main_parser.add_argument('--silent', action='store_true', required=False, default=False,
                        help='No console output (but if --log specified, the log will still be saved in the specified file).')

    main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False,
                        help='Force overwriting the ecc file even if it already exists (if --generate).')


    #== Parsing the arguments
    args = main_parser.parse_args(argv) # Storing all arguments to args

    #-- Set hard-coded variables
    entrymarker = "\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF" # marker that will signal the beginning of an ecc entry - use an alternating pattern of several characters, this avoids confusion (eg: if you use "AAA" as a pattern, if the ecc block of the previous file ends with "EGA" for example, then the full string for example will be "EGAAAAC:\yourfolder\filea.jpg" and then the entry reader will detect the first "AAA" occurrence as the entry start - this should not make the next entry bug because there is an automatic trim - but the previous ecc block will miss one character that could be used to repair the block because it will be "EG" instead of "EGA"!)
    field_delim = "\xFA\xFF\xFA\xFF\xFA" # delimiter between fields (filepath, filesize, hash+ecc blocks) inside an ecc entry
    markers = [entrymarker, field_delim] # put them in a list for easy reference
    max_block_size = 27
    resilience_rate = 1

    #-- Set variables from arguments
    inputpath = fullpath(args.input)
    outputpath = fullpath(args.output)
    distance_threshold = args.threshold
    indexpath = None
    if args.index: indexpath = fullpath(args.index)
    force = args.force
    ecc_algo = args.ecc_algo
    verbose = args.verbose
    silent = args.silent

    # -- Checking arguments
    if not os.path.isfile(inputpath):
        raise NameError('Specified database ecc file %s does not exist!' % inputpath)
    if os.path.isfile(outputpath) and not force:
        raise NameError('Specified output path for the repaired ecc file %s already exists! Use --force if you want to overwrite.' % outputpath)
    if indexpath and not os.path.isfile(indexpath):
        raise NameError('Specified index backup file %s does not exist!' % indexpath)

    if max_block_size < 2 or max_block_size > 255:
        raise ValueError('RS max block size must be between 2 and 255.')

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log, 'a', nostdout=silent)
        #sys.stdout = Tee(args.log, 'a')
        sys.stderr = Tee(args.log, 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)


    # == PROCESSING BRANCHING == #

    # Precompute some parameters and load up ecc manager objects (big optimization as g_exp and g_log tables calculation is done only once)
    hasher_none = Hasher('none') # for index ecc we don't use any hash
    ecc_params_idx = compute_ecc_params(max_block_size, resilience_rate, hasher_none)
    ecc_manager_idx = ECCMan(max_block_size, ecc_params_idx["message_size"], algo=ecc_algo)

    # == Main loop
    ptee.write("====================================")
    ptee.write("ECC repair, started on %s" % datetime.datetime.now().isoformat())
    ptee.write("====================================")
    ptee.write("Please note that this tool may not know if it found all the markers, so it may miss too much corrupted markers but it will repair the ones it finds (except if you have a fully valid index file, then you are guaranteed to always find all markers).")

    ecc_size = os.stat(inputpath).st_size
    if indexpath: idx_size = os.stat(indexpath).st_size
    shutil.copy2(inputpath, outputpath)
    blocksize = 65535
    with open(outputpath, 'r+b') as db:

        # == Index backup repair
        # This repair needs an index backup file which is normally generated at the same time as the ecc file. The index backup file is a file that stores the position of all ecc markers in the corresponding ecc file, and protects those positions using ecc.
        if indexpath:
            ptee.write("Using the index backup file %s to repair ecc markers, please wait..." % args.index)
            db.seek(0) # seek to the beginning of the database file
            idx_corrupted = 0
            idx_corrected = 0
            idx_total = 0
            markers_repaired = [0] * len(markers)
            bardisp = tqdm.tqdm(total=idx_size, file=ptee, leave=True, desc='IDXREAD', unit='B', unit_scale=True) # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have)
            with open(indexpath, 'rb') as dbidx:
                buf = 1
                while buf:
                    # The format of the index backup file is pretty simple: for each entrymarker or field_delim, a block is appended. Each such block is made of: the type on one byte (1 for entrymarker, 2 for field_delim), then the marker's position in the ecc file encoded in an unsigned long long (thus it's on a fixed 8 bytes), and finally an ecc for both the type and marker's position, and which is of fixed size (since we know that the marker's type + position = 9 bytes). Each such block is appended right after the precedent, so we know easily read them and such structure cannot be tampered by a soft error (there's no way a hard drive failure can chance the structure of the data, but a malicious user can! But it's then easy to fix that for a human user, you can clearly see the patterns, where the marker's positions begins and ends).
                    # Note that this constant sized structure of blocks is made on purpose, so that the structure of the index backup file is implicit, while the structure of the ecc file is explicit (it needs uncorrupted markers, which is a weak point that we try to address with the index backup file).
                    # eg of two blocks: 10000008Aecceccecc2000000F2ecceccecc
                    #
                    # Read one index block
                    curpos = dbidx.tell() # backup current position for error messages
                    buf = dbidx.read(max_block_size)
                    # Update progress bar
                    bardisp.update(dbidx.tell()-bardisp.n)
                    # If we have reached EOF, then we stop here
                    if not buf: break

                    # Else it's ok we have an index block, we process it
                    idx_total += 1
                    # Extract the marker's infos and the ecc
                    marker_str = buf[:ecc_params_idx["message_size"]]
                    ecc = buf[ecc_params_idx["message_size"]:]
                    # Check if the marker's infos are corrupted, if yes, then we will try to fix that using the ecc
                    if not ecc_manager_idx.check(marker_str, ecc):
                        # Trying to fix the marker's infos using the ecc
                        idx_corrupted += 1
                        marker_repaired, repaired_ecc = ecc_manager_idx.decode(marker_str, ecc)
                        # Repaired the marker's infos, all is good!
                        if ecc_manager_idx.check(marker_repaired, repaired_ecc):
                            marker_str = marker_repaired
                            idx_corrected += 1
                        # Else it's corrupted beyond repair, just skip
                        else:
                            ptee.write("\n- Index backup file: error on block starting at %i, corrupted and could not fix it. Skipping." % curpos)
                            marker_str = None
                            continue
                    if not marker_str: continue

                    # Repair ecc file's marker using our correct (or repaired) marker's infos
                    marker_type = int(marker_str[0]) # marker's type is always stored on the first byte/character
                    marker_pos = struct.unpack('>Q', marker_str[1:]) # marker's position is encoded as a big-endian unsigned long long, in a 8 bytes/chars string
                    db.seek(marker_pos[0]) # move the ecc reading cursor to the beginning of the marker
                    current_marker = db.read(len(markers[marker_type-1])) # read the current marker (potentially corrupted)
                    db.seek(marker_pos[0])
                    if verbose:
                        print "- Found marker by index file: type=%i content=" % (marker_type)
                        print db.read(len(markers[marker_type-1])+4)
                        db.seek(marker_pos[0]) # replace the reading cursor back in place before the marker
                    if current_marker != markers[marker_type-1]: # check if we really need to repair this marker
                        # Rewrite the marker over the ecc file
                        db.write(markers[marker_type-1])
                        markers_repaired[marker_type-1] += 1
                    else:
                        print "skipped, no need to repair"
            # Done the index backup repair
            if bardisp.n > bardisp.total: bardisp.n = bardisp.total # just a workaround in case there's one byte more than the predicted total
            bardisp.close()
            ptee.write("Done. Total: %i/%i markers repaired (%i entrymarkers and %i field_delim), %i indexes corrupted and %i indexes repaired (%i indexes lost).\n" % (markers_repaired[0]+markers_repaired[1], idx_total, markers_repaired[0], markers_repaired[1], idx_corrupted, idx_corrected, idx_corrupted-idx_corrected) )

        # == Heuristical Greedy Hamming distance repair
        # This is a heuristical (doesn't need any other file than the ecc file) 2-pass algorithm: the first pass tries to find the markers positions, and then the second pass simply reads the original ecc file and copies it while repairing the found markers.
        # The first pass is obviously the most interesting, here's a description: we use a kind of greedy algorithm but with backtracking, meaning that we simply read through all the strings sequentially and just compare with the markers and compute the Hamming distance: if the Hamming distance gets below the threshold, we trigger the found marker flag. Then if the Hamming distance decreases, we save this marker position and disable the found marker flag. However, there can be false positives like this (eg, the marker is corrupted in the middle), so we have a backtracking mechanism: if a later string is found to have a Hamming distance that is below the threshold, then we check if the just previously found marker is in the range (ie, the new marker's position is smaller than the previous marker's length) and if the Hamming distance is smaller, then we replace the previous marker with the new marker's position, because the previous one was most likely a false positive.
        # This method doesn't require any other file than the ecc file, but it may not work on ecc markers that are too much tampered, and if the detection threshold is too low or the markers are too small, there may be lots of false positives.
        # So try to use long markers (consisting of many character, preferably an alternating pattern different than the null byte \x00) and a high enough detection threshold.
        ptee.write("Using heuristics (Hamming distance) to fix markers with a threshold of %i%%, please wait..." % (round(distance_threshold*100, 0)) )

        # Main loop for heuristical repair, try to find the substrings that minimize the hamming distance to one of the ecc markers
        markers_repaired = [0] * len(markers) # stat counter
        already_valid = 0 # stat counter
        db.seek(0) # seek to the beginning of the database file
        buf = 1 # init the buffer to 1 to initiate the while loop
        markers_pos = [[] for i in xrange(len(markers))] # will contain the list of positions where a corrupted marker has been detected (not valid markers, they will be skipped)
        distance_thresholds = [round(len(x)*distance_threshold, 0) for x in markers] # calculate the number of characters maximum for distance
        skip_until = -1 # when a valid marker (non corrupted) is found, we use this variable to skip to after the marker length (to avoid detecting partial parts of this marker, which will have a hamming distance even if the marker is completely valid because the reading window will be after the beginning of the marker)
        bardisp = tqdm.tqdm(total=ecc_size, file=ptee, leave=True, desc='DBREAD', unit='B', unit_scale=True) # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have)
        while buf: # until we have walked through the whole ecc file
            # Read a part of the ecc file into a buffer, this allows to process more quickly than just loading the size of a marker
            curpos = db.tell() # keep the current reading position
            buf = db.read(blocksize)
            # Update progress bar
            bardisp.update(db.tell()-bardisp.n)
            if not buf: break # reached EOF? quitting here

            # Scan the buffer, by splitting the buffer into substrings the length of the ecc markers
            for i in xrange(len(buf)-max(len(entrymarker),len(field_delim))):
                # If we just came accross a non corrupted ecc marker, we skip until we are after this ecc marker (to avoid misdetections)
                if i < skip_until: continue
                # Compare each ecc marker type to this substring and compute the Hamming distance
                for m in xrange(len(markers)):
                    d = hamming(buf[i:i+len(markers[m])], markers[m]) # Compute the Hamming distance (simply the number of different characters)
                    mcurpos = curpos+i # current absolute position of this ecc marker
                    
                    # If there's no difference, then it's a valid, non-corrupted ecc marker
                    if d == 0:
                        already_valid += 1 # stats...
                        # If we previously wrongly detected a corrupted ecc marker near here, then it's probably a misdetection (because we just had a partial view on this marker until now), thus we just remove it from our list of markers to repair
                        if len(markers_pos[m]) > 0 and (mcurpos - markers_pos[m][-1][0]) <= len(markers[m]): # to detect that, we just check if the latest marker to repair is near the current marker (if its position is at maximum the length of the marker). This works because in the other condition below, we update the latest marker to repair if we find another one with a lower hamming distance very near.
                            del markers_pos[m][-1]
                        # Skip scanning until we are after the current marker to avoid misdetections
                        su = i+len(markers[m])
                        if su > skip_until: skip_until = su # update with the biggest marker (because both markers can be detected here if the pattern is similar)
                        break
                    # Else there's a difference/distance but it's below the threshold: we have a corrupted marker!
                    elif d > 0 and d <= distance_thresholds[m]:
                        # Updating case: If the latest marker to repair is quite close to the current one, but the current detection has a lower distance, we probably are detecting the same marker but we are better positionned now, so we update the previous marker's position with this one now.
                        if len(markers_pos[m]) > 0 and (mcurpos - markers_pos[m][-1][0]) <= len(markers[m]):
                            if d < markers_pos[m][-1][1]: # Update only if the distance is less
                                markers_pos[m][-1] = [mcurpos, d]
                            else: # Else, we probably are detecting the same marker as the last detected one, but since our scanning window has moved forward, we have increased the distance. Just skip it, we should not repair at this position (else we will probably be overwriting over real content).
                                continue
                        # Adding case: Else we just add this marker as a new one to repair by appending to the list
                        else:
                            markers_pos[m].append([mcurpos, d])
                    # Else the distance is too great for the threshold, it's not a marker at all, we go on to the next substring
            if db.tell() < ecc_size: db.seek(db.tell()-max(len(entrymarker),len(field_delim)))
        if bardisp.n > bardisp.total: bardisp.n = bardisp.total # just a workaround in case there's one byte more than the predicted total
        bardisp.close()

        # Committing the repair into the ecc file
        for m in xrange(len(markers)): # for each type of markers
            marker = markers[m]
            if len(markers_pos[m]) > 0: # If there is any detected marker to repair for this type
                for pos in markers_pos[m]: # for each detected marker to repair, we rewrite it over into the file at the detected position
                    if verbose: ptee.write("- Detected marker type %i at position %i with distance %i (%i%%): repairing." % (m+1, pos[0], pos[1], (float(pos[1])/len(markers[m]))*100) )
                    db.seek(pos[0])
                    db.write(marker)

        #print(markers_pos)
        ptee.write("Done. Hamming heuristic with threshold %i%% repaired %i entrymarkers and %i field_delim (%i total) and %i were already valid.\n" % (round(distance_threshold*100, 0), len(markers_pos[0]), len(markers_pos[1]), len(markers_pos[0])+len(markers_pos[1]), already_valid) )
        del ptee
        return 0
Ejemplo n.º 3
0
def main(argv=None):
    if argv is None:  # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(
            argv, basestring
    ):  # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv)  # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''ECC file repairer
Description: Repair the structure of an ecc file, mainly the ecc markers, so that at least the ecc correction can align correctly the ecc entries and fields.
Note: An ecc structure repair does NOT allow to recover from more errors on your files, it only allows to repair an ecc file so that its structure is valid and can be read correctly.
    '''
    ep = ''' '''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and
                          not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
    else:  # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]:
            argv.remove(
                '--ignore-gooey'
            )  # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
    # Required arguments
    main_parser.add_argument('-i',
                             '--input',
                             metavar='eccfile.txt',
                             type=str,
                             required=True,
                             help='Path to the ecc file to repair.',
                             **widget_file)
    main_parser.add_argument(
        '-o',
        '--output',
        metavar='eccfile_repaired.txt',
        type=str,
        required=True,  #type=argparse.FileType('rt')
        help='Output path where to save the repaired ecc file.',
        **widget_filesave)
    main_parser.add_argument(
        '-t',
        '--threshold',
        type=float,
        default=0.3,
        required=False,
        help=
        'Distance threshold for the heuristic hamming distance repair. This must be a float, eg, 0.2 means that if there are 20% characters different between an ecc marker and a substring in the ecc file, it will be detected as a marker and corrected.',
        **widget_text)

    # Optional general arguments
    main_parser.add_argument(
        '--index',
        metavar='eccfile.txt.idx',
        type=str,
        required=False,
        help=
        'Path to the index backup file corresponding to the ecc file (optional but helps a lot).',
        **widget_file)
    main_parser.add_argument(
        '--ecc_algo',
        type=int,
        default=1,
        required=False,
        help=
        'What algorithm use to generate and verify the ECC? Values possible: 1-4. 1 is the formal, fully verified Reed-Solomon in base 3 ; 2 is a faster implementation but still based on the formal base 3 ; 3 is an even faster implementation but based on another library which may not be correct ; 4 is the fastest implementation supporting US FAA ADSB UAT RS FEC standard but is totally incompatible with the other three (a text encoded with any of 1-3 modes will be decodable with any one of them).',
        **widget_text)
    main_parser.add_argument(
        '-l',
        '--log',
        metavar='/some/folder/filename.log',
        type=str,
        required=False,
        help=
        'Path to the log file. (Output will be piped to both the stdout and the log file)',
        **widget_filesave)
    main_parser.add_argument('-v',
                             '--verbose',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Verbose mode (show more output).')
    main_parser.add_argument(
        '--silent',
        action='store_true',
        required=False,
        default=False,
        help=
        'No console output (but if --log specified, the log will still be saved in the specified file).'
    )

    main_parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        required=False,
        default=False,
        help=
        'Force overwriting the ecc file even if it already exists (if --generate).'
    )

    #== Parsing the arguments
    args = main_parser.parse_args(argv)  # Storing all arguments to args

    #-- Set hard-coded variables
    entrymarker = "\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF"  # marker that will signal the beginning of an ecc entry - use an alternating pattern of several characters, this avoids confusion (eg: if you use "AAA" as a pattern, if the ecc block of the previous file ends with "EGA" for example, then the full string for example will be "EGAAAAC:\yourfolder\filea.jpg" and then the entry reader will detect the first "AAA" occurrence as the entry start - this should not make the next entry bug because there is an automatic trim - but the previous ecc block will miss one character that could be used to repair the block because it will be "EG" instead of "EGA"!)
    field_delim = "\xFA\xFF\xFA\xFF\xFA"  # delimiter between fields (filepath, filesize, hash+ecc blocks) inside an ecc entry
    markers = [entrymarker,
               field_delim]  # put them in a list for easy reference
    max_block_size = 27
    resilience_rate = 1

    #-- Set variables from arguments
    inputpath = fullpath(args.input)
    outputpath = fullpath(args.output)
    distance_threshold = args.threshold
    indexpath = None
    if args.index: indexpath = fullpath(args.index)
    force = args.force
    ecc_algo = args.ecc_algo
    verbose = args.verbose
    silent = args.silent

    # -- Checking arguments
    if not os.path.isfile(inputpath):
        raise NameError('Specified database ecc file %s does not exist!' %
                        inputpath)
    if os.path.isfile(outputpath) and not force:
        raise NameError(
            'Specified output path for the repaired ecc file %s already exists! Use --force if you want to overwrite.'
            % outputpath)
    if indexpath and not os.path.isfile(indexpath):
        raise NameError('Specified index backup file %s does not exist!' %
                        indexpath)

    if max_block_size < 2 or max_block_size > 255:
        raise ValueError('RS max block size must be between 2 and 255.')

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log, 'a', nostdout=silent)
        #sys.stdout = Tee(args.log, 'a')
        sys.stderr = Tee(args.log, 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #

    # Precompute some parameters and load up ecc manager objects (big optimization as g_exp and g_log tables calculation is done only once)
    hasher_none = Hasher('none')  # for index ecc we don't use any hash
    ecc_params_idx = compute_ecc_params(max_block_size, resilience_rate,
                                        hasher_none)
    ecc_manager_idx = ECCMan(max_block_size,
                             ecc_params_idx["message_size"],
                             algo=ecc_algo)

    # == Main loop
    ptee.write("====================================")
    ptee.write("ECC repair, started on %s" %
               datetime.datetime.now().isoformat())
    ptee.write("====================================")
    ptee.write(
        "Please note that this tool may not know if it found all the markers, so it may miss too much corrupted markers but it will repair the ones it finds (except if you have a fully valid index file, then you are guaranteed to always find all markers)."
    )

    ecc_size = os.stat(inputpath).st_size
    if indexpath: idx_size = os.stat(indexpath).st_size
    shutil.copy2(inputpath, outputpath)
    blocksize = 65535
    with open(outputpath, 'r+b') as db:

        # == Index backup repair
        # This repair needs an index backup file which is normally generated at the same time as the ecc file. The index backup file is a file that stores the position of all ecc markers in the corresponding ecc file, and protects those positions using ecc.
        if indexpath:
            ptee.write(
                "Using the index backup file %s to repair ecc markers, please wait..."
                % args.index)
            db.seek(0)  # seek to the beginning of the database file
            idx_corrupted = 0
            idx_corrected = 0
            idx_total = 0
            markers_repaired = [0] * len(markers)
            bardisp = tqdm.tqdm(
                total=idx_size,
                file=ptee,
                leave=True,
                desc='IDXREAD',
                unit='B',
                unit_scale=True
            )  # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have)
            with open(indexpath, 'rb') as dbidx:
                buf = 1
                while buf:
                    # The format of the index backup file is pretty simple: for each entrymarker or field_delim, a block is appended. Each such block is made of: the type on one byte (1 for entrymarker, 2 for field_delim), then the marker's position in the ecc file encoded in an unsigned long long (thus it's on a fixed 8 bytes), and finally an ecc for both the type and marker's position, and which is of fixed size (since we know that the marker's type + position = 9 bytes). Each such block is appended right after the precedent, so we know easily read them and such structure cannot be tampered by a soft error (there's no way a hard drive failure can chance the structure of the data, but a malicious user can! But it's then easy to fix that for a human user, you can clearly see the patterns, where the marker's positions begins and ends).
                    # Note that this constant sized structure of blocks is made on purpose, so that the structure of the index backup file is implicit, while the structure of the ecc file is explicit (it needs uncorrupted markers, which is a weak point that we try to address with the index backup file).
                    # eg of two blocks: 10000008Aecceccecc2000000F2ecceccecc
                    #
                    # Read one index block
                    curpos = dbidx.tell(
                    )  # backup current position for error messages
                    buf = dbidx.read(max_block_size)
                    # Update progress bar
                    bardisp.update(dbidx.tell() - bardisp.n)
                    # If we have reached EOF, then we stop here
                    if not buf: break

                    # Else it's ok we have an index block, we process it
                    idx_total += 1
                    # Extract the marker's infos and the ecc
                    marker_str = buf[:ecc_params_idx["message_size"]]
                    ecc = buf[ecc_params_idx["message_size"]:]
                    # Check if the marker's infos are corrupted, if yes, then we will try to fix that using the ecc
                    if not ecc_manager_idx.check(marker_str, ecc):
                        # Trying to fix the marker's infos using the ecc
                        idx_corrupted += 1
                        marker_repaired, repaired_ecc = ecc_manager_idx.decode(
                            marker_str, ecc)
                        # Repaired the marker's infos, all is good!
                        if ecc_manager_idx.check(marker_repaired,
                                                 repaired_ecc):
                            marker_str = marker_repaired
                            idx_corrected += 1
                        # Else it's corrupted beyond repair, just skip
                        else:
                            ptee.write(
                                "\n- Index backup file: error on block starting at %i, corrupted and could not fix it. Skipping."
                                % curpos)
                            marker_str = None
                            continue
                    if not marker_str: continue

                    # Repair ecc file's marker using our correct (or repaired) marker's infos
                    marker_type = int(
                        marker_str[0]
                    )  # marker's type is always stored on the first byte/character
                    marker_pos = struct.unpack(
                        '>Q', marker_str[1:]
                    )  # marker's position is encoded as a big-endian unsigned long long, in a 8 bytes/chars string
                    db.seek(
                        marker_pos[0]
                    )  # move the ecc reading cursor to the beginning of the marker
                    current_marker = db.read(len(markers[
                        marker_type -
                        1]))  # read the current marker (potentially corrupted)
                    db.seek(marker_pos[0])
                    if verbose:
                        print "- Found marker by index file: type=%i content=" % (
                            marker_type)
                        print db.read(len(markers[marker_type - 1]) + 4)
                        db.seek(
                            marker_pos[0]
                        )  # replace the reading cursor back in place before the marker
                    if current_marker != markers[
                            marker_type -
                            1]:  # check if we really need to repair this marker
                        # Rewrite the marker over the ecc file
                        db.write(markers[marker_type - 1])
                        markers_repaired[marker_type - 1] += 1
                    else:
                        print "skipped, no need to repair"
            # Done the index backup repair
            if bardisp.n > bardisp.total:
                bardisp.n = bardisp.total  # just a workaround in case there's one byte more than the predicted total
            bardisp.close()
            ptee.write(
                "Done. Total: %i/%i markers repaired (%i entrymarkers and %i field_delim), %i indexes corrupted and %i indexes repaired (%i indexes lost).\n"
                % (markers_repaired[0] + markers_repaired[1], idx_total,
                   markers_repaired[0], markers_repaired[1], idx_corrupted,
                   idx_corrected, idx_corrupted - idx_corrected))

        # == Heuristical Greedy Hamming distance repair
        # This is a heuristical (doesn't need any other file than the ecc file) 2-pass algorithm: the first pass tries to find the markers positions, and then the second pass simply reads the original ecc file and copies it while repairing the found markers.
        # The first pass is obviously the most interesting, here's a description: we use a kind of greedy algorithm but with backtracking, meaning that we simply read through all the strings sequentially and just compare with the markers and compute the Hamming distance: if the Hamming distance gets below the threshold, we trigger the found marker flag. Then if the Hamming distance decreases, we save this marker position and disable the found marker flag. However, there can be false positives like this (eg, the marker is corrupted in the middle), so we have a backtracking mechanism: if a later string is found to have a Hamming distance that is below the threshold, then we check if the just previously found marker is in the range (ie, the new marker's position is smaller than the previous marker's length) and if the Hamming distance is smaller, then we replace the previous marker with the new marker's position, because the previous one was most likely a false positive.
        # This method doesn't require any other file than the ecc file, but it may not work on ecc markers that are too much tampered, and if the detection threshold is too low or the markers are too small, there may be lots of false positives.
        # So try to use long markers (consisting of many character, preferably an alternating pattern different than the null byte \x00) and a high enough detection threshold.
        ptee.write(
            "Using heuristics (Hamming distance) to fix markers with a threshold of %i%%, please wait..."
            % (round(distance_threshold * 100, 0)))

        # Main loop for heuristical repair, try to find the substrings that minimize the hamming distance to one of the ecc markers
        markers_repaired = [0] * len(markers)  # stat counter
        already_valid = 0  # stat counter
        db.seek(0)  # seek to the beginning of the database file
        buf = 1  # init the buffer to 1 to initiate the while loop
        markers_pos = [
            [] for i in xrange(len(markers))
        ]  # will contain the list of positions where a corrupted marker has been detected (not valid markers, they will be skipped)
        distance_thresholds = [
            round(len(x) * distance_threshold, 0) for x in markers
        ]  # calculate the number of characters maximum for distance
        skip_until = -1  # when a valid marker (non corrupted) is found, we use this variable to skip to after the marker length (to avoid detecting partial parts of this marker, which will have a hamming distance even if the marker is completely valid because the reading window will be after the beginning of the marker)
        bardisp = tqdm.tqdm(
            total=ecc_size,
            file=ptee,
            leave=True,
            desc='DBREAD',
            unit='B',
            unit_scale=True
        )  # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have)
        while buf:  # until we have walked through the whole ecc file
            # Read a part of the ecc file into a buffer, this allows to process more quickly than just loading the size of a marker
            curpos = db.tell()  # keep the current reading position
            buf = db.read(blocksize)
            # Update progress bar
            bardisp.update(db.tell() - bardisp.n)
            if not buf: break  # reached EOF? quitting here

            # Scan the buffer, by splitting the buffer into substrings the length of the ecc markers
            for i in xrange(
                    len(buf) - max(len(entrymarker), len(field_delim))):
                # If we just came accross a non corrupted ecc marker, we skip until we are after this ecc marker (to avoid misdetections)
                if i < skip_until: continue
                # Compare each ecc marker type to this substring and compute the Hamming distance
                for m in xrange(len(markers)):
                    d = hamming(
                        buf[i:i + len(markers[m])], markers[m]
                    )  # Compute the Hamming distance (simply the number of different characters)
                    mcurpos = curpos + i  # current absolute position of this ecc marker

                    # If there's no difference, then it's a valid, non-corrupted ecc marker
                    if d == 0:
                        already_valid += 1  # stats...
                        # If we previously wrongly detected a corrupted ecc marker near here, then it's probably a misdetection (because we just had a partial view on this marker until now), thus we just remove it from our list of markers to repair
                        if len(
                                markers_pos[m]
                        ) > 0 and (mcurpos - markers_pos[m][-1][0]) <= len(
                                markers[m]
                        ):  # to detect that, we just check if the latest marker to repair is near the current marker (if its position is at maximum the length of the marker). This works because in the other condition below, we update the latest marker to repair if we find another one with a lower hamming distance very near.
                            del markers_pos[m][-1]
                        # Skip scanning until we are after the current marker to avoid misdetections
                        su = i + len(markers[m])
                        if su > skip_until:
                            skip_until = su  # update with the biggest marker (because both markers can be detected here if the pattern is similar)
                        break
                    # Else there's a difference/distance but it's below the threshold: we have a corrupted marker!
                    elif d > 0 and d <= distance_thresholds[m]:
                        # Updating case: If the latest marker to repair is quite close to the current one, but the current detection has a lower distance, we probably are detecting the same marker but we are better positionned now, so we update the previous marker's position with this one now.
                        if len(markers_pos[m]) > 0 and (
                                mcurpos - markers_pos[m][-1][0]) <= len(
                                    markers[m]):
                            if d < markers_pos[m][-1][
                                    1]:  # Update only if the distance is less
                                markers_pos[m][-1] = [mcurpos, d]
                            else:  # Else, we probably are detecting the same marker as the last detected one, but since our scanning window has moved forward, we have increased the distance. Just skip it, we should not repair at this position (else we will probably be overwriting over real content).
                                continue
                        # Adding case: Else we just add this marker as a new one to repair by appending to the list
                        else:
                            markers_pos[m].append([mcurpos, d])
                    # Else the distance is too great for the threshold, it's not a marker at all, we go on to the next substring
            if db.tell() < ecc_size:
                db.seek(db.tell() - max(len(entrymarker), len(field_delim)))
        if bardisp.n > bardisp.total:
            bardisp.n = bardisp.total  # just a workaround in case there's one byte more than the predicted total
        bardisp.close()

        # Committing the repair into the ecc file
        for m in xrange(len(markers)):  # for each type of markers
            marker = markers[m]
            if len(
                    markers_pos[m]
            ) > 0:  # If there is any detected marker to repair for this type
                for pos in markers_pos[
                        m]:  # for each detected marker to repair, we rewrite it over into the file at the detected position
                    if verbose:
                        ptee.write(
                            "- Detected marker type %i at position %i with distance %i (%i%%): repairing."
                            % (m + 1, pos[0], pos[1],
                               (float(pos[1]) / len(markers[m])) * 100))
                    db.seek(pos[0])
                    db.write(marker)

        #print(markers_pos)
        ptee.write(
            "Done. Hamming heuristic with threshold %i%% repaired %i entrymarkers and %i field_delim (%i total) and %i were already valid.\n"
            % (round(distance_threshold * 100,
                     0), len(markers_pos[0]), len(markers_pos[1]),
               len(markers_pos[0]) + len(markers_pos[1]), already_valid))
        del ptee
        return 0
def main(argv=None):
    if argv is None:  # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(
            argv, basestring
    ):  # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv)  # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Replication Repair
Description: Given a set of directories (or files), try to repair your files by scanning each byte, cast a majority vote among all copies, and then output the winning byte. This process is usually called triple-modular redundancy (but here it should be called n-modular redundancy since you can use as many copies as you have).
It is recommended for long term storage to store several copies of your files on different storage mediums. Everything's fine until all your copies are partially corrupted. In this case, this script can help you, by taking advantage of your multiple copies, without requiring a pregenerated ecc file. Just specify the path to every copies, and the script will try to recover them.
Replication can repair exactly r-2 errors using majority vote (you need at least 2 blocks for majority vote to work), where r is the number of replications: if r=3, you get a redundancy rate of 1/3, if r=4, rate is 2/4, etc.
This script can also take advantage of a database generated by rfigc.py to make sure that the recovered files are correct, or to select files that are already correct.

Note: in case the end result is not what you expected, you can try a different order of input directories: in case of ambiguity, the first input folder has precedence over subsequent folders.
Note2: in case some files with the same names are of different length, the merging will continue until the longest file is exhausted.
Note3: last modification date is not (yet) accounted for.
    '''
    ep = '''Use --gui as the first argument to use with a GUI (via Gooey).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and
                          not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
        widget_multidir = {"widget": "MultiDirChooser"}
    else:  # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]:
            argv.remove(
                '--ignore-gooey'
            )  # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
        widget_multidir = {}

    # Required arguments
    main_parser.add_argument(
        '-i',
        '--input',
        metavar='"/path/to/copy1/" "/path/to/copy2/" "etc."',
        type=is_dir_or_file,
        nargs='+',
        required=True,
        help=
        'Specify the paths to every copies you have (minimum 3 copies, else it won\'t work!). Can be folders or files (if you want to repair only one file). Order matters: in case of ambiguity, the first folder where the file exists will be chosen.',
        **widget_multidir)
    main_parser.add_argument('-o',
                             '--output',
                             metavar='/ouput/folder/',
                             nargs=1,
                             required=True,
                             help='Where the recovered files will be stored.',
                             **widget_dir)

    # Optional general arguments
    main_parser.add_argument(
        '-d',
        '--database',
        metavar='database.csv',
        type=is_file,
        required=False,
        help=
        'Path to a previously generated rfigc.py database. If provided, this will be used to check that the repaired files are correct (and also to find already correct files in copies).',
        **widget_file)
    main_parser.add_argument(
        '-r',
        '--report',
        metavar='/some/folder/report.csv',
        type=str,
        required=False,
        help=
        'Save all results of the repair process in a report file, with detailed descriptions of ambiguous repairs (ie, when majority vote came to a draw).',
        **widget_filesave)
    main_parser.add_argument(
        '-l',
        '--log',
        metavar='/some/folder/filename.log',
        type=str,
        nargs=1,
        required=False,
        help=
        'Path to the log file. (Output will be piped to both the stdout and the log file)',
        **widget_filesave)
    main_parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        required=False,
        default=False,
        help='Force overwriting the output folder even if it already exists.')
    main_parser.add_argument('-v',
                             '--verbose',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Verbose mode (show more output).')
    main_parser.add_argument(
        '--silent',
        action='store_true',
        required=False,
        default=False,
        help=
        'No console output (but if --log specified, the log will still be saved in the specified file).'
    )

    #== Parsing the arguments
    args = main_parser.parse_args(argv)  # Storing all arguments to args

    #-- Set variables from arguments
    inputpaths = [
        fullpath(x) for x in args.input
    ]  # path to the files to repair (ie, paths to all the different copies the user has)
    outputpath = fullpath(args.output[0])
    force = args.force
    verbose = args.verbose
    silent = args.silent

    if len(inputpaths) < 3:
        raise Exception(
            'Need at least 3 copies to do a replication repair/majority vote!')

    #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
    #rootfolderpath = os.path.dirname(inputpath)

    report_file = None
    if args.report: report_file = os.path.basename(fullpath(args.report))
    database = None
    if args.database: database = args.database

    # -- Checking arguments
    if os.path.exists(outputpath) and not force:
        raise NameError(
            'Specified output path %s already exists! Use --force if you want to overwrite.'
            % outputpath)

    if database and not os.path.isfile(database):
        raise NameError('Specified rfigc database file %s does not exist!' %
                        database)

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        #sys.stdout = Tee(args.log[0], 'a')
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #

    # == Precomputation of ecc file size
    # Precomputing is important so that the user can know what size to expect before starting (and how much time it will take...).
    filescount = 0
    sizetotal = 0
    sizeheaders = 0
    visitedfiles = {}
    ptee.write("Precomputing list of files and predicted statistics...")
    prebar = tqdm.tqdm(file=ptee, disable=silent)
    for inputpath in inputpaths:
        for (dirpath, filename) in recwalk(inputpath):
            # Get full absolute filepath
            filepath = os.path.join(dirpath, filename)
            relfilepath = path2unix(
                os.path.relpath(filepath, inputpath)
            )  # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different)

            # Only increase the files count if we didn't see this file before
            if not visitedfiles.get(relfilepath, None):
                # Counting the total number of files we will process (so that we can show a progress bar with ETA)
                filescount = filescount + 1
                # Add the file to the list of already visited files
                visitedfiles[relfilepath] = True
                # Get the current file's size
                size = os.stat(filepath).st_size
                # Compute total size of all files
                sizetotal = sizetotal + size
            prebar.update()
    prebar.close()
    ptee.write("Precomputing done.")

    # == Majority vote repair
    # For each folder, align the files lists and then majority vote over each byte to repair
    ptee.write("====================================")
    ptee.write("Replication repair, started on %s" %
               datetime.datetime.now().isoformat())
    ptee.write("====================================")

    # Prepare progress bar if necessary
    if silent:
        tqdm_bar = None
    else:
        tqdm_bar = tqdm.tqdm(total=filescount,
                             file=ptee,
                             leave=True,
                             unit="files")
    # Call the main function to synchronize files using majority vote
    errcode = synchronize_files(inputpaths,
                                outputpath,
                                database=database,
                                tqdm_bar=tqdm_bar,
                                report_file=report_file,
                                ptee=ptee,
                                verbose=verbose)
    #ptee.write("All done! Stats:\n- Total files processed: %i\n- Total files corrupted: %i\n- Total files repaired completely: %i\n- Total files repaired partially: %i\n- Total files corrupted but not repaired at all: %i\n- Total files skipped: %i" % (files_count, files_corrupted, files_repaired_completely, files_repaired_partially, files_corrupted - (files_repaired_partially + files_repaired_completely), files_skipped) )
    if tqdm_bar: tqdm_bar.close()
    ptee.write("All done!")
    if report_file:
        ptee.write("Saved replication repair results in report file: %s" %
                   report_file)
    del ptee
    return errcode
def main(argv=None):
    if argv is None:  # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(
            argv, basestring
    ):  # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv)  # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Resiliency Tester
Description: Given a directory and a configuration file (containing the commands to execute before and after file tampering), this script will generate a testing tree, where the files will be corrupted randomly and then the supplied repair commands will be executed, and repair stats will be computed at each step (for each stage and repair commands).

The testing process works in stages:
1- Before_tamper stage: Run preparatory commands before tampering (useful to generate ecc/database files).
2- Tamper stage: Tamper the files and/or databases.
3- After_tamper stage: Run after tampering commands, aka preparatory commands before repair stage.
4- Repair stage: Run repair commands, each repair command reusing the files generated (partially repaired) by the previous stage. This is indeed your repair workchain that you define here.
5- Statistics are generated for each stage.

Note that the original files are never tampered, we tamper only the copy we did inside the test folder.
Also note that the test folder will not be removed at the end, so that you can see for yourself the files resulting of each stage, and eventually use other tools to compute additional stats.
    '''
    ep = '''Use --gui as the first argument to use with a GUI (via Gooey).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and
                          not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
        widget_multidir = {"widget": "MultiDirChooser"}
    else:  # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]:
            argv.remove(
                '--ignore-gooey'
            )  # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
        widget_multidir = {}

    # Required arguments
    main_parser.add_argument(
        '-i',
        '--input',
        metavar='"/path/to/original/files/"',
        type=is_dir_or_file,
        nargs=1,
        required=True,
        help='Specify the path to the directory containing the sample data.',
        **widget_dir)
    main_parser.add_argument(
        '-o',
        '--output',
        metavar='/test/folder/',
        nargs=1,
        required=True,
        help=
        'Path to the test folder that will be created to store temporary test files.',
        **widget_dir)
    main_parser.add_argument(
        '-c',
        '--config',
        metavar='/some/folder/config.txt',
        type=str,
        nargs=1,
        required=True,  #type=argparse.FileType('rt')
        help=
        'Path to the configuration file (containing the commands to execute, Makefile format). Possible entries: before_tamper, tamper, after_tamper, repair. Note that you can use a few special tags to trigger string interpolation: {inputdir}, {dbdir}, {outputdir}.',
        **widget_file)

    # Optional arguments
    main_parser.add_argument(
        '-p',
        '--parallel',
        action='store_true',
        required=False,
        help=
        'If true, repair commands will be run on the tampered files, not on the previous repair results. Useful if you want to try different strategies/commands/programs. By default, false, thus the repair commands will take advantage of the results of previous repair commands.'
    )
    main_parser.add_argument(
        '-m',
        '--multiple',
        metavar=1,
        type=int,
        default=1,
        required=False,
        help='Run multiple times the resiliency test, and average the stats.',
        **widget_text)

    # Optional general arguments
    main_parser.add_argument(
        '-l',
        '--log',
        metavar='/some/folder/filename.log',
        type=str,
        nargs=1,
        required=False,
        help=
        'Path to the log file. (Output will be piped to both the stdout and the log file)',
        **widget_filesave)
    main_parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        required=False,
        default=False,
        help='Force overwriting the output folder even if it already exists.')
    main_parser.add_argument('-v',
                             '--verbose',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Verbose mode (show more output).')
    main_parser.add_argument(
        '--silent',
        action='store_true',
        required=False,
        default=False,
        help=
        'No console output (but if --log specified, the log will still be saved in the specified file).'
    )

    #== Parsing the arguments
    args = main_parser.parse_args(argv)  # Storing all arguments to args

    #-- Set variables from arguments
    origpath = fullpath(
        args.input[0]
    )  # path to the input directory (where the original, sample data is)
    outputpath = fullpath(args.output[0])
    configfile = fullpath(args.config[0])
    parallel = args.parallel
    multiple = args.multiple
    force = args.force
    verbose = args.verbose
    silent = args.silent

    #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
    #rootfolderpath = os.path.dirname(inputpath)

    # -- Checking arguments
    if not os.path.isdir(origpath):
        raise NameError("Input path needs to be a directory!")

    if not os.path.exists(configfile):
        raise NameError(
            "Please provide a configuration file in order to run a test!")
    else:
        commands = parse_configfile(configfile)

    if os.path.exists(outputpath) and not force:
        raise NameError(
            "Specified test folder (output path) %s already exists! Use --force to overwrite this directory."
            % outputpath)
    else:
        remove_if_exist(outputpath)

    if multiple < 1:
        multiple = 1

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #

    # == Main branch
    ptee.write("====================================")
    ptee.write("Resiliency tester, started on %s" %
               datetime.datetime.now().isoformat())
    ptee.write("====================================")

    ptee.write("Testing folder %s into test folder %s for %i run(s)." %
               (origpath, outputpath, multiple))

    fstats = {}
    for m in xrange(multiple):
        run_nb = m + 1

        ptee.write("===== Resiliency tester: starting run %i =====" % run_nb)

        # -- Define directories tree for this test run
        # testpath is the basepath for the current run
        # Generate a specific subdirectory for the current run
        testpath = os.path.join(outputpath, "run%i" % run_nb)
        dbdir = fullpath(os.path.join(testpath, "db"))
        origdbdir = fullpath(os.path.join(testpath, "origdb"))
        tamperdir = fullpath(os.path.join(testpath, "tampered"))
        repairdir = fullpath(os.path.join(testpath, "repair"))

        # == START TEST RUN
        # Create test folder
        create_dir_if_not_exist(testpath)

        # Before tampering
        ptee.write("=== BEFORE TAMPERING ===")
        create_dir_if_not_exist(dbdir)
        for i, cmd in enumerate(commands["before_tamper"]):
            scmd = interpolate_dict(cmd,
                                    interp_args={
                                        "inputdir": origpath,
                                        "dbdir": dbdir
                                    })
            ptee.write("Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)
        copy_any(dbdir,
                 origdbdir)  # make a copy because we may tamper the db files

        # Tampering
        ptee.write("=== TAMPERING ===")
        copy_any(origpath, tamperdir)
        for i, cmd in enumerate(commands["tamper"]):
            scmd = interpolate_dict(cmd,
                                    interp_args={
                                        "inputdir": tamperdir,
                                        "dbdir": dbdir
                                    })
            ptee.write("- RTEST: Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)

        # After tampering
        ptee.write("=== AFTER TAMPERING ===")
        for i, cmd in enumerate(commands["after_tamper"]):
            scmd = interpolate_dict(cmd,
                                    interp_args={
                                        "inputdir": tamperdir,
                                        "dbdir": dbdir
                                    })
            ptee.write("- RTEST: Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)

        # Repairing
        ptee.write("=== REPAIRING ===")
        indir = tamperdir
        finalrepairdir = ''
        for i, cmd in enumerate(commands["repair"]):
            outdir = "%s%i" % (repairdir, i)
            scmd = interpolate_dict(cmd,
                                    interp_args={
                                        "inputdir": indir,
                                        "dbdir": dbdir,
                                        "outputdir": outdir
                                    })
            ptee.write("- RTEST: Executing command: %s" % scmd)
            create_dir_if_not_exist(outdir)
            execute_command(scmd, ptee=ptee)
            copy_any(
                indir, outdir, only_missing=True
            )  # copy the files that did not need any repair (or could not be repaired at all!)
            finalrepairdir = outdir
            # If parallel, do not reuse the previous repair resulting files, repair from the tampered files directly
            if not parallel: indir = outdir

        # Stats
        stats = compute_all_diff_stats(commands, origpath, tamperdir,
                                       repairdir, finalrepairdir)
        ptee.write(
            "========== Resiliency tester results for run %i ==========" %
            run_nb)
        for key, stat in stats.iteritems():
            ptee.write("=> Stage: %s" % key)
            ptee.write(pretty_print_stats(stat))

        if run_nb == 1:
            fstats = stats
        else:
            fstats = stats_running_average(fstats, stats, run_nb - 1)

    ptee.write("============================")
    ptee.write("RESILIENCY TESTER FINAL AVERAGED RESULTS OVER %i RUNS" %
               multiple)
    ptee.write("============================")
    for key, stat in fstats.iteritems():
        ptee.write("=> Stage: %s" % key)
        ptee.write(pretty_print_stats(stat))

    # Shutting down
    del ptee
    # Completely repair all the files? Return OK
    if stats["final"]["error"] == 0:
        return 0
    else:
        return 1
Ejemplo n.º 6
0
def main(argv=None):
    if argv is None: # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv) # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Recursive/Relative Files Integrity Generator and Checker
Description: Recursively generate or check the integrity of files by MD5 and SHA1 hashes, size, modification date or by data structure integrity (only for images).

This script is originally meant to be used for data archival, by allowing an easy way to check for silent file corruption. Thus, this script uses relative paths so that you can easily compute and check the same redundant data copied on different mediums (hard drives, optical discs, etc.). This script is not meant for system files corruption notification, but is more meant to be used from times-to-times to check up on your data archives integrity.
    '''
    ep = '''Example usage:
- To generate the database (only needed once):
python rfigc.py -i "folderimages" -d "dbhash.csv" -g
- To check:
python rfigc.py -i "folderimages" -d "dbhash.csv" -l log.txt -s
- To update your database by appending new files:
python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a 
- To update your database by appending new files AND removing inexistent files:
python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a -r
- To use with a gui:
python rfigc.py --gui

Note that by default, the script is by default in check mode, to avoid wrong manipulations. It will also alert you if you generate over an already existing database file.
Note2: you can use PyPy to speed the generation, but you should avoid using PyPy when in checking mode (from our tests, it will slow things down a lot).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
    else: # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
    # Required arguments
    main_parser.add_argument('-i', '--input', metavar='/path/to/root/folder', type=is_dir_or_file, nargs=1, required=True,
                        help='Path to the root folder (or a single file) from where the scanning will occur.', **widget_dir)
    main_parser.add_argument('-d', '--database', metavar='/some/folder/databasefile.csv', type=str, nargs=1, required=True, #type=argparse.FileType('rt')
                        help='Path to the csv file containing the hash informations.', **widget_filesave)

    # Optional general arguments
    main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False,
                        help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave)
    main_parser.add_argument('--skip_hash', action='store_true', required=False, default=False,
                        help='Skip hash computation/checking (checks only the other metadata, this is a lot quicker).')
    main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False,
                        help='Verbose mode (show more output).')
    main_parser.add_argument('--silent', action='store_true', required=False, default=False,
                        help='No console output (but if --log specified, the log will still be saved in the specified file).')

    # Checking mode arguments
    main_parser.add_argument('-s', '--structure_check', action='store_true', required=False, default=False,
                        help='Check images structures for corruption?')
    main_parser.add_argument('-e', '--errors_file', metavar='/some/folder/errorsfile.csv', type=str, nargs=1, required=False, #type=argparse.FileType('rt')
                        help='Path to the error file, where errors at checking will be stored in CSV for further processing by other softwares (such as file repair softwares).', **widget_filesave)
    main_parser.add_argument('-m', '--disable_modification_date_checking', action='store_true', required=False, default=False,
                        help='Disable modification date checking.')
    main_parser.add_argument('--skip_missing', action='store_true', required=False, default=False,
                        help='Skip missing files when checking (useful if you split your files into several mediums, for example on optical discs with limited capacity).')

    # Generate mode arguments
    main_parser.add_argument('-g', '--generate', action='store_true', required=False, default=False,
                        help='Generate the database? (omit this parameter to check instead of generating).')
    main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False,
                        help='Force overwriting the database file even if it already exists (if --generate).')

    # Update mode arguments
    main_parser.add_argument('-u', '--update', action='store_true', required=False, default=False,
                        help='Update database (you must also specify --append or --remove).')
    main_parser.add_argument('-a', '--append', action='store_true', required=False, default=False,
                        help='Append new files (if --update).')
    main_parser.add_argument('-r', '--remove', action='store_true', required=False, default=False,
                        help='Remove missing files (if --update).')

    # Recover from file scraping
    main_parser.add_argument('--filescraping_recovery', action='store_true', required=False, default=False,
                        help='Given a folder of unorganized files, compare to the database and restore the filename and directory structure into the output folder.')
    main_parser.add_argument('-o', '--output', metavar='/path/to/root/folder', type=is_dir, nargs=1, required=False,
                        help='Path to the output folder where to output (copy) the files reorganized after --recover_from_filescraping.', **widget_dir)

    #== Parsing the arguments
    args = main_parser.parse_args(argv) # Storing all arguments to args

    #-- Set variables from arguments
    inputpath = fullpath(args.input[0]) # path to the files to protect (either a folder or a single file)
    rootfolderpath = inputpath # path to the root folder (to compute relative paths)
    #database = os.path.basename(fullpath(args.database[0])) # Take only the filename.
    database = fullpath(args.database[0])
    generate = args.generate
    structure_check = args.structure_check
    force = args.force
    disable_modification_date_checking = args.disable_modification_date_checking
    skip_missing = args.skip_missing
    skip_hash = args.skip_hash
    update = args.update
    append = args.append
    remove = args.remove
    outputpath = None
    if args.output: outputpath = fullpath(args.output[0])
    filescraping = args.filescraping_recovery
    verbose = args.verbose
    silent = args.silent

    if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
        rootfolderpath = os.path.dirname(inputpath)

    errors_file = None
    if args.errors_file: errors_file = fullpath(args.errors_file[0])

    # -- Checking arguments
    if structure_check and not structure_check_import:
        raise ImportError('PIL (Python Imaging Library) could not be imported. PIL is needed to do structure check, please install PIL (or you can disable structure check to continue).');

    if update and (not append and not remove):
        raise ValueError('--update specified but not --append nor --remove. You must specify at least one of these modes when using --update!')

    if filescraping and not outputpath:
        raise ValueError('Output path needed when --recover_from_filescraping.')

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        #sys.stdout = Tee(args.log[0], 'a')
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)


    # == PROCESSING BRANCHING == #
    retval = 0 # Returned value: 0 OK, 1 KO (files in error), -1 Error

    # -- Update the database file by removing missing files
    if update and remove:
        if not os.path.isfile(database):
            raise NameError('Specified database file does not exist, can\'t update!')

        ptee.write("====================================")
        ptee.write("RIFGC Database Update Removal of missing files, started on %s" % datetime.datetime.now().isoformat())
        ptee.write("====================================")

        # Precompute the total number of lines to process (this should be fairly quick)
        filestodocount = 0
        with open(database, 'rb') as dbf:
            for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'):
                filestodocount = filestodocount + 1

            # Preparing CSV writer for the temporary file that will have the lines removed
            with open(database+'.rem', 'wb') as dbfilerem:
                csv_writer = csv.writer(dbfilerem, lineterminator='\n', delimiter='|', quotechar='"')

                # Printing CSV headers
                csv_headers = ['path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext']
                csv_writer.writerow(csv_headers)

                dbf.seek(0)
                dbfile = csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"') # we need to reopen the file to put the reading cursor (the generator position) back to the beginning
                delcount = 0
                filescount = 0
                for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True):
                    filescount = filescount + 1
                    filepath = os.path.join(rootfolderpath, row['path']) # Build the absolute file path

                    # Single-file mode: skip if this is not the file we are looking for
                    if inputpath != rootfolderpath and inputpath != filepath: continue

                    if verbose: ptee.write("\n- Processing file %s" % row['path'])
                    errors = []
                    if not os.path.isfile(filepath):
                        delcount = delcount + 1
                        ptee.write("\n- File %s is missing, removed from database." % row['path'])
                    else:
                        csv_writer.writerow( [ path2unix(row['path']), row['md5'], row['sha1'], row['last_modification_timestamp'], row['last_modification_date'], row['size'], row['ext'] ] )

        # REMOVE UPDATE DONE, we remove the old database file and replace it with the new
        os.remove(database) # delete old database
        os.rename(database+'.rem', database) # rename new database to match old name
        # Show some stats
        ptee.write("----------------------------------------------------")
        ptee.write("All files processed: Total: %i - Removed/Missing: %i.\n\n" % (filescount, delcount))

    # -- Generate the database file or update/append (both will walk through the filesystem to get new files, contrary to other branchs which walk through the database csv)
    if generate or (update and append):
        if not force and os.path.isfile(database) and not update:
            raise NameError('Database file already exists. Please choose another name to generate your database file.')

        if generate:
            dbmode = 'wb'
        elif (update and append):
            dbmode = 'ab'
        with open(database, dbmode) as dbfile: # Must open in write + binary, because on Windows it will do weird things otherwise (at least with Python 2.7)
            ptee.write("====================================")
            if generate:
                ptee.write("RIFGC Database Generation started on %s" % datetime.datetime.now().isoformat())
            elif update and append:
                ptee.write("RIFGC Database Update Append new files, started on %s" % datetime.datetime.now().isoformat())
            ptee.write("====================================")

            # Preparing CSV writer
            csv_writer = csv.writer(dbfile, lineterminator='\n', delimiter='|', quotechar='"')

            if generate:
                # Printing CSV headers
                csv_headers = ['path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext']
                csv_writer.writerow(csv_headers)

            if (update and append):
                # Extract all paths already stored in database to avoid readding them
                db_paths = {}
                with open(database, 'rb') as dbf:
                    for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'):
                        db_paths[row['path']] = True

            # Counting the total number of files that we will have to process
            ptee.write("Counting total number of files to process, please wait...")
            filestodocount = 0
            for _ in tqdm.tqdm(recwalk(inputpath), file=ptee):
                filestodocount = filestodocount + 1
            ptee.write("Counting done.")

            # Recursively traversing the root directory and save the metadata in the db for each file
            ptee.write("Processing files to compute metadata to store in database, please wait...")
            filescount = 0
            addcount = 0
            for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True):
                    filescount = filescount + 1
                    # Get full absolute filepath
                    filepath = os.path.join(dirpath, filename)
                    # Get database relative path (from scanning root folder)
                    relfilepath = path2unix(os.path.relpath(filepath, rootfolderpath)) # File relative path from the root (so that we can easily check the files later even if the absolute path is different)
                    if verbose: ptee.write("\n- Processing file %s" % relfilepath)

                    # If update + append mode, then if the file is already in the database we skip it (we continue computing metadata only for new files)
                    if update and append and relfilepath in db_paths:
                        if verbose: ptee.write("... skipped")
                        continue
                    else:
                        addcount = addcount + 1

                    # Compute the hashes (leave it outside the with command because generate_hashes() open the file by itself, so that both hashes can be computed in a single sweep of the file at the same time)
                    if not skip_hash:
                        md5hash, sha1hash = generate_hashes(filepath)
                    else:
                        md5hash = sha1hash = 0
                    # Compute other metadata
                    with open(filepath) as thisfile:
                        # Check file structure if option is enabled
                        if structure_check:
                            struct_result = check_structure(filepath)
                            # Print/Log an error only if there's one (else we won't say anything)
                            if struct_result:
                                ptee.write("\n- Structure error with file "+filepath+": "+struct_result)
                        ext = os.path.splitext(filepath)[1] # File's extension
                        statinfos = os.stat(filepath) # Various OS filesystem infos about the file
                        size = statinfos.st_size # File size
                        lastmodif = statinfos.st_mtime # File last modified date (as a timestamp)
                        lastmodif_readable = datetime.datetime.fromtimestamp(lastmodif).strftime("%Y-%m-%d %H:%M:%S") # File last modified date as a human readable date (ISO universal time)

                        csv_row = [path2unix(relfilepath), md5hash, sha1hash, lastmodif, lastmodif_readable, size, ext] # Prepare the CSV row
                        csv_writer.writerow(csv_row) # Save to the file
        ptee.write("----------------------------------------------------")
        ptee.write("All files processed: Total: %i - Added: %i.\n\n" % (filescount, addcount))

    # -- Filescraping recovery mode
    # We will compare all files from the input path and reorganize the ones that are recognized into the output path
    elif filescraping:
        import shutil
        ptee.write("====================================")
        ptee.write("RIFGC File Scraping Recovery started on %s" % datetime.datetime.now().isoformat())
        ptee.write("====================================")

        ptee.write("Loading the database into memory, please wait...")
        md5list = {}
        sha1list = {}
        dbrows = {} # TODO: instead of memorizing everything in memory, store just the reading cursor position at the beginning of the line with the size and then just read when necessary from the db file directly
        id = 0
        with open(database, 'rb') as db:
            for row in csv.DictReader(db, lineterminator='\n', delimiter='|', quotechar='"'):
                id += 1
                if (len(row['md5']) > 0 and len(row['sha1']) > 0):
                    md5list[row['md5']] = id
                    sha1list[row['sha1']] = id
                    dbrows[id] = row
        ptee.write("Loading done.")

        if len(dbrows) == 0:
            ptee.write("Nothing to do, there's no md5 nor sha1 hashes in the database file!")
            del ptee
            return 1 # return with an error

        # Counting the total number of files that we will have to process
        ptee.write("Counting total number of files to process, please wait...")
        filestodocount = 0
        for _ in tqdm.tqdm(recwalk(inputpath), file=ptee):
            filestodocount = filestodocount + 1
        ptee.write("Counting done.")
        
        # Recursively traversing the root directory and save the metadata in the db for each file
        ptee.write("Processing file scraping recovery, walking through all files from input folder...")
        filescount = 0
        copiedcount = 0
        for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True):
                filescount = filescount + 1
                # Get full absolute filepath
                filepath = os.path.join(dirpath,filename)
                # Get database relative path (from scanning root folder)
                relfilepath = path2unix(os.path.relpath(filepath, rootfolderpath)) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different)
                if verbose: ptee.write("\n- Processing file %s" % relfilepath)

                # Generate the hashes from the currently inspected file
                md5hash, sha1hash = generate_hashes(filepath)
                # If it match with a file in the database, we will copy it over with the correct name, directory structure, file extension and last modification date
                if md5hash in md5list and sha1hash in sha1list and md5list[md5hash] == sha1list[sha1hash]:
                    # Load the db infos for this file
                    row = dbrows[md5list[md5hash]]
                    ptee.write("- Found: %s --> %s.\n" % (filepath, row['path']))
                    # Generate full absolute filepath of the output file
                    outfilepath = os.path.join(outputpath, row['path'])
                    # Recursively create the directory tree structure
                    outfiledir = os.path.dirname(outfilepath)
                    if not os.path.isdir(outfiledir): os.makedirs(outfiledir) # if the target directory does not exist, create it (and create recursively all parent directories too)
                    # Copy over and set attributes
                    shutil.copy2(filepath, outfilepath)
                    filestats = os.stat(filepath)
                    os.utime(outfilepath, (filestats.st_atime, float(row['last_modification_timestamp'])))
                    # Counter...
                    copiedcount += 1
        ptee.write("----------------------------------------------------")
        ptee.write("All files processed: Total: %i - Recovered: %i.\n\n" % (filescount, copiedcount))

    # -- Check mode: check the files using a database file
    elif not update and not generate and not filescraping:
        ptee.write("====================================")
        ptee.write("RIFGC Check started on %s" % datetime.datetime.now().isoformat())
        ptee.write("====================================")

        # Open errors file if supplied (where we will store every errors in a formatted csv so that it can later be easily processed by other softwares, such as repair softwares)
        if errors_file is not None:
            efile = open(errors_file, 'wb')
            e_writer = csv.writer(efile, delimiter='|', lineterminator='\n', quotechar='"')

        # Precompute the total number of lines to process (this should be fairly quick)
        filestodocount = 0
        with open(database, 'rb') as dbf:
            for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'):
                filestodocount = filestodocount + 1

            # Processing the files using the database list
            ptee.write("Checking for files corruption based on database %s on input path %s, please wait..." % (database, inputpath))
            dbf.seek(0)
            dbfile = csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"') # we need to reopen the file to put the reading cursor (the generator position) back to the beginning
            errorscount = 0
            filescount = 0
            for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True):
                filescount = filescount + 1
                filepath = os.path.join(rootfolderpath, row['path'])

                # Single-file mode: skip if this is not the file we are looking for
                if inputpath != rootfolderpath and inputpath != filepath: continue

                if verbose: ptee.write("\n- Processing file %s" % row['path'])
                errors = []
                if not os.path.isfile(filepath):
                    if not skip_missing: errors.append('file is missing')
                # First generate the current file's metadata given the filepath from the CSV, and then we will check the differences from database
                else:
                    try: # Try to be resilient to various file access errors
                        # Generate hash
                        if not skip_hash:
                            md5hash, sha1hash = generate_hashes(filepath)
                        else:
                            md5hash = sha1hash = 0
                        # Check structure integrity if enabled
                        if structure_check:
                            struct_result = check_structure(filepath)
                            if struct_result:
                                errors.append("structure error (%s)" % struct_result)
                        # Compute other metadata
                        with open(filepath) as thisfile:
                            ext = os.path.splitext(filepath)[1]
                            statinfos = os.stat(filepath)
                            size = statinfos.st_size
                            lastmodif = statinfos.st_mtime
                            lastmodif_readable = datetime.datetime.fromtimestamp(lastmodif).strftime("%Y-%m-%d %H:%M:%S")

                            # CHECK THE DIFFERENCES
                            if not skip_hash and md5hash != row['md5'] and sha1hash != row['sha1']:
                                errors.append('both md5 and sha1 hash failed')
                            elif not skip_hash and ((md5hash == row['md5'] and sha1hash != row['sha1']) or (md5hash != row['md5'] and sha1hash == row['sha1'])):
                                errors.append('one of the hash failed but not the other (which may indicate that the database file is corrupted)')
                            if ext != row['ext']:
                                errors.append('extension has changed')
                            if size != int(row['size']):
                                errors.append("size has changed (before: %s - now: %s)" % (row['size'], size))
                            if not disable_modification_date_checking and (lastmodif != float(row['last_modification_timestamp']) and round(lastmodif,0) != round(float(row['last_modification_timestamp']),0)): # for usage with PyPy: last modification time is differently managed (rounded), thus we need to round here manually to compare against PyPy.
                                errors.append("modification date has changed (before: %s - now: %s)" % (row['last_modification_date'], lastmodif_readable))
                    except IOError as e: # Catch IOError as a file error
                        errors.append('file can\'t be read, IOError (inaccessible, maybe bad sector?)')
                    except Exception as e: # Any other exception when accessing the file will also be caught as a file error
                        errors.append('file can\'t be accessed: %s' % e)
                # Print/Log all errors for this file if any happened
                if errors:
                    errorscount = errorscount + 1
                    ptee.write("\n- Error for file %s: %s." % (row['path'], ', '.join(errors)))
                    if errors_file is not None: # Write error in a csv file if supplied (for easy processing later by other softwares such as file repair softwares)
                        e_writer.writerow( [row['path'], ', '.join(errors)] )
        # END OF CHECKING: show some stats
        ptee.write("----------------------------------------------------")
        ptee.write("All files checked: Total: %i - Files with errors: %i.\n\n" % (filescount, errorscount))
        retval = (errorscount > 0)

    del ptee
    return retval # return error code if any
Ejemplo n.º 7
0
def main(argv=None):
    if argv is None: # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv) # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Resiliency Tester
Description: Given a directory and a configuration file (containing the commands to execute before and after file tampering), this script will generate a testing tree, where the files will be corrupted randomly and then the supplied repair commands will be executed, and repair stats will be computed at each step (for each stage and repair commands).

The testing process works in stages:
1- Before_tamper stage: Run preparatory commands before tampering (useful to generate ecc/database files).
2- Tamper stage: Tamper the files and/or databases.
3- After_tamper stage: Run after tampering commands, aka preparatory commands before repair stage.
4- Repair stage: Run repair commands, each repair command reusing the files generated (partially repaired) by the previous stage. This is indeed your repair workchain that you define here.
5- Statistics are generated for each stage.

Note that the original files are never tampered, we tamper only the copy we did inside the test folder.
Also note that the test folder will not be removed at the end, so that you can see for yourself the files resulting of each stage, and eventually use other tools to compute additional stats.
    '''
    ep = '''Use --gui as the first argument to use with a GUI (via Gooey).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
        widget_multidir = {"widget": "MultiDirChooser"}
    else: # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
        widget_multidir = {}

    # Required arguments
    main_parser.add_argument('-i', '--input', metavar='"/path/to/original/files/"', type=is_dir_or_file, nargs=1, required=True,
                        help='Specify the path to the directory containing the sample data.', **widget_dir)
    main_parser.add_argument('-o', '--output', metavar='/test/folder/', nargs=1, required=True,
                        help='Path to the test folder that will be created to store temporary test files.', **widget_dir)
    main_parser.add_argument('-c', '--config', metavar='/some/folder/config.txt', type=str, nargs=1, required=True, #type=argparse.FileType('rt')
                        help='Path to the configuration file (containing the commands to execute, Makefile format). Possible entries: before_tamper, tamper, after_tamper, repair. Note that you can use a few special tags to trigger string interpolation: {inputdir}, {dbdir}, {outputdir}.', **widget_file)

    # Optional arguments
    main_parser.add_argument('-p', '--parallel', action='store_true', required=False,
                        help='If true, repair commands will be run on the tampered files, not on the previous repair results. Useful if you want to try different strategies/commands/programs. By default, false, thus the repair commands will take advantage of the results of previous repair commands.')
    main_parser.add_argument('-m', '--multiple', metavar=1, type=int, default=1, required=False,
                        help='Run multiple times the resiliency test, and average the stats.', **widget_text)

    # Optional general arguments
    main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False,
                        help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave)
    main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False,
                        help='Force overwriting the output folder even if it already exists.')
    main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False,
                        help='Verbose mode (show more output).')
    main_parser.add_argument('--silent', action='store_true', required=False, default=False,
                        help='No console output (but if --log specified, the log will still be saved in the specified file).')

    #== Parsing the arguments
    args = main_parser.parse_args(argv) # Storing all arguments to args

    #-- Set variables from arguments
    origpath = fullpath(args.input[0]) # path to the input directory (where the original, sample data is)
    outputpath = fullpath(args.output[0])
    configfile = fullpath(args.config[0])
    parallel = args.parallel
    multiple = args.multiple
    force = args.force
    verbose = args.verbose
    silent = args.silent

    #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
        #rootfolderpath = os.path.dirname(inputpath)

    # -- Checking arguments
    if not os.path.isdir(origpath):
        raise NameError("Input path needs to be a directory!")

    if not os.path.exists(configfile):
        raise NameError("Please provide a configuration file in order to run a test!")
    else:
        commands = parse_configfile(configfile)

    if os.path.exists(outputpath) and not force:
        raise NameError("Specified test folder (output path) %s already exists! Use --force to overwrite this directory." % outputpath)
    else:
        remove_if_exist(outputpath)

    if multiple < 1:
        multiple = 1

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #

    # == Main branch
    ptee.write("====================================")
    ptee.write("Resiliency tester, started on %s" % datetime.datetime.now().isoformat())
    ptee.write("====================================")
    
    ptee.write("Testing folder %s into test folder %s for %i run(s)." % (origpath, outputpath, multiple))

    fstats = {}
    for m in xrange(multiple):
        run_nb = m + 1

        ptee.write("===== Resiliency tester: starting run %i =====" % run_nb)

        # -- Define directories tree for this test run
        # testpath is the basepath for the current run
        # Generate a specific subdirectory for the current run
        testpath = os.path.join(outputpath, "run%i" % run_nb)
        dbdir = fullpath(os.path.join(testpath, "db"))
        origdbdir = fullpath(os.path.join(testpath, "origdb"))
        tamperdir = fullpath(os.path.join(testpath, "tampered"))
        repairdir = fullpath(os.path.join(testpath, "repair"))

        # == START TEST RUN
        # Create test folder
        create_dir_if_not_exist(testpath)

        # Before tampering
        ptee.write("=== BEFORE TAMPERING ===")
        create_dir_if_not_exist(dbdir)
        for i, cmd in enumerate(commands["before_tamper"]):
            scmd = interpolate_dict(cmd, interp_args={"inputdir": origpath, "dbdir": dbdir})
            ptee.write("Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)
        copy_any(dbdir, origdbdir) # make a copy because we may tamper the db files

        # Tampering
        ptee.write("=== TAMPERING ===")
        copy_any(origpath, tamperdir)
        for i, cmd in enumerate(commands["tamper"]):
            scmd = interpolate_dict(cmd, interp_args={"inputdir": tamperdir, "dbdir": dbdir})
            ptee.write("- RTEST: Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)

        # After tampering
        ptee.write("=== AFTER TAMPERING ===")
        for i, cmd in enumerate(commands["after_tamper"]):
            scmd = interpolate_dict(cmd, interp_args={"inputdir": tamperdir, "dbdir": dbdir})
            ptee.write("- RTEST: Executing command: %s" % scmd)
            execute_command(scmd, ptee=ptee)

        # Repairing
        ptee.write("=== REPAIRING ===")
        indir = tamperdir
        finalrepairdir = ''
        for i, cmd in enumerate(commands["repair"]):
            outdir = "%s%i" % (repairdir, i)
            scmd = interpolate_dict(cmd, interp_args={"inputdir": indir, "dbdir": dbdir, "outputdir": outdir})
            ptee.write("- RTEST: Executing command: %s" % scmd)
            create_dir_if_not_exist(outdir)
            execute_command(scmd, ptee=ptee)
            copy_any(indir, outdir, only_missing=True) # copy the files that did not need any repair (or could not be repaired at all!)
            finalrepairdir = outdir
            # If parallel, do not reuse the previous repair resulting files, repair from the tampered files directly
            if not parallel: indir = outdir

        # Stats
        stats = compute_all_diff_stats(commands, origpath, tamperdir, repairdir, finalrepairdir)
        ptee.write("========== Resiliency tester results for run %i ==========" % run_nb)
        for key, stat in stats.iteritems():
            ptee.write("=> Stage: %s" % key)
            ptee.write(pretty_print_stats(stat))

        if run_nb == 1:
            fstats = stats
        else:
            fstats = stats_running_average(fstats, stats, run_nb-1)

    ptee.write("============================")
    ptee.write("RESILIENCY TESTER FINAL AVERAGED RESULTS OVER %i RUNS" % multiple)
    ptee.write("============================")
    for key, stat in fstats.iteritems():
        ptee.write("=> Stage: %s" % key)
        ptee.write(pretty_print_stats(stat))

    # Shutting down
    del ptee
    # Completely repair all the files? Return OK
    if stats["final"]["error"] == 0:
        return 0
    else:
        return 1
Ejemplo n.º 8
0
def main(argv=None):
    if argv is None: # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv) # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Random file/directory characters tamperer in Python
Description: Randomly tampers characters in a file or in a directory tree recursively (useful to test for integrity/repair after).
WARNING: this will tamper the file you specify. Please ensure you keep a copy of the original!
    '''
    ep = '''NOTE: this script tampers at the character (byte) level, not the bits! Thus the measures you will get here may be different from those you will find in papers (you must divide your probability by 8).'''

    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
    else: # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
    # Required arguments
    main_parser.add_argument('-i', '--input', metavar='filetotamper.ext', type=is_dir_or_file, nargs=1, required=True,
                        help='Path to the file (or directory tree) to tamper.', **widget_dir)
    main_parser.add_argument('-m', '--mode', metavar='e, erasure, n, noise', type=str, nargs=1, required=True,
                        help='Tampering mode: erasure or noise?', **widget_text)
    main_parser.add_argument('-p', '--probability', type=float, nargs=1, required=True,
                        help='Probability of corruption (float between 0.0 and 1.0)', **widget_text)
    # Optional arguments
    main_parser.add_argument('--block_probability', type=float, nargs=1, required=False,
                        help='Probability of block tampering (between 0.0 and 1.0, do not set it if you want to spread errors evenly, but researchs have shown that errors are rather at block level and not evenly distributed)', **widget_text)
    main_parser.add_argument('-b', '--burst_length', metavar="startint|endint", type=str, required=False,
                        help='If specified, this will define the number of consecutive characters that will be corrupted when the corruption probability (--probability) is triggered. Specify a range startint|endint, the burst length will be uniformly sampled over this range.')
    main_parser.add_argument('--header', type=int, required=False,
                        help='Only tamper the header of the file')

    main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False,
                        help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave)
    main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False,
                        help='Verbose mode (show more output).')
    main_parser.add_argument('--silent', action='store_true', required=False, default=False,
                        help='No console output (but if --log specified, the log will still be saved in the specified file).')

    #== Parsing the arguments
    args = main_parser.parse_args(argv) # Storing all arguments to args

    #-- Set variables from arguments
    filepath = fullpath(args.input[0])
    mode = args.mode[0]
    proba = float(args.probability[0])
    verbose = args.verbose
    silent = args.silent

    burst_length = args.burst_length
    if burst_length: burst_length = [int(r) for r in burst_length.split('|')] # split range and convert to int

    block_proba = None
    if args.block_probability:
        block_proba = float(args.block_probability[0])

    blocksize = 65536
    header = args.header

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        #sys.stdout = Tee(args.log[0], 'a')
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #
    # Sanity check
    if not os.path.exists(filepath):
        raise RuntimeError("Path does not exist: %s" % filepath)
    else:
        # -- Tampering a file
        if os.path.isfile(filepath):
            ptee.write('Tampering the file %s, please wait...' % os.path.basename(filepath))
            tcount, tsize = tamper_file(filepath, mode=mode, proba=proba, block_proba=block_proba, blocksize=blocksize, burst_length=burst_length, header=header, silent=silent)
            ptee.write("Tampering done: %i/%i (%.2f%%) characters tampered." % (tcount, tsize, tcount / max(1, tsize) * 100))
        # -- Tampering a directory tree recursively
        elif os.path.isdir(filepath):
            ptee.write('Tampering all files in directory %s, please wait...' % filepath)
            files_tampered, filescount, tcount, tsize = tamper_dir(filepath, mode=mode, proba=proba, block_proba=block_proba, blocksize=blocksize, burst_length=burst_length, header=header, silent=silent)
            ptee.write("Tampering done: %i/%i files tampered and overall %i/%i (%.2f%%) characters were tampered." % (files_tampered, filescount, tcount, tsize, tcount / max(1, tsize) * 100))

    del ptee
    return 0
Ejemplo n.º 9
0
def main(argv=None):
    if argv is None:  # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(
            argv, basestring
    ):  # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv)  # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Recursive/Relative Files Integrity Generator and Checker
Description: Recursively generate or check the integrity of files by MD5 and SHA1 hashes, size, modification date or by data structure integrity (only for images).

This script is originally meant to be used for data archival, by allowing an easy way to check for silent file corruption. Thus, this script uses relative paths so that you can easily compute and check the same redundant data copied on different mediums (hard drives, optical discs, etc.). This script is not meant for system files corruption notification, but is more meant to be used from times-to-times to check up on your data archives integrity.
    '''
    ep = '''Example usage:
- To generate the database (only needed once):
python rfigc.py -i "folderimages" -d "dbhash.csv" -g
- To check:
python rfigc.py -i "folderimages" -d "dbhash.csv" -l log.txt -s
- To update your database by appending new files:
python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a 
- To update your database by appending new files AND removing inexistent files:
python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a -r
- To use with a gui:
python rfigc.py --gui

Note that by default, the script is by default in check mode, to avoid wrong manipulations. It will also alert you if you generate over an already existing database file.
Note2: you can use PyPy to speed the generation, but you should avoid using PyPy when in checking mode (from our tests, it will slow things down a lot).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and
                          not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
    else:  # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]:
            argv.remove(
                '--ignore-gooey'
            )  # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
    # Required arguments
    main_parser.add_argument(
        '-i',
        '--input',
        metavar='/path/to/root/folder',
        type=is_dir_or_file,
        nargs=1,
        required=True,
        help=
        'Path to the root folder (or a single file) from where the scanning will occur.',
        **widget_dir)
    main_parser.add_argument(
        '-d',
        '--database',
        metavar='/some/folder/databasefile.csv',
        type=str,
        nargs=1,
        required=True,  #type=argparse.FileType('rt')
        help='Path to the csv file containing the hash informations.',
        **widget_filesave)

    # Optional general arguments
    main_parser.add_argument(
        '-l',
        '--log',
        metavar='/some/folder/filename.log',
        type=str,
        nargs=1,
        required=False,
        help=
        'Path to the log file. (Output will be piped to both the stdout and the log file)',
        **widget_filesave)
    main_parser.add_argument(
        '--skip_hash',
        action='store_true',
        required=False,
        default=False,
        help=
        'Skip hash computation/checking (checks only the other metadata, this is a lot quicker).'
    )
    main_parser.add_argument('-v',
                             '--verbose',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Verbose mode (show more output).')
    main_parser.add_argument(
        '--silent',
        action='store_true',
        required=False,
        default=False,
        help=
        'No console output (but if --log specified, the log will still be saved in the specified file).'
    )

    # Checking mode arguments
    main_parser.add_argument('-s',
                             '--structure_check',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Check images structures for corruption?')
    main_parser.add_argument(
        '-e',
        '--errors_file',
        metavar='/some/folder/errorsfile.csv',
        type=str,
        nargs=1,
        required=False,  #type=argparse.FileType('rt')
        help=
        'Path to the error file, where errors at checking will be stored in CSV for further processing by other softwares (such as file repair softwares).',
        **widget_filesave)
    main_parser.add_argument('-m',
                             '--disable_modification_date_checking',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Disable modification date checking.')
    main_parser.add_argument(
        '--skip_missing',
        action='store_true',
        required=False,
        default=False,
        help=
        'Skip missing files when checking (useful if you split your files into several mediums, for example on optical discs with limited capacity).'
    )

    # Generate mode arguments
    main_parser.add_argument(
        '-g',
        '--generate',
        action='store_true',
        required=False,
        default=False,
        help=
        'Generate the database? (omit this parameter to check instead of generating).'
    )
    main_parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        required=False,
        default=False,
        help=
        'Force overwriting the database file even if it already exists (if --generate).'
    )

    # Update mode arguments
    main_parser.add_argument(
        '-u',
        '--update',
        action='store_true',
        required=False,
        default=False,
        help='Update database (you must also specify --append or --remove).')
    main_parser.add_argument('-a',
                             '--append',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Append new files (if --update).')
    main_parser.add_argument('-r',
                             '--remove',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Remove missing files (if --update).')

    # Recover from file scraping
    main_parser.add_argument(
        '--filescraping_recovery',
        action='store_true',
        required=False,
        default=False,
        help=
        'Given a folder of unorganized files, compare to the database and restore the filename and directory structure into the output folder.'
    )
    main_parser.add_argument(
        '-o',
        '--output',
        metavar='/path/to/root/folder',
        type=is_dir,
        nargs=1,
        required=False,
        help=
        'Path to the output folder where to output (copy) the files reorganized after --recover_from_filescraping.',
        **widget_dir)

    #== Parsing the arguments
    args = main_parser.parse_args(argv)  # Storing all arguments to args

    #-- Set variables from arguments
    inputpath = fullpath(
        args.input[0]
    )  # path to the files to protect (either a folder or a single file)
    rootfolderpath = inputpath  # path to the root folder (to compute relative paths)
    #database = os.path.basename(fullpath(args.database[0])) # Take only the filename.
    database = fullpath(args.database[0])
    generate = args.generate
    structure_check = args.structure_check
    force = args.force
    disable_modification_date_checking = args.disable_modification_date_checking
    skip_missing = args.skip_missing
    skip_hash = args.skip_hash
    update = args.update
    append = args.append
    remove = args.remove
    outputpath = None
    if args.output: outputpath = fullpath(args.output[0])
    filescraping = args.filescraping_recovery
    verbose = args.verbose
    silent = args.silent

    if os.path.isfile(
            inputpath
    ):  # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
        rootfolderpath = os.path.dirname(inputpath)

    errors_file = None
    if args.errors_file: errors_file = fullpath(args.errors_file[0])

    # -- Checking arguments
    if structure_check and not structure_check_import:
        raise ImportError(
            'PIL (Python Imaging Library) could not be imported. PIL is needed to do structure check, please install PIL (or you can disable structure check to continue).'
        )

    if update and (not append and not remove):
        raise ValueError(
            '--update specified but not --append nor --remove. You must specify at least one of these modes when using --update!'
        )

    if filescraping and not outputpath:
        raise ValueError(
            'Output path needed when --recover_from_filescraping.')

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        #sys.stdout = Tee(args.log[0], 'a')
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #
    retval = 0  # Returned value: 0 OK, 1 KO (files in error), -1 Error

    # -- Update the database file by removing missing files
    if update and remove:
        if not os.path.isfile(database):
            raise NameError(
                'Specified database file does not exist, can\'t update!')

        ptee.write("====================================")
        ptee.write(
            "RIFGC Database Update Removal of missing files, started on %s" %
            datetime.datetime.now().isoformat())
        ptee.write("====================================")

        # Precompute the total number of lines to process (this should be fairly quick)
        filestodocount = 0
        with open(database, 'rb') as dbf:
            for row in csv.DictReader(dbf,
                                      lineterminator='\n',
                                      delimiter='|',
                                      quotechar='"'):
                filestodocount = filestodocount + 1

            # Preparing CSV writer for the temporary file that will have the lines removed
            with open(database + '.rem', 'wb') as dbfilerem:
                csv_writer = csv.writer(dbfilerem,
                                        lineterminator='\n',
                                        delimiter='|',
                                        quotechar='"')

                # Printing CSV headers
                csv_headers = [
                    'path', 'md5', 'sha1', 'last_modification_timestamp',
                    'last_modification_date', 'size', 'ext'
                ]
                csv_writer.writerow(csv_headers)

                dbf.seek(0)
                dbfile = csv.DictReader(
                    dbf, lineterminator='\n', delimiter='|', quotechar='"'
                )  # we need to reopen the file to put the reading cursor (the generator position) back to the beginning
                delcount = 0
                filescount = 0
                for row in tqdm.tqdm(dbfile,
                                     file=ptee,
                                     total=filestodocount,
                                     leave=True):
                    filescount = filescount + 1
                    filepath = os.path.join(
                        rootfolderpath,
                        row['path'])  # Build the absolute file path

                    # Single-file mode: skip if this is not the file we are looking for
                    if inputpath != rootfolderpath and inputpath != filepath:
                        continue

                    if verbose:
                        ptee.write("\n- Processing file %s" % row['path'])
                    errors = []
                    if not os.path.isfile(filepath):
                        delcount = delcount + 1
                        ptee.write(
                            "\n- File %s is missing, removed from database." %
                            row['path'])
                    else:
                        csv_writer.writerow([
                            path2unix(row['path']), row['md5'], row['sha1'],
                            row['last_modification_timestamp'],
                            row['last_modification_date'], row['size'],
                            row['ext']
                        ])

        # REMOVE UPDATE DONE, we remove the old database file and replace it with the new
        os.remove(database)  # delete old database
        os.rename(database + '.rem',
                  database)  # rename new database to match old name
        # Show some stats
        ptee.write("----------------------------------------------------")
        ptee.write(
            "All files processed: Total: %i - Removed/Missing: %i.\n\n" %
            (filescount, delcount))

    # -- Generate the database file or update/append (both will walk through the filesystem to get new files, contrary to other branchs which walk through the database csv)
    if generate or (update and append):
        if not force and os.path.isfile(database) and not update:
            raise NameError(
                'Database file already exists. Please choose another name to generate your database file.'
            )

        if generate:
            dbmode = 'wb'
        elif (update and append):
            dbmode = 'ab'
        with open(
                database, dbmode
        ) as dbfile:  # Must open in write + binary, because on Windows it will do weird things otherwise (at least with Python 2.7)
            ptee.write("====================================")
            if generate:
                ptee.write("RIFGC Database Generation started on %s" %
                           datetime.datetime.now().isoformat())
            elif update and append:
                ptee.write(
                    "RIFGC Database Update Append new files, started on %s" %
                    datetime.datetime.now().isoformat())
            ptee.write("====================================")

            # Preparing CSV writer
            csv_writer = csv.writer(dbfile,
                                    lineterminator='\n',
                                    delimiter='|',
                                    quotechar='"')

            if generate:
                # Printing CSV headers
                csv_headers = [
                    'path', 'md5', 'sha1', 'last_modification_timestamp',
                    'last_modification_date', 'size', 'ext'
                ]
                csv_writer.writerow(csv_headers)

            if (update and append):
                # Extract all paths already stored in database to avoid readding them
                db_paths = {}
                with open(database, 'rb') as dbf:
                    for row in csv.DictReader(dbf,
                                              lineterminator='\n',
                                              delimiter='|',
                                              quotechar='"'):
                        db_paths[row['path']] = True

            # Counting the total number of files that we will have to process
            ptee.write(
                "Counting total number of files to process, please wait...")
            filestodocount = 0
            for _ in tqdm.tqdm(recwalk(inputpath), file=ptee):
                filestodocount = filestodocount + 1
            ptee.write("Counting done.")

            # Recursively traversing the root directory and save the metadata in the db for each file
            ptee.write(
                "Processing files to compute metadata to store in database, please wait..."
            )
            filescount = 0
            addcount = 0
            for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath),
                                                 file=ptee,
                                                 total=filestodocount,
                                                 leave=True):
                filescount = filescount + 1
                # Get full absolute filepath
                filepath = os.path.join(dirpath, filename)
                # Get database relative path (from scanning root folder)
                relfilepath = path2unix(
                    os.path.relpath(filepath, rootfolderpath)
                )  # File relative path from the root (so that we can easily check the files later even if the absolute path is different)
                if verbose: ptee.write("\n- Processing file %s" % relfilepath)

                # If update + append mode, then if the file is already in the database we skip it (we continue computing metadata only for new files)
                if update and append and relfilepath in db_paths:
                    if verbose: ptee.write("... skipped")
                    continue
                else:
                    addcount = addcount + 1

                # Compute the hashes (leave it outside the with command because generate_hashes() open the file by itself, so that both hashes can be computed in a single sweep of the file at the same time)
                if not skip_hash:
                    md5hash, sha1hash = generate_hashes(filepath)
                else:
                    md5hash = sha1hash = 0
                # Compute other metadata
                with open(filepath) as thisfile:
                    # Check file structure if option is enabled
                    if structure_check:
                        struct_result = check_structure(filepath)
                        # Print/Log an error only if there's one (else we won't say anything)
                        if struct_result:
                            ptee.write("\n- Structure error with file " +
                                       filepath + ": " + struct_result)
                    ext = os.path.splitext(filepath)[1]  # File's extension
                    statinfos = os.stat(
                        filepath)  # Various OS filesystem infos about the file
                    size = statinfos.st_size  # File size
                    lastmodif = statinfos.st_mtime  # File last modified date (as a timestamp)
                    lastmodif_readable = datetime.datetime.fromtimestamp(
                        lastmodif
                    ).strftime(
                        "%Y-%m-%d %H:%M:%S"
                    )  # File last modified date as a human readable date (ISO universal time)

                    csv_row = [
                        path2unix(relfilepath), md5hash, sha1hash, lastmodif,
                        lastmodif_readable, size, ext
                    ]  # Prepare the CSV row
                    csv_writer.writerow(csv_row)  # Save to the file
        ptee.write("----------------------------------------------------")
        ptee.write("All files processed: Total: %i - Added: %i.\n\n" %
                   (filescount, addcount))

    # -- Filescraping recovery mode
    # We will compare all files from the input path and reorganize the ones that are recognized into the output path
    elif filescraping:
        import shutil
        ptee.write("====================================")
        ptee.write("RIFGC File Scraping Recovery started on %s" %
                   datetime.datetime.now().isoformat())
        ptee.write("====================================")

        ptee.write("Loading the database into memory, please wait...")
        md5list = {}
        sha1list = {}
        dbrows = {
        }  # TODO: instead of memorizing everything in memory, store just the reading cursor position at the beginning of the line with the size and then just read when necessary from the db file directly
        id = 0
        with open(database, 'rb') as db:
            for row in csv.DictReader(db,
                                      lineterminator='\n',
                                      delimiter='|',
                                      quotechar='"'):
                id += 1
                if (len(row['md5']) > 0 and len(row['sha1']) > 0):
                    md5list[row['md5']] = id
                    sha1list[row['sha1']] = id
                    dbrows[id] = row
        ptee.write("Loading done.")

        if len(dbrows) == 0:
            ptee.write(
                "Nothing to do, there's no md5 nor sha1 hashes in the database file!"
            )
            del ptee
            return 1  # return with an error

        # Counting the total number of files that we will have to process
        ptee.write("Counting total number of files to process, please wait...")
        filestodocount = 0
        for _ in tqdm.tqdm(recwalk(inputpath), file=ptee):
            filestodocount = filestodocount + 1
        ptee.write("Counting done.")

        # Recursively traversing the root directory and save the metadata in the db for each file
        ptee.write(
            "Processing file scraping recovery, walking through all files from input folder..."
        )
        filescount = 0
        copiedcount = 0
        for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath),
                                             file=ptee,
                                             total=filestodocount,
                                             leave=True):
            filescount = filescount + 1
            # Get full absolute filepath
            filepath = os.path.join(dirpath, filename)
            # Get database relative path (from scanning root folder)
            relfilepath = path2unix(
                os.path.relpath(filepath, rootfolderpath)
            )  # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different)
            if verbose: ptee.write("\n- Processing file %s" % relfilepath)

            # Generate the hashes from the currently inspected file
            md5hash, sha1hash = generate_hashes(filepath)
            # If it match with a file in the database, we will copy it over with the correct name, directory structure, file extension and last modification date
            if md5hash in md5list and sha1hash in sha1list and md5list[
                    md5hash] == sha1list[sha1hash]:
                # Load the db infos for this file
                row = dbrows[md5list[md5hash]]
                ptee.write("- Found: %s --> %s.\n" % (filepath, row['path']))
                # Generate full absolute filepath of the output file
                outfilepath = os.path.join(outputpath, row['path'])
                # Recursively create the directory tree structure
                outfiledir = os.path.dirname(outfilepath)
                if not os.path.isdir(outfiledir):
                    os.makedirs(
                        outfiledir
                    )  # if the target directory does not exist, create it (and create recursively all parent directories too)
                # Copy over and set attributes
                shutil.copy2(filepath, outfilepath)
                filestats = os.stat(filepath)
                os.utime(outfilepath,
                         (filestats.st_atime,
                          float(row['last_modification_timestamp'])))
                # Counter...
                copiedcount += 1
        ptee.write("----------------------------------------------------")
        ptee.write("All files processed: Total: %i - Recovered: %i.\n\n" %
                   (filescount, copiedcount))

    # -- Check mode: check the files using a database file
    elif not update and not generate and not filescraping:
        ptee.write("====================================")
        ptee.write("RIFGC Check started on %s" %
                   datetime.datetime.now().isoformat())
        ptee.write("====================================")

        # Open errors file if supplied (where we will store every errors in a formatted csv so that it can later be easily processed by other softwares, such as repair softwares)
        if errors_file is not None:
            efile = open(errors_file, 'wb')
            e_writer = csv.writer(efile,
                                  delimiter='|',
                                  lineterminator='\n',
                                  quotechar='"')

        # Precompute the total number of lines to process (this should be fairly quick)
        filestodocount = 0
        with open(database, 'rb') as dbf:
            for row in csv.DictReader(dbf,
                                      lineterminator='\n',
                                      delimiter='|',
                                      quotechar='"'):
                filestodocount = filestodocount + 1

            # Processing the files using the database list
            ptee.write(
                "Checking for files corruption based on database %s on input path %s, please wait..."
                % (database, inputpath))
            dbf.seek(0)
            dbfile = csv.DictReader(
                dbf, lineterminator='\n', delimiter='|', quotechar='"'
            )  # we need to reopen the file to put the reading cursor (the generator position) back to the beginning
            errorscount = 0
            filescount = 0
            for row in tqdm.tqdm(dbfile,
                                 file=ptee,
                                 total=filestodocount,
                                 leave=True):
                filescount = filescount + 1
                filepath = os.path.join(rootfolderpath, row['path'])

                # Single-file mode: skip if this is not the file we are looking for
                if inputpath != rootfolderpath and inputpath != filepath:
                    continue

                if verbose: ptee.write("\n- Processing file %s" % row['path'])
                errors = []
                if not os.path.isfile(filepath):
                    if not skip_missing: errors.append('file is missing')
                # First generate the current file's metadata given the filepath from the CSV, and then we will check the differences from database
                else:
                    try:  # Try to be resilient to various file access errors
                        # Generate hash
                        if not skip_hash:
                            md5hash, sha1hash = generate_hashes(filepath)
                        else:
                            md5hash = sha1hash = 0
                        # Check structure integrity if enabled
                        if structure_check:
                            struct_result = check_structure(filepath)
                            if struct_result:
                                errors.append("structure error (%s)" %
                                              struct_result)
                        # Compute other metadata
                        with open(filepath) as thisfile:
                            ext = os.path.splitext(filepath)[1]
                            statinfos = os.stat(filepath)
                            size = statinfos.st_size
                            lastmodif = statinfos.st_mtime
                            lastmodif_readable = datetime.datetime.fromtimestamp(
                                lastmodif).strftime("%Y-%m-%d %H:%M:%S")

                            # CHECK THE DIFFERENCES
                            if not skip_hash and md5hash != row[
                                    'md5'] and sha1hash != row['sha1']:
                                errors.append('both md5 and sha1 hash failed')
                            elif not skip_hash and (
                                (md5hash == row['md5']
                                 and sha1hash != row['sha1']) or
                                (md5hash != row['md5']
                                 and sha1hash == row['sha1'])):
                                errors.append(
                                    'one of the hash failed but not the other (which may indicate that the database file is corrupted)'
                                )
                            if ext != row['ext']:
                                errors.append('extension has changed')
                            if size != int(row['size']):
                                errors.append(
                                    "size has changed (before: %s - now: %s)" %
                                    (row['size'], size))
                            if not disable_modification_date_checking and (
                                    lastmodif != float(
                                        row['last_modification_timestamp'])
                                    and round(lastmodif, 0) != round(
                                        float(row[
                                            'last_modification_timestamp']), 0)
                            ):  # for usage with PyPy: last modification time is differently managed (rounded), thus we need to round here manually to compare against PyPy.
                                errors.append(
                                    "modification date has changed (before: %s - now: %s)"
                                    % (row['last_modification_date'],
                                       lastmodif_readable))
                    except IOError as e:  # Catch IOError as a file error
                        errors.append(
                            'file can\'t be read, IOError (inaccessible, maybe bad sector?)'
                        )
                    except Exception as e:  # Any other exception when accessing the file will also be caught as a file error
                        errors.append('file can\'t be accessed: %s' % e)
                # Print/Log all errors for this file if any happened
                if errors:
                    errorscount = errorscount + 1
                    ptee.write("\n- Error for file %s: %s." %
                               (row['path'], ', '.join(errors)))
                    if errors_file is not None:  # Write error in a csv file if supplied (for easy processing later by other softwares such as file repair softwares)
                        e_writer.writerow([row['path'], ', '.join(errors)])
        # END OF CHECKING: show some stats
        ptee.write("----------------------------------------------------")
        ptee.write(
            "All files checked: Total: %i - Files with errors: %i.\n\n" %
            (filescount, errorscount))
        retval = (errorscount > 0)

    del ptee
    return retval  # return error code if any