def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Replication Repair Description: Given a set of directories (or files), try to repair your files by scanning each byte, cast a majority vote among all copies, and then output the winning byte. This process is usually called triple-modular redundancy (but here it should be called n-modular redundancy since you can use as many copies as you have). It is recommended for long term storage to store several copies of your files on different storage mediums. Everything's fine until all your copies are partially corrupted. In this case, this script can help you, by taking advantage of your multiple copies, without requiring a pregenerated ecc file. Just specify the path to every copies, and the script will try to recover them. Replication can repair exactly r-2 errors using majority vote (you need at least 2 blocks for majority vote to work), where r is the number of replications: if r=3, you get a redundancy rate of 1/3, if r=4, rate is 2/4, etc. This script can also take advantage of a database generated by rfigc.py to make sure that the recovered files are correct, or to select files that are already correct. Note: in case the end result is not what you expected, you can try a different order of input directories: in case of ambiguity, the first input folder has precedence over subsequent folders. Note2: in case some files with the same names are of different length, the merging will continue until the longest file is exhausted. Note3: last modification date is not (yet) accounted for. ''' ep = '''Use --gui as the first argument to use with a GUI (via Gooey). ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} widget_multidir = {"widget": "MultiDirChooser"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} widget_multidir = {} # Required arguments main_parser.add_argument('-i', '--input', metavar='"/path/to/copy1/" "/path/to/copy2/" "etc."', type=is_dir_or_file, nargs='+', required=True, help='Specify the paths to every copies you have (minimum 3 copies, else it won\'t work!). Can be folders or files (if you want to repair only one file). Order matters: in case of ambiguity, the first folder where the file exists will be chosen.', **widget_multidir) main_parser.add_argument('-o', '--output', metavar='/ouput/folder/', nargs=1, required=True, help='Where the recovered files will be stored.', **widget_dir) # Optional general arguments main_parser.add_argument('-d', '--database', metavar='database.csv', type=is_file, required=False, help='Path to a previously generated rfigc.py database. If provided, this will be used to check that the repaired files are correct (and also to find already correct files in copies).', **widget_file) main_parser.add_argument('-r', '--report', metavar='/some/folder/report.csv', type=str, required=False, help='Save all results of the repair process in a report file, with detailed descriptions of ambiguous repairs (ie, when majority vote came to a draw).', **widget_filesave) main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False, help='Force overwriting the output folder even if it already exists.') main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument('--silent', action='store_true', required=False, default=False, help='No console output (but if --log specified, the log will still be saved in the specified file).') #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments inputpaths = [fullpath(x) for x in args.input] # path to the files to repair (ie, paths to all the different copies the user has) outputpath = fullpath(args.output[0]) force = args.force verbose = args.verbose silent = args.silent if len(inputpaths) < 3: raise Exception('Need at least 3 copies to do a replication repair/majority vote!') #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!) #rootfolderpath = os.path.dirname(inputpath) report_file = None if args.report: report_file = os.path.basename(fullpath(args.report)) database = None if args.database: database = args.database # -- Checking arguments if os.path.exists(outputpath) and not force: raise NameError('Specified output path %s already exists! Use --force if you want to overwrite.' % outputpath) if database and not os.path.isfile(database): raise NameError('Specified rfigc database file %s does not exist!' % database) # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) #sys.stdout = Tee(args.log[0], 'a') sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # # == Precomputation of ecc file size # Precomputing is important so that the user can know what size to expect before starting (and how much time it will take...). filescount = 0 sizetotal = 0 sizeheaders = 0 visitedfiles = {} ptee.write("Precomputing list of files and predicted statistics...") prebar = tqdm.tqdm(file=ptee, disable=silent) for inputpath in inputpaths: for (dirpath, filename) in recwalk(inputpath): # Get full absolute filepath filepath = os.path.join(dirpath, filename) relfilepath = path2unix(os.path.relpath(filepath, inputpath)) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different) # Only increase the files count if we didn't see this file before if not visitedfiles.get(relfilepath, None): # Counting the total number of files we will process (so that we can show a progress bar with ETA) filescount = filescount + 1 # Add the file to the list of already visited files visitedfiles[relfilepath] = True # Get the current file's size size = os.stat(filepath).st_size # Compute total size of all files sizetotal = sizetotal + size prebar.update() prebar.close() ptee.write("Precomputing done.") # == Majority vote repair # For each folder, align the files lists and then majority vote over each byte to repair ptee.write("====================================") ptee.write("Replication repair, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Prepare progress bar if necessary if silent: tqdm_bar = None else: tqdm_bar = tqdm.tqdm(total=filescount, file=ptee, leave=True, unit="files") # Call the main function to synchronize files using majority vote errcode = synchronize_files(inputpaths, outputpath, database=database, tqdm_bar=tqdm_bar, report_file=report_file, ptee=ptee, verbose=verbose) #ptee.write("All done! Stats:\n- Total files processed: %i\n- Total files corrupted: %i\n- Total files repaired completely: %i\n- Total files repaired partially: %i\n- Total files corrupted but not repaired at all: %i\n- Total files skipped: %i" % (files_count, files_corrupted, files_repaired_completely, files_repaired_partially, files_corrupted - (files_repaired_partially + files_repaired_completely), files_skipped) ) if tqdm_bar: tqdm_bar.close() ptee.write("All done!") if report_file: ptee.write("Saved replication repair results in report file: %s" % report_file) del ptee return errcode
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''ECC file repairer Description: Repair the structure of an ecc file, mainly the ecc markers, so that at least the ecc correction can align correctly the ecc entries and fields. Note: An ecc structure repair does NOT allow to recover from more errors on your files, it only allows to repair an ecc file so that its structure is valid and can be read correctly. ''' ep = ''' ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} # Required arguments main_parser.add_argument('-i', '--input', metavar='eccfile.txt', type=str, required=True, help='Path to the ecc file to repair.', **widget_file) main_parser.add_argument('-o', '--output', metavar='eccfile_repaired.txt', type=str, required=True, #type=argparse.FileType('rt') help='Output path where to save the repaired ecc file.', **widget_filesave) main_parser.add_argument('-t', '--threshold', type=float, default=0.3, required=False, help='Distance threshold for the heuristic hamming distance repair. This must be a float, eg, 0.2 means that if there are 20% characters different between an ecc marker and a substring in the ecc file, it will be detected as a marker and corrected.', **widget_text) # Optional general arguments main_parser.add_argument('--index', metavar='eccfile.txt.idx', type=str, required=False, help='Path to the index backup file corresponding to the ecc file (optional but helps a lot).', **widget_file) main_parser.add_argument('--ecc_algo', type=int, default=1, required=False, help='What algorithm use to generate and verify the ECC? Values possible: 1-4. 1 is the formal, fully verified Reed-Solomon in base 3 ; 2 is a faster implementation but still based on the formal base 3 ; 3 is an even faster implementation but based on another library which may not be correct ; 4 is the fastest implementation supporting US FAA ADSB UAT RS FEC standard but is totally incompatible with the other three (a text encoded with any of 1-3 modes will be decodable with any one of them).', **widget_text) main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, required=False, help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument('--silent', action='store_true', required=False, default=False, help='No console output (but if --log specified, the log will still be saved in the specified file).') main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False, help='Force overwriting the ecc file even if it already exists (if --generate).') #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set hard-coded variables entrymarker = "\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF" # marker that will signal the beginning of an ecc entry - use an alternating pattern of several characters, this avoids confusion (eg: if you use "AAA" as a pattern, if the ecc block of the previous file ends with "EGA" for example, then the full string for example will be "EGAAAAC:\yourfolder\filea.jpg" and then the entry reader will detect the first "AAA" occurrence as the entry start - this should not make the next entry bug because there is an automatic trim - but the previous ecc block will miss one character that could be used to repair the block because it will be "EG" instead of "EGA"!) field_delim = "\xFA\xFF\xFA\xFF\xFA" # delimiter between fields (filepath, filesize, hash+ecc blocks) inside an ecc entry markers = [entrymarker, field_delim] # put them in a list for easy reference max_block_size = 27 resilience_rate = 1 #-- Set variables from arguments inputpath = fullpath(args.input) outputpath = fullpath(args.output) distance_threshold = args.threshold indexpath = None if args.index: indexpath = fullpath(args.index) force = args.force ecc_algo = args.ecc_algo verbose = args.verbose silent = args.silent # -- Checking arguments if not os.path.isfile(inputpath): raise NameError('Specified database ecc file %s does not exist!' % inputpath) if os.path.isfile(outputpath) and not force: raise NameError('Specified output path for the repaired ecc file %s already exists! Use --force if you want to overwrite.' % outputpath) if indexpath and not os.path.isfile(indexpath): raise NameError('Specified index backup file %s does not exist!' % indexpath) if max_block_size < 2 or max_block_size > 255: raise ValueError('RS max block size must be between 2 and 255.') # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log, 'a', nostdout=silent) #sys.stdout = Tee(args.log, 'a') sys.stderr = Tee(args.log, 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # # Precompute some parameters and load up ecc manager objects (big optimization as g_exp and g_log tables calculation is done only once) hasher_none = Hasher('none') # for index ecc we don't use any hash ecc_params_idx = compute_ecc_params(max_block_size, resilience_rate, hasher_none) ecc_manager_idx = ECCMan(max_block_size, ecc_params_idx["message_size"], algo=ecc_algo) # == Main loop ptee.write("====================================") ptee.write("ECC repair, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") ptee.write("Please note that this tool may not know if it found all the markers, so it may miss too much corrupted markers but it will repair the ones it finds (except if you have a fully valid index file, then you are guaranteed to always find all markers).") ecc_size = os.stat(inputpath).st_size if indexpath: idx_size = os.stat(indexpath).st_size shutil.copy2(inputpath, outputpath) blocksize = 65535 with open(outputpath, 'r+b') as db: # == Index backup repair # This repair needs an index backup file which is normally generated at the same time as the ecc file. The index backup file is a file that stores the position of all ecc markers in the corresponding ecc file, and protects those positions using ecc. if indexpath: ptee.write("Using the index backup file %s to repair ecc markers, please wait..." % args.index) db.seek(0) # seek to the beginning of the database file idx_corrupted = 0 idx_corrected = 0 idx_total = 0 markers_repaired = [0] * len(markers) bardisp = tqdm.tqdm(total=idx_size, file=ptee, leave=True, desc='IDXREAD', unit='B', unit_scale=True) # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have) with open(indexpath, 'rb') as dbidx: buf = 1 while buf: # The format of the index backup file is pretty simple: for each entrymarker or field_delim, a block is appended. Each such block is made of: the type on one byte (1 for entrymarker, 2 for field_delim), then the marker's position in the ecc file encoded in an unsigned long long (thus it's on a fixed 8 bytes), and finally an ecc for both the type and marker's position, and which is of fixed size (since we know that the marker's type + position = 9 bytes). Each such block is appended right after the precedent, so we know easily read them and such structure cannot be tampered by a soft error (there's no way a hard drive failure can chance the structure of the data, but a malicious user can! But it's then easy to fix that for a human user, you can clearly see the patterns, where the marker's positions begins and ends). # Note that this constant sized structure of blocks is made on purpose, so that the structure of the index backup file is implicit, while the structure of the ecc file is explicit (it needs uncorrupted markers, which is a weak point that we try to address with the index backup file). # eg of two blocks: 10000008Aecceccecc2000000F2ecceccecc # # Read one index block curpos = dbidx.tell() # backup current position for error messages buf = dbidx.read(max_block_size) # Update progress bar bardisp.update(dbidx.tell()-bardisp.n) # If we have reached EOF, then we stop here if not buf: break # Else it's ok we have an index block, we process it idx_total += 1 # Extract the marker's infos and the ecc marker_str = buf[:ecc_params_idx["message_size"]] ecc = buf[ecc_params_idx["message_size"]:] # Check if the marker's infos are corrupted, if yes, then we will try to fix that using the ecc if not ecc_manager_idx.check(marker_str, ecc): # Trying to fix the marker's infos using the ecc idx_corrupted += 1 marker_repaired, repaired_ecc = ecc_manager_idx.decode(marker_str, ecc) # Repaired the marker's infos, all is good! if ecc_manager_idx.check(marker_repaired, repaired_ecc): marker_str = marker_repaired idx_corrected += 1 # Else it's corrupted beyond repair, just skip else: ptee.write("\n- Index backup file: error on block starting at %i, corrupted and could not fix it. Skipping." % curpos) marker_str = None continue if not marker_str: continue # Repair ecc file's marker using our correct (or repaired) marker's infos marker_type = int(marker_str[0]) # marker's type is always stored on the first byte/character marker_pos = struct.unpack('>Q', marker_str[1:]) # marker's position is encoded as a big-endian unsigned long long, in a 8 bytes/chars string db.seek(marker_pos[0]) # move the ecc reading cursor to the beginning of the marker current_marker = db.read(len(markers[marker_type-1])) # read the current marker (potentially corrupted) db.seek(marker_pos[0]) if verbose: print "- Found marker by index file: type=%i content=" % (marker_type) print db.read(len(markers[marker_type-1])+4) db.seek(marker_pos[0]) # replace the reading cursor back in place before the marker if current_marker != markers[marker_type-1]: # check if we really need to repair this marker # Rewrite the marker over the ecc file db.write(markers[marker_type-1]) markers_repaired[marker_type-1] += 1 else: print "skipped, no need to repair" # Done the index backup repair if bardisp.n > bardisp.total: bardisp.n = bardisp.total # just a workaround in case there's one byte more than the predicted total bardisp.close() ptee.write("Done. Total: %i/%i markers repaired (%i entrymarkers and %i field_delim), %i indexes corrupted and %i indexes repaired (%i indexes lost).\n" % (markers_repaired[0]+markers_repaired[1], idx_total, markers_repaired[0], markers_repaired[1], idx_corrupted, idx_corrected, idx_corrupted-idx_corrected) ) # == Heuristical Greedy Hamming distance repair # This is a heuristical (doesn't need any other file than the ecc file) 2-pass algorithm: the first pass tries to find the markers positions, and then the second pass simply reads the original ecc file and copies it while repairing the found markers. # The first pass is obviously the most interesting, here's a description: we use a kind of greedy algorithm but with backtracking, meaning that we simply read through all the strings sequentially and just compare with the markers and compute the Hamming distance: if the Hamming distance gets below the threshold, we trigger the found marker flag. Then if the Hamming distance decreases, we save this marker position and disable the found marker flag. However, there can be false positives like this (eg, the marker is corrupted in the middle), so we have a backtracking mechanism: if a later string is found to have a Hamming distance that is below the threshold, then we check if the just previously found marker is in the range (ie, the new marker's position is smaller than the previous marker's length) and if the Hamming distance is smaller, then we replace the previous marker with the new marker's position, because the previous one was most likely a false positive. # This method doesn't require any other file than the ecc file, but it may not work on ecc markers that are too much tampered, and if the detection threshold is too low or the markers are too small, there may be lots of false positives. # So try to use long markers (consisting of many character, preferably an alternating pattern different than the null byte \x00) and a high enough detection threshold. ptee.write("Using heuristics (Hamming distance) to fix markers with a threshold of %i%%, please wait..." % (round(distance_threshold*100, 0)) ) # Main loop for heuristical repair, try to find the substrings that minimize the hamming distance to one of the ecc markers markers_repaired = [0] * len(markers) # stat counter already_valid = 0 # stat counter db.seek(0) # seek to the beginning of the database file buf = 1 # init the buffer to 1 to initiate the while loop markers_pos = [[] for i in xrange(len(markers))] # will contain the list of positions where a corrupted marker has been detected (not valid markers, they will be skipped) distance_thresholds = [round(len(x)*distance_threshold, 0) for x in markers] # calculate the number of characters maximum for distance skip_until = -1 # when a valid marker (non corrupted) is found, we use this variable to skip to after the marker length (to avoid detecting partial parts of this marker, which will have a hamming distance even if the marker is completely valid because the reading window will be after the beginning of the marker) bardisp = tqdm.tqdm(total=ecc_size, file=ptee, leave=True, desc='DBREAD', unit='B', unit_scale=True) # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have) while buf: # until we have walked through the whole ecc file # Read a part of the ecc file into a buffer, this allows to process more quickly than just loading the size of a marker curpos = db.tell() # keep the current reading position buf = db.read(blocksize) # Update progress bar bardisp.update(db.tell()-bardisp.n) if not buf: break # reached EOF? quitting here # Scan the buffer, by splitting the buffer into substrings the length of the ecc markers for i in xrange(len(buf)-max(len(entrymarker),len(field_delim))): # If we just came accross a non corrupted ecc marker, we skip until we are after this ecc marker (to avoid misdetections) if i < skip_until: continue # Compare each ecc marker type to this substring and compute the Hamming distance for m in xrange(len(markers)): d = hamming(buf[i:i+len(markers[m])], markers[m]) # Compute the Hamming distance (simply the number of different characters) mcurpos = curpos+i # current absolute position of this ecc marker # If there's no difference, then it's a valid, non-corrupted ecc marker if d == 0: already_valid += 1 # stats... # If we previously wrongly detected a corrupted ecc marker near here, then it's probably a misdetection (because we just had a partial view on this marker until now), thus we just remove it from our list of markers to repair if len(markers_pos[m]) > 0 and (mcurpos - markers_pos[m][-1][0]) <= len(markers[m]): # to detect that, we just check if the latest marker to repair is near the current marker (if its position is at maximum the length of the marker). This works because in the other condition below, we update the latest marker to repair if we find another one with a lower hamming distance very near. del markers_pos[m][-1] # Skip scanning until we are after the current marker to avoid misdetections su = i+len(markers[m]) if su > skip_until: skip_until = su # update with the biggest marker (because both markers can be detected here if the pattern is similar) break # Else there's a difference/distance but it's below the threshold: we have a corrupted marker! elif d > 0 and d <= distance_thresholds[m]: # Updating case: If the latest marker to repair is quite close to the current one, but the current detection has a lower distance, we probably are detecting the same marker but we are better positionned now, so we update the previous marker's position with this one now. if len(markers_pos[m]) > 0 and (mcurpos - markers_pos[m][-1][0]) <= len(markers[m]): if d < markers_pos[m][-1][1]: # Update only if the distance is less markers_pos[m][-1] = [mcurpos, d] else: # Else, we probably are detecting the same marker as the last detected one, but since our scanning window has moved forward, we have increased the distance. Just skip it, we should not repair at this position (else we will probably be overwriting over real content). continue # Adding case: Else we just add this marker as a new one to repair by appending to the list else: markers_pos[m].append([mcurpos, d]) # Else the distance is too great for the threshold, it's not a marker at all, we go on to the next substring if db.tell() < ecc_size: db.seek(db.tell()-max(len(entrymarker),len(field_delim))) if bardisp.n > bardisp.total: bardisp.n = bardisp.total # just a workaround in case there's one byte more than the predicted total bardisp.close() # Committing the repair into the ecc file for m in xrange(len(markers)): # for each type of markers marker = markers[m] if len(markers_pos[m]) > 0: # If there is any detected marker to repair for this type for pos in markers_pos[m]: # for each detected marker to repair, we rewrite it over into the file at the detected position if verbose: ptee.write("- Detected marker type %i at position %i with distance %i (%i%%): repairing." % (m+1, pos[0], pos[1], (float(pos[1])/len(markers[m]))*100) ) db.seek(pos[0]) db.write(marker) #print(markers_pos) ptee.write("Done. Hamming heuristic with threshold %i%% repaired %i entrymarkers and %i field_delim (%i total) and %i were already valid.\n" % (round(distance_threshold*100, 0), len(markers_pos[0]), len(markers_pos[1]), len(markers_pos[0])+len(markers_pos[1]), already_valid) ) del ptee return 0
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance( argv, basestring ): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''ECC file repairer Description: Repair the structure of an ecc file, mainly the ecc markers, so that at least the ecc correction can align correctly the ecc entries and fields. Note: An ecc structure repair does NOT allow to recover from more errors on your files, it only allows to repair an ecc file so that its structure is valid and can be read correctly. ''' ep = ''' ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove( '--ignore-gooey' ) # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} # Required arguments main_parser.add_argument('-i', '--input', metavar='eccfile.txt', type=str, required=True, help='Path to the ecc file to repair.', **widget_file) main_parser.add_argument( '-o', '--output', metavar='eccfile_repaired.txt', type=str, required=True, #type=argparse.FileType('rt') help='Output path where to save the repaired ecc file.', **widget_filesave) main_parser.add_argument( '-t', '--threshold', type=float, default=0.3, required=False, help= 'Distance threshold for the heuristic hamming distance repair. This must be a float, eg, 0.2 means that if there are 20% characters different between an ecc marker and a substring in the ecc file, it will be detected as a marker and corrected.', **widget_text) # Optional general arguments main_parser.add_argument( '--index', metavar='eccfile.txt.idx', type=str, required=False, help= 'Path to the index backup file corresponding to the ecc file (optional but helps a lot).', **widget_file) main_parser.add_argument( '--ecc_algo', type=int, default=1, required=False, help= 'What algorithm use to generate and verify the ECC? Values possible: 1-4. 1 is the formal, fully verified Reed-Solomon in base 3 ; 2 is a faster implementation but still based on the formal base 3 ; 3 is an even faster implementation but based on another library which may not be correct ; 4 is the fastest implementation supporting US FAA ADSB UAT RS FEC standard but is totally incompatible with the other three (a text encoded with any of 1-3 modes will be decodable with any one of them).', **widget_text) main_parser.add_argument( '-l', '--log', metavar='/some/folder/filename.log', type=str, required=False, help= 'Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument( '--silent', action='store_true', required=False, default=False, help= 'No console output (but if --log specified, the log will still be saved in the specified file).' ) main_parser.add_argument( '-f', '--force', action='store_true', required=False, default=False, help= 'Force overwriting the ecc file even if it already exists (if --generate).' ) #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set hard-coded variables entrymarker = "\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF" # marker that will signal the beginning of an ecc entry - use an alternating pattern of several characters, this avoids confusion (eg: if you use "AAA" as a pattern, if the ecc block of the previous file ends with "EGA" for example, then the full string for example will be "EGAAAAC:\yourfolder\filea.jpg" and then the entry reader will detect the first "AAA" occurrence as the entry start - this should not make the next entry bug because there is an automatic trim - but the previous ecc block will miss one character that could be used to repair the block because it will be "EG" instead of "EGA"!) field_delim = "\xFA\xFF\xFA\xFF\xFA" # delimiter between fields (filepath, filesize, hash+ecc blocks) inside an ecc entry markers = [entrymarker, field_delim] # put them in a list for easy reference max_block_size = 27 resilience_rate = 1 #-- Set variables from arguments inputpath = fullpath(args.input) outputpath = fullpath(args.output) distance_threshold = args.threshold indexpath = None if args.index: indexpath = fullpath(args.index) force = args.force ecc_algo = args.ecc_algo verbose = args.verbose silent = args.silent # -- Checking arguments if not os.path.isfile(inputpath): raise NameError('Specified database ecc file %s does not exist!' % inputpath) if os.path.isfile(outputpath) and not force: raise NameError( 'Specified output path for the repaired ecc file %s already exists! Use --force if you want to overwrite.' % outputpath) if indexpath and not os.path.isfile(indexpath): raise NameError('Specified index backup file %s does not exist!' % indexpath) if max_block_size < 2 or max_block_size > 255: raise ValueError('RS max block size must be between 2 and 255.') # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log, 'a', nostdout=silent) #sys.stdout = Tee(args.log, 'a') sys.stderr = Tee(args.log, 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # # Precompute some parameters and load up ecc manager objects (big optimization as g_exp and g_log tables calculation is done only once) hasher_none = Hasher('none') # for index ecc we don't use any hash ecc_params_idx = compute_ecc_params(max_block_size, resilience_rate, hasher_none) ecc_manager_idx = ECCMan(max_block_size, ecc_params_idx["message_size"], algo=ecc_algo) # == Main loop ptee.write("====================================") ptee.write("ECC repair, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") ptee.write( "Please note that this tool may not know if it found all the markers, so it may miss too much corrupted markers but it will repair the ones it finds (except if you have a fully valid index file, then you are guaranteed to always find all markers)." ) ecc_size = os.stat(inputpath).st_size if indexpath: idx_size = os.stat(indexpath).st_size shutil.copy2(inputpath, outputpath) blocksize = 65535 with open(outputpath, 'r+b') as db: # == Index backup repair # This repair needs an index backup file which is normally generated at the same time as the ecc file. The index backup file is a file that stores the position of all ecc markers in the corresponding ecc file, and protects those positions using ecc. if indexpath: ptee.write( "Using the index backup file %s to repair ecc markers, please wait..." % args.index) db.seek(0) # seek to the beginning of the database file idx_corrupted = 0 idx_corrected = 0 idx_total = 0 markers_repaired = [0] * len(markers) bardisp = tqdm.tqdm( total=idx_size, file=ptee, leave=True, desc='IDXREAD', unit='B', unit_scale=True ) # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have) with open(indexpath, 'rb') as dbidx: buf = 1 while buf: # The format of the index backup file is pretty simple: for each entrymarker or field_delim, a block is appended. Each such block is made of: the type on one byte (1 for entrymarker, 2 for field_delim), then the marker's position in the ecc file encoded in an unsigned long long (thus it's on a fixed 8 bytes), and finally an ecc for both the type and marker's position, and which is of fixed size (since we know that the marker's type + position = 9 bytes). Each such block is appended right after the precedent, so we know easily read them and such structure cannot be tampered by a soft error (there's no way a hard drive failure can chance the structure of the data, but a malicious user can! But it's then easy to fix that for a human user, you can clearly see the patterns, where the marker's positions begins and ends). # Note that this constant sized structure of blocks is made on purpose, so that the structure of the index backup file is implicit, while the structure of the ecc file is explicit (it needs uncorrupted markers, which is a weak point that we try to address with the index backup file). # eg of two blocks: 10000008Aecceccecc2000000F2ecceccecc # # Read one index block curpos = dbidx.tell( ) # backup current position for error messages buf = dbidx.read(max_block_size) # Update progress bar bardisp.update(dbidx.tell() - bardisp.n) # If we have reached EOF, then we stop here if not buf: break # Else it's ok we have an index block, we process it idx_total += 1 # Extract the marker's infos and the ecc marker_str = buf[:ecc_params_idx["message_size"]] ecc = buf[ecc_params_idx["message_size"]:] # Check if the marker's infos are corrupted, if yes, then we will try to fix that using the ecc if not ecc_manager_idx.check(marker_str, ecc): # Trying to fix the marker's infos using the ecc idx_corrupted += 1 marker_repaired, repaired_ecc = ecc_manager_idx.decode( marker_str, ecc) # Repaired the marker's infos, all is good! if ecc_manager_idx.check(marker_repaired, repaired_ecc): marker_str = marker_repaired idx_corrected += 1 # Else it's corrupted beyond repair, just skip else: ptee.write( "\n- Index backup file: error on block starting at %i, corrupted and could not fix it. Skipping." % curpos) marker_str = None continue if not marker_str: continue # Repair ecc file's marker using our correct (or repaired) marker's infos marker_type = int( marker_str[0] ) # marker's type is always stored on the first byte/character marker_pos = struct.unpack( '>Q', marker_str[1:] ) # marker's position is encoded as a big-endian unsigned long long, in a 8 bytes/chars string db.seek( marker_pos[0] ) # move the ecc reading cursor to the beginning of the marker current_marker = db.read(len(markers[ marker_type - 1])) # read the current marker (potentially corrupted) db.seek(marker_pos[0]) if verbose: print "- Found marker by index file: type=%i content=" % ( marker_type) print db.read(len(markers[marker_type - 1]) + 4) db.seek( marker_pos[0] ) # replace the reading cursor back in place before the marker if current_marker != markers[ marker_type - 1]: # check if we really need to repair this marker # Rewrite the marker over the ecc file db.write(markers[marker_type - 1]) markers_repaired[marker_type - 1] += 1 else: print "skipped, no need to repair" # Done the index backup repair if bardisp.n > bardisp.total: bardisp.n = bardisp.total # just a workaround in case there's one byte more than the predicted total bardisp.close() ptee.write( "Done. Total: %i/%i markers repaired (%i entrymarkers and %i field_delim), %i indexes corrupted and %i indexes repaired (%i indexes lost).\n" % (markers_repaired[0] + markers_repaired[1], idx_total, markers_repaired[0], markers_repaired[1], idx_corrupted, idx_corrected, idx_corrupted - idx_corrected)) # == Heuristical Greedy Hamming distance repair # This is a heuristical (doesn't need any other file than the ecc file) 2-pass algorithm: the first pass tries to find the markers positions, and then the second pass simply reads the original ecc file and copies it while repairing the found markers. # The first pass is obviously the most interesting, here's a description: we use a kind of greedy algorithm but with backtracking, meaning that we simply read through all the strings sequentially and just compare with the markers and compute the Hamming distance: if the Hamming distance gets below the threshold, we trigger the found marker flag. Then if the Hamming distance decreases, we save this marker position and disable the found marker flag. However, there can be false positives like this (eg, the marker is corrupted in the middle), so we have a backtracking mechanism: if a later string is found to have a Hamming distance that is below the threshold, then we check if the just previously found marker is in the range (ie, the new marker's position is smaller than the previous marker's length) and if the Hamming distance is smaller, then we replace the previous marker with the new marker's position, because the previous one was most likely a false positive. # This method doesn't require any other file than the ecc file, but it may not work on ecc markers that are too much tampered, and if the detection threshold is too low or the markers are too small, there may be lots of false positives. # So try to use long markers (consisting of many character, preferably an alternating pattern different than the null byte \x00) and a high enough detection threshold. ptee.write( "Using heuristics (Hamming distance) to fix markers with a threshold of %i%%, please wait..." % (round(distance_threshold * 100, 0))) # Main loop for heuristical repair, try to find the substrings that minimize the hamming distance to one of the ecc markers markers_repaired = [0] * len(markers) # stat counter already_valid = 0 # stat counter db.seek(0) # seek to the beginning of the database file buf = 1 # init the buffer to 1 to initiate the while loop markers_pos = [ [] for i in xrange(len(markers)) ] # will contain the list of positions where a corrupted marker has been detected (not valid markers, they will be skipped) distance_thresholds = [ round(len(x) * distance_threshold, 0) for x in markers ] # calculate the number of characters maximum for distance skip_until = -1 # when a valid marker (non corrupted) is found, we use this variable to skip to after the marker length (to avoid detecting partial parts of this marker, which will have a hamming distance even if the marker is completely valid because the reading window will be after the beginning of the marker) bardisp = tqdm.tqdm( total=ecc_size, file=ptee, leave=True, desc='DBREAD', unit='B', unit_scale=True ) # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have) while buf: # until we have walked through the whole ecc file # Read a part of the ecc file into a buffer, this allows to process more quickly than just loading the size of a marker curpos = db.tell() # keep the current reading position buf = db.read(blocksize) # Update progress bar bardisp.update(db.tell() - bardisp.n) if not buf: break # reached EOF? quitting here # Scan the buffer, by splitting the buffer into substrings the length of the ecc markers for i in xrange( len(buf) - max(len(entrymarker), len(field_delim))): # If we just came accross a non corrupted ecc marker, we skip until we are after this ecc marker (to avoid misdetections) if i < skip_until: continue # Compare each ecc marker type to this substring and compute the Hamming distance for m in xrange(len(markers)): d = hamming( buf[i:i + len(markers[m])], markers[m] ) # Compute the Hamming distance (simply the number of different characters) mcurpos = curpos + i # current absolute position of this ecc marker # If there's no difference, then it's a valid, non-corrupted ecc marker if d == 0: already_valid += 1 # stats... # If we previously wrongly detected a corrupted ecc marker near here, then it's probably a misdetection (because we just had a partial view on this marker until now), thus we just remove it from our list of markers to repair if len( markers_pos[m] ) > 0 and (mcurpos - markers_pos[m][-1][0]) <= len( markers[m] ): # to detect that, we just check if the latest marker to repair is near the current marker (if its position is at maximum the length of the marker). This works because in the other condition below, we update the latest marker to repair if we find another one with a lower hamming distance very near. del markers_pos[m][-1] # Skip scanning until we are after the current marker to avoid misdetections su = i + len(markers[m]) if su > skip_until: skip_until = su # update with the biggest marker (because both markers can be detected here if the pattern is similar) break # Else there's a difference/distance but it's below the threshold: we have a corrupted marker! elif d > 0 and d <= distance_thresholds[m]: # Updating case: If the latest marker to repair is quite close to the current one, but the current detection has a lower distance, we probably are detecting the same marker but we are better positionned now, so we update the previous marker's position with this one now. if len(markers_pos[m]) > 0 and ( mcurpos - markers_pos[m][-1][0]) <= len( markers[m]): if d < markers_pos[m][-1][ 1]: # Update only if the distance is less markers_pos[m][-1] = [mcurpos, d] else: # Else, we probably are detecting the same marker as the last detected one, but since our scanning window has moved forward, we have increased the distance. Just skip it, we should not repair at this position (else we will probably be overwriting over real content). continue # Adding case: Else we just add this marker as a new one to repair by appending to the list else: markers_pos[m].append([mcurpos, d]) # Else the distance is too great for the threshold, it's not a marker at all, we go on to the next substring if db.tell() < ecc_size: db.seek(db.tell() - max(len(entrymarker), len(field_delim))) if bardisp.n > bardisp.total: bardisp.n = bardisp.total # just a workaround in case there's one byte more than the predicted total bardisp.close() # Committing the repair into the ecc file for m in xrange(len(markers)): # for each type of markers marker = markers[m] if len( markers_pos[m] ) > 0: # If there is any detected marker to repair for this type for pos in markers_pos[ m]: # for each detected marker to repair, we rewrite it over into the file at the detected position if verbose: ptee.write( "- Detected marker type %i at position %i with distance %i (%i%%): repairing." % (m + 1, pos[0], pos[1], (float(pos[1]) / len(markers[m])) * 100)) db.seek(pos[0]) db.write(marker) #print(markers_pos) ptee.write( "Done. Hamming heuristic with threshold %i%% repaired %i entrymarkers and %i field_delim (%i total) and %i were already valid.\n" % (round(distance_threshold * 100, 0), len(markers_pos[0]), len(markers_pos[1]), len(markers_pos[0]) + len(markers_pos[1]), already_valid)) del ptee return 0
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance( argv, basestring ): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Replication Repair Description: Given a set of directories (or files), try to repair your files by scanning each byte, cast a majority vote among all copies, and then output the winning byte. This process is usually called triple-modular redundancy (but here it should be called n-modular redundancy since you can use as many copies as you have). It is recommended for long term storage to store several copies of your files on different storage mediums. Everything's fine until all your copies are partially corrupted. In this case, this script can help you, by taking advantage of your multiple copies, without requiring a pregenerated ecc file. Just specify the path to every copies, and the script will try to recover them. Replication can repair exactly r-2 errors using majority vote (you need at least 2 blocks for majority vote to work), where r is the number of replications: if r=3, you get a redundancy rate of 1/3, if r=4, rate is 2/4, etc. This script can also take advantage of a database generated by rfigc.py to make sure that the recovered files are correct, or to select files that are already correct. Note: in case the end result is not what you expected, you can try a different order of input directories: in case of ambiguity, the first input folder has precedence over subsequent folders. Note2: in case some files with the same names are of different length, the merging will continue until the longest file is exhausted. Note3: last modification date is not (yet) accounted for. ''' ep = '''Use --gui as the first argument to use with a GUI (via Gooey). ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} widget_multidir = {"widget": "MultiDirChooser"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove( '--ignore-gooey' ) # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} widget_multidir = {} # Required arguments main_parser.add_argument( '-i', '--input', metavar='"/path/to/copy1/" "/path/to/copy2/" "etc."', type=is_dir_or_file, nargs='+', required=True, help= 'Specify the paths to every copies you have (minimum 3 copies, else it won\'t work!). Can be folders or files (if you want to repair only one file). Order matters: in case of ambiguity, the first folder where the file exists will be chosen.', **widget_multidir) main_parser.add_argument('-o', '--output', metavar='/ouput/folder/', nargs=1, required=True, help='Where the recovered files will be stored.', **widget_dir) # Optional general arguments main_parser.add_argument( '-d', '--database', metavar='database.csv', type=is_file, required=False, help= 'Path to a previously generated rfigc.py database. If provided, this will be used to check that the repaired files are correct (and also to find already correct files in copies).', **widget_file) main_parser.add_argument( '-r', '--report', metavar='/some/folder/report.csv', type=str, required=False, help= 'Save all results of the repair process in a report file, with detailed descriptions of ambiguous repairs (ie, when majority vote came to a draw).', **widget_filesave) main_parser.add_argument( '-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help= 'Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument( '-f', '--force', action='store_true', required=False, default=False, help='Force overwriting the output folder even if it already exists.') main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument( '--silent', action='store_true', required=False, default=False, help= 'No console output (but if --log specified, the log will still be saved in the specified file).' ) #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments inputpaths = [ fullpath(x) for x in args.input ] # path to the files to repair (ie, paths to all the different copies the user has) outputpath = fullpath(args.output[0]) force = args.force verbose = args.verbose silent = args.silent if len(inputpaths) < 3: raise Exception( 'Need at least 3 copies to do a replication repair/majority vote!') #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!) #rootfolderpath = os.path.dirname(inputpath) report_file = None if args.report: report_file = os.path.basename(fullpath(args.report)) database = None if args.database: database = args.database # -- Checking arguments if os.path.exists(outputpath) and not force: raise NameError( 'Specified output path %s already exists! Use --force if you want to overwrite.' % outputpath) if database and not os.path.isfile(database): raise NameError('Specified rfigc database file %s does not exist!' % database) # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) #sys.stdout = Tee(args.log[0], 'a') sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # # == Precomputation of ecc file size # Precomputing is important so that the user can know what size to expect before starting (and how much time it will take...). filescount = 0 sizetotal = 0 sizeheaders = 0 visitedfiles = {} ptee.write("Precomputing list of files and predicted statistics...") prebar = tqdm.tqdm(file=ptee, disable=silent) for inputpath in inputpaths: for (dirpath, filename) in recwalk(inputpath): # Get full absolute filepath filepath = os.path.join(dirpath, filename) relfilepath = path2unix( os.path.relpath(filepath, inputpath) ) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different) # Only increase the files count if we didn't see this file before if not visitedfiles.get(relfilepath, None): # Counting the total number of files we will process (so that we can show a progress bar with ETA) filescount = filescount + 1 # Add the file to the list of already visited files visitedfiles[relfilepath] = True # Get the current file's size size = os.stat(filepath).st_size # Compute total size of all files sizetotal = sizetotal + size prebar.update() prebar.close() ptee.write("Precomputing done.") # == Majority vote repair # For each folder, align the files lists and then majority vote over each byte to repair ptee.write("====================================") ptee.write("Replication repair, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Prepare progress bar if necessary if silent: tqdm_bar = None else: tqdm_bar = tqdm.tqdm(total=filescount, file=ptee, leave=True, unit="files") # Call the main function to synchronize files using majority vote errcode = synchronize_files(inputpaths, outputpath, database=database, tqdm_bar=tqdm_bar, report_file=report_file, ptee=ptee, verbose=verbose) #ptee.write("All done! Stats:\n- Total files processed: %i\n- Total files corrupted: %i\n- Total files repaired completely: %i\n- Total files repaired partially: %i\n- Total files corrupted but not repaired at all: %i\n- Total files skipped: %i" % (files_count, files_corrupted, files_repaired_completely, files_repaired_partially, files_corrupted - (files_repaired_partially + files_repaired_completely), files_skipped) ) if tqdm_bar: tqdm_bar.close() ptee.write("All done!") if report_file: ptee.write("Saved replication repair results in report file: %s" % report_file) del ptee return errcode
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance( argv, basestring ): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Resiliency Tester Description: Given a directory and a configuration file (containing the commands to execute before and after file tampering), this script will generate a testing tree, where the files will be corrupted randomly and then the supplied repair commands will be executed, and repair stats will be computed at each step (for each stage and repair commands). The testing process works in stages: 1- Before_tamper stage: Run preparatory commands before tampering (useful to generate ecc/database files). 2- Tamper stage: Tamper the files and/or databases. 3- After_tamper stage: Run after tampering commands, aka preparatory commands before repair stage. 4- Repair stage: Run repair commands, each repair command reusing the files generated (partially repaired) by the previous stage. This is indeed your repair workchain that you define here. 5- Statistics are generated for each stage. Note that the original files are never tampered, we tamper only the copy we did inside the test folder. Also note that the test folder will not be removed at the end, so that you can see for yourself the files resulting of each stage, and eventually use other tools to compute additional stats. ''' ep = '''Use --gui as the first argument to use with a GUI (via Gooey). ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} widget_multidir = {"widget": "MultiDirChooser"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove( '--ignore-gooey' ) # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} widget_multidir = {} # Required arguments main_parser.add_argument( '-i', '--input', metavar='"/path/to/original/files/"', type=is_dir_or_file, nargs=1, required=True, help='Specify the path to the directory containing the sample data.', **widget_dir) main_parser.add_argument( '-o', '--output', metavar='/test/folder/', nargs=1, required=True, help= 'Path to the test folder that will be created to store temporary test files.', **widget_dir) main_parser.add_argument( '-c', '--config', metavar='/some/folder/config.txt', type=str, nargs=1, required=True, #type=argparse.FileType('rt') help= 'Path to the configuration file (containing the commands to execute, Makefile format). Possible entries: before_tamper, tamper, after_tamper, repair. Note that you can use a few special tags to trigger string interpolation: {inputdir}, {dbdir}, {outputdir}.', **widget_file) # Optional arguments main_parser.add_argument( '-p', '--parallel', action='store_true', required=False, help= 'If true, repair commands will be run on the tampered files, not on the previous repair results. Useful if you want to try different strategies/commands/programs. By default, false, thus the repair commands will take advantage of the results of previous repair commands.' ) main_parser.add_argument( '-m', '--multiple', metavar=1, type=int, default=1, required=False, help='Run multiple times the resiliency test, and average the stats.', **widget_text) # Optional general arguments main_parser.add_argument( '-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help= 'Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument( '-f', '--force', action='store_true', required=False, default=False, help='Force overwriting the output folder even if it already exists.') main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument( '--silent', action='store_true', required=False, default=False, help= 'No console output (but if --log specified, the log will still be saved in the specified file).' ) #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments origpath = fullpath( args.input[0] ) # path to the input directory (where the original, sample data is) outputpath = fullpath(args.output[0]) configfile = fullpath(args.config[0]) parallel = args.parallel multiple = args.multiple force = args.force verbose = args.verbose silent = args.silent #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!) #rootfolderpath = os.path.dirname(inputpath) # -- Checking arguments if not os.path.isdir(origpath): raise NameError("Input path needs to be a directory!") if not os.path.exists(configfile): raise NameError( "Please provide a configuration file in order to run a test!") else: commands = parse_configfile(configfile) if os.path.exists(outputpath) and not force: raise NameError( "Specified test folder (output path) %s already exists! Use --force to overwrite this directory." % outputpath) else: remove_if_exist(outputpath) if multiple < 1: multiple = 1 # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # # == Main branch ptee.write("====================================") ptee.write("Resiliency tester, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") ptee.write("Testing folder %s into test folder %s for %i run(s)." % (origpath, outputpath, multiple)) fstats = {} for m in xrange(multiple): run_nb = m + 1 ptee.write("===== Resiliency tester: starting run %i =====" % run_nb) # -- Define directories tree for this test run # testpath is the basepath for the current run # Generate a specific subdirectory for the current run testpath = os.path.join(outputpath, "run%i" % run_nb) dbdir = fullpath(os.path.join(testpath, "db")) origdbdir = fullpath(os.path.join(testpath, "origdb")) tamperdir = fullpath(os.path.join(testpath, "tampered")) repairdir = fullpath(os.path.join(testpath, "repair")) # == START TEST RUN # Create test folder create_dir_if_not_exist(testpath) # Before tampering ptee.write("=== BEFORE TAMPERING ===") create_dir_if_not_exist(dbdir) for i, cmd in enumerate(commands["before_tamper"]): scmd = interpolate_dict(cmd, interp_args={ "inputdir": origpath, "dbdir": dbdir }) ptee.write("Executing command: %s" % scmd) execute_command(scmd, ptee=ptee) copy_any(dbdir, origdbdir) # make a copy because we may tamper the db files # Tampering ptee.write("=== TAMPERING ===") copy_any(origpath, tamperdir) for i, cmd in enumerate(commands["tamper"]): scmd = interpolate_dict(cmd, interp_args={ "inputdir": tamperdir, "dbdir": dbdir }) ptee.write("- RTEST: Executing command: %s" % scmd) execute_command(scmd, ptee=ptee) # After tampering ptee.write("=== AFTER TAMPERING ===") for i, cmd in enumerate(commands["after_tamper"]): scmd = interpolate_dict(cmd, interp_args={ "inputdir": tamperdir, "dbdir": dbdir }) ptee.write("- RTEST: Executing command: %s" % scmd) execute_command(scmd, ptee=ptee) # Repairing ptee.write("=== REPAIRING ===") indir = tamperdir finalrepairdir = '' for i, cmd in enumerate(commands["repair"]): outdir = "%s%i" % (repairdir, i) scmd = interpolate_dict(cmd, interp_args={ "inputdir": indir, "dbdir": dbdir, "outputdir": outdir }) ptee.write("- RTEST: Executing command: %s" % scmd) create_dir_if_not_exist(outdir) execute_command(scmd, ptee=ptee) copy_any( indir, outdir, only_missing=True ) # copy the files that did not need any repair (or could not be repaired at all!) finalrepairdir = outdir # If parallel, do not reuse the previous repair resulting files, repair from the tampered files directly if not parallel: indir = outdir # Stats stats = compute_all_diff_stats(commands, origpath, tamperdir, repairdir, finalrepairdir) ptee.write( "========== Resiliency tester results for run %i ==========" % run_nb) for key, stat in stats.iteritems(): ptee.write("=> Stage: %s" % key) ptee.write(pretty_print_stats(stat)) if run_nb == 1: fstats = stats else: fstats = stats_running_average(fstats, stats, run_nb - 1) ptee.write("============================") ptee.write("RESILIENCY TESTER FINAL AVERAGED RESULTS OVER %i RUNS" % multiple) ptee.write("============================") for key, stat in fstats.iteritems(): ptee.write("=> Stage: %s" % key) ptee.write(pretty_print_stats(stat)) # Shutting down del ptee # Completely repair all the files? Return OK if stats["final"]["error"] == 0: return 0 else: return 1
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Recursive/Relative Files Integrity Generator and Checker Description: Recursively generate or check the integrity of files by MD5 and SHA1 hashes, size, modification date or by data structure integrity (only for images). This script is originally meant to be used for data archival, by allowing an easy way to check for silent file corruption. Thus, this script uses relative paths so that you can easily compute and check the same redundant data copied on different mediums (hard drives, optical discs, etc.). This script is not meant for system files corruption notification, but is more meant to be used from times-to-times to check up on your data archives integrity. ''' ep = '''Example usage: - To generate the database (only needed once): python rfigc.py -i "folderimages" -d "dbhash.csv" -g - To check: python rfigc.py -i "folderimages" -d "dbhash.csv" -l log.txt -s - To update your database by appending new files: python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a - To update your database by appending new files AND removing inexistent files: python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a -r - To use with a gui: python rfigc.py --gui Note that by default, the script is by default in check mode, to avoid wrong manipulations. It will also alert you if you generate over an already existing database file. Note2: you can use PyPy to speed the generation, but you should avoid using PyPy when in checking mode (from our tests, it will slow things down a lot). ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} # Required arguments main_parser.add_argument('-i', '--input', metavar='/path/to/root/folder', type=is_dir_or_file, nargs=1, required=True, help='Path to the root folder (or a single file) from where the scanning will occur.', **widget_dir) main_parser.add_argument('-d', '--database', metavar='/some/folder/databasefile.csv', type=str, nargs=1, required=True, #type=argparse.FileType('rt') help='Path to the csv file containing the hash informations.', **widget_filesave) # Optional general arguments main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument('--skip_hash', action='store_true', required=False, default=False, help='Skip hash computation/checking (checks only the other metadata, this is a lot quicker).') main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument('--silent', action='store_true', required=False, default=False, help='No console output (but if --log specified, the log will still be saved in the specified file).') # Checking mode arguments main_parser.add_argument('-s', '--structure_check', action='store_true', required=False, default=False, help='Check images structures for corruption?') main_parser.add_argument('-e', '--errors_file', metavar='/some/folder/errorsfile.csv', type=str, nargs=1, required=False, #type=argparse.FileType('rt') help='Path to the error file, where errors at checking will be stored in CSV for further processing by other softwares (such as file repair softwares).', **widget_filesave) main_parser.add_argument('-m', '--disable_modification_date_checking', action='store_true', required=False, default=False, help='Disable modification date checking.') main_parser.add_argument('--skip_missing', action='store_true', required=False, default=False, help='Skip missing files when checking (useful if you split your files into several mediums, for example on optical discs with limited capacity).') # Generate mode arguments main_parser.add_argument('-g', '--generate', action='store_true', required=False, default=False, help='Generate the database? (omit this parameter to check instead of generating).') main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False, help='Force overwriting the database file even if it already exists (if --generate).') # Update mode arguments main_parser.add_argument('-u', '--update', action='store_true', required=False, default=False, help='Update database (you must also specify --append or --remove).') main_parser.add_argument('-a', '--append', action='store_true', required=False, default=False, help='Append new files (if --update).') main_parser.add_argument('-r', '--remove', action='store_true', required=False, default=False, help='Remove missing files (if --update).') # Recover from file scraping main_parser.add_argument('--filescraping_recovery', action='store_true', required=False, default=False, help='Given a folder of unorganized files, compare to the database and restore the filename and directory structure into the output folder.') main_parser.add_argument('-o', '--output', metavar='/path/to/root/folder', type=is_dir, nargs=1, required=False, help='Path to the output folder where to output (copy) the files reorganized after --recover_from_filescraping.', **widget_dir) #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments inputpath = fullpath(args.input[0]) # path to the files to protect (either a folder or a single file) rootfolderpath = inputpath # path to the root folder (to compute relative paths) #database = os.path.basename(fullpath(args.database[0])) # Take only the filename. database = fullpath(args.database[0]) generate = args.generate structure_check = args.structure_check force = args.force disable_modification_date_checking = args.disable_modification_date_checking skip_missing = args.skip_missing skip_hash = args.skip_hash update = args.update append = args.append remove = args.remove outputpath = None if args.output: outputpath = fullpath(args.output[0]) filescraping = args.filescraping_recovery verbose = args.verbose silent = args.silent if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!) rootfolderpath = os.path.dirname(inputpath) errors_file = None if args.errors_file: errors_file = fullpath(args.errors_file[0]) # -- Checking arguments if structure_check and not structure_check_import: raise ImportError('PIL (Python Imaging Library) could not be imported. PIL is needed to do structure check, please install PIL (or you can disable structure check to continue).'); if update and (not append and not remove): raise ValueError('--update specified but not --append nor --remove. You must specify at least one of these modes when using --update!') if filescraping and not outputpath: raise ValueError('Output path needed when --recover_from_filescraping.') # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) #sys.stdout = Tee(args.log[0], 'a') sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # retval = 0 # Returned value: 0 OK, 1 KO (files in error), -1 Error # -- Update the database file by removing missing files if update and remove: if not os.path.isfile(database): raise NameError('Specified database file does not exist, can\'t update!') ptee.write("====================================") ptee.write("RIFGC Database Update Removal of missing files, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Precompute the total number of lines to process (this should be fairly quick) filestodocount = 0 with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): filestodocount = filestodocount + 1 # Preparing CSV writer for the temporary file that will have the lines removed with open(database+'.rem', 'wb') as dbfilerem: csv_writer = csv.writer(dbfilerem, lineterminator='\n', delimiter='|', quotechar='"') # Printing CSV headers csv_headers = ['path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext'] csv_writer.writerow(csv_headers) dbf.seek(0) dbfile = csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"') # we need to reopen the file to put the reading cursor (the generator position) back to the beginning delcount = 0 filescount = 0 for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 filepath = os.path.join(rootfolderpath, row['path']) # Build the absolute file path # Single-file mode: skip if this is not the file we are looking for if inputpath != rootfolderpath and inputpath != filepath: continue if verbose: ptee.write("\n- Processing file %s" % row['path']) errors = [] if not os.path.isfile(filepath): delcount = delcount + 1 ptee.write("\n- File %s is missing, removed from database." % row['path']) else: csv_writer.writerow( [ path2unix(row['path']), row['md5'], row['sha1'], row['last_modification_timestamp'], row['last_modification_date'], row['size'], row['ext'] ] ) # REMOVE UPDATE DONE, we remove the old database file and replace it with the new os.remove(database) # delete old database os.rename(database+'.rem', database) # rename new database to match old name # Show some stats ptee.write("----------------------------------------------------") ptee.write("All files processed: Total: %i - Removed/Missing: %i.\n\n" % (filescount, delcount)) # -- Generate the database file or update/append (both will walk through the filesystem to get new files, contrary to other branchs which walk through the database csv) if generate or (update and append): if not force and os.path.isfile(database) and not update: raise NameError('Database file already exists. Please choose another name to generate your database file.') if generate: dbmode = 'wb' elif (update and append): dbmode = 'ab' with open(database, dbmode) as dbfile: # Must open in write + binary, because on Windows it will do weird things otherwise (at least with Python 2.7) ptee.write("====================================") if generate: ptee.write("RIFGC Database Generation started on %s" % datetime.datetime.now().isoformat()) elif update and append: ptee.write("RIFGC Database Update Append new files, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Preparing CSV writer csv_writer = csv.writer(dbfile, lineterminator='\n', delimiter='|', quotechar='"') if generate: # Printing CSV headers csv_headers = ['path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext'] csv_writer.writerow(csv_headers) if (update and append): # Extract all paths already stored in database to avoid readding them db_paths = {} with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): db_paths[row['path']] = True # Counting the total number of files that we will have to process ptee.write("Counting total number of files to process, please wait...") filestodocount = 0 for _ in tqdm.tqdm(recwalk(inputpath), file=ptee): filestodocount = filestodocount + 1 ptee.write("Counting done.") # Recursively traversing the root directory and save the metadata in the db for each file ptee.write("Processing files to compute metadata to store in database, please wait...") filescount = 0 addcount = 0 for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 # Get full absolute filepath filepath = os.path.join(dirpath, filename) # Get database relative path (from scanning root folder) relfilepath = path2unix(os.path.relpath(filepath, rootfolderpath)) # File relative path from the root (so that we can easily check the files later even if the absolute path is different) if verbose: ptee.write("\n- Processing file %s" % relfilepath) # If update + append mode, then if the file is already in the database we skip it (we continue computing metadata only for new files) if update and append and relfilepath in db_paths: if verbose: ptee.write("... skipped") continue else: addcount = addcount + 1 # Compute the hashes (leave it outside the with command because generate_hashes() open the file by itself, so that both hashes can be computed in a single sweep of the file at the same time) if not skip_hash: md5hash, sha1hash = generate_hashes(filepath) else: md5hash = sha1hash = 0 # Compute other metadata with open(filepath) as thisfile: # Check file structure if option is enabled if structure_check: struct_result = check_structure(filepath) # Print/Log an error only if there's one (else we won't say anything) if struct_result: ptee.write("\n- Structure error with file "+filepath+": "+struct_result) ext = os.path.splitext(filepath)[1] # File's extension statinfos = os.stat(filepath) # Various OS filesystem infos about the file size = statinfos.st_size # File size lastmodif = statinfos.st_mtime # File last modified date (as a timestamp) lastmodif_readable = datetime.datetime.fromtimestamp(lastmodif).strftime("%Y-%m-%d %H:%M:%S") # File last modified date as a human readable date (ISO universal time) csv_row = [path2unix(relfilepath), md5hash, sha1hash, lastmodif, lastmodif_readable, size, ext] # Prepare the CSV row csv_writer.writerow(csv_row) # Save to the file ptee.write("----------------------------------------------------") ptee.write("All files processed: Total: %i - Added: %i.\n\n" % (filescount, addcount)) # -- Filescraping recovery mode # We will compare all files from the input path and reorganize the ones that are recognized into the output path elif filescraping: import shutil ptee.write("====================================") ptee.write("RIFGC File Scraping Recovery started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") ptee.write("Loading the database into memory, please wait...") md5list = {} sha1list = {} dbrows = {} # TODO: instead of memorizing everything in memory, store just the reading cursor position at the beginning of the line with the size and then just read when necessary from the db file directly id = 0 with open(database, 'rb') as db: for row in csv.DictReader(db, lineterminator='\n', delimiter='|', quotechar='"'): id += 1 if (len(row['md5']) > 0 and len(row['sha1']) > 0): md5list[row['md5']] = id sha1list[row['sha1']] = id dbrows[id] = row ptee.write("Loading done.") if len(dbrows) == 0: ptee.write("Nothing to do, there's no md5 nor sha1 hashes in the database file!") del ptee return 1 # return with an error # Counting the total number of files that we will have to process ptee.write("Counting total number of files to process, please wait...") filestodocount = 0 for _ in tqdm.tqdm(recwalk(inputpath), file=ptee): filestodocount = filestodocount + 1 ptee.write("Counting done.") # Recursively traversing the root directory and save the metadata in the db for each file ptee.write("Processing file scraping recovery, walking through all files from input folder...") filescount = 0 copiedcount = 0 for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 # Get full absolute filepath filepath = os.path.join(dirpath,filename) # Get database relative path (from scanning root folder) relfilepath = path2unix(os.path.relpath(filepath, rootfolderpath)) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different) if verbose: ptee.write("\n- Processing file %s" % relfilepath) # Generate the hashes from the currently inspected file md5hash, sha1hash = generate_hashes(filepath) # If it match with a file in the database, we will copy it over with the correct name, directory structure, file extension and last modification date if md5hash in md5list and sha1hash in sha1list and md5list[md5hash] == sha1list[sha1hash]: # Load the db infos for this file row = dbrows[md5list[md5hash]] ptee.write("- Found: %s --> %s.\n" % (filepath, row['path'])) # Generate full absolute filepath of the output file outfilepath = os.path.join(outputpath, row['path']) # Recursively create the directory tree structure outfiledir = os.path.dirname(outfilepath) if not os.path.isdir(outfiledir): os.makedirs(outfiledir) # if the target directory does not exist, create it (and create recursively all parent directories too) # Copy over and set attributes shutil.copy2(filepath, outfilepath) filestats = os.stat(filepath) os.utime(outfilepath, (filestats.st_atime, float(row['last_modification_timestamp']))) # Counter... copiedcount += 1 ptee.write("----------------------------------------------------") ptee.write("All files processed: Total: %i - Recovered: %i.\n\n" % (filescount, copiedcount)) # -- Check mode: check the files using a database file elif not update and not generate and not filescraping: ptee.write("====================================") ptee.write("RIFGC Check started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Open errors file if supplied (where we will store every errors in a formatted csv so that it can later be easily processed by other softwares, such as repair softwares) if errors_file is not None: efile = open(errors_file, 'wb') e_writer = csv.writer(efile, delimiter='|', lineterminator='\n', quotechar='"') # Precompute the total number of lines to process (this should be fairly quick) filestodocount = 0 with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): filestodocount = filestodocount + 1 # Processing the files using the database list ptee.write("Checking for files corruption based on database %s on input path %s, please wait..." % (database, inputpath)) dbf.seek(0) dbfile = csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"') # we need to reopen the file to put the reading cursor (the generator position) back to the beginning errorscount = 0 filescount = 0 for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 filepath = os.path.join(rootfolderpath, row['path']) # Single-file mode: skip if this is not the file we are looking for if inputpath != rootfolderpath and inputpath != filepath: continue if verbose: ptee.write("\n- Processing file %s" % row['path']) errors = [] if not os.path.isfile(filepath): if not skip_missing: errors.append('file is missing') # First generate the current file's metadata given the filepath from the CSV, and then we will check the differences from database else: try: # Try to be resilient to various file access errors # Generate hash if not skip_hash: md5hash, sha1hash = generate_hashes(filepath) else: md5hash = sha1hash = 0 # Check structure integrity if enabled if structure_check: struct_result = check_structure(filepath) if struct_result: errors.append("structure error (%s)" % struct_result) # Compute other metadata with open(filepath) as thisfile: ext = os.path.splitext(filepath)[1] statinfos = os.stat(filepath) size = statinfos.st_size lastmodif = statinfos.st_mtime lastmodif_readable = datetime.datetime.fromtimestamp(lastmodif).strftime("%Y-%m-%d %H:%M:%S") # CHECK THE DIFFERENCES if not skip_hash and md5hash != row['md5'] and sha1hash != row['sha1']: errors.append('both md5 and sha1 hash failed') elif not skip_hash and ((md5hash == row['md5'] and sha1hash != row['sha1']) or (md5hash != row['md5'] and sha1hash == row['sha1'])): errors.append('one of the hash failed but not the other (which may indicate that the database file is corrupted)') if ext != row['ext']: errors.append('extension has changed') if size != int(row['size']): errors.append("size has changed (before: %s - now: %s)" % (row['size'], size)) if not disable_modification_date_checking and (lastmodif != float(row['last_modification_timestamp']) and round(lastmodif,0) != round(float(row['last_modification_timestamp']),0)): # for usage with PyPy: last modification time is differently managed (rounded), thus we need to round here manually to compare against PyPy. errors.append("modification date has changed (before: %s - now: %s)" % (row['last_modification_date'], lastmodif_readable)) except IOError as e: # Catch IOError as a file error errors.append('file can\'t be read, IOError (inaccessible, maybe bad sector?)') except Exception as e: # Any other exception when accessing the file will also be caught as a file error errors.append('file can\'t be accessed: %s' % e) # Print/Log all errors for this file if any happened if errors: errorscount = errorscount + 1 ptee.write("\n- Error for file %s: %s." % (row['path'], ', '.join(errors))) if errors_file is not None: # Write error in a csv file if supplied (for easy processing later by other softwares such as file repair softwares) e_writer.writerow( [row['path'], ', '.join(errors)] ) # END OF CHECKING: show some stats ptee.write("----------------------------------------------------") ptee.write("All files checked: Total: %i - Files with errors: %i.\n\n" % (filescount, errorscount)) retval = (errorscount > 0) del ptee return retval # return error code if any
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Resiliency Tester Description: Given a directory and a configuration file (containing the commands to execute before and after file tampering), this script will generate a testing tree, where the files will be corrupted randomly and then the supplied repair commands will be executed, and repair stats will be computed at each step (for each stage and repair commands). The testing process works in stages: 1- Before_tamper stage: Run preparatory commands before tampering (useful to generate ecc/database files). 2- Tamper stage: Tamper the files and/or databases. 3- After_tamper stage: Run after tampering commands, aka preparatory commands before repair stage. 4- Repair stage: Run repair commands, each repair command reusing the files generated (partially repaired) by the previous stage. This is indeed your repair workchain that you define here. 5- Statistics are generated for each stage. Note that the original files are never tampered, we tamper only the copy we did inside the test folder. Also note that the test folder will not be removed at the end, so that you can see for yourself the files resulting of each stage, and eventually use other tools to compute additional stats. ''' ep = '''Use --gui as the first argument to use with a GUI (via Gooey). ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} widget_multidir = {"widget": "MultiDirChooser"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} widget_multidir = {} # Required arguments main_parser.add_argument('-i', '--input', metavar='"/path/to/original/files/"', type=is_dir_or_file, nargs=1, required=True, help='Specify the path to the directory containing the sample data.', **widget_dir) main_parser.add_argument('-o', '--output', metavar='/test/folder/', nargs=1, required=True, help='Path to the test folder that will be created to store temporary test files.', **widget_dir) main_parser.add_argument('-c', '--config', metavar='/some/folder/config.txt', type=str, nargs=1, required=True, #type=argparse.FileType('rt') help='Path to the configuration file (containing the commands to execute, Makefile format). Possible entries: before_tamper, tamper, after_tamper, repair. Note that you can use a few special tags to trigger string interpolation: {inputdir}, {dbdir}, {outputdir}.', **widget_file) # Optional arguments main_parser.add_argument('-p', '--parallel', action='store_true', required=False, help='If true, repair commands will be run on the tampered files, not on the previous repair results. Useful if you want to try different strategies/commands/programs. By default, false, thus the repair commands will take advantage of the results of previous repair commands.') main_parser.add_argument('-m', '--multiple', metavar=1, type=int, default=1, required=False, help='Run multiple times the resiliency test, and average the stats.', **widget_text) # Optional general arguments main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False, help='Force overwriting the output folder even if it already exists.') main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument('--silent', action='store_true', required=False, default=False, help='No console output (but if --log specified, the log will still be saved in the specified file).') #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments origpath = fullpath(args.input[0]) # path to the input directory (where the original, sample data is) outputpath = fullpath(args.output[0]) configfile = fullpath(args.config[0]) parallel = args.parallel multiple = args.multiple force = args.force verbose = args.verbose silent = args.silent #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!) #rootfolderpath = os.path.dirname(inputpath) # -- Checking arguments if not os.path.isdir(origpath): raise NameError("Input path needs to be a directory!") if not os.path.exists(configfile): raise NameError("Please provide a configuration file in order to run a test!") else: commands = parse_configfile(configfile) if os.path.exists(outputpath) and not force: raise NameError("Specified test folder (output path) %s already exists! Use --force to overwrite this directory." % outputpath) else: remove_if_exist(outputpath) if multiple < 1: multiple = 1 # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # # == Main branch ptee.write("====================================") ptee.write("Resiliency tester, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") ptee.write("Testing folder %s into test folder %s for %i run(s)." % (origpath, outputpath, multiple)) fstats = {} for m in xrange(multiple): run_nb = m + 1 ptee.write("===== Resiliency tester: starting run %i =====" % run_nb) # -- Define directories tree for this test run # testpath is the basepath for the current run # Generate a specific subdirectory for the current run testpath = os.path.join(outputpath, "run%i" % run_nb) dbdir = fullpath(os.path.join(testpath, "db")) origdbdir = fullpath(os.path.join(testpath, "origdb")) tamperdir = fullpath(os.path.join(testpath, "tampered")) repairdir = fullpath(os.path.join(testpath, "repair")) # == START TEST RUN # Create test folder create_dir_if_not_exist(testpath) # Before tampering ptee.write("=== BEFORE TAMPERING ===") create_dir_if_not_exist(dbdir) for i, cmd in enumerate(commands["before_tamper"]): scmd = interpolate_dict(cmd, interp_args={"inputdir": origpath, "dbdir": dbdir}) ptee.write("Executing command: %s" % scmd) execute_command(scmd, ptee=ptee) copy_any(dbdir, origdbdir) # make a copy because we may tamper the db files # Tampering ptee.write("=== TAMPERING ===") copy_any(origpath, tamperdir) for i, cmd in enumerate(commands["tamper"]): scmd = interpolate_dict(cmd, interp_args={"inputdir": tamperdir, "dbdir": dbdir}) ptee.write("- RTEST: Executing command: %s" % scmd) execute_command(scmd, ptee=ptee) # After tampering ptee.write("=== AFTER TAMPERING ===") for i, cmd in enumerate(commands["after_tamper"]): scmd = interpolate_dict(cmd, interp_args={"inputdir": tamperdir, "dbdir": dbdir}) ptee.write("- RTEST: Executing command: %s" % scmd) execute_command(scmd, ptee=ptee) # Repairing ptee.write("=== REPAIRING ===") indir = tamperdir finalrepairdir = '' for i, cmd in enumerate(commands["repair"]): outdir = "%s%i" % (repairdir, i) scmd = interpolate_dict(cmd, interp_args={"inputdir": indir, "dbdir": dbdir, "outputdir": outdir}) ptee.write("- RTEST: Executing command: %s" % scmd) create_dir_if_not_exist(outdir) execute_command(scmd, ptee=ptee) copy_any(indir, outdir, only_missing=True) # copy the files that did not need any repair (or could not be repaired at all!) finalrepairdir = outdir # If parallel, do not reuse the previous repair resulting files, repair from the tampered files directly if not parallel: indir = outdir # Stats stats = compute_all_diff_stats(commands, origpath, tamperdir, repairdir, finalrepairdir) ptee.write("========== Resiliency tester results for run %i ==========" % run_nb) for key, stat in stats.iteritems(): ptee.write("=> Stage: %s" % key) ptee.write(pretty_print_stats(stat)) if run_nb == 1: fstats = stats else: fstats = stats_running_average(fstats, stats, run_nb-1) ptee.write("============================") ptee.write("RESILIENCY TESTER FINAL AVERAGED RESULTS OVER %i RUNS" % multiple) ptee.write("============================") for key, stat in fstats.iteritems(): ptee.write("=> Stage: %s" % key) ptee.write(pretty_print_stats(stat)) # Shutting down del ptee # Completely repair all the files? Return OK if stats["final"]["error"] == 0: return 0 else: return 1
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Random file/directory characters tamperer in Python Description: Randomly tampers characters in a file or in a directory tree recursively (useful to test for integrity/repair after). WARNING: this will tamper the file you specify. Please ensure you keep a copy of the original! ''' ep = '''NOTE: this script tampers at the character (byte) level, not the bits! Thus the measures you will get here may be different from those you will find in papers (you must divide your probability by 8).''' #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} # Required arguments main_parser.add_argument('-i', '--input', metavar='filetotamper.ext', type=is_dir_or_file, nargs=1, required=True, help='Path to the file (or directory tree) to tamper.', **widget_dir) main_parser.add_argument('-m', '--mode', metavar='e, erasure, n, noise', type=str, nargs=1, required=True, help='Tampering mode: erasure or noise?', **widget_text) main_parser.add_argument('-p', '--probability', type=float, nargs=1, required=True, help='Probability of corruption (float between 0.0 and 1.0)', **widget_text) # Optional arguments main_parser.add_argument('--block_probability', type=float, nargs=1, required=False, help='Probability of block tampering (between 0.0 and 1.0, do not set it if you want to spread errors evenly, but researchs have shown that errors are rather at block level and not evenly distributed)', **widget_text) main_parser.add_argument('-b', '--burst_length', metavar="startint|endint", type=str, required=False, help='If specified, this will define the number of consecutive characters that will be corrupted when the corruption probability (--probability) is triggered. Specify a range startint|endint, the burst length will be uniformly sampled over this range.') main_parser.add_argument('--header', type=int, required=False, help='Only tamper the header of the file') main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument('--silent', action='store_true', required=False, default=False, help='No console output (but if --log specified, the log will still be saved in the specified file).') #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments filepath = fullpath(args.input[0]) mode = args.mode[0] proba = float(args.probability[0]) verbose = args.verbose silent = args.silent burst_length = args.burst_length if burst_length: burst_length = [int(r) for r in burst_length.split('|')] # split range and convert to int block_proba = None if args.block_probability: block_proba = float(args.block_probability[0]) blocksize = 65536 header = args.header # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) #sys.stdout = Tee(args.log[0], 'a') sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # # Sanity check if not os.path.exists(filepath): raise RuntimeError("Path does not exist: %s" % filepath) else: # -- Tampering a file if os.path.isfile(filepath): ptee.write('Tampering the file %s, please wait...' % os.path.basename(filepath)) tcount, tsize = tamper_file(filepath, mode=mode, proba=proba, block_proba=block_proba, blocksize=blocksize, burst_length=burst_length, header=header, silent=silent) ptee.write("Tampering done: %i/%i (%.2f%%) characters tampered." % (tcount, tsize, tcount / max(1, tsize) * 100)) # -- Tampering a directory tree recursively elif os.path.isdir(filepath): ptee.write('Tampering all files in directory %s, please wait...' % filepath) files_tampered, filescount, tcount, tsize = tamper_dir(filepath, mode=mode, proba=proba, block_proba=block_proba, blocksize=blocksize, burst_length=burst_length, header=header, silent=silent) ptee.write("Tampering done: %i/%i files tampered and overall %i/%i (%.2f%%) characters were tampered." % (files_tampered, filescount, tcount, tsize, tcount / max(1, tsize) * 100)) del ptee return 0
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance( argv, basestring ): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Recursive/Relative Files Integrity Generator and Checker Description: Recursively generate or check the integrity of files by MD5 and SHA1 hashes, size, modification date or by data structure integrity (only for images). This script is originally meant to be used for data archival, by allowing an easy way to check for silent file corruption. Thus, this script uses relative paths so that you can easily compute and check the same redundant data copied on different mediums (hard drives, optical discs, etc.). This script is not meant for system files corruption notification, but is more meant to be used from times-to-times to check up on your data archives integrity. ''' ep = '''Example usage: - To generate the database (only needed once): python rfigc.py -i "folderimages" -d "dbhash.csv" -g - To check: python rfigc.py -i "folderimages" -d "dbhash.csv" -l log.txt -s - To update your database by appending new files: python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a - To update your database by appending new files AND removing inexistent files: python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a -r - To use with a gui: python rfigc.py --gui Note that by default, the script is by default in check mode, to avoid wrong manipulations. It will also alert you if you generate over an already existing database file. Note2: you can use PyPy to speed the generation, but you should avoid using PyPy when in checking mode (from our tests, it will slow things down a lot). ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove( '--ignore-gooey' ) # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} # Required arguments main_parser.add_argument( '-i', '--input', metavar='/path/to/root/folder', type=is_dir_or_file, nargs=1, required=True, help= 'Path to the root folder (or a single file) from where the scanning will occur.', **widget_dir) main_parser.add_argument( '-d', '--database', metavar='/some/folder/databasefile.csv', type=str, nargs=1, required=True, #type=argparse.FileType('rt') help='Path to the csv file containing the hash informations.', **widget_filesave) # Optional general arguments main_parser.add_argument( '-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help= 'Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument( '--skip_hash', action='store_true', required=False, default=False, help= 'Skip hash computation/checking (checks only the other metadata, this is a lot quicker).' ) main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument( '--silent', action='store_true', required=False, default=False, help= 'No console output (but if --log specified, the log will still be saved in the specified file).' ) # Checking mode arguments main_parser.add_argument('-s', '--structure_check', action='store_true', required=False, default=False, help='Check images structures for corruption?') main_parser.add_argument( '-e', '--errors_file', metavar='/some/folder/errorsfile.csv', type=str, nargs=1, required=False, #type=argparse.FileType('rt') help= 'Path to the error file, where errors at checking will be stored in CSV for further processing by other softwares (such as file repair softwares).', **widget_filesave) main_parser.add_argument('-m', '--disable_modification_date_checking', action='store_true', required=False, default=False, help='Disable modification date checking.') main_parser.add_argument( '--skip_missing', action='store_true', required=False, default=False, help= 'Skip missing files when checking (useful if you split your files into several mediums, for example on optical discs with limited capacity).' ) # Generate mode arguments main_parser.add_argument( '-g', '--generate', action='store_true', required=False, default=False, help= 'Generate the database? (omit this parameter to check instead of generating).' ) main_parser.add_argument( '-f', '--force', action='store_true', required=False, default=False, help= 'Force overwriting the database file even if it already exists (if --generate).' ) # Update mode arguments main_parser.add_argument( '-u', '--update', action='store_true', required=False, default=False, help='Update database (you must also specify --append or --remove).') main_parser.add_argument('-a', '--append', action='store_true', required=False, default=False, help='Append new files (if --update).') main_parser.add_argument('-r', '--remove', action='store_true', required=False, default=False, help='Remove missing files (if --update).') # Recover from file scraping main_parser.add_argument( '--filescraping_recovery', action='store_true', required=False, default=False, help= 'Given a folder of unorganized files, compare to the database and restore the filename and directory structure into the output folder.' ) main_parser.add_argument( '-o', '--output', metavar='/path/to/root/folder', type=is_dir, nargs=1, required=False, help= 'Path to the output folder where to output (copy) the files reorganized after --recover_from_filescraping.', **widget_dir) #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments inputpath = fullpath( args.input[0] ) # path to the files to protect (either a folder or a single file) rootfolderpath = inputpath # path to the root folder (to compute relative paths) #database = os.path.basename(fullpath(args.database[0])) # Take only the filename. database = fullpath(args.database[0]) generate = args.generate structure_check = args.structure_check force = args.force disable_modification_date_checking = args.disable_modification_date_checking skip_missing = args.skip_missing skip_hash = args.skip_hash update = args.update append = args.append remove = args.remove outputpath = None if args.output: outputpath = fullpath(args.output[0]) filescraping = args.filescraping_recovery verbose = args.verbose silent = args.silent if os.path.isfile( inputpath ): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!) rootfolderpath = os.path.dirname(inputpath) errors_file = None if args.errors_file: errors_file = fullpath(args.errors_file[0]) # -- Checking arguments if structure_check and not structure_check_import: raise ImportError( 'PIL (Python Imaging Library) could not be imported. PIL is needed to do structure check, please install PIL (or you can disable structure check to continue).' ) if update and (not append and not remove): raise ValueError( '--update specified but not --append nor --remove. You must specify at least one of these modes when using --update!' ) if filescraping and not outputpath: raise ValueError( 'Output path needed when --recover_from_filescraping.') # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) #sys.stdout = Tee(args.log[0], 'a') sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # retval = 0 # Returned value: 0 OK, 1 KO (files in error), -1 Error # -- Update the database file by removing missing files if update and remove: if not os.path.isfile(database): raise NameError( 'Specified database file does not exist, can\'t update!') ptee.write("====================================") ptee.write( "RIFGC Database Update Removal of missing files, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Precompute the total number of lines to process (this should be fairly quick) filestodocount = 0 with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): filestodocount = filestodocount + 1 # Preparing CSV writer for the temporary file that will have the lines removed with open(database + '.rem', 'wb') as dbfilerem: csv_writer = csv.writer(dbfilerem, lineterminator='\n', delimiter='|', quotechar='"') # Printing CSV headers csv_headers = [ 'path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext' ] csv_writer.writerow(csv_headers) dbf.seek(0) dbfile = csv.DictReader( dbf, lineterminator='\n', delimiter='|', quotechar='"' ) # we need to reopen the file to put the reading cursor (the generator position) back to the beginning delcount = 0 filescount = 0 for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 filepath = os.path.join( rootfolderpath, row['path']) # Build the absolute file path # Single-file mode: skip if this is not the file we are looking for if inputpath != rootfolderpath and inputpath != filepath: continue if verbose: ptee.write("\n- Processing file %s" % row['path']) errors = [] if not os.path.isfile(filepath): delcount = delcount + 1 ptee.write( "\n- File %s is missing, removed from database." % row['path']) else: csv_writer.writerow([ path2unix(row['path']), row['md5'], row['sha1'], row['last_modification_timestamp'], row['last_modification_date'], row['size'], row['ext'] ]) # REMOVE UPDATE DONE, we remove the old database file and replace it with the new os.remove(database) # delete old database os.rename(database + '.rem', database) # rename new database to match old name # Show some stats ptee.write("----------------------------------------------------") ptee.write( "All files processed: Total: %i - Removed/Missing: %i.\n\n" % (filescount, delcount)) # -- Generate the database file or update/append (both will walk through the filesystem to get new files, contrary to other branchs which walk through the database csv) if generate or (update and append): if not force and os.path.isfile(database) and not update: raise NameError( 'Database file already exists. Please choose another name to generate your database file.' ) if generate: dbmode = 'wb' elif (update and append): dbmode = 'ab' with open( database, dbmode ) as dbfile: # Must open in write + binary, because on Windows it will do weird things otherwise (at least with Python 2.7) ptee.write("====================================") if generate: ptee.write("RIFGC Database Generation started on %s" % datetime.datetime.now().isoformat()) elif update and append: ptee.write( "RIFGC Database Update Append new files, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Preparing CSV writer csv_writer = csv.writer(dbfile, lineterminator='\n', delimiter='|', quotechar='"') if generate: # Printing CSV headers csv_headers = [ 'path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext' ] csv_writer.writerow(csv_headers) if (update and append): # Extract all paths already stored in database to avoid readding them db_paths = {} with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): db_paths[row['path']] = True # Counting the total number of files that we will have to process ptee.write( "Counting total number of files to process, please wait...") filestodocount = 0 for _ in tqdm.tqdm(recwalk(inputpath), file=ptee): filestodocount = filestodocount + 1 ptee.write("Counting done.") # Recursively traversing the root directory and save the metadata in the db for each file ptee.write( "Processing files to compute metadata to store in database, please wait..." ) filescount = 0 addcount = 0 for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 # Get full absolute filepath filepath = os.path.join(dirpath, filename) # Get database relative path (from scanning root folder) relfilepath = path2unix( os.path.relpath(filepath, rootfolderpath) ) # File relative path from the root (so that we can easily check the files later even if the absolute path is different) if verbose: ptee.write("\n- Processing file %s" % relfilepath) # If update + append mode, then if the file is already in the database we skip it (we continue computing metadata only for new files) if update and append and relfilepath in db_paths: if verbose: ptee.write("... skipped") continue else: addcount = addcount + 1 # Compute the hashes (leave it outside the with command because generate_hashes() open the file by itself, so that both hashes can be computed in a single sweep of the file at the same time) if not skip_hash: md5hash, sha1hash = generate_hashes(filepath) else: md5hash = sha1hash = 0 # Compute other metadata with open(filepath) as thisfile: # Check file structure if option is enabled if structure_check: struct_result = check_structure(filepath) # Print/Log an error only if there's one (else we won't say anything) if struct_result: ptee.write("\n- Structure error with file " + filepath + ": " + struct_result) ext = os.path.splitext(filepath)[1] # File's extension statinfos = os.stat( filepath) # Various OS filesystem infos about the file size = statinfos.st_size # File size lastmodif = statinfos.st_mtime # File last modified date (as a timestamp) lastmodif_readable = datetime.datetime.fromtimestamp( lastmodif ).strftime( "%Y-%m-%d %H:%M:%S" ) # File last modified date as a human readable date (ISO universal time) csv_row = [ path2unix(relfilepath), md5hash, sha1hash, lastmodif, lastmodif_readable, size, ext ] # Prepare the CSV row csv_writer.writerow(csv_row) # Save to the file ptee.write("----------------------------------------------------") ptee.write("All files processed: Total: %i - Added: %i.\n\n" % (filescount, addcount)) # -- Filescraping recovery mode # We will compare all files from the input path and reorganize the ones that are recognized into the output path elif filescraping: import shutil ptee.write("====================================") ptee.write("RIFGC File Scraping Recovery started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") ptee.write("Loading the database into memory, please wait...") md5list = {} sha1list = {} dbrows = { } # TODO: instead of memorizing everything in memory, store just the reading cursor position at the beginning of the line with the size and then just read when necessary from the db file directly id = 0 with open(database, 'rb') as db: for row in csv.DictReader(db, lineterminator='\n', delimiter='|', quotechar='"'): id += 1 if (len(row['md5']) > 0 and len(row['sha1']) > 0): md5list[row['md5']] = id sha1list[row['sha1']] = id dbrows[id] = row ptee.write("Loading done.") if len(dbrows) == 0: ptee.write( "Nothing to do, there's no md5 nor sha1 hashes in the database file!" ) del ptee return 1 # return with an error # Counting the total number of files that we will have to process ptee.write("Counting total number of files to process, please wait...") filestodocount = 0 for _ in tqdm.tqdm(recwalk(inputpath), file=ptee): filestodocount = filestodocount + 1 ptee.write("Counting done.") # Recursively traversing the root directory and save the metadata in the db for each file ptee.write( "Processing file scraping recovery, walking through all files from input folder..." ) filescount = 0 copiedcount = 0 for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 # Get full absolute filepath filepath = os.path.join(dirpath, filename) # Get database relative path (from scanning root folder) relfilepath = path2unix( os.path.relpath(filepath, rootfolderpath) ) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different) if verbose: ptee.write("\n- Processing file %s" % relfilepath) # Generate the hashes from the currently inspected file md5hash, sha1hash = generate_hashes(filepath) # If it match with a file in the database, we will copy it over with the correct name, directory structure, file extension and last modification date if md5hash in md5list and sha1hash in sha1list and md5list[ md5hash] == sha1list[sha1hash]: # Load the db infos for this file row = dbrows[md5list[md5hash]] ptee.write("- Found: %s --> %s.\n" % (filepath, row['path'])) # Generate full absolute filepath of the output file outfilepath = os.path.join(outputpath, row['path']) # Recursively create the directory tree structure outfiledir = os.path.dirname(outfilepath) if not os.path.isdir(outfiledir): os.makedirs( outfiledir ) # if the target directory does not exist, create it (and create recursively all parent directories too) # Copy over and set attributes shutil.copy2(filepath, outfilepath) filestats = os.stat(filepath) os.utime(outfilepath, (filestats.st_atime, float(row['last_modification_timestamp']))) # Counter... copiedcount += 1 ptee.write("----------------------------------------------------") ptee.write("All files processed: Total: %i - Recovered: %i.\n\n" % (filescount, copiedcount)) # -- Check mode: check the files using a database file elif not update and not generate and not filescraping: ptee.write("====================================") ptee.write("RIFGC Check started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Open errors file if supplied (where we will store every errors in a formatted csv so that it can later be easily processed by other softwares, such as repair softwares) if errors_file is not None: efile = open(errors_file, 'wb') e_writer = csv.writer(efile, delimiter='|', lineterminator='\n', quotechar='"') # Precompute the total number of lines to process (this should be fairly quick) filestodocount = 0 with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): filestodocount = filestodocount + 1 # Processing the files using the database list ptee.write( "Checking for files corruption based on database %s on input path %s, please wait..." % (database, inputpath)) dbf.seek(0) dbfile = csv.DictReader( dbf, lineterminator='\n', delimiter='|', quotechar='"' ) # we need to reopen the file to put the reading cursor (the generator position) back to the beginning errorscount = 0 filescount = 0 for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 filepath = os.path.join(rootfolderpath, row['path']) # Single-file mode: skip if this is not the file we are looking for if inputpath != rootfolderpath and inputpath != filepath: continue if verbose: ptee.write("\n- Processing file %s" % row['path']) errors = [] if not os.path.isfile(filepath): if not skip_missing: errors.append('file is missing') # First generate the current file's metadata given the filepath from the CSV, and then we will check the differences from database else: try: # Try to be resilient to various file access errors # Generate hash if not skip_hash: md5hash, sha1hash = generate_hashes(filepath) else: md5hash = sha1hash = 0 # Check structure integrity if enabled if structure_check: struct_result = check_structure(filepath) if struct_result: errors.append("structure error (%s)" % struct_result) # Compute other metadata with open(filepath) as thisfile: ext = os.path.splitext(filepath)[1] statinfos = os.stat(filepath) size = statinfos.st_size lastmodif = statinfos.st_mtime lastmodif_readable = datetime.datetime.fromtimestamp( lastmodif).strftime("%Y-%m-%d %H:%M:%S") # CHECK THE DIFFERENCES if not skip_hash and md5hash != row[ 'md5'] and sha1hash != row['sha1']: errors.append('both md5 and sha1 hash failed') elif not skip_hash and ( (md5hash == row['md5'] and sha1hash != row['sha1']) or (md5hash != row['md5'] and sha1hash == row['sha1'])): errors.append( 'one of the hash failed but not the other (which may indicate that the database file is corrupted)' ) if ext != row['ext']: errors.append('extension has changed') if size != int(row['size']): errors.append( "size has changed (before: %s - now: %s)" % (row['size'], size)) if not disable_modification_date_checking and ( lastmodif != float( row['last_modification_timestamp']) and round(lastmodif, 0) != round( float(row[ 'last_modification_timestamp']), 0) ): # for usage with PyPy: last modification time is differently managed (rounded), thus we need to round here manually to compare against PyPy. errors.append( "modification date has changed (before: %s - now: %s)" % (row['last_modification_date'], lastmodif_readable)) except IOError as e: # Catch IOError as a file error errors.append( 'file can\'t be read, IOError (inaccessible, maybe bad sector?)' ) except Exception as e: # Any other exception when accessing the file will also be caught as a file error errors.append('file can\'t be accessed: %s' % e) # Print/Log all errors for this file if any happened if errors: errorscount = errorscount + 1 ptee.write("\n- Error for file %s: %s." % (row['path'], ', '.join(errors))) if errors_file is not None: # Write error in a csv file if supplied (for easy processing later by other softwares such as file repair softwares) e_writer.writerow([row['path'], ', '.join(errors)]) # END OF CHECKING: show some stats ptee.write("----------------------------------------------------") ptee.write( "All files checked: Total: %i - Files with errors: %i.\n\n" % (filescount, errorscount)) retval = (errorscount > 0) del ptee return retval # return error code if any