def relpath_posix(recwalk_result, pardir, fromwinpath=False): ''' Helper function to convert all paths to relative posix like paths (to ease comparison) ''' return recwalk_result[0], path2unix(os.path.join(os.path.relpath(recwalk_result[0], pardir),recwalk_result[1]), nojoin=True, fromwinpath=fromwinpath)
def synchronize_files(inputpaths, outpath, database=None, tqdm_bar=None, report_file=None, ptee=None, verbose=False): ''' Main function to synchronize files contents by majority vote The main job of this function is to walk through the input folders and align the files, so that we can compare every files across every folders, one by one. The whole trick here is to align files, so that we don't need to memorize all the files in memory and we compare all equivalent files together: to do that, we ensure that we walk through the input directories in alphabetical order, and we pick the relative filepath at the top of the alphabetical order, this ensures the alignment of files between different folders, without memorizing the whole trees structures. ''' # (Generator) Files Synchronization Algorithm: # Needs a function stable_dir_walking, which will walk through directories recursively but in always the same order on all platforms (same order for files but also for folders), whatever order it is, as long as it is stable. # Until there's no file in any of the input folders to be processed: # - curfiles <- load first file for each folder by using stable_dir_walking on each input folder. # - curfiles_grouped <- group curfiles_ordered: # * curfiles_ordered <- order curfiles alphabetically (need to separate the relative parent directory and the filename, to account for both without ambiguity) # * curfiles_grouped <- empty list # * curfiles_grouped[0] = add first element in curfiles_ordered # * last_group = 0 # * for every subsequent element nextelt in curfiles_ordered: # . if nextelt == curfiles_grouped[last_group][0]: add nextelt into curfiles_grouped[last_group] (the latest group in curfiles_grouped) # . else: create a new group in curfiles_grouped (last_group += 1) and add nextelt into curfiles_grouped[last_group] # At this stage, curfiles_grouped[0] should contain a group of files with the same relative filepath from different input folders, and since we used stable_dir_walking, we are guaranteed that this file is the next to be processed in alphabetical order. # - Majority vote byte-by-byte for each of curfiles_grouped[0], and output winning byte to the output file. # - Update files list alignment: we will now ditch files in curfiles_grouped[0] from curfiles, and replace by the next files respectively from each respective folder. Since we processed in alphabetical (or whatever) order, the next loaded files will match the files in other curfiles_grouped groups that we could not process before. # At this point (after the loop), all input files have been processed in order, without maintaining the whole files list in memory, just one file per input folder. # Init files walking generator for each inputpaths recgen = [recwalk(path, sorting=True) for path in inputpaths] curfiles = {} recgen_exhausted = {} recgen_exhausted_count = 0 nbpaths = len(inputpaths) retcode = 0 if not ptee: ptee = sys.stdout # Open report file and write header if report_file is not None: rfile = open(report_file, 'wb') r_writer = csv.writer(rfile, delimiter='|', lineterminator='\n', quotechar='"') r_header = ["filepath"] + ["dir%i" % (i+1) for i in xrange(nbpaths)] + ["hash-correct", "error_code", "errors"] r_length = len(r_header) r_writer.writerow(r_header) # Initialization: load the first batch of files, one for each folder for i in xrange(len(recgen)): recgen_exhausted[i] = False try: if curfiles.get(i, None) is None: curfiles[i] = relpath_posix(recgen[i].next(), inputpaths[i])[1] except StopIteration: recgen_exhausted[i] = True recgen_exhausted_count += 1 # Files lists alignment loop while recgen_exhausted_count < nbpaths: errcode = 0 errmsg = None # Init a new report's row if report_file: r_row = ["-"] * r_length # -- Group equivalent relative filepaths together #print curfiles # debug curfiles_grouped = sort_group(curfiles, True) # -- Extract first group of equivalent filepaths (this allows us to process with the same alphabetical order on all platforms) # Note that the remaining files in other groups will be processed later, because their alphabetical order is higher to the first group, this means that the first group is to be processed now to_process = curfiles_grouped[0] #print to_process # debug # -- Byte-by-byte majority vote on the first group of files # Need the relative filepath also (note that there's only one since it's a group of equivalent relative filepaths, only the absolute path is different between files of a same group) relfilepath = path2unix(os.path.join(*to_process[0][1])) if report_file: r_row[0] = relfilepath if verbose: ptee.write("- Processing file %s." % relfilepath) # Generate output path outpathfull = os.path.join(outpath, relfilepath) create_dir_if_not_exist(os.path.dirname(outpathfull)) # Initialize the list of absolute filepaths fileslist = [] for elt in to_process: i = elt[0] fileslist.append(os.path.join(inputpaths[i], os.path.join(*elt[1]))) if report_file: r_row[i+1] = 'X' # put an X in the report file below each folder that contains this file # If there's only one file, just copy it over if len(to_process) == 1: shutil.copyfile(fileslist[0], outpathfull) id = to_process[0][0] if report_file: r_row[id+1] = 'O' # Else, merge by majority vote else: # Before-merge check using rfigc database, if provided # If one of the files in the input folders is already correct, just copy it over correct_file = None if database: for id, filepath in enumerate(fileslist): if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" % (filepath, database)) == 0: correct_file = filepath correct_id = to_process[id][0] break # If one correct file was found, copy it over if correct_file: create_dir_if_not_exist(os.path.dirname(outpathfull)) shutil.copyfile(correct_file, outpathfull) if report_file: r_row[correct_id+1] = "O" r_row[-3] = "OK" # Else, we need to do the majority vote merge else: # Do the majority vote merge errcode, errmsg = majority_vote_byte_scan(relfilepath, fileslist, outpath) # After-merge/move check using rfigc database, if provided if database: if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" % (outpathfull, database)) == 1: errcode = 1 r_row[-3] = "KO" if not errmsg: errmsg = '' errmsg += " File could not be totally repaired according to rfigc database." else: if report_file: r_row[-3] = "OK" if errmsg: errmsg += " But merged file is correct according to rfigc database." # Display errors if any if errcode: if report_file: r_row[-2] = "KO" r_row[-1] = errmsg ptee.write(errmsg) retcode = 1 else: if report_file: r_row[-2] = "OK" # Save current report's row if report_file: r_writer.writerow(r_row) # -- Update files lists alignment (ie, retrieve new files but while trying to keep the alignment) for elt in to_process: # for files of the first group (the ones we processed) i = elt[0] # Walk their respective folders and load up the next file try: if not recgen_exhausted.get(i, False): curfiles[i] = relpath_posix(recgen[i].next(), inputpaths[i])[1] # If there's no file left in this folder, mark this input folder as exhausted and continue with the others except StopIteration: curfiles[i] = None recgen_exhausted[i] = True recgen_exhausted_count += 1 if tqdm_bar: tqdm_bar.update() if tqdm_bar: tqdm_bar.close() # Closing report file if report_file: # Write list of directories and legend rfile.write("\n=> Input directories:") for id, ipath in enumerate(inputpaths): rfile.write("\n\t- dir%i = %s" % ((id+1), ipath)) rfile.write("\n=> Output directory: %s" % outpath) rfile.write("\n=> Legend: X=existing/selected for majority vote, O=only used this file, - = not existing, OK = check correct, KO = check incorrect (file was not recovered)\n") # Close the report file handle rfile.close() return retcode
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Replication Repair Description: Given a set of directories (or files), try to repair your files by scanning each byte, cast a majority vote among all copies, and then output the winning byte. This process is usually called triple-modular redundancy (but here it should be called n-modular redundancy since you can use as many copies as you have). It is recommended for long term storage to store several copies of your files on different storage mediums. Everything's fine until all your copies are partially corrupted. In this case, this script can help you, by taking advantage of your multiple copies, without requiring a pregenerated ecc file. Just specify the path to every copies, and the script will try to recover them. Replication can repair exactly r-2 errors using majority vote (you need at least 2 blocks for majority vote to work), where r is the number of replications: if r=3, you get a redundancy rate of 1/3, if r=4, rate is 2/4, etc. This script can also take advantage of a database generated by rfigc.py to make sure that the recovered files are correct, or to select files that are already correct. Note: in case the end result is not what you expected, you can try a different order of input directories: in case of ambiguity, the first input folder has precedence over subsequent folders. Note2: in case some files with the same names are of different length, the merging will continue until the longest file is exhausted. Note3: last modification date is not (yet) accounted for. ''' ep = '''Use --gui as the first argument to use with a GUI (via Gooey). ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} widget_multidir = {"widget": "MultiDirChooser"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} widget_multidir = {} # Required arguments main_parser.add_argument('-i', '--input', metavar='"/path/to/copy1/" "/path/to/copy2/" "etc."', type=is_dir_or_file, nargs='+', required=True, help='Specify the paths to every copies you have (minimum 3 copies, else it won\'t work!). Can be folders or files (if you want to repair only one file). Order matters: in case of ambiguity, the first folder where the file exists will be chosen.', **widget_multidir) main_parser.add_argument('-o', '--output', metavar='/ouput/folder/', nargs=1, required=True, help='Where the recovered files will be stored.', **widget_dir) # Optional general arguments main_parser.add_argument('-d', '--database', metavar='database.csv', type=is_file, required=False, help='Path to a previously generated rfigc.py database. If provided, this will be used to check that the repaired files are correct (and also to find already correct files in copies).', **widget_file) main_parser.add_argument('-r', '--report', metavar='/some/folder/report.csv', type=str, required=False, help='Save all results of the repair process in a report file, with detailed descriptions of ambiguous repairs (ie, when majority vote came to a draw).', **widget_filesave) main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False, help='Force overwriting the output folder even if it already exists.') main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument('--silent', action='store_true', required=False, default=False, help='No console output (but if --log specified, the log will still be saved in the specified file).') #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments inputpaths = [fullpath(x) for x in args.input] # path to the files to repair (ie, paths to all the different copies the user has) outputpath = fullpath(args.output[0]) force = args.force verbose = args.verbose silent = args.silent if len(inputpaths) < 3: raise Exception('Need at least 3 copies to do a replication repair/majority vote!') #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!) #rootfolderpath = os.path.dirname(inputpath) report_file = None if args.report: report_file = os.path.basename(fullpath(args.report)) database = None if args.database: database = args.database # -- Checking arguments if os.path.exists(outputpath) and not force: raise NameError('Specified output path %s already exists! Use --force if you want to overwrite.' % outputpath) if database and not os.path.isfile(database): raise NameError('Specified rfigc database file %s does not exist!' % database) # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) #sys.stdout = Tee(args.log[0], 'a') sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # # == Precomputation of ecc file size # Precomputing is important so that the user can know what size to expect before starting (and how much time it will take...). filescount = 0 sizetotal = 0 sizeheaders = 0 visitedfiles = {} ptee.write("Precomputing list of files and predicted statistics...") prebar = tqdm.tqdm(file=ptee, disable=silent) for inputpath in inputpaths: for (dirpath, filename) in recwalk(inputpath): # Get full absolute filepath filepath = os.path.join(dirpath, filename) relfilepath = path2unix(os.path.relpath(filepath, inputpath)) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different) # Only increase the files count if we didn't see this file before if not visitedfiles.get(relfilepath, None): # Counting the total number of files we will process (so that we can show a progress bar with ETA) filescount = filescount + 1 # Add the file to the list of already visited files visitedfiles[relfilepath] = True # Get the current file's size size = os.stat(filepath).st_size # Compute total size of all files sizetotal = sizetotal + size prebar.update() prebar.close() ptee.write("Precomputing done.") # == Majority vote repair # For each folder, align the files lists and then majority vote over each byte to repair ptee.write("====================================") ptee.write("Replication repair, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Prepare progress bar if necessary if silent: tqdm_bar = None else: tqdm_bar = tqdm.tqdm(total=filescount, file=ptee, leave=True, unit="files") # Call the main function to synchronize files using majority vote errcode = synchronize_files(inputpaths, outputpath, database=database, tqdm_bar=tqdm_bar, report_file=report_file, ptee=ptee, verbose=verbose) #ptee.write("All done! Stats:\n- Total files processed: %i\n- Total files corrupted: %i\n- Total files repaired completely: %i\n- Total files repaired partially: %i\n- Total files corrupted but not repaired at all: %i\n- Total files skipped: %i" % (files_count, files_corrupted, files_repaired_completely, files_repaired_partially, files_corrupted - (files_repaired_partially + files_repaired_completely), files_skipped) ) if tqdm_bar: tqdm_bar.close() ptee.write("All done!") if report_file: ptee.write("Saved replication repair results in report file: %s" % report_file) del ptee return errcode
def relpath_posix(recwalk_result, pardir, fromwinpath=False): ''' Helper function to convert all paths to relative posix like paths (to ease comparison) ''' return recwalk_result[0], path2unix(os.path.join( os.path.relpath(recwalk_result[0], pardir), recwalk_result[1]), nojoin=True, fromwinpath=fromwinpath)
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance( argv, basestring ): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Replication Repair Description: Given a set of directories (or files), try to repair your files by scanning each byte, cast a majority vote among all copies, and then output the winning byte. This process is usually called triple-modular redundancy (but here it should be called n-modular redundancy since you can use as many copies as you have). It is recommended for long term storage to store several copies of your files on different storage mediums. Everything's fine until all your copies are partially corrupted. In this case, this script can help you, by taking advantage of your multiple copies, without requiring a pregenerated ecc file. Just specify the path to every copies, and the script will try to recover them. Replication can repair exactly r-2 errors using majority vote (you need at least 2 blocks for majority vote to work), where r is the number of replications: if r=3, you get a redundancy rate of 1/3, if r=4, rate is 2/4, etc. This script can also take advantage of a database generated by rfigc.py to make sure that the recovered files are correct, or to select files that are already correct. Note: in case the end result is not what you expected, you can try a different order of input directories: in case of ambiguity, the first input folder has precedence over subsequent folders. Note2: in case some files with the same names are of different length, the merging will continue until the longest file is exhausted. Note3: last modification date is not (yet) accounted for. ''' ep = '''Use --gui as the first argument to use with a GUI (via Gooey). ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} widget_multidir = {"widget": "MultiDirChooser"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove( '--ignore-gooey' ) # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} widget_multidir = {} # Required arguments main_parser.add_argument( '-i', '--input', metavar='"/path/to/copy1/" "/path/to/copy2/" "etc."', type=is_dir_or_file, nargs='+', required=True, help= 'Specify the paths to every copies you have (minimum 3 copies, else it won\'t work!). Can be folders or files (if you want to repair only one file). Order matters: in case of ambiguity, the first folder where the file exists will be chosen.', **widget_multidir) main_parser.add_argument('-o', '--output', metavar='/ouput/folder/', nargs=1, required=True, help='Where the recovered files will be stored.', **widget_dir) # Optional general arguments main_parser.add_argument( '-d', '--database', metavar='database.csv', type=is_file, required=False, help= 'Path to a previously generated rfigc.py database. If provided, this will be used to check that the repaired files are correct (and also to find already correct files in copies).', **widget_file) main_parser.add_argument( '-r', '--report', metavar='/some/folder/report.csv', type=str, required=False, help= 'Save all results of the repair process in a report file, with detailed descriptions of ambiguous repairs (ie, when majority vote came to a draw).', **widget_filesave) main_parser.add_argument( '-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help= 'Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument( '-f', '--force', action='store_true', required=False, default=False, help='Force overwriting the output folder even if it already exists.') main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument( '--silent', action='store_true', required=False, default=False, help= 'No console output (but if --log specified, the log will still be saved in the specified file).' ) #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments inputpaths = [ fullpath(x) for x in args.input ] # path to the files to repair (ie, paths to all the different copies the user has) outputpath = fullpath(args.output[0]) force = args.force verbose = args.verbose silent = args.silent if len(inputpaths) < 3: raise Exception( 'Need at least 3 copies to do a replication repair/majority vote!') #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!) #rootfolderpath = os.path.dirname(inputpath) report_file = None if args.report: report_file = os.path.basename(fullpath(args.report)) database = None if args.database: database = args.database # -- Checking arguments if os.path.exists(outputpath) and not force: raise NameError( 'Specified output path %s already exists! Use --force if you want to overwrite.' % outputpath) if database and not os.path.isfile(database): raise NameError('Specified rfigc database file %s does not exist!' % database) # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) #sys.stdout = Tee(args.log[0], 'a') sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # # == Precomputation of ecc file size # Precomputing is important so that the user can know what size to expect before starting (and how much time it will take...). filescount = 0 sizetotal = 0 sizeheaders = 0 visitedfiles = {} ptee.write("Precomputing list of files and predicted statistics...") prebar = tqdm.tqdm(file=ptee, disable=silent) for inputpath in inputpaths: for (dirpath, filename) in recwalk(inputpath): # Get full absolute filepath filepath = os.path.join(dirpath, filename) relfilepath = path2unix( os.path.relpath(filepath, inputpath) ) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different) # Only increase the files count if we didn't see this file before if not visitedfiles.get(relfilepath, None): # Counting the total number of files we will process (so that we can show a progress bar with ETA) filescount = filescount + 1 # Add the file to the list of already visited files visitedfiles[relfilepath] = True # Get the current file's size size = os.stat(filepath).st_size # Compute total size of all files sizetotal = sizetotal + size prebar.update() prebar.close() ptee.write("Precomputing done.") # == Majority vote repair # For each folder, align the files lists and then majority vote over each byte to repair ptee.write("====================================") ptee.write("Replication repair, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Prepare progress bar if necessary if silent: tqdm_bar = None else: tqdm_bar = tqdm.tqdm(total=filescount, file=ptee, leave=True, unit="files") # Call the main function to synchronize files using majority vote errcode = synchronize_files(inputpaths, outputpath, database=database, tqdm_bar=tqdm_bar, report_file=report_file, ptee=ptee, verbose=verbose) #ptee.write("All done! Stats:\n- Total files processed: %i\n- Total files corrupted: %i\n- Total files repaired completely: %i\n- Total files repaired partially: %i\n- Total files corrupted but not repaired at all: %i\n- Total files skipped: %i" % (files_count, files_corrupted, files_repaired_completely, files_repaired_partially, files_corrupted - (files_repaired_partially + files_repaired_completely), files_skipped) ) if tqdm_bar: tqdm_bar.close() ptee.write("All done!") if report_file: ptee.write("Saved replication repair results in report file: %s" % report_file) del ptee return errcode
def synchronize_files(inputpaths, outpath, database=None, tqdm_bar=None, report_file=None, ptee=None, verbose=False): ''' Main function to synchronize files contents by majority vote The main job of this function is to walk through the input folders and align the files, so that we can compare every files across every folders, one by one. The whole trick here is to align files, so that we don't need to memorize all the files in memory and we compare all equivalent files together: to do that, we ensure that we walk through the input directories in alphabetical order, and we pick the relative filepath at the top of the alphabetical order, this ensures the alignment of files between different folders, without memorizing the whole trees structures. ''' # (Generator) Files Synchronization Algorithm: # Needs a function stable_dir_walking, which will walk through directories recursively but in always the same order on all platforms (same order for files but also for folders), whatever order it is, as long as it is stable. # Until there's no file in any of the input folders to be processed: # - curfiles <- load first file for each folder by using stable_dir_walking on each input folder. # - curfiles_grouped <- group curfiles_ordered: # * curfiles_ordered <- order curfiles alphabetically (need to separate the relative parent directory and the filename, to account for both without ambiguity) # * curfiles_grouped <- empty list # * curfiles_grouped[0] = add first element in curfiles_ordered # * last_group = 0 # * for every subsequent element nextelt in curfiles_ordered: # . if nextelt == curfiles_grouped[last_group][0]: add nextelt into curfiles_grouped[last_group] (the latest group in curfiles_grouped) # . else: create a new group in curfiles_grouped (last_group += 1) and add nextelt into curfiles_grouped[last_group] # At this stage, curfiles_grouped[0] should contain a group of files with the same relative filepath from different input folders, and since we used stable_dir_walking, we are guaranteed that this file is the next to be processed in alphabetical order. # - Majority vote byte-by-byte for each of curfiles_grouped[0], and output winning byte to the output file. # - Update files list alignment: we will now ditch files in curfiles_grouped[0] from curfiles, and replace by the next files respectively from each respective folder. Since we processed in alphabetical (or whatever) order, the next loaded files will match the files in other curfiles_grouped groups that we could not process before. # At this point (after the loop), all input files have been processed in order, without maintaining the whole files list in memory, just one file per input folder. # Init files walking generator for each inputpaths recgen = [recwalk(path, sorting=True) for path in inputpaths] curfiles = {} recgen_exhausted = {} recgen_exhausted_count = 0 nbpaths = len(inputpaths) retcode = 0 if not ptee: ptee = sys.stdout # Open report file and write header if report_file is not None: rfile = open(report_file, 'wb') r_writer = csv.writer(rfile, delimiter='|', lineterminator='\n', quotechar='"') r_header = ["filepath"] + [ "dir%i" % (i + 1) for i in xrange(nbpaths) ] + ["hash-correct", "error_code", "errors"] r_length = len(r_header) r_writer.writerow(r_header) # Initialization: load the first batch of files, one for each folder for i in xrange(len(recgen)): recgen_exhausted[i] = False try: if curfiles.get(i, None) is None: curfiles[i] = relpath_posix(recgen[i].next(), inputpaths[i])[1] except StopIteration: recgen_exhausted[i] = True recgen_exhausted_count += 1 # Files lists alignment loop while recgen_exhausted_count < nbpaths: errcode = 0 errmsg = None # Init a new report's row if report_file: r_row = ["-"] * r_length # -- Group equivalent relative filepaths together #print curfiles # debug curfiles_grouped = sort_group(curfiles, True) # -- Extract first group of equivalent filepaths (this allows us to process with the same alphabetical order on all platforms) # Note that the remaining files in other groups will be processed later, because their alphabetical order is higher to the first group, this means that the first group is to be processed now to_process = curfiles_grouped[0] #print to_process # debug # -- Byte-by-byte majority vote on the first group of files # Need the relative filepath also (note that there's only one since it's a group of equivalent relative filepaths, only the absolute path is different between files of a same group) relfilepath = path2unix(os.path.join(*to_process[0][1])) if report_file: r_row[0] = relfilepath if verbose: ptee.write("- Processing file %s." % relfilepath) # Generate output path outpathfull = os.path.join(outpath, relfilepath) create_dir_if_not_exist(os.path.dirname(outpathfull)) # Initialize the list of absolute filepaths fileslist = [] for elt in to_process: i = elt[0] fileslist.append(os.path.join(inputpaths[i], os.path.join(*elt[1]))) if report_file: r_row[ i + 1] = 'X' # put an X in the report file below each folder that contains this file # If there's only one file, just copy it over if len(to_process) == 1: shutil.copyfile(fileslist[0], outpathfull) id = to_process[0][0] if report_file: r_row[id + 1] = 'O' # Else, merge by majority vote else: # Before-merge check using rfigc database, if provided # If one of the files in the input folders is already correct, just copy it over correct_file = None if database: for id, filepath in enumerate(fileslist): if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" % (filepath, database)) == 0: correct_file = filepath correct_id = to_process[id][0] break # If one correct file was found, copy it over if correct_file: create_dir_if_not_exist(os.path.dirname(outpathfull)) shutil.copyfile(correct_file, outpathfull) if report_file: r_row[correct_id + 1] = "O" r_row[-3] = "OK" # Else, we need to do the majority vote merge else: # Do the majority vote merge errcode, errmsg = majority_vote_byte_scan( relfilepath, fileslist, outpath) # After-merge/move check using rfigc database, if provided if database: if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" % (outpathfull, database)) == 1: errcode = 1 r_row[-3] = "KO" if not errmsg: errmsg = '' errmsg += " File could not be totally repaired according to rfigc database." else: if report_file: r_row[-3] = "OK" if errmsg: errmsg += " But merged file is correct according to rfigc database." # Display errors if any if errcode: if report_file: r_row[-2] = "KO" r_row[-1] = errmsg ptee.write(errmsg) retcode = 1 else: if report_file: r_row[-2] = "OK" # Save current report's row if report_file: r_writer.writerow(r_row) # -- Update files lists alignment (ie, retrieve new files but while trying to keep the alignment) for elt in to_process: # for files of the first group (the ones we processed) i = elt[0] # Walk their respective folders and load up the next file try: if not recgen_exhausted.get(i, False): curfiles[i] = relpath_posix(recgen[i].next(), inputpaths[i])[1] # If there's no file left in this folder, mark this input folder as exhausted and continue with the others except StopIteration: curfiles[i] = None recgen_exhausted[i] = True recgen_exhausted_count += 1 if tqdm_bar: tqdm_bar.update() if tqdm_bar: tqdm_bar.close() # Closing report file if report_file: # Write list of directories and legend rfile.write("\n=> Input directories:") for id, ipath in enumerate(inputpaths): rfile.write("\n\t- dir%i = %s" % ((id + 1), ipath)) rfile.write("\n=> Output directory: %s" % outpath) rfile.write( "\n=> Legend: X=existing/selected for majority vote, O=only used this file, - = not existing, OK = check correct, KO = check incorrect (file was not recovered)\n" ) # Close the report file handle rfile.close() return retcode
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Recursive/Relative Files Integrity Generator and Checker Description: Recursively generate or check the integrity of files by MD5 and SHA1 hashes, size, modification date or by data structure integrity (only for images). This script is originally meant to be used for data archival, by allowing an easy way to check for silent file corruption. Thus, this script uses relative paths so that you can easily compute and check the same redundant data copied on different mediums (hard drives, optical discs, etc.). This script is not meant for system files corruption notification, but is more meant to be used from times-to-times to check up on your data archives integrity. ''' ep = '''Example usage: - To generate the database (only needed once): python rfigc.py -i "folderimages" -d "dbhash.csv" -g - To check: python rfigc.py -i "folderimages" -d "dbhash.csv" -l log.txt -s - To update your database by appending new files: python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a - To update your database by appending new files AND removing inexistent files: python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a -r - To use with a gui: python rfigc.py --gui Note that by default, the script is by default in check mode, to avoid wrong manipulations. It will also alert you if you generate over an already existing database file. Note2: you can use PyPy to speed the generation, but you should avoid using PyPy when in checking mode (from our tests, it will slow things down a lot). ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} # Required arguments main_parser.add_argument('-i', '--input', metavar='/path/to/root/folder', type=is_dir_or_file, nargs=1, required=True, help='Path to the root folder (or a single file) from where the scanning will occur.', **widget_dir) main_parser.add_argument('-d', '--database', metavar='/some/folder/databasefile.csv', type=str, nargs=1, required=True, #type=argparse.FileType('rt') help='Path to the csv file containing the hash informations.', **widget_filesave) # Optional general arguments main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument('--skip_hash', action='store_true', required=False, default=False, help='Skip hash computation/checking (checks only the other metadata, this is a lot quicker).') main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument('--silent', action='store_true', required=False, default=False, help='No console output (but if --log specified, the log will still be saved in the specified file).') # Checking mode arguments main_parser.add_argument('-s', '--structure_check', action='store_true', required=False, default=False, help='Check images structures for corruption?') main_parser.add_argument('-e', '--errors_file', metavar='/some/folder/errorsfile.csv', type=str, nargs=1, required=False, #type=argparse.FileType('rt') help='Path to the error file, where errors at checking will be stored in CSV for further processing by other softwares (such as file repair softwares).', **widget_filesave) main_parser.add_argument('-m', '--disable_modification_date_checking', action='store_true', required=False, default=False, help='Disable modification date checking.') main_parser.add_argument('--skip_missing', action='store_true', required=False, default=False, help='Skip missing files when checking (useful if you split your files into several mediums, for example on optical discs with limited capacity).') # Generate mode arguments main_parser.add_argument('-g', '--generate', action='store_true', required=False, default=False, help='Generate the database? (omit this parameter to check instead of generating).') main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False, help='Force overwriting the database file even if it already exists (if --generate).') # Update mode arguments main_parser.add_argument('-u', '--update', action='store_true', required=False, default=False, help='Update database (you must also specify --append or --remove).') main_parser.add_argument('-a', '--append', action='store_true', required=False, default=False, help='Append new files (if --update).') main_parser.add_argument('-r', '--remove', action='store_true', required=False, default=False, help='Remove missing files (if --update).') # Recover from file scraping main_parser.add_argument('--filescraping_recovery', action='store_true', required=False, default=False, help='Given a folder of unorganized files, compare to the database and restore the filename and directory structure into the output folder.') main_parser.add_argument('-o', '--output', metavar='/path/to/root/folder', type=is_dir, nargs=1, required=False, help='Path to the output folder where to output (copy) the files reorganized after --recover_from_filescraping.', **widget_dir) #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments inputpath = fullpath(args.input[0]) # path to the files to protect (either a folder or a single file) rootfolderpath = inputpath # path to the root folder (to compute relative paths) #database = os.path.basename(fullpath(args.database[0])) # Take only the filename. database = fullpath(args.database[0]) generate = args.generate structure_check = args.structure_check force = args.force disable_modification_date_checking = args.disable_modification_date_checking skip_missing = args.skip_missing skip_hash = args.skip_hash update = args.update append = args.append remove = args.remove outputpath = None if args.output: outputpath = fullpath(args.output[0]) filescraping = args.filescraping_recovery verbose = args.verbose silent = args.silent if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!) rootfolderpath = os.path.dirname(inputpath) errors_file = None if args.errors_file: errors_file = fullpath(args.errors_file[0]) # -- Checking arguments if structure_check and not structure_check_import: raise ImportError('PIL (Python Imaging Library) could not be imported. PIL is needed to do structure check, please install PIL (or you can disable structure check to continue).'); if update and (not append and not remove): raise ValueError('--update specified but not --append nor --remove. You must specify at least one of these modes when using --update!') if filescraping and not outputpath: raise ValueError('Output path needed when --recover_from_filescraping.') # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) #sys.stdout = Tee(args.log[0], 'a') sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # retval = 0 # Returned value: 0 OK, 1 KO (files in error), -1 Error # -- Update the database file by removing missing files if update and remove: if not os.path.isfile(database): raise NameError('Specified database file does not exist, can\'t update!') ptee.write("====================================") ptee.write("RIFGC Database Update Removal of missing files, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Precompute the total number of lines to process (this should be fairly quick) filestodocount = 0 with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): filestodocount = filestodocount + 1 # Preparing CSV writer for the temporary file that will have the lines removed with open(database+'.rem', 'wb') as dbfilerem: csv_writer = csv.writer(dbfilerem, lineterminator='\n', delimiter='|', quotechar='"') # Printing CSV headers csv_headers = ['path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext'] csv_writer.writerow(csv_headers) dbf.seek(0) dbfile = csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"') # we need to reopen the file to put the reading cursor (the generator position) back to the beginning delcount = 0 filescount = 0 for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 filepath = os.path.join(rootfolderpath, row['path']) # Build the absolute file path # Single-file mode: skip if this is not the file we are looking for if inputpath != rootfolderpath and inputpath != filepath: continue if verbose: ptee.write("\n- Processing file %s" % row['path']) errors = [] if not os.path.isfile(filepath): delcount = delcount + 1 ptee.write("\n- File %s is missing, removed from database." % row['path']) else: csv_writer.writerow( [ path2unix(row['path']), row['md5'], row['sha1'], row['last_modification_timestamp'], row['last_modification_date'], row['size'], row['ext'] ] ) # REMOVE UPDATE DONE, we remove the old database file and replace it with the new os.remove(database) # delete old database os.rename(database+'.rem', database) # rename new database to match old name # Show some stats ptee.write("----------------------------------------------------") ptee.write("All files processed: Total: %i - Removed/Missing: %i.\n\n" % (filescount, delcount)) # -- Generate the database file or update/append (both will walk through the filesystem to get new files, contrary to other branchs which walk through the database csv) if generate or (update and append): if not force and os.path.isfile(database) and not update: raise NameError('Database file already exists. Please choose another name to generate your database file.') if generate: dbmode = 'wb' elif (update and append): dbmode = 'ab' with open(database, dbmode) as dbfile: # Must open in write + binary, because on Windows it will do weird things otherwise (at least with Python 2.7) ptee.write("====================================") if generate: ptee.write("RIFGC Database Generation started on %s" % datetime.datetime.now().isoformat()) elif update and append: ptee.write("RIFGC Database Update Append new files, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Preparing CSV writer csv_writer = csv.writer(dbfile, lineterminator='\n', delimiter='|', quotechar='"') if generate: # Printing CSV headers csv_headers = ['path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext'] csv_writer.writerow(csv_headers) if (update and append): # Extract all paths already stored in database to avoid readding them db_paths = {} with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): db_paths[row['path']] = True # Counting the total number of files that we will have to process ptee.write("Counting total number of files to process, please wait...") filestodocount = 0 for _ in tqdm.tqdm(recwalk(inputpath), file=ptee): filestodocount = filestodocount + 1 ptee.write("Counting done.") # Recursively traversing the root directory and save the metadata in the db for each file ptee.write("Processing files to compute metadata to store in database, please wait...") filescount = 0 addcount = 0 for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 # Get full absolute filepath filepath = os.path.join(dirpath, filename) # Get database relative path (from scanning root folder) relfilepath = path2unix(os.path.relpath(filepath, rootfolderpath)) # File relative path from the root (so that we can easily check the files later even if the absolute path is different) if verbose: ptee.write("\n- Processing file %s" % relfilepath) # If update + append mode, then if the file is already in the database we skip it (we continue computing metadata only for new files) if update and append and relfilepath in db_paths: if verbose: ptee.write("... skipped") continue else: addcount = addcount + 1 # Compute the hashes (leave it outside the with command because generate_hashes() open the file by itself, so that both hashes can be computed in a single sweep of the file at the same time) if not skip_hash: md5hash, sha1hash = generate_hashes(filepath) else: md5hash = sha1hash = 0 # Compute other metadata with open(filepath) as thisfile: # Check file structure if option is enabled if structure_check: struct_result = check_structure(filepath) # Print/Log an error only if there's one (else we won't say anything) if struct_result: ptee.write("\n- Structure error with file "+filepath+": "+struct_result) ext = os.path.splitext(filepath)[1] # File's extension statinfos = os.stat(filepath) # Various OS filesystem infos about the file size = statinfos.st_size # File size lastmodif = statinfos.st_mtime # File last modified date (as a timestamp) lastmodif_readable = datetime.datetime.fromtimestamp(lastmodif).strftime("%Y-%m-%d %H:%M:%S") # File last modified date as a human readable date (ISO universal time) csv_row = [path2unix(relfilepath), md5hash, sha1hash, lastmodif, lastmodif_readable, size, ext] # Prepare the CSV row csv_writer.writerow(csv_row) # Save to the file ptee.write("----------------------------------------------------") ptee.write("All files processed: Total: %i - Added: %i.\n\n" % (filescount, addcount)) # -- Filescraping recovery mode # We will compare all files from the input path and reorganize the ones that are recognized into the output path elif filescraping: import shutil ptee.write("====================================") ptee.write("RIFGC File Scraping Recovery started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") ptee.write("Loading the database into memory, please wait...") md5list = {} sha1list = {} dbrows = {} # TODO: instead of memorizing everything in memory, store just the reading cursor position at the beginning of the line with the size and then just read when necessary from the db file directly id = 0 with open(database, 'rb') as db: for row in csv.DictReader(db, lineterminator='\n', delimiter='|', quotechar='"'): id += 1 if (len(row['md5']) > 0 and len(row['sha1']) > 0): md5list[row['md5']] = id sha1list[row['sha1']] = id dbrows[id] = row ptee.write("Loading done.") if len(dbrows) == 0: ptee.write("Nothing to do, there's no md5 nor sha1 hashes in the database file!") del ptee return 1 # return with an error # Counting the total number of files that we will have to process ptee.write("Counting total number of files to process, please wait...") filestodocount = 0 for _ in tqdm.tqdm(recwalk(inputpath), file=ptee): filestodocount = filestodocount + 1 ptee.write("Counting done.") # Recursively traversing the root directory and save the metadata in the db for each file ptee.write("Processing file scraping recovery, walking through all files from input folder...") filescount = 0 copiedcount = 0 for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 # Get full absolute filepath filepath = os.path.join(dirpath,filename) # Get database relative path (from scanning root folder) relfilepath = path2unix(os.path.relpath(filepath, rootfolderpath)) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different) if verbose: ptee.write("\n- Processing file %s" % relfilepath) # Generate the hashes from the currently inspected file md5hash, sha1hash = generate_hashes(filepath) # If it match with a file in the database, we will copy it over with the correct name, directory structure, file extension and last modification date if md5hash in md5list and sha1hash in sha1list and md5list[md5hash] == sha1list[sha1hash]: # Load the db infos for this file row = dbrows[md5list[md5hash]] ptee.write("- Found: %s --> %s.\n" % (filepath, row['path'])) # Generate full absolute filepath of the output file outfilepath = os.path.join(outputpath, row['path']) # Recursively create the directory tree structure outfiledir = os.path.dirname(outfilepath) if not os.path.isdir(outfiledir): os.makedirs(outfiledir) # if the target directory does not exist, create it (and create recursively all parent directories too) # Copy over and set attributes shutil.copy2(filepath, outfilepath) filestats = os.stat(filepath) os.utime(outfilepath, (filestats.st_atime, float(row['last_modification_timestamp']))) # Counter... copiedcount += 1 ptee.write("----------------------------------------------------") ptee.write("All files processed: Total: %i - Recovered: %i.\n\n" % (filescount, copiedcount)) # -- Check mode: check the files using a database file elif not update and not generate and not filescraping: ptee.write("====================================") ptee.write("RIFGC Check started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Open errors file if supplied (where we will store every errors in a formatted csv so that it can later be easily processed by other softwares, such as repair softwares) if errors_file is not None: efile = open(errors_file, 'wb') e_writer = csv.writer(efile, delimiter='|', lineterminator='\n', quotechar='"') # Precompute the total number of lines to process (this should be fairly quick) filestodocount = 0 with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): filestodocount = filestodocount + 1 # Processing the files using the database list ptee.write("Checking for files corruption based on database %s on input path %s, please wait..." % (database, inputpath)) dbf.seek(0) dbfile = csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"') # we need to reopen the file to put the reading cursor (the generator position) back to the beginning errorscount = 0 filescount = 0 for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 filepath = os.path.join(rootfolderpath, row['path']) # Single-file mode: skip if this is not the file we are looking for if inputpath != rootfolderpath and inputpath != filepath: continue if verbose: ptee.write("\n- Processing file %s" % row['path']) errors = [] if not os.path.isfile(filepath): if not skip_missing: errors.append('file is missing') # First generate the current file's metadata given the filepath from the CSV, and then we will check the differences from database else: try: # Try to be resilient to various file access errors # Generate hash if not skip_hash: md5hash, sha1hash = generate_hashes(filepath) else: md5hash = sha1hash = 0 # Check structure integrity if enabled if structure_check: struct_result = check_structure(filepath) if struct_result: errors.append("structure error (%s)" % struct_result) # Compute other metadata with open(filepath) as thisfile: ext = os.path.splitext(filepath)[1] statinfos = os.stat(filepath) size = statinfos.st_size lastmodif = statinfos.st_mtime lastmodif_readable = datetime.datetime.fromtimestamp(lastmodif).strftime("%Y-%m-%d %H:%M:%S") # CHECK THE DIFFERENCES if not skip_hash and md5hash != row['md5'] and sha1hash != row['sha1']: errors.append('both md5 and sha1 hash failed') elif not skip_hash and ((md5hash == row['md5'] and sha1hash != row['sha1']) or (md5hash != row['md5'] and sha1hash == row['sha1'])): errors.append('one of the hash failed but not the other (which may indicate that the database file is corrupted)') if ext != row['ext']: errors.append('extension has changed') if size != int(row['size']): errors.append("size has changed (before: %s - now: %s)" % (row['size'], size)) if not disable_modification_date_checking and (lastmodif != float(row['last_modification_timestamp']) and round(lastmodif,0) != round(float(row['last_modification_timestamp']),0)): # for usage with PyPy: last modification time is differently managed (rounded), thus we need to round here manually to compare against PyPy. errors.append("modification date has changed (before: %s - now: %s)" % (row['last_modification_date'], lastmodif_readable)) except IOError as e: # Catch IOError as a file error errors.append('file can\'t be read, IOError (inaccessible, maybe bad sector?)') except Exception as e: # Any other exception when accessing the file will also be caught as a file error errors.append('file can\'t be accessed: %s' % e) # Print/Log all errors for this file if any happened if errors: errorscount = errorscount + 1 ptee.write("\n- Error for file %s: %s." % (row['path'], ', '.join(errors))) if errors_file is not None: # Write error in a csv file if supplied (for easy processing later by other softwares such as file repair softwares) e_writer.writerow( [row['path'], ', '.join(errors)] ) # END OF CHECKING: show some stats ptee.write("----------------------------------------------------") ptee.write("All files checked: Total: %i - Files with errors: %i.\n\n" % (filescount, errorscount)) retval = (errorscount > 0) del ptee return retval # return error code if any
def main(argv=None): if argv is None: # if argv is empty, fetch from the commandline argv = sys.argv[1:] elif isinstance( argv, basestring ): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser argv = shlex.split(argv) # Parse string just like argv using shlex #==== COMMANDLINE PARSER ==== #== Commandline description desc = '''Recursive/Relative Files Integrity Generator and Checker Description: Recursively generate or check the integrity of files by MD5 and SHA1 hashes, size, modification date or by data structure integrity (only for images). This script is originally meant to be used for data archival, by allowing an easy way to check for silent file corruption. Thus, this script uses relative paths so that you can easily compute and check the same redundant data copied on different mediums (hard drives, optical discs, etc.). This script is not meant for system files corruption notification, but is more meant to be used from times-to-times to check up on your data archives integrity. ''' ep = '''Example usage: - To generate the database (only needed once): python rfigc.py -i "folderimages" -d "dbhash.csv" -g - To check: python rfigc.py -i "folderimages" -d "dbhash.csv" -l log.txt -s - To update your database by appending new files: python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a - To update your database by appending new files AND removing inexistent files: python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a -r - To use with a gui: python rfigc.py --gui Note that by default, the script is by default in check mode, to avoid wrong manipulations. It will also alert you if you generate over an already existing database file. Note2: you can use PyPy to speed the generation, but you should avoid using PyPy when in checking mode (from our tests, it will slow things down a lot). ''' #== Commandline arguments #-- Constructing the parser # Use GooeyParser if we want the GUI because it will provide better widgets if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv): # pragma: no cover # Initialize the Gooey parser main_parser = gooey.GooeyParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well) widget_dir = {"widget": "DirChooser"} widget_filesave = {"widget": "FileSaver"} widget_file = {"widget": "FileChooser"} widget_text = {"widget": "TextField"} else: # Else in command-line usage, use the standard argparse # Delete the special argument to avoid unrecognized argument error in argparse if '--ignore-gooey' in argv[0]: argv.remove( '--ignore-gooey' ) # this argument is automatically fed by Gooey when the user clicks on Start # Initialize the normal argparse parser main_parser = argparse.ArgumentParser( add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter) # Define dummy dict to keep compatibile with command-line usage widget_dir = {} widget_filesave = {} widget_file = {} widget_text = {} # Required arguments main_parser.add_argument( '-i', '--input', metavar='/path/to/root/folder', type=is_dir_or_file, nargs=1, required=True, help= 'Path to the root folder (or a single file) from where the scanning will occur.', **widget_dir) main_parser.add_argument( '-d', '--database', metavar='/some/folder/databasefile.csv', type=str, nargs=1, required=True, #type=argparse.FileType('rt') help='Path to the csv file containing the hash informations.', **widget_filesave) # Optional general arguments main_parser.add_argument( '-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False, help= 'Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave) main_parser.add_argument( '--skip_hash', action='store_true', required=False, default=False, help= 'Skip hash computation/checking (checks only the other metadata, this is a lot quicker).' ) main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode (show more output).') main_parser.add_argument( '--silent', action='store_true', required=False, default=False, help= 'No console output (but if --log specified, the log will still be saved in the specified file).' ) # Checking mode arguments main_parser.add_argument('-s', '--structure_check', action='store_true', required=False, default=False, help='Check images structures for corruption?') main_parser.add_argument( '-e', '--errors_file', metavar='/some/folder/errorsfile.csv', type=str, nargs=1, required=False, #type=argparse.FileType('rt') help= 'Path to the error file, where errors at checking will be stored in CSV for further processing by other softwares (such as file repair softwares).', **widget_filesave) main_parser.add_argument('-m', '--disable_modification_date_checking', action='store_true', required=False, default=False, help='Disable modification date checking.') main_parser.add_argument( '--skip_missing', action='store_true', required=False, default=False, help= 'Skip missing files when checking (useful if you split your files into several mediums, for example on optical discs with limited capacity).' ) # Generate mode arguments main_parser.add_argument( '-g', '--generate', action='store_true', required=False, default=False, help= 'Generate the database? (omit this parameter to check instead of generating).' ) main_parser.add_argument( '-f', '--force', action='store_true', required=False, default=False, help= 'Force overwriting the database file even if it already exists (if --generate).' ) # Update mode arguments main_parser.add_argument( '-u', '--update', action='store_true', required=False, default=False, help='Update database (you must also specify --append or --remove).') main_parser.add_argument('-a', '--append', action='store_true', required=False, default=False, help='Append new files (if --update).') main_parser.add_argument('-r', '--remove', action='store_true', required=False, default=False, help='Remove missing files (if --update).') # Recover from file scraping main_parser.add_argument( '--filescraping_recovery', action='store_true', required=False, default=False, help= 'Given a folder of unorganized files, compare to the database and restore the filename and directory structure into the output folder.' ) main_parser.add_argument( '-o', '--output', metavar='/path/to/root/folder', type=is_dir, nargs=1, required=False, help= 'Path to the output folder where to output (copy) the files reorganized after --recover_from_filescraping.', **widget_dir) #== Parsing the arguments args = main_parser.parse_args(argv) # Storing all arguments to args #-- Set variables from arguments inputpath = fullpath( args.input[0] ) # path to the files to protect (either a folder or a single file) rootfolderpath = inputpath # path to the root folder (to compute relative paths) #database = os.path.basename(fullpath(args.database[0])) # Take only the filename. database = fullpath(args.database[0]) generate = args.generate structure_check = args.structure_check force = args.force disable_modification_date_checking = args.disable_modification_date_checking skip_missing = args.skip_missing skip_hash = args.skip_hash update = args.update append = args.append remove = args.remove outputpath = None if args.output: outputpath = fullpath(args.output[0]) filescraping = args.filescraping_recovery verbose = args.verbose silent = args.silent if os.path.isfile( inputpath ): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!) rootfolderpath = os.path.dirname(inputpath) errors_file = None if args.errors_file: errors_file = fullpath(args.errors_file[0]) # -- Checking arguments if structure_check and not structure_check_import: raise ImportError( 'PIL (Python Imaging Library) could not be imported. PIL is needed to do structure check, please install PIL (or you can disable structure check to continue).' ) if update and (not append and not remove): raise ValueError( '--update specified but not --append nor --remove. You must specify at least one of these modes when using --update!' ) if filescraping and not outputpath: raise ValueError( 'Output path needed when --recover_from_filescraping.') # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file) if args.log: ptee = Tee(args.log[0], 'a', nostdout=silent) #sys.stdout = Tee(args.log[0], 'a') sys.stderr = Tee(args.log[0], 'a', nostdout=silent) else: ptee = Tee(nostdout=silent) # == PROCESSING BRANCHING == # retval = 0 # Returned value: 0 OK, 1 KO (files in error), -1 Error # -- Update the database file by removing missing files if update and remove: if not os.path.isfile(database): raise NameError( 'Specified database file does not exist, can\'t update!') ptee.write("====================================") ptee.write( "RIFGC Database Update Removal of missing files, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Precompute the total number of lines to process (this should be fairly quick) filestodocount = 0 with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): filestodocount = filestodocount + 1 # Preparing CSV writer for the temporary file that will have the lines removed with open(database + '.rem', 'wb') as dbfilerem: csv_writer = csv.writer(dbfilerem, lineterminator='\n', delimiter='|', quotechar='"') # Printing CSV headers csv_headers = [ 'path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext' ] csv_writer.writerow(csv_headers) dbf.seek(0) dbfile = csv.DictReader( dbf, lineterminator='\n', delimiter='|', quotechar='"' ) # we need to reopen the file to put the reading cursor (the generator position) back to the beginning delcount = 0 filescount = 0 for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 filepath = os.path.join( rootfolderpath, row['path']) # Build the absolute file path # Single-file mode: skip if this is not the file we are looking for if inputpath != rootfolderpath and inputpath != filepath: continue if verbose: ptee.write("\n- Processing file %s" % row['path']) errors = [] if not os.path.isfile(filepath): delcount = delcount + 1 ptee.write( "\n- File %s is missing, removed from database." % row['path']) else: csv_writer.writerow([ path2unix(row['path']), row['md5'], row['sha1'], row['last_modification_timestamp'], row['last_modification_date'], row['size'], row['ext'] ]) # REMOVE UPDATE DONE, we remove the old database file and replace it with the new os.remove(database) # delete old database os.rename(database + '.rem', database) # rename new database to match old name # Show some stats ptee.write("----------------------------------------------------") ptee.write( "All files processed: Total: %i - Removed/Missing: %i.\n\n" % (filescount, delcount)) # -- Generate the database file or update/append (both will walk through the filesystem to get new files, contrary to other branchs which walk through the database csv) if generate or (update and append): if not force and os.path.isfile(database) and not update: raise NameError( 'Database file already exists. Please choose another name to generate your database file.' ) if generate: dbmode = 'wb' elif (update and append): dbmode = 'ab' with open( database, dbmode ) as dbfile: # Must open in write + binary, because on Windows it will do weird things otherwise (at least with Python 2.7) ptee.write("====================================") if generate: ptee.write("RIFGC Database Generation started on %s" % datetime.datetime.now().isoformat()) elif update and append: ptee.write( "RIFGC Database Update Append new files, started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Preparing CSV writer csv_writer = csv.writer(dbfile, lineterminator='\n', delimiter='|', quotechar='"') if generate: # Printing CSV headers csv_headers = [ 'path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext' ] csv_writer.writerow(csv_headers) if (update and append): # Extract all paths already stored in database to avoid readding them db_paths = {} with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): db_paths[row['path']] = True # Counting the total number of files that we will have to process ptee.write( "Counting total number of files to process, please wait...") filestodocount = 0 for _ in tqdm.tqdm(recwalk(inputpath), file=ptee): filestodocount = filestodocount + 1 ptee.write("Counting done.") # Recursively traversing the root directory and save the metadata in the db for each file ptee.write( "Processing files to compute metadata to store in database, please wait..." ) filescount = 0 addcount = 0 for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 # Get full absolute filepath filepath = os.path.join(dirpath, filename) # Get database relative path (from scanning root folder) relfilepath = path2unix( os.path.relpath(filepath, rootfolderpath) ) # File relative path from the root (so that we can easily check the files later even if the absolute path is different) if verbose: ptee.write("\n- Processing file %s" % relfilepath) # If update + append mode, then if the file is already in the database we skip it (we continue computing metadata only for new files) if update and append and relfilepath in db_paths: if verbose: ptee.write("... skipped") continue else: addcount = addcount + 1 # Compute the hashes (leave it outside the with command because generate_hashes() open the file by itself, so that both hashes can be computed in a single sweep of the file at the same time) if not skip_hash: md5hash, sha1hash = generate_hashes(filepath) else: md5hash = sha1hash = 0 # Compute other metadata with open(filepath) as thisfile: # Check file structure if option is enabled if structure_check: struct_result = check_structure(filepath) # Print/Log an error only if there's one (else we won't say anything) if struct_result: ptee.write("\n- Structure error with file " + filepath + ": " + struct_result) ext = os.path.splitext(filepath)[1] # File's extension statinfos = os.stat( filepath) # Various OS filesystem infos about the file size = statinfos.st_size # File size lastmodif = statinfos.st_mtime # File last modified date (as a timestamp) lastmodif_readable = datetime.datetime.fromtimestamp( lastmodif ).strftime( "%Y-%m-%d %H:%M:%S" ) # File last modified date as a human readable date (ISO universal time) csv_row = [ path2unix(relfilepath), md5hash, sha1hash, lastmodif, lastmodif_readable, size, ext ] # Prepare the CSV row csv_writer.writerow(csv_row) # Save to the file ptee.write("----------------------------------------------------") ptee.write("All files processed: Total: %i - Added: %i.\n\n" % (filescount, addcount)) # -- Filescraping recovery mode # We will compare all files from the input path and reorganize the ones that are recognized into the output path elif filescraping: import shutil ptee.write("====================================") ptee.write("RIFGC File Scraping Recovery started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") ptee.write("Loading the database into memory, please wait...") md5list = {} sha1list = {} dbrows = { } # TODO: instead of memorizing everything in memory, store just the reading cursor position at the beginning of the line with the size and then just read when necessary from the db file directly id = 0 with open(database, 'rb') as db: for row in csv.DictReader(db, lineterminator='\n', delimiter='|', quotechar='"'): id += 1 if (len(row['md5']) > 0 and len(row['sha1']) > 0): md5list[row['md5']] = id sha1list[row['sha1']] = id dbrows[id] = row ptee.write("Loading done.") if len(dbrows) == 0: ptee.write( "Nothing to do, there's no md5 nor sha1 hashes in the database file!" ) del ptee return 1 # return with an error # Counting the total number of files that we will have to process ptee.write("Counting total number of files to process, please wait...") filestodocount = 0 for _ in tqdm.tqdm(recwalk(inputpath), file=ptee): filestodocount = filestodocount + 1 ptee.write("Counting done.") # Recursively traversing the root directory and save the metadata in the db for each file ptee.write( "Processing file scraping recovery, walking through all files from input folder..." ) filescount = 0 copiedcount = 0 for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 # Get full absolute filepath filepath = os.path.join(dirpath, filename) # Get database relative path (from scanning root folder) relfilepath = path2unix( os.path.relpath(filepath, rootfolderpath) ) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different) if verbose: ptee.write("\n- Processing file %s" % relfilepath) # Generate the hashes from the currently inspected file md5hash, sha1hash = generate_hashes(filepath) # If it match with a file in the database, we will copy it over with the correct name, directory structure, file extension and last modification date if md5hash in md5list and sha1hash in sha1list and md5list[ md5hash] == sha1list[sha1hash]: # Load the db infos for this file row = dbrows[md5list[md5hash]] ptee.write("- Found: %s --> %s.\n" % (filepath, row['path'])) # Generate full absolute filepath of the output file outfilepath = os.path.join(outputpath, row['path']) # Recursively create the directory tree structure outfiledir = os.path.dirname(outfilepath) if not os.path.isdir(outfiledir): os.makedirs( outfiledir ) # if the target directory does not exist, create it (and create recursively all parent directories too) # Copy over and set attributes shutil.copy2(filepath, outfilepath) filestats = os.stat(filepath) os.utime(outfilepath, (filestats.st_atime, float(row['last_modification_timestamp']))) # Counter... copiedcount += 1 ptee.write("----------------------------------------------------") ptee.write("All files processed: Total: %i - Recovered: %i.\n\n" % (filescount, copiedcount)) # -- Check mode: check the files using a database file elif not update and not generate and not filescraping: ptee.write("====================================") ptee.write("RIFGC Check started on %s" % datetime.datetime.now().isoformat()) ptee.write("====================================") # Open errors file if supplied (where we will store every errors in a formatted csv so that it can later be easily processed by other softwares, such as repair softwares) if errors_file is not None: efile = open(errors_file, 'wb') e_writer = csv.writer(efile, delimiter='|', lineterminator='\n', quotechar='"') # Precompute the total number of lines to process (this should be fairly quick) filestodocount = 0 with open(database, 'rb') as dbf: for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'): filestodocount = filestodocount + 1 # Processing the files using the database list ptee.write( "Checking for files corruption based on database %s on input path %s, please wait..." % (database, inputpath)) dbf.seek(0) dbfile = csv.DictReader( dbf, lineterminator='\n', delimiter='|', quotechar='"' ) # we need to reopen the file to put the reading cursor (the generator position) back to the beginning errorscount = 0 filescount = 0 for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True): filescount = filescount + 1 filepath = os.path.join(rootfolderpath, row['path']) # Single-file mode: skip if this is not the file we are looking for if inputpath != rootfolderpath and inputpath != filepath: continue if verbose: ptee.write("\n- Processing file %s" % row['path']) errors = [] if not os.path.isfile(filepath): if not skip_missing: errors.append('file is missing') # First generate the current file's metadata given the filepath from the CSV, and then we will check the differences from database else: try: # Try to be resilient to various file access errors # Generate hash if not skip_hash: md5hash, sha1hash = generate_hashes(filepath) else: md5hash = sha1hash = 0 # Check structure integrity if enabled if structure_check: struct_result = check_structure(filepath) if struct_result: errors.append("structure error (%s)" % struct_result) # Compute other metadata with open(filepath) as thisfile: ext = os.path.splitext(filepath)[1] statinfos = os.stat(filepath) size = statinfos.st_size lastmodif = statinfos.st_mtime lastmodif_readable = datetime.datetime.fromtimestamp( lastmodif).strftime("%Y-%m-%d %H:%M:%S") # CHECK THE DIFFERENCES if not skip_hash and md5hash != row[ 'md5'] and sha1hash != row['sha1']: errors.append('both md5 and sha1 hash failed') elif not skip_hash and ( (md5hash == row['md5'] and sha1hash != row['sha1']) or (md5hash != row['md5'] and sha1hash == row['sha1'])): errors.append( 'one of the hash failed but not the other (which may indicate that the database file is corrupted)' ) if ext != row['ext']: errors.append('extension has changed') if size != int(row['size']): errors.append( "size has changed (before: %s - now: %s)" % (row['size'], size)) if not disable_modification_date_checking and ( lastmodif != float( row['last_modification_timestamp']) and round(lastmodif, 0) != round( float(row[ 'last_modification_timestamp']), 0) ): # for usage with PyPy: last modification time is differently managed (rounded), thus we need to round here manually to compare against PyPy. errors.append( "modification date has changed (before: %s - now: %s)" % (row['last_modification_date'], lastmodif_readable)) except IOError as e: # Catch IOError as a file error errors.append( 'file can\'t be read, IOError (inaccessible, maybe bad sector?)' ) except Exception as e: # Any other exception when accessing the file will also be caught as a file error errors.append('file can\'t be accessed: %s' % e) # Print/Log all errors for this file if any happened if errors: errorscount = errorscount + 1 ptee.write("\n- Error for file %s: %s." % (row['path'], ', '.join(errors))) if errors_file is not None: # Write error in a csv file if supplied (for easy processing later by other softwares such as file repair softwares) e_writer.writerow([row['path'], ', '.join(errors)]) # END OF CHECKING: show some stats ptee.write("----------------------------------------------------") ptee.write( "All files checked: Total: %i - Files with errors: %i.\n\n" % (filescount, errorscount)) retval = (errorscount > 0) del ptee return retval # return error code if any