Esempio n. 1
0
def relpath_posix(recwalk_result, pardir, fromwinpath=False):
    ''' Helper function to convert all paths to relative posix like paths (to ease comparison) '''
    return recwalk_result[0], path2unix(os.path.join(os.path.relpath(recwalk_result[0], pardir),recwalk_result[1]), nojoin=True, fromwinpath=fromwinpath)
Esempio n. 2
0
def synchronize_files(inputpaths, outpath, database=None, tqdm_bar=None, report_file=None, ptee=None, verbose=False):
    ''' Main function to synchronize files contents by majority vote
    The main job of this function is to walk through the input folders and align the files, so that we can compare every files across every folders, one by one.
    The whole trick here is to align files, so that we don't need to memorize all the files in memory and we compare all equivalent files together: to do that, we ensure that we walk through the input directories in alphabetical order, and we pick the relative filepath at the top of the alphabetical order, this ensures the alignment of files between different folders, without memorizing  the whole trees structures.
    '''
    # (Generator) Files Synchronization Algorithm:
    # Needs a function stable_dir_walking, which will walk through directories recursively but in always the same order on all platforms (same order for files but also for folders), whatever order it is, as long as it is stable.
    # Until there's no file in any of the input folders to be processed:
    # - curfiles <- load first file for each folder by using stable_dir_walking on each input folder.
    # - curfiles_grouped <- group curfiles_ordered:
    #    * curfiles_ordered <- order curfiles alphabetically (need to separate the relative parent directory and the filename, to account for both without ambiguity)
    #    * curfiles_grouped <- empty list
    #    * curfiles_grouped[0] = add first element in curfiles_ordered
    #    * last_group = 0
    #    * for every subsequent element nextelt in curfiles_ordered:
    #        . if nextelt == curfiles_grouped[last_group][0]: add nextelt into curfiles_grouped[last_group] (the latest group in curfiles_grouped)
    #        . else: create a new group in curfiles_grouped (last_group += 1) and add nextelt into curfiles_grouped[last_group]
    # At this stage, curfiles_grouped[0] should contain a group of files with the same relative filepath from different input folders, and since we used stable_dir_walking, we are guaranteed that this file is the next to be processed in alphabetical order.
    # - Majority vote byte-by-byte for each of curfiles_grouped[0], and output winning byte to the output file.
    # - Update files list alignment: we will now ditch files in curfiles_grouped[0] from curfiles, and replace by the next files respectively from each respective folder. Since we processed in alphabetical (or whatever) order, the next loaded files will match the files in other curfiles_grouped groups that we could not process before.
    # At this point (after the loop), all input files have been processed in order, without maintaining the whole files list in memory, just one file per input folder.

    # Init files walking generator for each inputpaths
    recgen = [recwalk(path, sorting=True) for path in inputpaths]
    curfiles = {}
    recgen_exhausted = {}
    recgen_exhausted_count = 0
    nbpaths = len(inputpaths)
    retcode = 0

    if not ptee: ptee = sys.stdout

    # Open report file and write header
    if report_file is not None:
        rfile = open(report_file, 'wb')
        r_writer = csv.writer(rfile, delimiter='|', lineterminator='\n', quotechar='"')
        r_header = ["filepath"] + ["dir%i" % (i+1) for i in xrange(nbpaths)] + ["hash-correct", "error_code", "errors"]
        r_length = len(r_header)
        r_writer.writerow(r_header)

    # Initialization: load the first batch of files, one for each folder
    for i in xrange(len(recgen)):
        recgen_exhausted[i] = False
        try:
            if curfiles.get(i, None) is None:
                curfiles[i] = relpath_posix(recgen[i].next(), inputpaths[i])[1]
        except StopIteration:
            recgen_exhausted[i] = True
            recgen_exhausted_count += 1

    # Files lists alignment loop
    while recgen_exhausted_count < nbpaths:
        errcode = 0
        errmsg = None

        # Init a new report's row
        if report_file: r_row = ["-"] * r_length

        # -- Group equivalent relative filepaths together
        #print curfiles # debug
        curfiles_grouped = sort_group(curfiles, True)

        # -- Extract first group of equivalent filepaths (this allows us to process with the same alphabetical order on all platforms)
        # Note that the remaining files in other groups will be processed later, because their alphabetical order is higher to the first group, this means that the first group is to be processed now
        to_process = curfiles_grouped[0]
        #print to_process # debug

        # -- Byte-by-byte majority vote on the first group of files
        # Need the relative filepath also (note that there's only one since it's a group of equivalent relative filepaths, only the absolute path is different between files of a same group)
        relfilepath = path2unix(os.path.join(*to_process[0][1]))
        if report_file: r_row[0] = relfilepath
        if verbose: ptee.write("- Processing file %s." % relfilepath)
        # Generate output path
        outpathfull = os.path.join(outpath, relfilepath)
        create_dir_if_not_exist(os.path.dirname(outpathfull))
        # Initialize the list of absolute filepaths
        fileslist = []
        for elt in to_process:
            i = elt[0]
            fileslist.append(os.path.join(inputpaths[i], os.path.join(*elt[1])))
            if report_file: r_row[i+1] = 'X' # put an X in the report file below each folder that contains this file
        # If there's only one file, just copy it over
        if len(to_process) == 1:
            shutil.copyfile(fileslist[0], outpathfull)
            id = to_process[0][0]
            if report_file: r_row[id+1] = 'O'
        # Else, merge by majority vote
        else:
            # Before-merge check using rfigc database, if provided
            # If one of the files in the input folders is already correct, just copy it over
            correct_file = None
            if database:
                for id, filepath in enumerate(fileslist):
                    if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" % (filepath, database)) == 0:
                        correct_file = filepath
                        correct_id = to_process[id][0]
                        break

            # If one correct file was found, copy it over
            if correct_file:
                create_dir_if_not_exist(os.path.dirname(outpathfull))
                shutil.copyfile(correct_file, outpathfull)
                if report_file:
                    r_row[correct_id+1] = "O"
                    r_row[-3] = "OK"
            # Else, we need to do the majority vote merge
            else:
                # Do the majority vote merge
                errcode, errmsg = majority_vote_byte_scan(relfilepath, fileslist, outpath)

        # After-merge/move check using rfigc database, if provided
        if database:
            if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" % (outpathfull, database)) == 1:
                errcode = 1
                r_row[-3] = "KO"
                if not errmsg: errmsg = ''
                errmsg += " File could not be totally repaired according to rfigc database."
            else:
                if report_file:
                    r_row[-3] = "OK"
                    if errmsg: errmsg += " But merged file is correct according to rfigc database."

        # Display errors if any
        if errcode:
            if report_file:
                r_row[-2] = "KO"
                r_row[-1] = errmsg
            ptee.write(errmsg)
            retcode = 1
        else:
            if report_file: r_row[-2] = "OK"

        # Save current report's row
        if report_file:
            r_writer.writerow(r_row)

        # -- Update files lists alignment (ie, retrieve new files but while trying to keep the alignment)
        for elt in to_process:  # for files of the first group (the ones we processed)
            i = elt[0]
            # Walk their respective folders and load up the next file
            try:
                if not recgen_exhausted.get(i, False):
                    curfiles[i] = relpath_posix(recgen[i].next(), inputpaths[i])[1]
            # If there's no file left in this folder, mark this input folder as exhausted and continue with the others
            except StopIteration:
                curfiles[i] = None
                recgen_exhausted[i] = True
                recgen_exhausted_count += 1
        if tqdm_bar: tqdm_bar.update()
    if tqdm_bar: tqdm_bar.close()

    # Closing report file
    if report_file:
        # Write list of directories and legend
        rfile.write("\n=> Input directories:")
        for id, ipath in enumerate(inputpaths):
            rfile.write("\n\t- dir%i = %s" % ((id+1), ipath))
        rfile.write("\n=> Output directory: %s" % outpath)
        rfile.write("\n=> Legend: X=existing/selected for majority vote, O=only used this file, - = not existing, OK = check correct, KO = check incorrect (file was not recovered)\n")
        # Close the report file handle
        rfile.close()

    return retcode
Esempio n. 3
0
def main(argv=None):
    if argv is None: # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv) # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Replication Repair
Description: Given a set of directories (or files), try to repair your files by scanning each byte, cast a majority vote among all copies, and then output the winning byte. This process is usually called triple-modular redundancy (but here it should be called n-modular redundancy since you can use as many copies as you have).
It is recommended for long term storage to store several copies of your files on different storage mediums. Everything's fine until all your copies are partially corrupted. In this case, this script can help you, by taking advantage of your multiple copies, without requiring a pregenerated ecc file. Just specify the path to every copies, and the script will try to recover them.
Replication can repair exactly r-2 errors using majority vote (you need at least 2 blocks for majority vote to work), where r is the number of replications: if r=3, you get a redundancy rate of 1/3, if r=4, rate is 2/4, etc.
This script can also take advantage of a database generated by rfigc.py to make sure that the recovered files are correct, or to select files that are already correct.

Note: in case the end result is not what you expected, you can try a different order of input directories: in case of ambiguity, the first input folder has precedence over subsequent folders.
Note2: in case some files with the same names are of different length, the merging will continue until the longest file is exhausted.
Note3: last modification date is not (yet) accounted for.
    '''
    ep = '''Use --gui as the first argument to use with a GUI (via Gooey).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
        widget_multidir = {"widget": "MultiDirChooser"}
    else: # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
        widget_multidir = {}

    # Required arguments
    main_parser.add_argument('-i', '--input', metavar='"/path/to/copy1/" "/path/to/copy2/" "etc."', type=is_dir_or_file, nargs='+', required=True,
                        help='Specify the paths to every copies you have (minimum 3 copies, else it won\'t work!). Can be folders or files (if you want to repair only one file). Order matters: in case of ambiguity, the first folder where the file exists will be chosen.', **widget_multidir)
    main_parser.add_argument('-o', '--output', metavar='/ouput/folder/', nargs=1, required=True,
                        help='Where the recovered files will be stored.', **widget_dir)

    # Optional general arguments
    main_parser.add_argument('-d', '--database', metavar='database.csv', type=is_file, required=False,
                        help='Path to a previously generated rfigc.py database. If provided, this will be used to check that the repaired files are correct (and also to find already correct files in copies).', **widget_file)
    main_parser.add_argument('-r', '--report', metavar='/some/folder/report.csv', type=str, required=False,
                        help='Save all results of the repair process in a report file, with detailed descriptions of ambiguous repairs (ie, when majority vote came to a draw).', **widget_filesave)
    main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False,
                        help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave)
    main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False,
                        help='Force overwriting the output folder even if it already exists.')
    main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False,
                        help='Verbose mode (show more output).')
    main_parser.add_argument('--silent', action='store_true', required=False, default=False,
                        help='No console output (but if --log specified, the log will still be saved in the specified file).')

    #== Parsing the arguments
    args = main_parser.parse_args(argv) # Storing all arguments to args

    #-- Set variables from arguments
    inputpaths = [fullpath(x) for x in args.input] # path to the files to repair (ie, paths to all the different copies the user has)
    outputpath = fullpath(args.output[0])
    force = args.force
    verbose = args.verbose
    silent = args.silent

    if len(inputpaths) < 3:
        raise Exception('Need at least 3 copies to do a replication repair/majority vote!')

    #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
        #rootfolderpath = os.path.dirname(inputpath)

    report_file = None
    if args.report: report_file = os.path.basename(fullpath(args.report))
    database = None
    if args.database: database = args.database

    # -- Checking arguments
    if os.path.exists(outputpath) and not force:
        raise NameError('Specified output path %s already exists! Use --force if you want to overwrite.' % outputpath)

    if database and not os.path.isfile(database):
        raise NameError('Specified rfigc database file %s does not exist!' % database)

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        #sys.stdout = Tee(args.log[0], 'a')
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)


    # == PROCESSING BRANCHING == #

    # == Precomputation of ecc file size
    # Precomputing is important so that the user can know what size to expect before starting (and how much time it will take...).
    filescount = 0
    sizetotal = 0
    sizeheaders = 0
    visitedfiles = {}
    ptee.write("Precomputing list of files and predicted statistics...")
    prebar = tqdm.tqdm(file=ptee, disable=silent)
    for inputpath in inputpaths:
        for (dirpath, filename) in recwalk(inputpath):
            # Get full absolute filepath
            filepath = os.path.join(dirpath, filename)
            relfilepath = path2unix(os.path.relpath(filepath, inputpath)) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different)

            # Only increase the files count if we didn't see this file before
            if not visitedfiles.get(relfilepath, None):
                # Counting the total number of files we will process (so that we can show a progress bar with ETA)
                filescount = filescount + 1
                # Add the file to the list of already visited files
                visitedfiles[relfilepath] = True
                # Get the current file's size
                size = os.stat(filepath).st_size
                # Compute total size of all files
                sizetotal = sizetotal + size
            prebar.update()
    prebar.close()
    ptee.write("Precomputing done.")

    # == Majority vote repair
    # For each folder, align the files lists and then majority vote over each byte to repair
    ptee.write("====================================")
    ptee.write("Replication repair, started on %s" % datetime.datetime.now().isoformat())
    ptee.write("====================================")

    # Prepare progress bar if necessary
    if silent:
        tqdm_bar = None
    else:
        tqdm_bar = tqdm.tqdm(total=filescount, file=ptee, leave=True, unit="files")
    # Call the main function to synchronize files using majority vote
    errcode = synchronize_files(inputpaths, outputpath, database=database, tqdm_bar=tqdm_bar, report_file=report_file, ptee=ptee, verbose=verbose)
    #ptee.write("All done! Stats:\n- Total files processed: %i\n- Total files corrupted: %i\n- Total files repaired completely: %i\n- Total files repaired partially: %i\n- Total files corrupted but not repaired at all: %i\n- Total files skipped: %i" % (files_count, files_corrupted, files_repaired_completely, files_repaired_partially, files_corrupted - (files_repaired_partially + files_repaired_completely), files_skipped) )
    if tqdm_bar: tqdm_bar.close()
    ptee.write("All done!")
    if report_file: ptee.write("Saved replication repair results in report file: %s" % report_file)
    del ptee
    return errcode
def relpath_posix(recwalk_result, pardir, fromwinpath=False):
    ''' Helper function to convert all paths to relative posix like paths (to ease comparison) '''
    return recwalk_result[0], path2unix(os.path.join(
        os.path.relpath(recwalk_result[0], pardir), recwalk_result[1]),
                                        nojoin=True,
                                        fromwinpath=fromwinpath)
def main(argv=None):
    if argv is None:  # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(
            argv, basestring
    ):  # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv)  # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Replication Repair
Description: Given a set of directories (or files), try to repair your files by scanning each byte, cast a majority vote among all copies, and then output the winning byte. This process is usually called triple-modular redundancy (but here it should be called n-modular redundancy since you can use as many copies as you have).
It is recommended for long term storage to store several copies of your files on different storage mediums. Everything's fine until all your copies are partially corrupted. In this case, this script can help you, by taking advantage of your multiple copies, without requiring a pregenerated ecc file. Just specify the path to every copies, and the script will try to recover them.
Replication can repair exactly r-2 errors using majority vote (you need at least 2 blocks for majority vote to work), where r is the number of replications: if r=3, you get a redundancy rate of 1/3, if r=4, rate is 2/4, etc.
This script can also take advantage of a database generated by rfigc.py to make sure that the recovered files are correct, or to select files that are already correct.

Note: in case the end result is not what you expected, you can try a different order of input directories: in case of ambiguity, the first input folder has precedence over subsequent folders.
Note2: in case some files with the same names are of different length, the merging will continue until the longest file is exhausted.
Note3: last modification date is not (yet) accounted for.
    '''
    ep = '''Use --gui as the first argument to use with a GUI (via Gooey).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and
                          not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
        widget_multidir = {"widget": "MultiDirChooser"}
    else:  # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]:
            argv.remove(
                '--ignore-gooey'
            )  # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
        widget_multidir = {}

    # Required arguments
    main_parser.add_argument(
        '-i',
        '--input',
        metavar='"/path/to/copy1/" "/path/to/copy2/" "etc."',
        type=is_dir_or_file,
        nargs='+',
        required=True,
        help=
        'Specify the paths to every copies you have (minimum 3 copies, else it won\'t work!). Can be folders or files (if you want to repair only one file). Order matters: in case of ambiguity, the first folder where the file exists will be chosen.',
        **widget_multidir)
    main_parser.add_argument('-o',
                             '--output',
                             metavar='/ouput/folder/',
                             nargs=1,
                             required=True,
                             help='Where the recovered files will be stored.',
                             **widget_dir)

    # Optional general arguments
    main_parser.add_argument(
        '-d',
        '--database',
        metavar='database.csv',
        type=is_file,
        required=False,
        help=
        'Path to a previously generated rfigc.py database. If provided, this will be used to check that the repaired files are correct (and also to find already correct files in copies).',
        **widget_file)
    main_parser.add_argument(
        '-r',
        '--report',
        metavar='/some/folder/report.csv',
        type=str,
        required=False,
        help=
        'Save all results of the repair process in a report file, with detailed descriptions of ambiguous repairs (ie, when majority vote came to a draw).',
        **widget_filesave)
    main_parser.add_argument(
        '-l',
        '--log',
        metavar='/some/folder/filename.log',
        type=str,
        nargs=1,
        required=False,
        help=
        'Path to the log file. (Output will be piped to both the stdout and the log file)',
        **widget_filesave)
    main_parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        required=False,
        default=False,
        help='Force overwriting the output folder even if it already exists.')
    main_parser.add_argument('-v',
                             '--verbose',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Verbose mode (show more output).')
    main_parser.add_argument(
        '--silent',
        action='store_true',
        required=False,
        default=False,
        help=
        'No console output (but if --log specified, the log will still be saved in the specified file).'
    )

    #== Parsing the arguments
    args = main_parser.parse_args(argv)  # Storing all arguments to args

    #-- Set variables from arguments
    inputpaths = [
        fullpath(x) for x in args.input
    ]  # path to the files to repair (ie, paths to all the different copies the user has)
    outputpath = fullpath(args.output[0])
    force = args.force
    verbose = args.verbose
    silent = args.silent

    if len(inputpaths) < 3:
        raise Exception(
            'Need at least 3 copies to do a replication repair/majority vote!')

    #if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
    #rootfolderpath = os.path.dirname(inputpath)

    report_file = None
    if args.report: report_file = os.path.basename(fullpath(args.report))
    database = None
    if args.database: database = args.database

    # -- Checking arguments
    if os.path.exists(outputpath) and not force:
        raise NameError(
            'Specified output path %s already exists! Use --force if you want to overwrite.'
            % outputpath)

    if database and not os.path.isfile(database):
        raise NameError('Specified rfigc database file %s does not exist!' %
                        database)

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        #sys.stdout = Tee(args.log[0], 'a')
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #

    # == Precomputation of ecc file size
    # Precomputing is important so that the user can know what size to expect before starting (and how much time it will take...).
    filescount = 0
    sizetotal = 0
    sizeheaders = 0
    visitedfiles = {}
    ptee.write("Precomputing list of files and predicted statistics...")
    prebar = tqdm.tqdm(file=ptee, disable=silent)
    for inputpath in inputpaths:
        for (dirpath, filename) in recwalk(inputpath):
            # Get full absolute filepath
            filepath = os.path.join(dirpath, filename)
            relfilepath = path2unix(
                os.path.relpath(filepath, inputpath)
            )  # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different)

            # Only increase the files count if we didn't see this file before
            if not visitedfiles.get(relfilepath, None):
                # Counting the total number of files we will process (so that we can show a progress bar with ETA)
                filescount = filescount + 1
                # Add the file to the list of already visited files
                visitedfiles[relfilepath] = True
                # Get the current file's size
                size = os.stat(filepath).st_size
                # Compute total size of all files
                sizetotal = sizetotal + size
            prebar.update()
    prebar.close()
    ptee.write("Precomputing done.")

    # == Majority vote repair
    # For each folder, align the files lists and then majority vote over each byte to repair
    ptee.write("====================================")
    ptee.write("Replication repair, started on %s" %
               datetime.datetime.now().isoformat())
    ptee.write("====================================")

    # Prepare progress bar if necessary
    if silent:
        tqdm_bar = None
    else:
        tqdm_bar = tqdm.tqdm(total=filescount,
                             file=ptee,
                             leave=True,
                             unit="files")
    # Call the main function to synchronize files using majority vote
    errcode = synchronize_files(inputpaths,
                                outputpath,
                                database=database,
                                tqdm_bar=tqdm_bar,
                                report_file=report_file,
                                ptee=ptee,
                                verbose=verbose)
    #ptee.write("All done! Stats:\n- Total files processed: %i\n- Total files corrupted: %i\n- Total files repaired completely: %i\n- Total files repaired partially: %i\n- Total files corrupted but not repaired at all: %i\n- Total files skipped: %i" % (files_count, files_corrupted, files_repaired_completely, files_repaired_partially, files_corrupted - (files_repaired_partially + files_repaired_completely), files_skipped) )
    if tqdm_bar: tqdm_bar.close()
    ptee.write("All done!")
    if report_file:
        ptee.write("Saved replication repair results in report file: %s" %
                   report_file)
    del ptee
    return errcode
def synchronize_files(inputpaths,
                      outpath,
                      database=None,
                      tqdm_bar=None,
                      report_file=None,
                      ptee=None,
                      verbose=False):
    ''' Main function to synchronize files contents by majority vote
    The main job of this function is to walk through the input folders and align the files, so that we can compare every files across every folders, one by one.
    The whole trick here is to align files, so that we don't need to memorize all the files in memory and we compare all equivalent files together: to do that, we ensure that we walk through the input directories in alphabetical order, and we pick the relative filepath at the top of the alphabetical order, this ensures the alignment of files between different folders, without memorizing  the whole trees structures.
    '''
    # (Generator) Files Synchronization Algorithm:
    # Needs a function stable_dir_walking, which will walk through directories recursively but in always the same order on all platforms (same order for files but also for folders), whatever order it is, as long as it is stable.
    # Until there's no file in any of the input folders to be processed:
    # - curfiles <- load first file for each folder by using stable_dir_walking on each input folder.
    # - curfiles_grouped <- group curfiles_ordered:
    #    * curfiles_ordered <- order curfiles alphabetically (need to separate the relative parent directory and the filename, to account for both without ambiguity)
    #    * curfiles_grouped <- empty list
    #    * curfiles_grouped[0] = add first element in curfiles_ordered
    #    * last_group = 0
    #    * for every subsequent element nextelt in curfiles_ordered:
    #        . if nextelt == curfiles_grouped[last_group][0]: add nextelt into curfiles_grouped[last_group] (the latest group in curfiles_grouped)
    #        . else: create a new group in curfiles_grouped (last_group += 1) and add nextelt into curfiles_grouped[last_group]
    # At this stage, curfiles_grouped[0] should contain a group of files with the same relative filepath from different input folders, and since we used stable_dir_walking, we are guaranteed that this file is the next to be processed in alphabetical order.
    # - Majority vote byte-by-byte for each of curfiles_grouped[0], and output winning byte to the output file.
    # - Update files list alignment: we will now ditch files in curfiles_grouped[0] from curfiles, and replace by the next files respectively from each respective folder. Since we processed in alphabetical (or whatever) order, the next loaded files will match the files in other curfiles_grouped groups that we could not process before.
    # At this point (after the loop), all input files have been processed in order, without maintaining the whole files list in memory, just one file per input folder.

    # Init files walking generator for each inputpaths
    recgen = [recwalk(path, sorting=True) for path in inputpaths]
    curfiles = {}
    recgen_exhausted = {}
    recgen_exhausted_count = 0
    nbpaths = len(inputpaths)
    retcode = 0

    if not ptee: ptee = sys.stdout

    # Open report file and write header
    if report_file is not None:
        rfile = open(report_file, 'wb')
        r_writer = csv.writer(rfile,
                              delimiter='|',
                              lineterminator='\n',
                              quotechar='"')
        r_header = ["filepath"] + [
            "dir%i" % (i + 1) for i in xrange(nbpaths)
        ] + ["hash-correct", "error_code", "errors"]
        r_length = len(r_header)
        r_writer.writerow(r_header)

    # Initialization: load the first batch of files, one for each folder
    for i in xrange(len(recgen)):
        recgen_exhausted[i] = False
        try:
            if curfiles.get(i, None) is None:
                curfiles[i] = relpath_posix(recgen[i].next(), inputpaths[i])[1]
        except StopIteration:
            recgen_exhausted[i] = True
            recgen_exhausted_count += 1

    # Files lists alignment loop
    while recgen_exhausted_count < nbpaths:
        errcode = 0
        errmsg = None

        # Init a new report's row
        if report_file: r_row = ["-"] * r_length

        # -- Group equivalent relative filepaths together
        #print curfiles # debug
        curfiles_grouped = sort_group(curfiles, True)

        # -- Extract first group of equivalent filepaths (this allows us to process with the same alphabetical order on all platforms)
        # Note that the remaining files in other groups will be processed later, because their alphabetical order is higher to the first group, this means that the first group is to be processed now
        to_process = curfiles_grouped[0]
        #print to_process # debug

        # -- Byte-by-byte majority vote on the first group of files
        # Need the relative filepath also (note that there's only one since it's a group of equivalent relative filepaths, only the absolute path is different between files of a same group)
        relfilepath = path2unix(os.path.join(*to_process[0][1]))
        if report_file: r_row[0] = relfilepath
        if verbose: ptee.write("- Processing file %s." % relfilepath)
        # Generate output path
        outpathfull = os.path.join(outpath, relfilepath)
        create_dir_if_not_exist(os.path.dirname(outpathfull))
        # Initialize the list of absolute filepaths
        fileslist = []
        for elt in to_process:
            i = elt[0]
            fileslist.append(os.path.join(inputpaths[i],
                                          os.path.join(*elt[1])))
            if report_file:
                r_row[
                    i +
                    1] = 'X'  # put an X in the report file below each folder that contains this file
        # If there's only one file, just copy it over
        if len(to_process) == 1:
            shutil.copyfile(fileslist[0], outpathfull)
            id = to_process[0][0]
            if report_file: r_row[id + 1] = 'O'
        # Else, merge by majority vote
        else:
            # Before-merge check using rfigc database, if provided
            # If one of the files in the input folders is already correct, just copy it over
            correct_file = None
            if database:
                for id, filepath in enumerate(fileslist):
                    if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" %
                                  (filepath, database)) == 0:
                        correct_file = filepath
                        correct_id = to_process[id][0]
                        break

            # If one correct file was found, copy it over
            if correct_file:
                create_dir_if_not_exist(os.path.dirname(outpathfull))
                shutil.copyfile(correct_file, outpathfull)
                if report_file:
                    r_row[correct_id + 1] = "O"
                    r_row[-3] = "OK"
            # Else, we need to do the majority vote merge
            else:
                # Do the majority vote merge
                errcode, errmsg = majority_vote_byte_scan(
                    relfilepath, fileslist, outpath)

        # After-merge/move check using rfigc database, if provided
        if database:
            if rfigc.main("-i \"%s\" -d \"%s\" -m --silent" %
                          (outpathfull, database)) == 1:
                errcode = 1
                r_row[-3] = "KO"
                if not errmsg: errmsg = ''
                errmsg += " File could not be totally repaired according to rfigc database."
            else:
                if report_file:
                    r_row[-3] = "OK"
                    if errmsg:
                        errmsg += " But merged file is correct according to rfigc database."

        # Display errors if any
        if errcode:
            if report_file:
                r_row[-2] = "KO"
                r_row[-1] = errmsg
            ptee.write(errmsg)
            retcode = 1
        else:
            if report_file: r_row[-2] = "OK"

        # Save current report's row
        if report_file:
            r_writer.writerow(r_row)

        # -- Update files lists alignment (ie, retrieve new files but while trying to keep the alignment)
        for elt in to_process:  # for files of the first group (the ones we processed)
            i = elt[0]
            # Walk their respective folders and load up the next file
            try:
                if not recgen_exhausted.get(i, False):
                    curfiles[i] = relpath_posix(recgen[i].next(),
                                                inputpaths[i])[1]
            # If there's no file left in this folder, mark this input folder as exhausted and continue with the others
            except StopIteration:
                curfiles[i] = None
                recgen_exhausted[i] = True
                recgen_exhausted_count += 1
        if tqdm_bar: tqdm_bar.update()
    if tqdm_bar: tqdm_bar.close()

    # Closing report file
    if report_file:
        # Write list of directories and legend
        rfile.write("\n=> Input directories:")
        for id, ipath in enumerate(inputpaths):
            rfile.write("\n\t- dir%i = %s" % ((id + 1), ipath))
        rfile.write("\n=> Output directory: %s" % outpath)
        rfile.write(
            "\n=> Legend: X=existing/selected for majority vote, O=only used this file, - = not existing, OK = check correct, KO = check incorrect (file was not recovered)\n"
        )
        # Close the report file handle
        rfile.close()

    return retcode
Esempio n. 7
0
def main(argv=None):
    if argv is None: # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv) # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Recursive/Relative Files Integrity Generator and Checker
Description: Recursively generate or check the integrity of files by MD5 and SHA1 hashes, size, modification date or by data structure integrity (only for images).

This script is originally meant to be used for data archival, by allowing an easy way to check for silent file corruption. Thus, this script uses relative paths so that you can easily compute and check the same redundant data copied on different mediums (hard drives, optical discs, etc.). This script is not meant for system files corruption notification, but is more meant to be used from times-to-times to check up on your data archives integrity.
    '''
    ep = '''Example usage:
- To generate the database (only needed once):
python rfigc.py -i "folderimages" -d "dbhash.csv" -g
- To check:
python rfigc.py -i "folderimages" -d "dbhash.csv" -l log.txt -s
- To update your database by appending new files:
python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a 
- To update your database by appending new files AND removing inexistent files:
python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a -r
- To use with a gui:
python rfigc.py --gui

Note that by default, the script is by default in check mode, to avoid wrong manipulations. It will also alert you if you generate over an already existing database file.
Note2: you can use PyPy to speed the generation, but you should avoid using PyPy when in checking mode (from our tests, it will slow things down a lot).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
    else: # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
    # Required arguments
    main_parser.add_argument('-i', '--input', metavar='/path/to/root/folder', type=is_dir_or_file, nargs=1, required=True,
                        help='Path to the root folder (or a single file) from where the scanning will occur.', **widget_dir)
    main_parser.add_argument('-d', '--database', metavar='/some/folder/databasefile.csv', type=str, nargs=1, required=True, #type=argparse.FileType('rt')
                        help='Path to the csv file containing the hash informations.', **widget_filesave)

    # Optional general arguments
    main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, nargs=1, required=False,
                        help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave)
    main_parser.add_argument('--skip_hash', action='store_true', required=False, default=False,
                        help='Skip hash computation/checking (checks only the other metadata, this is a lot quicker).')
    main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False,
                        help='Verbose mode (show more output).')
    main_parser.add_argument('--silent', action='store_true', required=False, default=False,
                        help='No console output (but if --log specified, the log will still be saved in the specified file).')

    # Checking mode arguments
    main_parser.add_argument('-s', '--structure_check', action='store_true', required=False, default=False,
                        help='Check images structures for corruption?')
    main_parser.add_argument('-e', '--errors_file', metavar='/some/folder/errorsfile.csv', type=str, nargs=1, required=False, #type=argparse.FileType('rt')
                        help='Path to the error file, where errors at checking will be stored in CSV for further processing by other softwares (such as file repair softwares).', **widget_filesave)
    main_parser.add_argument('-m', '--disable_modification_date_checking', action='store_true', required=False, default=False,
                        help='Disable modification date checking.')
    main_parser.add_argument('--skip_missing', action='store_true', required=False, default=False,
                        help='Skip missing files when checking (useful if you split your files into several mediums, for example on optical discs with limited capacity).')

    # Generate mode arguments
    main_parser.add_argument('-g', '--generate', action='store_true', required=False, default=False,
                        help='Generate the database? (omit this parameter to check instead of generating).')
    main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False,
                        help='Force overwriting the database file even if it already exists (if --generate).')

    # Update mode arguments
    main_parser.add_argument('-u', '--update', action='store_true', required=False, default=False,
                        help='Update database (you must also specify --append or --remove).')
    main_parser.add_argument('-a', '--append', action='store_true', required=False, default=False,
                        help='Append new files (if --update).')
    main_parser.add_argument('-r', '--remove', action='store_true', required=False, default=False,
                        help='Remove missing files (if --update).')

    # Recover from file scraping
    main_parser.add_argument('--filescraping_recovery', action='store_true', required=False, default=False,
                        help='Given a folder of unorganized files, compare to the database and restore the filename and directory structure into the output folder.')
    main_parser.add_argument('-o', '--output', metavar='/path/to/root/folder', type=is_dir, nargs=1, required=False,
                        help='Path to the output folder where to output (copy) the files reorganized after --recover_from_filescraping.', **widget_dir)

    #== Parsing the arguments
    args = main_parser.parse_args(argv) # Storing all arguments to args

    #-- Set variables from arguments
    inputpath = fullpath(args.input[0]) # path to the files to protect (either a folder or a single file)
    rootfolderpath = inputpath # path to the root folder (to compute relative paths)
    #database = os.path.basename(fullpath(args.database[0])) # Take only the filename.
    database = fullpath(args.database[0])
    generate = args.generate
    structure_check = args.structure_check
    force = args.force
    disable_modification_date_checking = args.disable_modification_date_checking
    skip_missing = args.skip_missing
    skip_hash = args.skip_hash
    update = args.update
    append = args.append
    remove = args.remove
    outputpath = None
    if args.output: outputpath = fullpath(args.output[0])
    filescraping = args.filescraping_recovery
    verbose = args.verbose
    silent = args.silent

    if os.path.isfile(inputpath): # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
        rootfolderpath = os.path.dirname(inputpath)

    errors_file = None
    if args.errors_file: errors_file = fullpath(args.errors_file[0])

    # -- Checking arguments
    if structure_check and not structure_check_import:
        raise ImportError('PIL (Python Imaging Library) could not be imported. PIL is needed to do structure check, please install PIL (or you can disable structure check to continue).');

    if update and (not append and not remove):
        raise ValueError('--update specified but not --append nor --remove. You must specify at least one of these modes when using --update!')

    if filescraping and not outputpath:
        raise ValueError('Output path needed when --recover_from_filescraping.')

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        #sys.stdout = Tee(args.log[0], 'a')
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)


    # == PROCESSING BRANCHING == #
    retval = 0 # Returned value: 0 OK, 1 KO (files in error), -1 Error

    # -- Update the database file by removing missing files
    if update and remove:
        if not os.path.isfile(database):
            raise NameError('Specified database file does not exist, can\'t update!')

        ptee.write("====================================")
        ptee.write("RIFGC Database Update Removal of missing files, started on %s" % datetime.datetime.now().isoformat())
        ptee.write("====================================")

        # Precompute the total number of lines to process (this should be fairly quick)
        filestodocount = 0
        with open(database, 'rb') as dbf:
            for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'):
                filestodocount = filestodocount + 1

            # Preparing CSV writer for the temporary file that will have the lines removed
            with open(database+'.rem', 'wb') as dbfilerem:
                csv_writer = csv.writer(dbfilerem, lineterminator='\n', delimiter='|', quotechar='"')

                # Printing CSV headers
                csv_headers = ['path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext']
                csv_writer.writerow(csv_headers)

                dbf.seek(0)
                dbfile = csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"') # we need to reopen the file to put the reading cursor (the generator position) back to the beginning
                delcount = 0
                filescount = 0
                for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True):
                    filescount = filescount + 1
                    filepath = os.path.join(rootfolderpath, row['path']) # Build the absolute file path

                    # Single-file mode: skip if this is not the file we are looking for
                    if inputpath != rootfolderpath and inputpath != filepath: continue

                    if verbose: ptee.write("\n- Processing file %s" % row['path'])
                    errors = []
                    if not os.path.isfile(filepath):
                        delcount = delcount + 1
                        ptee.write("\n- File %s is missing, removed from database." % row['path'])
                    else:
                        csv_writer.writerow( [ path2unix(row['path']), row['md5'], row['sha1'], row['last_modification_timestamp'], row['last_modification_date'], row['size'], row['ext'] ] )

        # REMOVE UPDATE DONE, we remove the old database file and replace it with the new
        os.remove(database) # delete old database
        os.rename(database+'.rem', database) # rename new database to match old name
        # Show some stats
        ptee.write("----------------------------------------------------")
        ptee.write("All files processed: Total: %i - Removed/Missing: %i.\n\n" % (filescount, delcount))

    # -- Generate the database file or update/append (both will walk through the filesystem to get new files, contrary to other branchs which walk through the database csv)
    if generate or (update and append):
        if not force and os.path.isfile(database) and not update:
            raise NameError('Database file already exists. Please choose another name to generate your database file.')

        if generate:
            dbmode = 'wb'
        elif (update and append):
            dbmode = 'ab'
        with open(database, dbmode) as dbfile: # Must open in write + binary, because on Windows it will do weird things otherwise (at least with Python 2.7)
            ptee.write("====================================")
            if generate:
                ptee.write("RIFGC Database Generation started on %s" % datetime.datetime.now().isoformat())
            elif update and append:
                ptee.write("RIFGC Database Update Append new files, started on %s" % datetime.datetime.now().isoformat())
            ptee.write("====================================")

            # Preparing CSV writer
            csv_writer = csv.writer(dbfile, lineterminator='\n', delimiter='|', quotechar='"')

            if generate:
                # Printing CSV headers
                csv_headers = ['path', 'md5', 'sha1', 'last_modification_timestamp', 'last_modification_date', 'size', 'ext']
                csv_writer.writerow(csv_headers)

            if (update and append):
                # Extract all paths already stored in database to avoid readding them
                db_paths = {}
                with open(database, 'rb') as dbf:
                    for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'):
                        db_paths[row['path']] = True

            # Counting the total number of files that we will have to process
            ptee.write("Counting total number of files to process, please wait...")
            filestodocount = 0
            for _ in tqdm.tqdm(recwalk(inputpath), file=ptee):
                filestodocount = filestodocount + 1
            ptee.write("Counting done.")

            # Recursively traversing the root directory and save the metadata in the db for each file
            ptee.write("Processing files to compute metadata to store in database, please wait...")
            filescount = 0
            addcount = 0
            for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True):
                    filescount = filescount + 1
                    # Get full absolute filepath
                    filepath = os.path.join(dirpath, filename)
                    # Get database relative path (from scanning root folder)
                    relfilepath = path2unix(os.path.relpath(filepath, rootfolderpath)) # File relative path from the root (so that we can easily check the files later even if the absolute path is different)
                    if verbose: ptee.write("\n- Processing file %s" % relfilepath)

                    # If update + append mode, then if the file is already in the database we skip it (we continue computing metadata only for new files)
                    if update and append and relfilepath in db_paths:
                        if verbose: ptee.write("... skipped")
                        continue
                    else:
                        addcount = addcount + 1

                    # Compute the hashes (leave it outside the with command because generate_hashes() open the file by itself, so that both hashes can be computed in a single sweep of the file at the same time)
                    if not skip_hash:
                        md5hash, sha1hash = generate_hashes(filepath)
                    else:
                        md5hash = sha1hash = 0
                    # Compute other metadata
                    with open(filepath) as thisfile:
                        # Check file structure if option is enabled
                        if structure_check:
                            struct_result = check_structure(filepath)
                            # Print/Log an error only if there's one (else we won't say anything)
                            if struct_result:
                                ptee.write("\n- Structure error with file "+filepath+": "+struct_result)
                        ext = os.path.splitext(filepath)[1] # File's extension
                        statinfos = os.stat(filepath) # Various OS filesystem infos about the file
                        size = statinfos.st_size # File size
                        lastmodif = statinfos.st_mtime # File last modified date (as a timestamp)
                        lastmodif_readable = datetime.datetime.fromtimestamp(lastmodif).strftime("%Y-%m-%d %H:%M:%S") # File last modified date as a human readable date (ISO universal time)

                        csv_row = [path2unix(relfilepath), md5hash, sha1hash, lastmodif, lastmodif_readable, size, ext] # Prepare the CSV row
                        csv_writer.writerow(csv_row) # Save to the file
        ptee.write("----------------------------------------------------")
        ptee.write("All files processed: Total: %i - Added: %i.\n\n" % (filescount, addcount))

    # -- Filescraping recovery mode
    # We will compare all files from the input path and reorganize the ones that are recognized into the output path
    elif filescraping:
        import shutil
        ptee.write("====================================")
        ptee.write("RIFGC File Scraping Recovery started on %s" % datetime.datetime.now().isoformat())
        ptee.write("====================================")

        ptee.write("Loading the database into memory, please wait...")
        md5list = {}
        sha1list = {}
        dbrows = {} # TODO: instead of memorizing everything in memory, store just the reading cursor position at the beginning of the line with the size and then just read when necessary from the db file directly
        id = 0
        with open(database, 'rb') as db:
            for row in csv.DictReader(db, lineterminator='\n', delimiter='|', quotechar='"'):
                id += 1
                if (len(row['md5']) > 0 and len(row['sha1']) > 0):
                    md5list[row['md5']] = id
                    sha1list[row['sha1']] = id
                    dbrows[id] = row
        ptee.write("Loading done.")

        if len(dbrows) == 0:
            ptee.write("Nothing to do, there's no md5 nor sha1 hashes in the database file!")
            del ptee
            return 1 # return with an error

        # Counting the total number of files that we will have to process
        ptee.write("Counting total number of files to process, please wait...")
        filestodocount = 0
        for _ in tqdm.tqdm(recwalk(inputpath), file=ptee):
            filestodocount = filestodocount + 1
        ptee.write("Counting done.")
        
        # Recursively traversing the root directory and save the metadata in the db for each file
        ptee.write("Processing file scraping recovery, walking through all files from input folder...")
        filescount = 0
        copiedcount = 0
        for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath), file=ptee, total=filestodocount, leave=True):
                filescount = filescount + 1
                # Get full absolute filepath
                filepath = os.path.join(dirpath,filename)
                # Get database relative path (from scanning root folder)
                relfilepath = path2unix(os.path.relpath(filepath, rootfolderpath)) # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different)
                if verbose: ptee.write("\n- Processing file %s" % relfilepath)

                # Generate the hashes from the currently inspected file
                md5hash, sha1hash = generate_hashes(filepath)
                # If it match with a file in the database, we will copy it over with the correct name, directory structure, file extension and last modification date
                if md5hash in md5list and sha1hash in sha1list and md5list[md5hash] == sha1list[sha1hash]:
                    # Load the db infos for this file
                    row = dbrows[md5list[md5hash]]
                    ptee.write("- Found: %s --> %s.\n" % (filepath, row['path']))
                    # Generate full absolute filepath of the output file
                    outfilepath = os.path.join(outputpath, row['path'])
                    # Recursively create the directory tree structure
                    outfiledir = os.path.dirname(outfilepath)
                    if not os.path.isdir(outfiledir): os.makedirs(outfiledir) # if the target directory does not exist, create it (and create recursively all parent directories too)
                    # Copy over and set attributes
                    shutil.copy2(filepath, outfilepath)
                    filestats = os.stat(filepath)
                    os.utime(outfilepath, (filestats.st_atime, float(row['last_modification_timestamp'])))
                    # Counter...
                    copiedcount += 1
        ptee.write("----------------------------------------------------")
        ptee.write("All files processed: Total: %i - Recovered: %i.\n\n" % (filescount, copiedcount))

    # -- Check mode: check the files using a database file
    elif not update and not generate and not filescraping:
        ptee.write("====================================")
        ptee.write("RIFGC Check started on %s" % datetime.datetime.now().isoformat())
        ptee.write("====================================")

        # Open errors file if supplied (where we will store every errors in a formatted csv so that it can later be easily processed by other softwares, such as repair softwares)
        if errors_file is not None:
            efile = open(errors_file, 'wb')
            e_writer = csv.writer(efile, delimiter='|', lineterminator='\n', quotechar='"')

        # Precompute the total number of lines to process (this should be fairly quick)
        filestodocount = 0
        with open(database, 'rb') as dbf:
            for row in csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"'):
                filestodocount = filestodocount + 1

            # Processing the files using the database list
            ptee.write("Checking for files corruption based on database %s on input path %s, please wait..." % (database, inputpath))
            dbf.seek(0)
            dbfile = csv.DictReader(dbf, lineterminator='\n', delimiter='|', quotechar='"') # we need to reopen the file to put the reading cursor (the generator position) back to the beginning
            errorscount = 0
            filescount = 0
            for row in tqdm.tqdm(dbfile, file=ptee, total=filestodocount, leave=True):
                filescount = filescount + 1
                filepath = os.path.join(rootfolderpath, row['path'])

                # Single-file mode: skip if this is not the file we are looking for
                if inputpath != rootfolderpath and inputpath != filepath: continue

                if verbose: ptee.write("\n- Processing file %s" % row['path'])
                errors = []
                if not os.path.isfile(filepath):
                    if not skip_missing: errors.append('file is missing')
                # First generate the current file's metadata given the filepath from the CSV, and then we will check the differences from database
                else:
                    try: # Try to be resilient to various file access errors
                        # Generate hash
                        if not skip_hash:
                            md5hash, sha1hash = generate_hashes(filepath)
                        else:
                            md5hash = sha1hash = 0
                        # Check structure integrity if enabled
                        if structure_check:
                            struct_result = check_structure(filepath)
                            if struct_result:
                                errors.append("structure error (%s)" % struct_result)
                        # Compute other metadata
                        with open(filepath) as thisfile:
                            ext = os.path.splitext(filepath)[1]
                            statinfos = os.stat(filepath)
                            size = statinfos.st_size
                            lastmodif = statinfos.st_mtime
                            lastmodif_readable = datetime.datetime.fromtimestamp(lastmodif).strftime("%Y-%m-%d %H:%M:%S")

                            # CHECK THE DIFFERENCES
                            if not skip_hash and md5hash != row['md5'] and sha1hash != row['sha1']:
                                errors.append('both md5 and sha1 hash failed')
                            elif not skip_hash and ((md5hash == row['md5'] and sha1hash != row['sha1']) or (md5hash != row['md5'] and sha1hash == row['sha1'])):
                                errors.append('one of the hash failed but not the other (which may indicate that the database file is corrupted)')
                            if ext != row['ext']:
                                errors.append('extension has changed')
                            if size != int(row['size']):
                                errors.append("size has changed (before: %s - now: %s)" % (row['size'], size))
                            if not disable_modification_date_checking and (lastmodif != float(row['last_modification_timestamp']) and round(lastmodif,0) != round(float(row['last_modification_timestamp']),0)): # for usage with PyPy: last modification time is differently managed (rounded), thus we need to round here manually to compare against PyPy.
                                errors.append("modification date has changed (before: %s - now: %s)" % (row['last_modification_date'], lastmodif_readable))
                    except IOError as e: # Catch IOError as a file error
                        errors.append('file can\'t be read, IOError (inaccessible, maybe bad sector?)')
                    except Exception as e: # Any other exception when accessing the file will also be caught as a file error
                        errors.append('file can\'t be accessed: %s' % e)
                # Print/Log all errors for this file if any happened
                if errors:
                    errorscount = errorscount + 1
                    ptee.write("\n- Error for file %s: %s." % (row['path'], ', '.join(errors)))
                    if errors_file is not None: # Write error in a csv file if supplied (for easy processing later by other softwares such as file repair softwares)
                        e_writer.writerow( [row['path'], ', '.join(errors)] )
        # END OF CHECKING: show some stats
        ptee.write("----------------------------------------------------")
        ptee.write("All files checked: Total: %i - Files with errors: %i.\n\n" % (filescount, errorscount))
        retval = (errorscount > 0)

    del ptee
    return retval # return error code if any
Esempio n. 8
0
def main(argv=None):
    if argv is None:  # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(
            argv, basestring
    ):  # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv)  # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''Recursive/Relative Files Integrity Generator and Checker
Description: Recursively generate or check the integrity of files by MD5 and SHA1 hashes, size, modification date or by data structure integrity (only for images).

This script is originally meant to be used for data archival, by allowing an easy way to check for silent file corruption. Thus, this script uses relative paths so that you can easily compute and check the same redundant data copied on different mediums (hard drives, optical discs, etc.). This script is not meant for system files corruption notification, but is more meant to be used from times-to-times to check up on your data archives integrity.
    '''
    ep = '''Example usage:
- To generate the database (only needed once):
python rfigc.py -i "folderimages" -d "dbhash.csv" -g
- To check:
python rfigc.py -i "folderimages" -d "dbhash.csv" -l log.txt -s
- To update your database by appending new files:
python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a 
- To update your database by appending new files AND removing inexistent files:
python rfigc.py -i "folderimages" -d "dbhash.csv" -u -a -r
- To use with a gui:
python rfigc.py --gui

Note that by default, the script is by default in check mode, to avoid wrong manipulations. It will also alert you if you generate over an already existing database file.
Note2: you can use PyPy to speed the generation, but you should avoid using PyPy when in checking mode (from our tests, it will slow things down a lot).
'''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and
                          not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
    else:  # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]:
            argv.remove(
                '--ignore-gooey'
            )  # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
    # Required arguments
    main_parser.add_argument(
        '-i',
        '--input',
        metavar='/path/to/root/folder',
        type=is_dir_or_file,
        nargs=1,
        required=True,
        help=
        'Path to the root folder (or a single file) from where the scanning will occur.',
        **widget_dir)
    main_parser.add_argument(
        '-d',
        '--database',
        metavar='/some/folder/databasefile.csv',
        type=str,
        nargs=1,
        required=True,  #type=argparse.FileType('rt')
        help='Path to the csv file containing the hash informations.',
        **widget_filesave)

    # Optional general arguments
    main_parser.add_argument(
        '-l',
        '--log',
        metavar='/some/folder/filename.log',
        type=str,
        nargs=1,
        required=False,
        help=
        'Path to the log file. (Output will be piped to both the stdout and the log file)',
        **widget_filesave)
    main_parser.add_argument(
        '--skip_hash',
        action='store_true',
        required=False,
        default=False,
        help=
        'Skip hash computation/checking (checks only the other metadata, this is a lot quicker).'
    )
    main_parser.add_argument('-v',
                             '--verbose',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Verbose mode (show more output).')
    main_parser.add_argument(
        '--silent',
        action='store_true',
        required=False,
        default=False,
        help=
        'No console output (but if --log specified, the log will still be saved in the specified file).'
    )

    # Checking mode arguments
    main_parser.add_argument('-s',
                             '--structure_check',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Check images structures for corruption?')
    main_parser.add_argument(
        '-e',
        '--errors_file',
        metavar='/some/folder/errorsfile.csv',
        type=str,
        nargs=1,
        required=False,  #type=argparse.FileType('rt')
        help=
        'Path to the error file, where errors at checking will be stored in CSV for further processing by other softwares (such as file repair softwares).',
        **widget_filesave)
    main_parser.add_argument('-m',
                             '--disable_modification_date_checking',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Disable modification date checking.')
    main_parser.add_argument(
        '--skip_missing',
        action='store_true',
        required=False,
        default=False,
        help=
        'Skip missing files when checking (useful if you split your files into several mediums, for example on optical discs with limited capacity).'
    )

    # Generate mode arguments
    main_parser.add_argument(
        '-g',
        '--generate',
        action='store_true',
        required=False,
        default=False,
        help=
        'Generate the database? (omit this parameter to check instead of generating).'
    )
    main_parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        required=False,
        default=False,
        help=
        'Force overwriting the database file even if it already exists (if --generate).'
    )

    # Update mode arguments
    main_parser.add_argument(
        '-u',
        '--update',
        action='store_true',
        required=False,
        default=False,
        help='Update database (you must also specify --append or --remove).')
    main_parser.add_argument('-a',
                             '--append',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Append new files (if --update).')
    main_parser.add_argument('-r',
                             '--remove',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Remove missing files (if --update).')

    # Recover from file scraping
    main_parser.add_argument(
        '--filescraping_recovery',
        action='store_true',
        required=False,
        default=False,
        help=
        'Given a folder of unorganized files, compare to the database and restore the filename and directory structure into the output folder.'
    )
    main_parser.add_argument(
        '-o',
        '--output',
        metavar='/path/to/root/folder',
        type=is_dir,
        nargs=1,
        required=False,
        help=
        'Path to the output folder where to output (copy) the files reorganized after --recover_from_filescraping.',
        **widget_dir)

    #== Parsing the arguments
    args = main_parser.parse_args(argv)  # Storing all arguments to args

    #-- Set variables from arguments
    inputpath = fullpath(
        args.input[0]
    )  # path to the files to protect (either a folder or a single file)
    rootfolderpath = inputpath  # path to the root folder (to compute relative paths)
    #database = os.path.basename(fullpath(args.database[0])) # Take only the filename.
    database = fullpath(args.database[0])
    generate = args.generate
    structure_check = args.structure_check
    force = args.force
    disable_modification_date_checking = args.disable_modification_date_checking
    skip_missing = args.skip_missing
    skip_hash = args.skip_hash
    update = args.update
    append = args.append
    remove = args.remove
    outputpath = None
    if args.output: outputpath = fullpath(args.output[0])
    filescraping = args.filescraping_recovery
    verbose = args.verbose
    silent = args.silent

    if os.path.isfile(
            inputpath
    ):  # if inputpath is a single file (instead of a folder), then define the rootfolderpath as the parent directory (for correct relative path generation, else it will also truncate the filename!)
        rootfolderpath = os.path.dirname(inputpath)

    errors_file = None
    if args.errors_file: errors_file = fullpath(args.errors_file[0])

    # -- Checking arguments
    if structure_check and not structure_check_import:
        raise ImportError(
            'PIL (Python Imaging Library) could not be imported. PIL is needed to do structure check, please install PIL (or you can disable structure check to continue).'
        )

    if update and (not append and not remove):
        raise ValueError(
            '--update specified but not --append nor --remove. You must specify at least one of these modes when using --update!'
        )

    if filescraping and not outputpath:
        raise ValueError(
            'Output path needed when --recover_from_filescraping.')

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log[0], 'a', nostdout=silent)
        #sys.stdout = Tee(args.log[0], 'a')
        sys.stderr = Tee(args.log[0], 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #
    retval = 0  # Returned value: 0 OK, 1 KO (files in error), -1 Error

    # -- Update the database file by removing missing files
    if update and remove:
        if not os.path.isfile(database):
            raise NameError(
                'Specified database file does not exist, can\'t update!')

        ptee.write("====================================")
        ptee.write(
            "RIFGC Database Update Removal of missing files, started on %s" %
            datetime.datetime.now().isoformat())
        ptee.write("====================================")

        # Precompute the total number of lines to process (this should be fairly quick)
        filestodocount = 0
        with open(database, 'rb') as dbf:
            for row in csv.DictReader(dbf,
                                      lineterminator='\n',
                                      delimiter='|',
                                      quotechar='"'):
                filestodocount = filestodocount + 1

            # Preparing CSV writer for the temporary file that will have the lines removed
            with open(database + '.rem', 'wb') as dbfilerem:
                csv_writer = csv.writer(dbfilerem,
                                        lineterminator='\n',
                                        delimiter='|',
                                        quotechar='"')

                # Printing CSV headers
                csv_headers = [
                    'path', 'md5', 'sha1', 'last_modification_timestamp',
                    'last_modification_date', 'size', 'ext'
                ]
                csv_writer.writerow(csv_headers)

                dbf.seek(0)
                dbfile = csv.DictReader(
                    dbf, lineterminator='\n', delimiter='|', quotechar='"'
                )  # we need to reopen the file to put the reading cursor (the generator position) back to the beginning
                delcount = 0
                filescount = 0
                for row in tqdm.tqdm(dbfile,
                                     file=ptee,
                                     total=filestodocount,
                                     leave=True):
                    filescount = filescount + 1
                    filepath = os.path.join(
                        rootfolderpath,
                        row['path'])  # Build the absolute file path

                    # Single-file mode: skip if this is not the file we are looking for
                    if inputpath != rootfolderpath and inputpath != filepath:
                        continue

                    if verbose:
                        ptee.write("\n- Processing file %s" % row['path'])
                    errors = []
                    if not os.path.isfile(filepath):
                        delcount = delcount + 1
                        ptee.write(
                            "\n- File %s is missing, removed from database." %
                            row['path'])
                    else:
                        csv_writer.writerow([
                            path2unix(row['path']), row['md5'], row['sha1'],
                            row['last_modification_timestamp'],
                            row['last_modification_date'], row['size'],
                            row['ext']
                        ])

        # REMOVE UPDATE DONE, we remove the old database file and replace it with the new
        os.remove(database)  # delete old database
        os.rename(database + '.rem',
                  database)  # rename new database to match old name
        # Show some stats
        ptee.write("----------------------------------------------------")
        ptee.write(
            "All files processed: Total: %i - Removed/Missing: %i.\n\n" %
            (filescount, delcount))

    # -- Generate the database file or update/append (both will walk through the filesystem to get new files, contrary to other branchs which walk through the database csv)
    if generate or (update and append):
        if not force and os.path.isfile(database) and not update:
            raise NameError(
                'Database file already exists. Please choose another name to generate your database file.'
            )

        if generate:
            dbmode = 'wb'
        elif (update and append):
            dbmode = 'ab'
        with open(
                database, dbmode
        ) as dbfile:  # Must open in write + binary, because on Windows it will do weird things otherwise (at least with Python 2.7)
            ptee.write("====================================")
            if generate:
                ptee.write("RIFGC Database Generation started on %s" %
                           datetime.datetime.now().isoformat())
            elif update and append:
                ptee.write(
                    "RIFGC Database Update Append new files, started on %s" %
                    datetime.datetime.now().isoformat())
            ptee.write("====================================")

            # Preparing CSV writer
            csv_writer = csv.writer(dbfile,
                                    lineterminator='\n',
                                    delimiter='|',
                                    quotechar='"')

            if generate:
                # Printing CSV headers
                csv_headers = [
                    'path', 'md5', 'sha1', 'last_modification_timestamp',
                    'last_modification_date', 'size', 'ext'
                ]
                csv_writer.writerow(csv_headers)

            if (update and append):
                # Extract all paths already stored in database to avoid readding them
                db_paths = {}
                with open(database, 'rb') as dbf:
                    for row in csv.DictReader(dbf,
                                              lineterminator='\n',
                                              delimiter='|',
                                              quotechar='"'):
                        db_paths[row['path']] = True

            # Counting the total number of files that we will have to process
            ptee.write(
                "Counting total number of files to process, please wait...")
            filestodocount = 0
            for _ in tqdm.tqdm(recwalk(inputpath), file=ptee):
                filestodocount = filestodocount + 1
            ptee.write("Counting done.")

            # Recursively traversing the root directory and save the metadata in the db for each file
            ptee.write(
                "Processing files to compute metadata to store in database, please wait..."
            )
            filescount = 0
            addcount = 0
            for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath),
                                                 file=ptee,
                                                 total=filestodocount,
                                                 leave=True):
                filescount = filescount + 1
                # Get full absolute filepath
                filepath = os.path.join(dirpath, filename)
                # Get database relative path (from scanning root folder)
                relfilepath = path2unix(
                    os.path.relpath(filepath, rootfolderpath)
                )  # File relative path from the root (so that we can easily check the files later even if the absolute path is different)
                if verbose: ptee.write("\n- Processing file %s" % relfilepath)

                # If update + append mode, then if the file is already in the database we skip it (we continue computing metadata only for new files)
                if update and append and relfilepath in db_paths:
                    if verbose: ptee.write("... skipped")
                    continue
                else:
                    addcount = addcount + 1

                # Compute the hashes (leave it outside the with command because generate_hashes() open the file by itself, so that both hashes can be computed in a single sweep of the file at the same time)
                if not skip_hash:
                    md5hash, sha1hash = generate_hashes(filepath)
                else:
                    md5hash = sha1hash = 0
                # Compute other metadata
                with open(filepath) as thisfile:
                    # Check file structure if option is enabled
                    if structure_check:
                        struct_result = check_structure(filepath)
                        # Print/Log an error only if there's one (else we won't say anything)
                        if struct_result:
                            ptee.write("\n- Structure error with file " +
                                       filepath + ": " + struct_result)
                    ext = os.path.splitext(filepath)[1]  # File's extension
                    statinfos = os.stat(
                        filepath)  # Various OS filesystem infos about the file
                    size = statinfos.st_size  # File size
                    lastmodif = statinfos.st_mtime  # File last modified date (as a timestamp)
                    lastmodif_readable = datetime.datetime.fromtimestamp(
                        lastmodif
                    ).strftime(
                        "%Y-%m-%d %H:%M:%S"
                    )  # File last modified date as a human readable date (ISO universal time)

                    csv_row = [
                        path2unix(relfilepath), md5hash, sha1hash, lastmodif,
                        lastmodif_readable, size, ext
                    ]  # Prepare the CSV row
                    csv_writer.writerow(csv_row)  # Save to the file
        ptee.write("----------------------------------------------------")
        ptee.write("All files processed: Total: %i - Added: %i.\n\n" %
                   (filescount, addcount))

    # -- Filescraping recovery mode
    # We will compare all files from the input path and reorganize the ones that are recognized into the output path
    elif filescraping:
        import shutil
        ptee.write("====================================")
        ptee.write("RIFGC File Scraping Recovery started on %s" %
                   datetime.datetime.now().isoformat())
        ptee.write("====================================")

        ptee.write("Loading the database into memory, please wait...")
        md5list = {}
        sha1list = {}
        dbrows = {
        }  # TODO: instead of memorizing everything in memory, store just the reading cursor position at the beginning of the line with the size and then just read when necessary from the db file directly
        id = 0
        with open(database, 'rb') as db:
            for row in csv.DictReader(db,
                                      lineterminator='\n',
                                      delimiter='|',
                                      quotechar='"'):
                id += 1
                if (len(row['md5']) > 0 and len(row['sha1']) > 0):
                    md5list[row['md5']] = id
                    sha1list[row['sha1']] = id
                    dbrows[id] = row
        ptee.write("Loading done.")

        if len(dbrows) == 0:
            ptee.write(
                "Nothing to do, there's no md5 nor sha1 hashes in the database file!"
            )
            del ptee
            return 1  # return with an error

        # Counting the total number of files that we will have to process
        ptee.write("Counting total number of files to process, please wait...")
        filestodocount = 0
        for _ in tqdm.tqdm(recwalk(inputpath), file=ptee):
            filestodocount = filestodocount + 1
        ptee.write("Counting done.")

        # Recursively traversing the root directory and save the metadata in the db for each file
        ptee.write(
            "Processing file scraping recovery, walking through all files from input folder..."
        )
        filescount = 0
        copiedcount = 0
        for (dirpath, filename) in tqdm.tqdm(recwalk(inputpath),
                                             file=ptee,
                                             total=filestodocount,
                                             leave=True):
            filescount = filescount + 1
            # Get full absolute filepath
            filepath = os.path.join(dirpath, filename)
            # Get database relative path (from scanning root folder)
            relfilepath = path2unix(
                os.path.relpath(filepath, rootfolderpath)
            )  # File relative path from the root (we truncate the rootfolderpath so that we can easily check the files later even if the absolute path is different)
            if verbose: ptee.write("\n- Processing file %s" % relfilepath)

            # Generate the hashes from the currently inspected file
            md5hash, sha1hash = generate_hashes(filepath)
            # If it match with a file in the database, we will copy it over with the correct name, directory structure, file extension and last modification date
            if md5hash in md5list and sha1hash in sha1list and md5list[
                    md5hash] == sha1list[sha1hash]:
                # Load the db infos for this file
                row = dbrows[md5list[md5hash]]
                ptee.write("- Found: %s --> %s.\n" % (filepath, row['path']))
                # Generate full absolute filepath of the output file
                outfilepath = os.path.join(outputpath, row['path'])
                # Recursively create the directory tree structure
                outfiledir = os.path.dirname(outfilepath)
                if not os.path.isdir(outfiledir):
                    os.makedirs(
                        outfiledir
                    )  # if the target directory does not exist, create it (and create recursively all parent directories too)
                # Copy over and set attributes
                shutil.copy2(filepath, outfilepath)
                filestats = os.stat(filepath)
                os.utime(outfilepath,
                         (filestats.st_atime,
                          float(row['last_modification_timestamp'])))
                # Counter...
                copiedcount += 1
        ptee.write("----------------------------------------------------")
        ptee.write("All files processed: Total: %i - Recovered: %i.\n\n" %
                   (filescount, copiedcount))

    # -- Check mode: check the files using a database file
    elif not update and not generate and not filescraping:
        ptee.write("====================================")
        ptee.write("RIFGC Check started on %s" %
                   datetime.datetime.now().isoformat())
        ptee.write("====================================")

        # Open errors file if supplied (where we will store every errors in a formatted csv so that it can later be easily processed by other softwares, such as repair softwares)
        if errors_file is not None:
            efile = open(errors_file, 'wb')
            e_writer = csv.writer(efile,
                                  delimiter='|',
                                  lineterminator='\n',
                                  quotechar='"')

        # Precompute the total number of lines to process (this should be fairly quick)
        filestodocount = 0
        with open(database, 'rb') as dbf:
            for row in csv.DictReader(dbf,
                                      lineterminator='\n',
                                      delimiter='|',
                                      quotechar='"'):
                filestodocount = filestodocount + 1

            # Processing the files using the database list
            ptee.write(
                "Checking for files corruption based on database %s on input path %s, please wait..."
                % (database, inputpath))
            dbf.seek(0)
            dbfile = csv.DictReader(
                dbf, lineterminator='\n', delimiter='|', quotechar='"'
            )  # we need to reopen the file to put the reading cursor (the generator position) back to the beginning
            errorscount = 0
            filescount = 0
            for row in tqdm.tqdm(dbfile,
                                 file=ptee,
                                 total=filestodocount,
                                 leave=True):
                filescount = filescount + 1
                filepath = os.path.join(rootfolderpath, row['path'])

                # Single-file mode: skip if this is not the file we are looking for
                if inputpath != rootfolderpath and inputpath != filepath:
                    continue

                if verbose: ptee.write("\n- Processing file %s" % row['path'])
                errors = []
                if not os.path.isfile(filepath):
                    if not skip_missing: errors.append('file is missing')
                # First generate the current file's metadata given the filepath from the CSV, and then we will check the differences from database
                else:
                    try:  # Try to be resilient to various file access errors
                        # Generate hash
                        if not skip_hash:
                            md5hash, sha1hash = generate_hashes(filepath)
                        else:
                            md5hash = sha1hash = 0
                        # Check structure integrity if enabled
                        if structure_check:
                            struct_result = check_structure(filepath)
                            if struct_result:
                                errors.append("structure error (%s)" %
                                              struct_result)
                        # Compute other metadata
                        with open(filepath) as thisfile:
                            ext = os.path.splitext(filepath)[1]
                            statinfos = os.stat(filepath)
                            size = statinfos.st_size
                            lastmodif = statinfos.st_mtime
                            lastmodif_readable = datetime.datetime.fromtimestamp(
                                lastmodif).strftime("%Y-%m-%d %H:%M:%S")

                            # CHECK THE DIFFERENCES
                            if not skip_hash and md5hash != row[
                                    'md5'] and sha1hash != row['sha1']:
                                errors.append('both md5 and sha1 hash failed')
                            elif not skip_hash and (
                                (md5hash == row['md5']
                                 and sha1hash != row['sha1']) or
                                (md5hash != row['md5']
                                 and sha1hash == row['sha1'])):
                                errors.append(
                                    'one of the hash failed but not the other (which may indicate that the database file is corrupted)'
                                )
                            if ext != row['ext']:
                                errors.append('extension has changed')
                            if size != int(row['size']):
                                errors.append(
                                    "size has changed (before: %s - now: %s)" %
                                    (row['size'], size))
                            if not disable_modification_date_checking and (
                                    lastmodif != float(
                                        row['last_modification_timestamp'])
                                    and round(lastmodif, 0) != round(
                                        float(row[
                                            'last_modification_timestamp']), 0)
                            ):  # for usage with PyPy: last modification time is differently managed (rounded), thus we need to round here manually to compare against PyPy.
                                errors.append(
                                    "modification date has changed (before: %s - now: %s)"
                                    % (row['last_modification_date'],
                                       lastmodif_readable))
                    except IOError as e:  # Catch IOError as a file error
                        errors.append(
                            'file can\'t be read, IOError (inaccessible, maybe bad sector?)'
                        )
                    except Exception as e:  # Any other exception when accessing the file will also be caught as a file error
                        errors.append('file can\'t be accessed: %s' % e)
                # Print/Log all errors for this file if any happened
                if errors:
                    errorscount = errorscount + 1
                    ptee.write("\n- Error for file %s: %s." %
                               (row['path'], ', '.join(errors)))
                    if errors_file is not None:  # Write error in a csv file if supplied (for easy processing later by other softwares such as file repair softwares)
                        e_writer.writerow([row['path'], ', '.join(errors)])
        # END OF CHECKING: show some stats
        ptee.write("----------------------------------------------------")
        ptee.write(
            "All files checked: Total: %i - Files with errors: %i.\n\n" %
            (filescount, errorscount))
        retval = (errorscount > 0)

    del ptee
    return retval  # return error code if any