Example #1
0
def main(argv=None):
    if argv is None: # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(argv, basestring): # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv) # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''ECC file repairer
Description: Repair the structure of an ecc file, mainly the ecc markers, so that at least the ecc correction can align correctly the ecc entries and fields.
Note: An ecc structure repair does NOT allow to recover from more errors on your files, it only allows to repair an ecc file so that its structure is valid and can be read correctly.
    '''
    ep = ''' '''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
    else: # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]: argv.remove('--ignore-gooey') # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(add_help=True, description=desc, epilog=ep, formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
    # Required arguments
    main_parser.add_argument('-i', '--input', metavar='eccfile.txt', type=str, required=True,
                        help='Path to the ecc file to repair.', **widget_file)
    main_parser.add_argument('-o', '--output', metavar='eccfile_repaired.txt', type=str, required=True, #type=argparse.FileType('rt')
                        help='Output path where to save the repaired ecc file.', **widget_filesave)
    main_parser.add_argument('-t', '--threshold', type=float, default=0.3, required=False,
                        help='Distance threshold for the heuristic hamming distance repair. This must be a float, eg, 0.2 means that if there are 20% characters different between an ecc marker and a substring in the ecc file, it will be detected as a marker and corrected.', **widget_text)

    # Optional general arguments
    main_parser.add_argument('--index', metavar='eccfile.txt.idx', type=str, required=False,
                        help='Path to the index backup file corresponding to the ecc file (optional but helps a lot).', **widget_file)
    main_parser.add_argument('--ecc_algo', type=int, default=1, required=False,
                        help='What algorithm use to generate and verify the ECC? Values possible: 1-4. 1 is the formal, fully verified Reed-Solomon in base 3 ; 2 is a faster implementation but still based on the formal base 3 ; 3 is an even faster implementation but based on another library which may not be correct ; 4 is the fastest implementation supporting US FAA ADSB UAT RS FEC standard but is totally incompatible with the other three (a text encoded with any of 1-3 modes will be decodable with any one of them).', **widget_text)
    main_parser.add_argument('-l', '--log', metavar='/some/folder/filename.log', type=str, required=False,
                        help='Path to the log file. (Output will be piped to both the stdout and the log file)', **widget_filesave)
    main_parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False,
                        help='Verbose mode (show more output).')
    main_parser.add_argument('--silent', action='store_true', required=False, default=False,
                        help='No console output (but if --log specified, the log will still be saved in the specified file).')

    main_parser.add_argument('-f', '--force', action='store_true', required=False, default=False,
                        help='Force overwriting the ecc file even if it already exists (if --generate).')


    #== Parsing the arguments
    args = main_parser.parse_args(argv) # Storing all arguments to args

    #-- Set hard-coded variables
    entrymarker = "\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF" # marker that will signal the beginning of an ecc entry - use an alternating pattern of several characters, this avoids confusion (eg: if you use "AAA" as a pattern, if the ecc block of the previous file ends with "EGA" for example, then the full string for example will be "EGAAAAC:\yourfolder\filea.jpg" and then the entry reader will detect the first "AAA" occurrence as the entry start - this should not make the next entry bug because there is an automatic trim - but the previous ecc block will miss one character that could be used to repair the block because it will be "EG" instead of "EGA"!)
    field_delim = "\xFA\xFF\xFA\xFF\xFA" # delimiter between fields (filepath, filesize, hash+ecc blocks) inside an ecc entry
    markers = [entrymarker, field_delim] # put them in a list for easy reference
    max_block_size = 27
    resilience_rate = 1

    #-- Set variables from arguments
    inputpath = fullpath(args.input)
    outputpath = fullpath(args.output)
    distance_threshold = args.threshold
    indexpath = None
    if args.index: indexpath = fullpath(args.index)
    force = args.force
    ecc_algo = args.ecc_algo
    verbose = args.verbose
    silent = args.silent

    # -- Checking arguments
    if not os.path.isfile(inputpath):
        raise NameError('Specified database ecc file %s does not exist!' % inputpath)
    if os.path.isfile(outputpath) and not force:
        raise NameError('Specified output path for the repaired ecc file %s already exists! Use --force if you want to overwrite.' % outputpath)
    if indexpath and not os.path.isfile(indexpath):
        raise NameError('Specified index backup file %s does not exist!' % indexpath)

    if max_block_size < 2 or max_block_size > 255:
        raise ValueError('RS max block size must be between 2 and 255.')

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log, 'a', nostdout=silent)
        #sys.stdout = Tee(args.log, 'a')
        sys.stderr = Tee(args.log, 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)


    # == PROCESSING BRANCHING == #

    # Precompute some parameters and load up ecc manager objects (big optimization as g_exp and g_log tables calculation is done only once)
    hasher_none = Hasher('none') # for index ecc we don't use any hash
    ecc_params_idx = compute_ecc_params(max_block_size, resilience_rate, hasher_none)
    ecc_manager_idx = ECCMan(max_block_size, ecc_params_idx["message_size"], algo=ecc_algo)

    # == Main loop
    ptee.write("====================================")
    ptee.write("ECC repair, started on %s" % datetime.datetime.now().isoformat())
    ptee.write("====================================")
    ptee.write("Please note that this tool may not know if it found all the markers, so it may miss too much corrupted markers but it will repair the ones it finds (except if you have a fully valid index file, then you are guaranteed to always find all markers).")

    ecc_size = os.stat(inputpath).st_size
    if indexpath: idx_size = os.stat(indexpath).st_size
    shutil.copy2(inputpath, outputpath)
    blocksize = 65535
    with open(outputpath, 'r+b') as db:

        # == Index backup repair
        # This repair needs an index backup file which is normally generated at the same time as the ecc file. The index backup file is a file that stores the position of all ecc markers in the corresponding ecc file, and protects those positions using ecc.
        if indexpath:
            ptee.write("Using the index backup file %s to repair ecc markers, please wait..." % args.index)
            db.seek(0) # seek to the beginning of the database file
            idx_corrupted = 0
            idx_corrected = 0
            idx_total = 0
            markers_repaired = [0] * len(markers)
            bardisp = tqdm.tqdm(total=idx_size, file=ptee, leave=True, desc='IDXREAD', unit='B', unit_scale=True) # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have)
            with open(indexpath, 'rb') as dbidx:
                buf = 1
                while buf:
                    # The format of the index backup file is pretty simple: for each entrymarker or field_delim, a block is appended. Each such block is made of: the type on one byte (1 for entrymarker, 2 for field_delim), then the marker's position in the ecc file encoded in an unsigned long long (thus it's on a fixed 8 bytes), and finally an ecc for both the type and marker's position, and which is of fixed size (since we know that the marker's type + position = 9 bytes). Each such block is appended right after the precedent, so we know easily read them and such structure cannot be tampered by a soft error (there's no way a hard drive failure can chance the structure of the data, but a malicious user can! But it's then easy to fix that for a human user, you can clearly see the patterns, where the marker's positions begins and ends).
                    # Note that this constant sized structure of blocks is made on purpose, so that the structure of the index backup file is implicit, while the structure of the ecc file is explicit (it needs uncorrupted markers, which is a weak point that we try to address with the index backup file).
                    # eg of two blocks: 10000008Aecceccecc2000000F2ecceccecc
                    #
                    # Read one index block
                    curpos = dbidx.tell() # backup current position for error messages
                    buf = dbidx.read(max_block_size)
                    # Update progress bar
                    bardisp.update(dbidx.tell()-bardisp.n)
                    # If we have reached EOF, then we stop here
                    if not buf: break

                    # Else it's ok we have an index block, we process it
                    idx_total += 1
                    # Extract the marker's infos and the ecc
                    marker_str = buf[:ecc_params_idx["message_size"]]
                    ecc = buf[ecc_params_idx["message_size"]:]
                    # Check if the marker's infos are corrupted, if yes, then we will try to fix that using the ecc
                    if not ecc_manager_idx.check(marker_str, ecc):
                        # Trying to fix the marker's infos using the ecc
                        idx_corrupted += 1
                        marker_repaired, repaired_ecc = ecc_manager_idx.decode(marker_str, ecc)
                        # Repaired the marker's infos, all is good!
                        if ecc_manager_idx.check(marker_repaired, repaired_ecc):
                            marker_str = marker_repaired
                            idx_corrected += 1
                        # Else it's corrupted beyond repair, just skip
                        else:
                            ptee.write("\n- Index backup file: error on block starting at %i, corrupted and could not fix it. Skipping." % curpos)
                            marker_str = None
                            continue
                    if not marker_str: continue

                    # Repair ecc file's marker using our correct (or repaired) marker's infos
                    marker_type = int(marker_str[0]) # marker's type is always stored on the first byte/character
                    marker_pos = struct.unpack('>Q', marker_str[1:]) # marker's position is encoded as a big-endian unsigned long long, in a 8 bytes/chars string
                    db.seek(marker_pos[0]) # move the ecc reading cursor to the beginning of the marker
                    current_marker = db.read(len(markers[marker_type-1])) # read the current marker (potentially corrupted)
                    db.seek(marker_pos[0])
                    if verbose:
                        print "- Found marker by index file: type=%i content=" % (marker_type)
                        print db.read(len(markers[marker_type-1])+4)
                        db.seek(marker_pos[0]) # replace the reading cursor back in place before the marker
                    if current_marker != markers[marker_type-1]: # check if we really need to repair this marker
                        # Rewrite the marker over the ecc file
                        db.write(markers[marker_type-1])
                        markers_repaired[marker_type-1] += 1
                    else:
                        print "skipped, no need to repair"
            # Done the index backup repair
            if bardisp.n > bardisp.total: bardisp.n = bardisp.total # just a workaround in case there's one byte more than the predicted total
            bardisp.close()
            ptee.write("Done. Total: %i/%i markers repaired (%i entrymarkers and %i field_delim), %i indexes corrupted and %i indexes repaired (%i indexes lost).\n" % (markers_repaired[0]+markers_repaired[1], idx_total, markers_repaired[0], markers_repaired[1], idx_corrupted, idx_corrected, idx_corrupted-idx_corrected) )

        # == Heuristical Greedy Hamming distance repair
        # This is a heuristical (doesn't need any other file than the ecc file) 2-pass algorithm: the first pass tries to find the markers positions, and then the second pass simply reads the original ecc file and copies it while repairing the found markers.
        # The first pass is obviously the most interesting, here's a description: we use a kind of greedy algorithm but with backtracking, meaning that we simply read through all the strings sequentially and just compare with the markers and compute the Hamming distance: if the Hamming distance gets below the threshold, we trigger the found marker flag. Then if the Hamming distance decreases, we save this marker position and disable the found marker flag. However, there can be false positives like this (eg, the marker is corrupted in the middle), so we have a backtracking mechanism: if a later string is found to have a Hamming distance that is below the threshold, then we check if the just previously found marker is in the range (ie, the new marker's position is smaller than the previous marker's length) and if the Hamming distance is smaller, then we replace the previous marker with the new marker's position, because the previous one was most likely a false positive.
        # This method doesn't require any other file than the ecc file, but it may not work on ecc markers that are too much tampered, and if the detection threshold is too low or the markers are too small, there may be lots of false positives.
        # So try to use long markers (consisting of many character, preferably an alternating pattern different than the null byte \x00) and a high enough detection threshold.
        ptee.write("Using heuristics (Hamming distance) to fix markers with a threshold of %i%%, please wait..." % (round(distance_threshold*100, 0)) )

        # Main loop for heuristical repair, try to find the substrings that minimize the hamming distance to one of the ecc markers
        markers_repaired = [0] * len(markers) # stat counter
        already_valid = 0 # stat counter
        db.seek(0) # seek to the beginning of the database file
        buf = 1 # init the buffer to 1 to initiate the while loop
        markers_pos = [[] for i in xrange(len(markers))] # will contain the list of positions where a corrupted marker has been detected (not valid markers, they will be skipped)
        distance_thresholds = [round(len(x)*distance_threshold, 0) for x in markers] # calculate the number of characters maximum for distance
        skip_until = -1 # when a valid marker (non corrupted) is found, we use this variable to skip to after the marker length (to avoid detecting partial parts of this marker, which will have a hamming distance even if the marker is completely valid because the reading window will be after the beginning of the marker)
        bardisp = tqdm.tqdm(total=ecc_size, file=ptee, leave=True, desc='DBREAD', unit='B', unit_scale=True) # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have)
        while buf: # until we have walked through the whole ecc file
            # Read a part of the ecc file into a buffer, this allows to process more quickly than just loading the size of a marker
            curpos = db.tell() # keep the current reading position
            buf = db.read(blocksize)
            # Update progress bar
            bardisp.update(db.tell()-bardisp.n)
            if not buf: break # reached EOF? quitting here

            # Scan the buffer, by splitting the buffer into substrings the length of the ecc markers
            for i in xrange(len(buf)-max(len(entrymarker),len(field_delim))):
                # If we just came accross a non corrupted ecc marker, we skip until we are after this ecc marker (to avoid misdetections)
                if i < skip_until: continue
                # Compare each ecc marker type to this substring and compute the Hamming distance
                for m in xrange(len(markers)):
                    d = hamming(buf[i:i+len(markers[m])], markers[m]) # Compute the Hamming distance (simply the number of different characters)
                    mcurpos = curpos+i # current absolute position of this ecc marker
                    
                    # If there's no difference, then it's a valid, non-corrupted ecc marker
                    if d == 0:
                        already_valid += 1 # stats...
                        # If we previously wrongly detected a corrupted ecc marker near here, then it's probably a misdetection (because we just had a partial view on this marker until now), thus we just remove it from our list of markers to repair
                        if len(markers_pos[m]) > 0 and (mcurpos - markers_pos[m][-1][0]) <= len(markers[m]): # to detect that, we just check if the latest marker to repair is near the current marker (if its position is at maximum the length of the marker). This works because in the other condition below, we update the latest marker to repair if we find another one with a lower hamming distance very near.
                            del markers_pos[m][-1]
                        # Skip scanning until we are after the current marker to avoid misdetections
                        su = i+len(markers[m])
                        if su > skip_until: skip_until = su # update with the biggest marker (because both markers can be detected here if the pattern is similar)
                        break
                    # Else there's a difference/distance but it's below the threshold: we have a corrupted marker!
                    elif d > 0 and d <= distance_thresholds[m]:
                        # Updating case: If the latest marker to repair is quite close to the current one, but the current detection has a lower distance, we probably are detecting the same marker but we are better positionned now, so we update the previous marker's position with this one now.
                        if len(markers_pos[m]) > 0 and (mcurpos - markers_pos[m][-1][0]) <= len(markers[m]):
                            if d < markers_pos[m][-1][1]: # Update only if the distance is less
                                markers_pos[m][-1] = [mcurpos, d]
                            else: # Else, we probably are detecting the same marker as the last detected one, but since our scanning window has moved forward, we have increased the distance. Just skip it, we should not repair at this position (else we will probably be overwriting over real content).
                                continue
                        # Adding case: Else we just add this marker as a new one to repair by appending to the list
                        else:
                            markers_pos[m].append([mcurpos, d])
                    # Else the distance is too great for the threshold, it's not a marker at all, we go on to the next substring
            if db.tell() < ecc_size: db.seek(db.tell()-max(len(entrymarker),len(field_delim)))
        if bardisp.n > bardisp.total: bardisp.n = bardisp.total # just a workaround in case there's one byte more than the predicted total
        bardisp.close()

        # Committing the repair into the ecc file
        for m in xrange(len(markers)): # for each type of markers
            marker = markers[m]
            if len(markers_pos[m]) > 0: # If there is any detected marker to repair for this type
                for pos in markers_pos[m]: # for each detected marker to repair, we rewrite it over into the file at the detected position
                    if verbose: ptee.write("- Detected marker type %i at position %i with distance %i (%i%%): repairing." % (m+1, pos[0], pos[1], (float(pos[1])/len(markers[m]))*100) )
                    db.seek(pos[0])
                    db.write(marker)

        #print(markers_pos)
        ptee.write("Done. Hamming heuristic with threshold %i%% repaired %i entrymarkers and %i field_delim (%i total) and %i were already valid.\n" % (round(distance_threshold*100, 0), len(markers_pos[0]), len(markers_pos[1]), len(markers_pos[0])+len(markers_pos[1]), already_valid) )
        del ptee
        return 0
Example #2
0
def main(argv=None):
    # Setup configuration variables. Change here if you want to.
    max_block_size = 255
    resilience_rate = 0.2
    ecc_algo = 3
    msg_nb = 1000000
    tamper_rate = 0.4 # tamper rate is relative to the number of ecc bytes, not the whole message (not like the resilience_rate)
    tamper_mode = 'noise' # noise or erasure
    no_decoding = True

    # Precompute some parameters and load up ecc manager objects (big optimization as g_exp and g_log tables calculation is done only once)
    hasher_none = Hasher('none') # for index ecc we don't use any hash
    ecc_params = compute_ecc_params(max_block_size, resilience_rate, hasher_none)
    ecc_manager = ECCMan(max_block_size, ecc_params["message_size"], algo=ecc_algo)

    # == Main loop
    print("====================================")
    print("ECC Speed Test, started on %s" % datetime.datetime.now().isoformat())
    print("====================================")
    print("ECC algorithm: %i." % ecc_algo)

    # -- Encoding test
    # IMPORTANT: we do NOT check the correctness of encoding, only the speed! It's up to you to verify that you are computing the ecc correctly.
    total_time = 0
    total_size = msg_nb*max_block_size
    bardisp = tqdm.tqdm(total=total_size, leave=True, desc='ENC', unit='B', unit_scale=True, ncols=79, mininterval=0.5) # display progress bar based on the number of bytes encoded
    k = ecc_params["message_size"]
    # Generate a random string and encode it
    for msg in gen_random_string(msg_nb, k):
        start = time.clock()
        ecc_manager.encode(msg)
        total_time += time.clock() - start
        bardisp.update(max_block_size)
    bardisp.close()
    print("Encoding: total time elapsed: %f sec for %s of data. Real Speed (only encoding, no other computation): %s." % (total_time, format_sizeof(total_size, 'B'), format_sizeof(total_size/total_time, 'B/sec') ))
    
    # -- Decoding test
    if not no_decoding:
        total_time = 0
        total_size = msg_nb*max_block_size
        bardisp = tqdm.tqdm(total=total_size, leave=True, desc='ENC', unit='B', unit_scale=True) # display progress bar based on the number of bytes encoded
        # Generate a random string and encode it
        for msg in gen_random_string(msg_nb, ecc_params["message_size"]):
            # Computing the ecc first
            ecc = ecc_manager.encode(msg)

            # Then tamper it randomly
            # First generate a list of random indices where we will tamper
            tamper_idx = random.sample(xrange(ecc_params["message_size"]), int(math.floor(ecc_params["ecc_size"] * tamper_rate)))
            # Convert to bytearray to easily modify characters in the message
            msg_tampered = bytearray(msg)
            # Tamper the characters
            for pos in tamper_idx:
                if tamper_mode == 'n' or tamper_mode == 'noise': # Noising the character (set a random ASCII character)
                    msg_tampered[pos] = random.randint(0,255)
                elif tamper_mode == 'e' or tamper_mode == 'erasure': # Erase the character (set a null byte)
                    msg_tampered[pos] = 0
            # Convert back to a string
            msg_tampered = str(msg_tampered)
            ecc = str(ecc)

            # Decode the tampered message with ecc
            start = time.clock()
            try:
                msg_repaired, ecc_repaired = ecc_manager.decode(msg_tampered, ecc)
                # Check if the decoding was successful, else there's a problem, the decoding may be buggy
                if not ecc_manager.check(msg_repaired, ecc_repaired): raise ReedSolomonError
            except ReedSolomonError:
                print("Warning, there was an error while decoding. Please check your parameters (tamper_rate not too high) or the decoding procedure.")
                pass
            total_time += time.clock() - start
            bardisp.update(max_block_size)
        bardisp.close()
        print("Decoding: total time elapsed: %f sec for %s of data. Real Speed (only decoding, no other computation): %s." % (total_time, format_sizeof(total_size, 'B'), format_sizeof(total_size/total_time, 'B/sec') ))

    return 0
def main(argv=None):
    # Setup configuration variables. Change here if you want to.
    max_block_size = 255
    resilience_rate = 0.2
    ecc_algo = 3
    msg_nb = 1000000
    tamper_rate = 0.4  # tamper rate is relative to the number of ecc bytes, not the whole message (not like the resilience_rate)
    tamper_mode = 'noise'  # noise or erasure
    no_decoding = True

    # Precompute some parameters and load up ecc manager objects (big optimization as g_exp and g_log tables calculation is done only once)
    hasher_none = Hasher('none')  # for index ecc we don't use any hash
    ecc_params = compute_ecc_params(max_block_size, resilience_rate,
                                    hasher_none)
    ecc_manager = ECCMan(max_block_size,
                         ecc_params["message_size"],
                         algo=ecc_algo)

    # == Main loop
    print("====================================")
    print("ECC Speed Test, started on %s" %
          datetime.datetime.now().isoformat())
    print("====================================")
    print("ECC algorithm: %i." % ecc_algo)

    # -- Encoding test
    # IMPORTANT: we do NOT check the correctness of encoding, only the speed! It's up to you to verify that you are computing the ecc correctly.
    total_time = 0
    total_size = msg_nb * max_block_size
    bardisp = tqdm.tqdm(
        total=total_size,
        leave=True,
        desc='ENC',
        unit='B',
        unit_scale=True,
        ncols=79,
        mininterval=0.5
    )  # display progress bar based on the number of bytes encoded
    k = ecc_params["message_size"]
    # Generate a random string and encode it
    for msg in gen_random_string(msg_nb, k):
        start = time.clock()
        ecc_manager.encode(msg)
        total_time += time.clock() - start
        bardisp.update(max_block_size)
    bardisp.close()
    print(
        "Encoding: total time elapsed: %f sec for %s of data. Real Speed (only encoding, no other computation): %s."
        % (total_time, format_sizeof(
            total_size, 'B'), format_sizeof(total_size / total_time, 'B/sec')))

    # -- Decoding test
    if not no_decoding:
        total_time = 0
        total_size = msg_nb * max_block_size
        bardisp = tqdm.tqdm(
            total=total_size,
            leave=True,
            desc='ENC',
            unit='B',
            unit_scale=True
        )  # display progress bar based on the number of bytes encoded
        # Generate a random string and encode it
        for msg in gen_random_string(msg_nb, ecc_params["message_size"]):
            # Computing the ecc first
            ecc = ecc_manager.encode(msg)

            # Then tamper it randomly
            # First generate a list of random indices where we will tamper
            tamper_idx = random.sample(
                xrange(ecc_params["message_size"]),
                int(math.floor(ecc_params["ecc_size"] * tamper_rate)))
            # Convert to bytearray to easily modify characters in the message
            msg_tampered = bytearray(msg)
            # Tamper the characters
            for pos in tamper_idx:
                if tamper_mode == 'n' or tamper_mode == 'noise':  # Noising the character (set a random ASCII character)
                    msg_tampered[pos] = random.randint(0, 255)
                elif tamper_mode == 'e' or tamper_mode == 'erasure':  # Erase the character (set a null byte)
                    msg_tampered[pos] = 0
            # Convert back to a string
            msg_tampered = str(msg_tampered)
            ecc = str(ecc)

            # Decode the tampered message with ecc
            start = time.clock()
            try:
                msg_repaired, ecc_repaired = ecc_manager.decode(
                    msg_tampered, ecc)
                # Check if the decoding was successful, else there's a problem, the decoding may be buggy
                if not ecc_manager.check(msg_repaired, ecc_repaired):
                    raise ReedSolomonError
            except ReedSolomonError:
                print(
                    "Warning, there was an error while decoding. Please check your parameters (tamper_rate not too high) or the decoding procedure."
                )
                pass
            total_time += time.clock() - start
            bardisp.update(max_block_size)
        bardisp.close()
        print(
            "Decoding: total time elapsed: %f sec for %s of data. Real Speed (only decoding, no other computation): %s."
            % (total_time, format_sizeof(total_size, 'B'),
               format_sizeof(total_size / total_time, 'B/sec')))

    return 0
Example #4
0
def main(argv=None):
    if argv is None:  # if argv is empty, fetch from the commandline
        argv = sys.argv[1:]
    elif isinstance(
            argv, basestring
    ):  # else if argv is supplied but it's a simple string, we need to parse it to a list of arguments before handing to argparse or any other argument parser
        argv = shlex.split(argv)  # Parse string just like argv using shlex

    #==== COMMANDLINE PARSER ====

    #== Commandline description
    desc = '''ECC file repairer
Description: Repair the structure of an ecc file, mainly the ecc markers, so that at least the ecc correction can align correctly the ecc entries and fields.
Note: An ecc structure repair does NOT allow to recover from more errors on your files, it only allows to repair an ecc file so that its structure is valid and can be read correctly.
    '''
    ep = ''' '''

    #== Commandline arguments
    #-- Constructing the parser
    # Use GooeyParser if we want the GUI because it will provide better widgets
    if len(argv) > 0 and (argv[0] == '--gui' and
                          not '--ignore-gooey' in argv):  # pragma: no cover
        # Initialize the Gooey parser
        main_parser = gooey.GooeyParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define Gooey widget types explicitly (because type auto-detection doesn't work quite well)
        widget_dir = {"widget": "DirChooser"}
        widget_filesave = {"widget": "FileSaver"}
        widget_file = {"widget": "FileChooser"}
        widget_text = {"widget": "TextField"}
    else:  # Else in command-line usage, use the standard argparse
        # Delete the special argument to avoid unrecognized argument error in argparse
        if '--ignore-gooey' in argv[0]:
            argv.remove(
                '--ignore-gooey'
            )  # this argument is automatically fed by Gooey when the user clicks on Start
        # Initialize the normal argparse parser
        main_parser = argparse.ArgumentParser(
            add_help=True,
            description=desc,
            epilog=ep,
            formatter_class=argparse.RawTextHelpFormatter)
        # Define dummy dict to keep compatibile with command-line usage
        widget_dir = {}
        widget_filesave = {}
        widget_file = {}
        widget_text = {}
    # Required arguments
    main_parser.add_argument('-i',
                             '--input',
                             metavar='eccfile.txt',
                             type=str,
                             required=True,
                             help='Path to the ecc file to repair.',
                             **widget_file)
    main_parser.add_argument(
        '-o',
        '--output',
        metavar='eccfile_repaired.txt',
        type=str,
        required=True,  #type=argparse.FileType('rt')
        help='Output path where to save the repaired ecc file.',
        **widget_filesave)
    main_parser.add_argument(
        '-t',
        '--threshold',
        type=float,
        default=0.3,
        required=False,
        help=
        'Distance threshold for the heuristic hamming distance repair. This must be a float, eg, 0.2 means that if there are 20% characters different between an ecc marker and a substring in the ecc file, it will be detected as a marker and corrected.',
        **widget_text)

    # Optional general arguments
    main_parser.add_argument(
        '--index',
        metavar='eccfile.txt.idx',
        type=str,
        required=False,
        help=
        'Path to the index backup file corresponding to the ecc file (optional but helps a lot).',
        **widget_file)
    main_parser.add_argument(
        '--ecc_algo',
        type=int,
        default=1,
        required=False,
        help=
        'What algorithm use to generate and verify the ECC? Values possible: 1-4. 1 is the formal, fully verified Reed-Solomon in base 3 ; 2 is a faster implementation but still based on the formal base 3 ; 3 is an even faster implementation but based on another library which may not be correct ; 4 is the fastest implementation supporting US FAA ADSB UAT RS FEC standard but is totally incompatible with the other three (a text encoded with any of 1-3 modes will be decodable with any one of them).',
        **widget_text)
    main_parser.add_argument(
        '-l',
        '--log',
        metavar='/some/folder/filename.log',
        type=str,
        required=False,
        help=
        'Path to the log file. (Output will be piped to both the stdout and the log file)',
        **widget_filesave)
    main_parser.add_argument('-v',
                             '--verbose',
                             action='store_true',
                             required=False,
                             default=False,
                             help='Verbose mode (show more output).')
    main_parser.add_argument(
        '--silent',
        action='store_true',
        required=False,
        default=False,
        help=
        'No console output (but if --log specified, the log will still be saved in the specified file).'
    )

    main_parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        required=False,
        default=False,
        help=
        'Force overwriting the ecc file even if it already exists (if --generate).'
    )

    #== Parsing the arguments
    args = main_parser.parse_args(argv)  # Storing all arguments to args

    #-- Set hard-coded variables
    entrymarker = "\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF"  # marker that will signal the beginning of an ecc entry - use an alternating pattern of several characters, this avoids confusion (eg: if you use "AAA" as a pattern, if the ecc block of the previous file ends with "EGA" for example, then the full string for example will be "EGAAAAC:\yourfolder\filea.jpg" and then the entry reader will detect the first "AAA" occurrence as the entry start - this should not make the next entry bug because there is an automatic trim - but the previous ecc block will miss one character that could be used to repair the block because it will be "EG" instead of "EGA"!)
    field_delim = "\xFA\xFF\xFA\xFF\xFA"  # delimiter between fields (filepath, filesize, hash+ecc blocks) inside an ecc entry
    markers = [entrymarker,
               field_delim]  # put them in a list for easy reference
    max_block_size = 27
    resilience_rate = 1

    #-- Set variables from arguments
    inputpath = fullpath(args.input)
    outputpath = fullpath(args.output)
    distance_threshold = args.threshold
    indexpath = None
    if args.index: indexpath = fullpath(args.index)
    force = args.force
    ecc_algo = args.ecc_algo
    verbose = args.verbose
    silent = args.silent

    # -- Checking arguments
    if not os.path.isfile(inputpath):
        raise NameError('Specified database ecc file %s does not exist!' %
                        inputpath)
    if os.path.isfile(outputpath) and not force:
        raise NameError(
            'Specified output path for the repaired ecc file %s already exists! Use --force if you want to overwrite.'
            % outputpath)
    if indexpath and not os.path.isfile(indexpath):
        raise NameError('Specified index backup file %s does not exist!' %
                        indexpath)

    if max_block_size < 2 or max_block_size > 255:
        raise ValueError('RS max block size must be between 2 and 255.')

    # -- Configure the log file if enabled (ptee.write() will write to both stdout/console and to the log file)
    if args.log:
        ptee = Tee(args.log, 'a', nostdout=silent)
        #sys.stdout = Tee(args.log, 'a')
        sys.stderr = Tee(args.log, 'a', nostdout=silent)
    else:
        ptee = Tee(nostdout=silent)

    # == PROCESSING BRANCHING == #

    # Precompute some parameters and load up ecc manager objects (big optimization as g_exp and g_log tables calculation is done only once)
    hasher_none = Hasher('none')  # for index ecc we don't use any hash
    ecc_params_idx = compute_ecc_params(max_block_size, resilience_rate,
                                        hasher_none)
    ecc_manager_idx = ECCMan(max_block_size,
                             ecc_params_idx["message_size"],
                             algo=ecc_algo)

    # == Main loop
    ptee.write("====================================")
    ptee.write("ECC repair, started on %s" %
               datetime.datetime.now().isoformat())
    ptee.write("====================================")
    ptee.write(
        "Please note that this tool may not know if it found all the markers, so it may miss too much corrupted markers but it will repair the ones it finds (except if you have a fully valid index file, then you are guaranteed to always find all markers)."
    )

    ecc_size = os.stat(inputpath).st_size
    if indexpath: idx_size = os.stat(indexpath).st_size
    shutil.copy2(inputpath, outputpath)
    blocksize = 65535
    with open(outputpath, 'r+b') as db:

        # == Index backup repair
        # This repair needs an index backup file which is normally generated at the same time as the ecc file. The index backup file is a file that stores the position of all ecc markers in the corresponding ecc file, and protects those positions using ecc.
        if indexpath:
            ptee.write(
                "Using the index backup file %s to repair ecc markers, please wait..."
                % args.index)
            db.seek(0)  # seek to the beginning of the database file
            idx_corrupted = 0
            idx_corrected = 0
            idx_total = 0
            markers_repaired = [0] * len(markers)
            bardisp = tqdm.tqdm(
                total=idx_size,
                file=ptee,
                leave=True,
                desc='IDXREAD',
                unit='B',
                unit_scale=True
            )  # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have)
            with open(indexpath, 'rb') as dbidx:
                buf = 1
                while buf:
                    # The format of the index backup file is pretty simple: for each entrymarker or field_delim, a block is appended. Each such block is made of: the type on one byte (1 for entrymarker, 2 for field_delim), then the marker's position in the ecc file encoded in an unsigned long long (thus it's on a fixed 8 bytes), and finally an ecc for both the type and marker's position, and which is of fixed size (since we know that the marker's type + position = 9 bytes). Each such block is appended right after the precedent, so we know easily read them and such structure cannot be tampered by a soft error (there's no way a hard drive failure can chance the structure of the data, but a malicious user can! But it's then easy to fix that for a human user, you can clearly see the patterns, where the marker's positions begins and ends).
                    # Note that this constant sized structure of blocks is made on purpose, so that the structure of the index backup file is implicit, while the structure of the ecc file is explicit (it needs uncorrupted markers, which is a weak point that we try to address with the index backup file).
                    # eg of two blocks: 10000008Aecceccecc2000000F2ecceccecc
                    #
                    # Read one index block
                    curpos = dbidx.tell(
                    )  # backup current position for error messages
                    buf = dbidx.read(max_block_size)
                    # Update progress bar
                    bardisp.update(dbidx.tell() - bardisp.n)
                    # If we have reached EOF, then we stop here
                    if not buf: break

                    # Else it's ok we have an index block, we process it
                    idx_total += 1
                    # Extract the marker's infos and the ecc
                    marker_str = buf[:ecc_params_idx["message_size"]]
                    ecc = buf[ecc_params_idx["message_size"]:]
                    # Check if the marker's infos are corrupted, if yes, then we will try to fix that using the ecc
                    if not ecc_manager_idx.check(marker_str, ecc):
                        # Trying to fix the marker's infos using the ecc
                        idx_corrupted += 1
                        marker_repaired, repaired_ecc = ecc_manager_idx.decode(
                            marker_str, ecc)
                        # Repaired the marker's infos, all is good!
                        if ecc_manager_idx.check(marker_repaired,
                                                 repaired_ecc):
                            marker_str = marker_repaired
                            idx_corrected += 1
                        # Else it's corrupted beyond repair, just skip
                        else:
                            ptee.write(
                                "\n- Index backup file: error on block starting at %i, corrupted and could not fix it. Skipping."
                                % curpos)
                            marker_str = None
                            continue
                    if not marker_str: continue

                    # Repair ecc file's marker using our correct (or repaired) marker's infos
                    marker_type = int(
                        marker_str[0]
                    )  # marker's type is always stored on the first byte/character
                    marker_pos = struct.unpack(
                        '>Q', marker_str[1:]
                    )  # marker's position is encoded as a big-endian unsigned long long, in a 8 bytes/chars string
                    db.seek(
                        marker_pos[0]
                    )  # move the ecc reading cursor to the beginning of the marker
                    current_marker = db.read(len(markers[
                        marker_type -
                        1]))  # read the current marker (potentially corrupted)
                    db.seek(marker_pos[0])
                    if verbose:
                        print "- Found marker by index file: type=%i content=" % (
                            marker_type)
                        print db.read(len(markers[marker_type - 1]) + 4)
                        db.seek(
                            marker_pos[0]
                        )  # replace the reading cursor back in place before the marker
                    if current_marker != markers[
                            marker_type -
                            1]:  # check if we really need to repair this marker
                        # Rewrite the marker over the ecc file
                        db.write(markers[marker_type - 1])
                        markers_repaired[marker_type - 1] += 1
                    else:
                        print "skipped, no need to repair"
            # Done the index backup repair
            if bardisp.n > bardisp.total:
                bardisp.n = bardisp.total  # just a workaround in case there's one byte more than the predicted total
            bardisp.close()
            ptee.write(
                "Done. Total: %i/%i markers repaired (%i entrymarkers and %i field_delim), %i indexes corrupted and %i indexes repaired (%i indexes lost).\n"
                % (markers_repaired[0] + markers_repaired[1], idx_total,
                   markers_repaired[0], markers_repaired[1], idx_corrupted,
                   idx_corrected, idx_corrupted - idx_corrected))

        # == Heuristical Greedy Hamming distance repair
        # This is a heuristical (doesn't need any other file than the ecc file) 2-pass algorithm: the first pass tries to find the markers positions, and then the second pass simply reads the original ecc file and copies it while repairing the found markers.
        # The first pass is obviously the most interesting, here's a description: we use a kind of greedy algorithm but with backtracking, meaning that we simply read through all the strings sequentially and just compare with the markers and compute the Hamming distance: if the Hamming distance gets below the threshold, we trigger the found marker flag. Then if the Hamming distance decreases, we save this marker position and disable the found marker flag. However, there can be false positives like this (eg, the marker is corrupted in the middle), so we have a backtracking mechanism: if a later string is found to have a Hamming distance that is below the threshold, then we check if the just previously found marker is in the range (ie, the new marker's position is smaller than the previous marker's length) and if the Hamming distance is smaller, then we replace the previous marker with the new marker's position, because the previous one was most likely a false positive.
        # This method doesn't require any other file than the ecc file, but it may not work on ecc markers that are too much tampered, and if the detection threshold is too low or the markers are too small, there may be lots of false positives.
        # So try to use long markers (consisting of many character, preferably an alternating pattern different than the null byte \x00) and a high enough detection threshold.
        ptee.write(
            "Using heuristics (Hamming distance) to fix markers with a threshold of %i%%, please wait..."
            % (round(distance_threshold * 100, 0)))

        # Main loop for heuristical repair, try to find the substrings that minimize the hamming distance to one of the ecc markers
        markers_repaired = [0] * len(markers)  # stat counter
        already_valid = 0  # stat counter
        db.seek(0)  # seek to the beginning of the database file
        buf = 1  # init the buffer to 1 to initiate the while loop
        markers_pos = [
            [] for i in xrange(len(markers))
        ]  # will contain the list of positions where a corrupted marker has been detected (not valid markers, they will be skipped)
        distance_thresholds = [
            round(len(x) * distance_threshold, 0) for x in markers
        ]  # calculate the number of characters maximum for distance
        skip_until = -1  # when a valid marker (non corrupted) is found, we use this variable to skip to after the marker length (to avoid detecting partial parts of this marker, which will have a hamming distance even if the marker is completely valid because the reading window will be after the beginning of the marker)
        bardisp = tqdm.tqdm(
            total=ecc_size,
            file=ptee,
            leave=True,
            desc='DBREAD',
            unit='B',
            unit_scale=True
        )  # display progress bar based on reading the database file (since we don't know how many files we will process beforehand nor how many total entries we have)
        while buf:  # until we have walked through the whole ecc file
            # Read a part of the ecc file into a buffer, this allows to process more quickly than just loading the size of a marker
            curpos = db.tell()  # keep the current reading position
            buf = db.read(blocksize)
            # Update progress bar
            bardisp.update(db.tell() - bardisp.n)
            if not buf: break  # reached EOF? quitting here

            # Scan the buffer, by splitting the buffer into substrings the length of the ecc markers
            for i in xrange(
                    len(buf) - max(len(entrymarker), len(field_delim))):
                # If we just came accross a non corrupted ecc marker, we skip until we are after this ecc marker (to avoid misdetections)
                if i < skip_until: continue
                # Compare each ecc marker type to this substring and compute the Hamming distance
                for m in xrange(len(markers)):
                    d = hamming(
                        buf[i:i + len(markers[m])], markers[m]
                    )  # Compute the Hamming distance (simply the number of different characters)
                    mcurpos = curpos + i  # current absolute position of this ecc marker

                    # If there's no difference, then it's a valid, non-corrupted ecc marker
                    if d == 0:
                        already_valid += 1  # stats...
                        # If we previously wrongly detected a corrupted ecc marker near here, then it's probably a misdetection (because we just had a partial view on this marker until now), thus we just remove it from our list of markers to repair
                        if len(
                                markers_pos[m]
                        ) > 0 and (mcurpos - markers_pos[m][-1][0]) <= len(
                                markers[m]
                        ):  # to detect that, we just check if the latest marker to repair is near the current marker (if its position is at maximum the length of the marker). This works because in the other condition below, we update the latest marker to repair if we find another one with a lower hamming distance very near.
                            del markers_pos[m][-1]
                        # Skip scanning until we are after the current marker to avoid misdetections
                        su = i + len(markers[m])
                        if su > skip_until:
                            skip_until = su  # update with the biggest marker (because both markers can be detected here if the pattern is similar)
                        break
                    # Else there's a difference/distance but it's below the threshold: we have a corrupted marker!
                    elif d > 0 and d <= distance_thresholds[m]:
                        # Updating case: If the latest marker to repair is quite close to the current one, but the current detection has a lower distance, we probably are detecting the same marker but we are better positionned now, so we update the previous marker's position with this one now.
                        if len(markers_pos[m]) > 0 and (
                                mcurpos - markers_pos[m][-1][0]) <= len(
                                    markers[m]):
                            if d < markers_pos[m][-1][
                                    1]:  # Update only if the distance is less
                                markers_pos[m][-1] = [mcurpos, d]
                            else:  # Else, we probably are detecting the same marker as the last detected one, but since our scanning window has moved forward, we have increased the distance. Just skip it, we should not repair at this position (else we will probably be overwriting over real content).
                                continue
                        # Adding case: Else we just add this marker as a new one to repair by appending to the list
                        else:
                            markers_pos[m].append([mcurpos, d])
                    # Else the distance is too great for the threshold, it's not a marker at all, we go on to the next substring
            if db.tell() < ecc_size:
                db.seek(db.tell() - max(len(entrymarker), len(field_delim)))
        if bardisp.n > bardisp.total:
            bardisp.n = bardisp.total  # just a workaround in case there's one byte more than the predicted total
        bardisp.close()

        # Committing the repair into the ecc file
        for m in xrange(len(markers)):  # for each type of markers
            marker = markers[m]
            if len(
                    markers_pos[m]
            ) > 0:  # If there is any detected marker to repair for this type
                for pos in markers_pos[
                        m]:  # for each detected marker to repair, we rewrite it over into the file at the detected position
                    if verbose:
                        ptee.write(
                            "- Detected marker type %i at position %i with distance %i (%i%%): repairing."
                            % (m + 1, pos[0], pos[1],
                               (float(pos[1]) / len(markers[m])) * 100))
                    db.seek(pos[0])
                    db.write(marker)

        #print(markers_pos)
        ptee.write(
            "Done. Hamming heuristic with threshold %i%% repaired %i entrymarkers and %i field_delim (%i total) and %i were already valid.\n"
            % (round(distance_threshold * 100,
                     0), len(markers_pos[0]), len(markers_pos[1]),
               len(markers_pos[0]) + len(markers_pos[1]), already_valid))
        del ptee
        return 0