def get_offsets_from_texts(file_names, test_files, k):
    common.note_time('get_offsets_from_texts k=%d' % k)
    
    allowed_substrings = None
    for name in file_names:
        x = test_files[name]
        substrings = get_substrings(x['text'], k, allowed_substrings)
        substrings = filter_repeats(substrings, x['repeats'])
        substrings = filter_junk_strings(substrings)
        if not substrings:
            print 'No %d character string works!' % k
            return None 
        allowed_substrings = substrings.keys() 

    # Remove all the substrings that are no longer used
    for name in file_names:
        for key in substrings.keys():
            if not key in allowed_substrings:
                del(substrings[name][key])
    #report('k=%d:\n\substrings=%d:%s' % (k, len(allowed_substrings), sorted(allowed_substrings)))
    note_time('got substrings')

    # From now on work with offsets  
    # offsets_dict[<filename>][<substring>] = list of offsets of <substring> in file with name <filename>
    offsets_dict = {}
    for name in file_names:
        x = test_files[name]
        offsets_dict[name] = {}
        for key in substrings.keys():
            offsets_dict[name][key] = common.get_substring_offsets(x['text'], key) 

    return [offsets_dict[name] for name in file_names] 
def compare_string_subsets(mask, subset_fraction, num_tests):
    """Find the longest substring that is repeated in several subsets of a list of files
        matched by <mask> in which the filename encodes the number of repeats as defined 
        in name_to_repeats() above.
        Compare results from each set. It shoulbe the same
    """
    # Read the files matched by <mask>    
    test_files = get_test_files(mask)
    print '%d files in mask "%s"' % (len(test_files), mask)
    if not test_files:
        print 'no test files'
        return

    file_names = [x for x in test_files.keys()]
    file_names.sort(key = lambda x: len(test_files[x]['text']))  
    
    for i, name in enumerate(file_names):
        x = test_files[name]
        print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name)
    print '=' * 60

    subset_size = int(len(test_files)*subset_fraction)

    print 'Testing %d subsets of size %d from total of %d' % (num_tests, subset_size, len(test_files))

    random.seed(111)

    subset_substring_list_list = [] 
    for test in range(num_tests):
        test_file_names = file_names[:]
        random.shuffle(test_file_names)
        test_files_subset = test_file_names[:subset_size]
        if not common.is_quiet():
            for i, name in enumerate(test_files_subset):
                x = test_files[name]
                print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name)
            print '-' * 60
        offsets_dict = FRS.find_repeated_substrings(test_files)
        substring_list = offsets_dict[file_names[0]].keys()
        subset_substring_list_list.append(substring_list)
        print 'Found %d substrings' % len(substring_list)
        for i, substring in enumerate(substring_list):
            print '%2d: len=%3d, substring="%s"' % (i, len(substring), H(substring))
            for name in file_names:
                x = test_files[name]
                offsets = sorted(offsets_dict[name][substring])
                print '%40s: needed=%-2d,num=%-2d,offsets=%s' % (os.path.basename(name), 
                    x['repeats'], len(offsets), offsets)
                offsets2 = sorted(common.get_substring_offsets(x['text'], substring)) 
                assert(len(offsets2) == len(offsets))
                for i in range(len(offsets)):
                    assert(offsets2[i] == offsets[i]) 
def find_and_show_substrings(mask):
    """Find the longest substring that is repeated in a list of files
        matched by <mask> in which the filename encodes the number of 
        repeats as defined in name_to_repeats() above.
    """

    # Read the files matched by <mask>    
    test_files = get_test_files(mask)
    file_names = [x for x in test_files.keys()]
    file_names.sort(key = lambda x: len(test_files[x]['text']))  
    
    print '%d files in mask "%s"' % (len(test_files), mask)
    if not test_files:
        print 'no test files'
        return

    for i, name in enumerate(file_names):
        x = test_files[name]
        print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name)
    print '-' * 60

    offsets_dict = FRS.find_repeated_substrings(test_files)

    # Print out the results 
    
    substring_list = offsets_dict[file_names[0]].keys()

    print 'Substrings summary:'
    print 'Found %d substrings' % len(substring_list)
    for i, substring in enumerate(substring_list):
        print '%2d: len=%3d, substring="%s"' % (i, len(substring), H(substring))

    print '=' * 80
    print 'Substrings in detail:'
    for substring in substring_list:
        print 'Substring %2d' % i, '-' * 60
        print 'len=%3d, substring="%s"' % (len(substring), substring)
        print 'hex =', H(substring)
        print 'Offsets of substring in test files:'
        for name in file_names:
            x = test_files[name]
            offsets = sorted(offsets_dict[name][substring])
            print '%40s: needed=%-2d,num=%-2d,offsets=%s' % (os.path.basename(name), 
                x['repeats'], len(offsets), offsets)
            offsets2 = sorted(common.get_substring_offsets(x['text'], substring)) 
            assert(len(offsets2) == len(offsets))
            for i in range(len(offsets)):
                assert(offsets2[i] == offsets[i])