def get_offsets_from_texts(file_names, test_files, k):
    common.note_time('get_offsets_from_texts k=%d' % k)
    
    allowed_substrings = None
    for name in file_names:
        x = test_files[name]
        substrings = get_substrings(x['text'], k, allowed_substrings)
        substrings = filter_repeats(substrings, x['repeats'])
        substrings = filter_junk_strings(substrings)
        if not substrings:
            print 'No %d character string works!' % k
            return None 
        allowed_substrings = substrings.keys() 

    # Remove all the substrings that are no longer used
    for name in file_names:
        for key in substrings.keys():
            if not key in allowed_substrings:
                del(substrings[name][key])
    #report('k=%d:\n\substrings=%d:%s' % (k, len(allowed_substrings), sorted(allowed_substrings)))
    note_time('got substrings')

    # From now on work with offsets  
    # offsets_dict[<filename>][<substring>] = list of offsets of <substring> in file with name <filename>
    offsets_dict = {}
    for name in file_names:
        x = test_files[name]
        offsets_dict[name] = {}
        for key in substrings.keys():
            offsets_dict[name][key] = common.get_substring_offsets(x['text'], key) 

    return [offsets_dict[name] for name in file_names] 
def find_repeated_substrings(test_files):
    """Return the longest substring(s) s that is repeated in <test_files>
        according to rule:
            For each x in test_files:
                s occurs at least x['repeats'] times in x['text']
        test_files[name] = {'text':text, 'repeats':repeats}
    """ 
    common.note_time('start searching strings')
    common.report('find_repeated_substrings(%d,%d,%d)' % (len(test_files.keys()), _MIN_K, _MAX_K))
    if not test_files:
        print 'no test files'
        return

    # Find the substrings that are repeated >= k times in files with k repeats
    # It is important to test shorter files first
    file_names = [x for x in test_files.keys()]
    file_names.sort(key = lambda x: len(test_files[x]['text']))

    common.report('file_names:\n%s' % '\n'.join(['%8d:%3d: %s' % 
            (len(test_files[name]['text']),test_files[name]['repeats'],name) for name in file_names]))

    # Start by finding all substrings of length _MIN_K which is typically 4
    k = _MIN_K
    
    if False:
        print 'Pure Python'
        pattern_offsets_list = get_offsets_from_texts(file_names, test_files, k)
    else:
        print 'Cython rolling hash'
        text_list, repeats_list = test_files_to_text_repeats(file_names, test_files)
        # Get rid of expensive references to big strings
        for name in file_names:
            test_files[name]['text'] = None
        
        pattern_offsets_list = rolling_hash.get_offsets_from_texts(text_list, repeats_list, k, 
            _JUNK_KEY_THRESHOLD)

        text_repeats_to_test_files(file_names, test_files, text_list, repeats_list)    
        text_list = None
        min_repeats_list = None
        
    if False:
        # Does not work. !@#$ Find out why
        while k >= _MIN_K:  
            pattern_offsets_list = rolling_hash.get_offsets_from_texts(text_list, repeats_list, k, 
                _JUNK_KEY_THRESHOLD)
            if pattern_offsets_list[0]:
                break
            print 'reducing k %d=>%d' % (k, k // 2)
            k = k // 2
            
    common.note_time('got substrings')

    offsets_dict = dict(zip(file_names, pattern_offsets_list))

    # Work in increasing length of substrings, +1 per round    
    
    offsets_dict_dict = {}
    k_list = []
    for k in range(_MIN_K, _MAX_K):
    
        #sparsify the text
        for name in file_names:
            test_files[name]['text'] = common.sparsify_by_offsets(test_files[name]['text'], 
                offsets_dict[name], k)

        offsets_dict_dict[k] = offsets_dict
        k_list.append(k)
        
        common.note_time('found %3d substrings of length >= %3d' % (len(offsets_dict[file_names[0]]), k)) 
        child_offsets_dict = get_child_offsets(file_names, test_files, offsets_dict, k)
        if not child_offsets_dict:
            break
        offsets_dict = child_offsets_dict 
    
    # The offsets dict may have too many repeats 
    # Walk back through the offsets list to find the first one without excess repeats
    k_list.reverse()
    
    print '$' * 60
    print 'k_list', k_list
    
    for k in k_list:
        print '-' * 60
        offsets_dict = offsets_dict_dict[k]
        for key in sorted(offsets_dict[file_names[0]].keys()):
            print '%s:' % H(key), 
            for name in file_names:
                if len(offsets_dict[name][key]) != test_files[name]['repeats']:
                    print '"%s":%d,%d' % (name, len(offsets_dict[name][key]), test_files[name]['repeats']),
            print
        for name in file_names:
            for key, ofs_set in offsets_dict[name].items():            
                if len(ofs_set) > test_files[name]['repeats']:
                    for n in file_names:
                        del(offsets_dict[n][key])
        #for name in file_names:                
        #    if not offsets_dict[name]:
        #        del(offsets_dict[name])
        
        print 'k=%d, offsets_dict=' % (k)
        for name in file_names:
            print ' ', name, ['"%s":%d'%(key, len(val)) for (key,val) in offsets_dict[name].items()]
            
        if all(offsets_dict.values()):
            break
   
            
    exit()
    # return last non-empty dict of offsets    
    return offsets_dict