def get_offsets_from_texts(file_names, test_files, k): common.note_time('get_offsets_from_texts k=%d' % k) allowed_substrings = None for name in file_names: x = test_files[name] substrings = get_substrings(x['text'], k, allowed_substrings) substrings = filter_repeats(substrings, x['repeats']) substrings = filter_junk_strings(substrings) if not substrings: print 'No %d character string works!' % k return None allowed_substrings = substrings.keys() # Remove all the substrings that are no longer used for name in file_names: for key in substrings.keys(): if not key in allowed_substrings: del(substrings[name][key]) #report('k=%d:\n\substrings=%d:%s' % (k, len(allowed_substrings), sorted(allowed_substrings))) note_time('got substrings') # From now on work with offsets # offsets_dict[<filename>][<substring>] = list of offsets of <substring> in file with name <filename> offsets_dict = {} for name in file_names: x = test_files[name] offsets_dict[name] = {} for key in substrings.keys(): offsets_dict[name][key] = common.get_substring_offsets(x['text'], key) return [offsets_dict[name] for name in file_names]
def find_repeated_substrings(test_files): """Return the longest substring(s) s that is repeated in <test_files> according to rule: For each x in test_files: s occurs at least x['repeats'] times in x['text'] test_files[name] = {'text':text, 'repeats':repeats} """ common.note_time('start searching strings') common.report('find_repeated_substrings(%d,%d,%d)' % (len(test_files.keys()), _MIN_K, _MAX_K)) if not test_files: print 'no test files' return # Find the substrings that are repeated >= k times in files with k repeats # It is important to test shorter files first file_names = [x for x in test_files.keys()] file_names.sort(key = lambda x: len(test_files[x]['text'])) common.report('file_names:\n%s' % '\n'.join(['%8d:%3d: %s' % (len(test_files[name]['text']),test_files[name]['repeats'],name) for name in file_names])) # Start by finding all substrings of length _MIN_K which is typically 4 k = _MIN_K if False: print 'Pure Python' pattern_offsets_list = get_offsets_from_texts(file_names, test_files, k) else: print 'Cython rolling hash' text_list, repeats_list = test_files_to_text_repeats(file_names, test_files) # Get rid of expensive references to big strings for name in file_names: test_files[name]['text'] = None pattern_offsets_list = rolling_hash.get_offsets_from_texts(text_list, repeats_list, k, _JUNK_KEY_THRESHOLD) text_repeats_to_test_files(file_names, test_files, text_list, repeats_list) text_list = None min_repeats_list = None if False: # Does not work. !@#$ Find out why while k >= _MIN_K: pattern_offsets_list = rolling_hash.get_offsets_from_texts(text_list, repeats_list, k, _JUNK_KEY_THRESHOLD) if pattern_offsets_list[0]: break print 'reducing k %d=>%d' % (k, k // 2) k = k // 2 common.note_time('got substrings') offsets_dict = dict(zip(file_names, pattern_offsets_list)) # Work in increasing length of substrings, +1 per round offsets_dict_dict = {} k_list = [] for k in range(_MIN_K, _MAX_K): #sparsify the text for name in file_names: test_files[name]['text'] = common.sparsify_by_offsets(test_files[name]['text'], offsets_dict[name], k) offsets_dict_dict[k] = offsets_dict k_list.append(k) common.note_time('found %3d substrings of length >= %3d' % (len(offsets_dict[file_names[0]]), k)) child_offsets_dict = get_child_offsets(file_names, test_files, offsets_dict, k) if not child_offsets_dict: break offsets_dict = child_offsets_dict # The offsets dict may have too many repeats # Walk back through the offsets list to find the first one without excess repeats k_list.reverse() print '$' * 60 print 'k_list', k_list for k in k_list: print '-' * 60 offsets_dict = offsets_dict_dict[k] for key in sorted(offsets_dict[file_names[0]].keys()): print '%s:' % H(key), for name in file_names: if len(offsets_dict[name][key]) != test_files[name]['repeats']: print '"%s":%d,%d' % (name, len(offsets_dict[name][key]), test_files[name]['repeats']), print for name in file_names: for key, ofs_set in offsets_dict[name].items(): if len(ofs_set) > test_files[name]['repeats']: for n in file_names: del(offsets_dict[n][key]) #for name in file_names: # if not offsets_dict[name]: # del(offsets_dict[name]) print 'k=%d, offsets_dict=' % (k) for name in file_names: print ' ', name, ['"%s":%d'%(key, len(val)) for (key,val) in offsets_dict[name].items()] if all(offsets_dict.values()): break exit() # return last non-empty dict of offsets return offsets_dict