def get_offsets_from_texts(file_names, test_files, k): common.note_time('get_offsets_from_texts k=%d' % k) allowed_substrings = None for name in file_names: x = test_files[name] substrings = get_substrings(x['text'], k, allowed_substrings) substrings = filter_repeats(substrings, x['repeats']) substrings = filter_junk_strings(substrings) if not substrings: print 'No %d character string works!' % k return None allowed_substrings = substrings.keys() # Remove all the substrings that are no longer used for name in file_names: for key in substrings.keys(): if not key in allowed_substrings: del(substrings[name][key]) #report('k=%d:\n\substrings=%d:%s' % (k, len(allowed_substrings), sorted(allowed_substrings))) note_time('got substrings') # From now on work with offsets # offsets_dict[<filename>][<substring>] = list of offsets of <substring> in file with name <filename> offsets_dict = {} for name in file_names: x = test_files[name] offsets_dict[name] = {} for key in substrings.keys(): offsets_dict[name][key] = common.get_substring_offsets(x['text'], key) return [offsets_dict[name] for name in file_names]
def compare_string_subsets(mask, subset_fraction, num_tests): """Find the longest substring that is repeated in several subsets of a list of files matched by <mask> in which the filename encodes the number of repeats as defined in name_to_repeats() above. Compare results from each set. It shoulbe the same """ # Read the files matched by <mask> test_files = get_test_files(mask) print '%d files in mask "%s"' % (len(test_files), mask) if not test_files: print 'no test files' return file_names = [x for x in test_files.keys()] file_names.sort(key = lambda x: len(test_files[x]['text'])) for i, name in enumerate(file_names): x = test_files[name] print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name) print '=' * 60 subset_size = int(len(test_files)*subset_fraction) print 'Testing %d subsets of size %d from total of %d' % (num_tests, subset_size, len(test_files)) random.seed(111) subset_substring_list_list = [] for test in range(num_tests): test_file_names = file_names[:] random.shuffle(test_file_names) test_files_subset = test_file_names[:subset_size] if not common.is_quiet(): for i, name in enumerate(test_files_subset): x = test_files[name] print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name) print '-' * 60 offsets_dict = FRS.find_repeated_substrings(test_files) substring_list = offsets_dict[file_names[0]].keys() subset_substring_list_list.append(substring_list) print 'Found %d substrings' % len(substring_list) for i, substring in enumerate(substring_list): print '%2d: len=%3d, substring="%s"' % (i, len(substring), H(substring)) for name in file_names: x = test_files[name] offsets = sorted(offsets_dict[name][substring]) print '%40s: needed=%-2d,num=%-2d,offsets=%s' % (os.path.basename(name), x['repeats'], len(offsets), offsets) offsets2 = sorted(common.get_substring_offsets(x['text'], substring)) assert(len(offsets2) == len(offsets)) for i in range(len(offsets)): assert(offsets2[i] == offsets[i])
def find_and_show_substrings(mask): """Find the longest substring that is repeated in a list of files matched by <mask> in which the filename encodes the number of repeats as defined in name_to_repeats() above. """ # Read the files matched by <mask> test_files = get_test_files(mask) file_names = [x for x in test_files.keys()] file_names.sort(key = lambda x: len(test_files[x]['text'])) print '%d files in mask "%s"' % (len(test_files), mask) if not test_files: print 'no test files' return for i, name in enumerate(file_names): x = test_files[name] print '%2d: size=%7d, repeats=%3d, name="%s"' % (i, len(x['text']), x['repeats'], name) print '-' * 60 offsets_dict = FRS.find_repeated_substrings(test_files) # Print out the results substring_list = offsets_dict[file_names[0]].keys() print 'Substrings summary:' print 'Found %d substrings' % len(substring_list) for i, substring in enumerate(substring_list): print '%2d: len=%3d, substring="%s"' % (i, len(substring), H(substring)) print '=' * 80 print 'Substrings in detail:' for substring in substring_list: print 'Substring %2d' % i, '-' * 60 print 'len=%3d, substring="%s"' % (len(substring), substring) print 'hex =', H(substring) print 'Offsets of substring in test files:' for name in file_names: x = test_files[name] offsets = sorted(offsets_dict[name][substring]) print '%40s: needed=%-2d,num=%-2d,offsets=%s' % (os.path.basename(name), x['repeats'], len(offsets), offsets) offsets2 = sorted(common.get_substring_offsets(x['text'], substring)) assert(len(offsets2) == len(offsets)) for i in range(len(offsets)): assert(offsets2[i] == offsets[i])