def preserve(self, in_file, out_file, *args, **kwargs): """ Restore punctuation and case after the generator. If more output letters are produced than there were input letters, the extra letters are all written as they are, with no separation. At the end, all remaining punctuation is written to the output file. This is useful because for instance it means that the last trailing newline (which is present in any file made by a sane person) will be written as output. """ in_file_1, in_file_2 = itertools.tee(file_chars(in_file)) output = self.func(strip_punc(in_file_1), *args, **kwargs) for c in output: punc = ' ' for punc in in_file_2: if punc.isalpha(): break out_file.write(punc) if not punc.isalpha(): out_file.write(c) elif punc.isupper(): out_file.write(c.upper()) else: out_file.write(c.lower()) for punc in in_file_2: if not punc.isalpha(): out_file.write(punc)
def __call__(self, text, *args, **kwargs): """ Call the actual function. This also strips punctuation and makes text uppercase, follows the principle of least astonishment, I think (eg you'd expect ioc(BEE_MOVIE) to not think about spaces). """ return self.measure(strip_punc(text), *args, **kwargs)
def permutation_from_key(key): """ Generate a low-level permutation from a key consisting of letters, by removing repeated letters and filling in the rest of the alphabet going from the last letter. Eg "linustorvalds" as key becomes ABCDEFGHIJKLMNOPQRSTUVWXYZ LINUSTORVADEFGHJKMPQWXYZBC This method is /not/ completely standard. Wikipedia would have you believe that you should just chug along with the rest of the alphabet from the first letter, but this bleeds huge amounts of information into your permutation, as xyz will often map to xyz, whereas here they're basically randomly offset. (Wikipedia's example sneakily has a z in the key so you don't notice this) This function generously strips any punctuation and makes the string uppercase, so should be fairly robust on any input. """ mapping = {} alphabet = set(string.ascii_uppercase) from_iterable = iter(string.ascii_uppercase) # use an OrderedDict so as to retain compatibility with 3.6 spec key_unique = "".join(collections.OrderedDict.fromkeys(strip_punc(key))) # in case of empty key (although that's not a good idea) k = 'A' for k, a in zip(key_unique, from_iterable): mapping[a] = k alphabet.remove(k) alphabet = sorted(alphabet) start_index = 0 while start_index < len(alphabet) and alphabet[start_index] < k: start_index += 1 for ind, k in enumerate(from_iterable): mapping[k] = alphabet[(start_index + ind) % len(alphabet)] return Perm(mapping)
def word_count(self, file_name): self.index_by_word = {} self.text = self.sc.textFile(file_name) self.counts = self.text \ .flatMap(lambda line: util.strip_punc(line)) \ .map(lambda word: (word.lower(), 1)) \ .reduceByKey(lambda a, b: a + b)
def make_index(self): for items in self.counts.collect(): if items[0].strip() != '': self.index_by_word[items[0]] = [] line_no = 1 for line in self.text.collect(): line = util.strip_punc(line) for word in line: if word.strip() != '': self.index_by_word[word.lower()].append(line_no) line_no += 1
def strip(self, in_file, out_file, *args, compare=False, lowercase=False, block=BLOCK_DEFAULT, width=WIDTH_DEFAULT, **kwargs): """ Strip all punctuation from the output, convert output to uppercase, and format the output into blocks of size `block`, with lines wrapped to length `width`. If width is <= 0, no wrapping is done. If block is <= 0, no spaces are inserted. Of course you could simulate the case with width <= 0, block > 0 with the case width > 0, block <= 0 (and vice versa). But I think it's a nice courtesy to support both. If it is passed `compare=True`, it alternates between printing lines of the original text and the processed text (so they can be compared). If either runs out, empty lines will be printed until the other is exhausted. If it is passed `lowercase=True`, output it lowercase rather than uppercase. """ if compare and 0 < width <= 2: raise ValueError("width should be > 2 in compare mode") input_chars = strip_punc(file_chars(in_file)) if compare: input_chars, plaintext = itertools.tee(input_chars) output = self.func(input_chars, *args, **kwargs) if compare: lines = get_lines(output, block, width - 2) else: lines = get_lines(output, block, width) if lowercase: post_func = lambda s: "{}\n".format(s.lower()) else: post_func = lambda s: "{}\n".format(s.upper()) if compare: plain_lines = get_lines(plaintext, block, width - 2) for line, plain in itertools.zip_longest(lines, plain_lines, fillvalue=""): out_file.write("i:{}".format(post_func(plain))) out_file.write("o:{}".format(post_func(line))) out_file.write("\n") else: for line in lines: out_file.write(post_func(line))
def make_index(self): for items in self.counts.collect(): if items[0].strip() != '': self.index_by_word[items[0]] = [] line_no = 1 self.lines = [] for line in self.text.collect(): words = util.strip_punc(line) self.lines.append(line) # print(self.text.collect()[line_no-1] + '\n') for word in words: if word.strip() != '': self.index_by_word[word.lower()].append(line_no) line_no += 1
yield tuple(Perm.random(string.ascii_uppercase) for _ in range(2)) # try modifying just one of sigma or tau yield from ( (self.sigma * p, self.tau) for p in random.sample(MOD_PERMUTATIONS, k=len(MOD_PERMUTATIONS))) yield from ( (self.sigma, self.tau * p) for p in random.sample(MOD_PERMUTATIONS, k=len(MOD_PERMUTATIONS))) def get_score(self, state): return quadgram_score.no_strip( autoperm_decipher.func(self.text, *state)) def set_state(self, state): self.sigma, self.tau = state def get_state(self): return self.sigma, self.tau if __name__ == "__main__": from metric import BEE_MOVIE from util import permutation_from_key, strip_punc plaintext = "".join(strip_punc(BEE_MOVIE)) sigma = permutation_from_key("richardstallman") tau = permutation_from_key("linustorvalds") ciphertext = "".join(autoperm_encipher.func(plaintext, sigma, tau)) print("ciphertext: {}".format(ciphertext)) hill_climber = AutopermHillClimber(ciphertext, 1) hill_climber.hill_climb()
def modify_state(self): # try all other permutations, randomly ordered. The shuffling here # doesn't take place in a bottleneck, and it hopefully prevents the # search path from becoming too homogeneous. yield from ( self.key * p for p in random.sample(MOD_PERMUTATIONS, k=len(MOD_PERMUTATIONS))) def get_score(self, state): return quadgram_score.no_strip(substitution.func(self.text, state)) def set_state(self, state): self.key = state def get_state(self): return self.key if __name__ == "__main__": from metric import BEE_MOVIE from util import permutation_from_key, strip_punc, file_chars if sys.stdin.isatty(): plaintext = "".join(strip_punc(BEE_MOVIE)) key = permutation_from_key("linustorvalds") ciphertext = "".join(substitution.func(plaintext, key)) else: ciphertext = "".join(strip_punc(file_chars(sys.stdin))) hill_climber = SubstitutionHillClimber(ciphertext, 20) hill_climber.hill_climb()