Ejemplo n.º 1
0
    def preserve(self, in_file, out_file, *args, **kwargs):
        """
        Restore punctuation and case after the generator.

        If more output letters are produced than there were input letters, the
        extra letters are all written as they are, with no separation.

        At the end, all remaining punctuation is written to the output file.
        This is useful because for instance it means that the last trailing
        newline (which is present in any file made by a sane person) will be
        written as output.
        """
        in_file_1, in_file_2 = itertools.tee(file_chars(in_file))
        output = self.func(strip_punc(in_file_1), *args, **kwargs)
        for c in output:
            punc = ' '
            for punc in in_file_2:
                if punc.isalpha():
                    break
                out_file.write(punc)
            if not punc.isalpha():
                out_file.write(c)
            elif punc.isupper():
                out_file.write(c.upper())
            else:
                out_file.write(c.lower())
        for punc in in_file_2:
            if not punc.isalpha():
                out_file.write(punc)
Ejemplo n.º 2
0
 def __call__(self, text, *args, **kwargs):
     """
     Call the actual function. This also strips punctuation and makes text
     uppercase, follows the principle of least astonishment, I think (eg
     you'd expect ioc(BEE_MOVIE) to not think about spaces).
     """
     return self.measure(strip_punc(text), *args, **kwargs)
Ejemplo n.º 3
0
def permutation_from_key(key):
    """
    Generate a low-level permutation from a key consisting of letters, by
    removing repeated letters and filling in the rest of the alphabet going from
    the last letter. Eg "linustorvalds" as key becomes
    ABCDEFGHIJKLMNOPQRSTUVWXYZ
    LINUSTORVADEFGHJKMPQWXYZBC
    This method is /not/ completely standard. Wikipedia would have you believe
    that you should just chug along with the rest of the alphabet from the first
    letter, but this bleeds huge amounts of information into your permutation,
    as xyz will often map to xyz, whereas here they're basically randomly
    offset. (Wikipedia's example sneakily has a z in the key so you don't notice
    this)

    This function generously strips any punctuation and makes the string
    uppercase, so should be fairly robust on any input.
    """
    mapping = {}
    alphabet = set(string.ascii_uppercase)
    from_iterable = iter(string.ascii_uppercase)
    # use an OrderedDict so as to retain compatibility with 3.6 spec
    key_unique = "".join(collections.OrderedDict.fromkeys(strip_punc(key)))
    # in case of empty key (although that's not a good idea)
    k = 'A'
    for k, a in zip(key_unique, from_iterable):
        mapping[a] = k
        alphabet.remove(k)
    alphabet = sorted(alphabet)
    start_index = 0
    while start_index < len(alphabet) and alphabet[start_index] < k:
        start_index += 1
    for ind, k in enumerate(from_iterable):
        mapping[k] = alphabet[(start_index + ind) % len(alphabet)]
    return Perm(mapping)
 def word_count(self, file_name):
     self.index_by_word = {}
     self.text = self.sc.textFile(file_name)
     self.counts = self.text \
         .flatMap(lambda line: util.strip_punc(line)) \
         .map(lambda word: (word.lower(), 1)) \
         .reduceByKey(lambda a, b: a + b)
 def make_index(self):
     for items in self.counts.collect():
         if items[0].strip() != '':
             self.index_by_word[items[0]] = []
     line_no = 1
     for line in self.text.collect():
         line = util.strip_punc(line)
         for word in line:
             if word.strip() != '':
                 self.index_by_word[word.lower()].append(line_no)
         line_no += 1
Ejemplo n.º 6
0
    def strip(self,
              in_file,
              out_file,
              *args,
              compare=False,
              lowercase=False,
              block=BLOCK_DEFAULT,
              width=WIDTH_DEFAULT,
              **kwargs):
        """
        Strip all punctuation from the output, convert output to uppercase, and
        format the output into blocks of size `block`, with lines wrapped to
        length `width`. If width is <= 0, no wrapping is done. If block is <= 0,
        no spaces are inserted.

        Of course you could simulate the case with width <= 0, block > 0 with
        the case width > 0, block <= 0 (and vice versa). But I think it's a nice
        courtesy to support both.

        If it is passed `compare=True`, it alternates between printing lines of
        the original text and the processed text (so they can be compared). If
        either runs out, empty lines will be printed until the other is
        exhausted.

        If it is passed `lowercase=True`, output it lowercase rather than
        uppercase.
        """
        if compare and 0 < width <= 2:
            raise ValueError("width should be > 2 in compare mode")
        input_chars = strip_punc(file_chars(in_file))
        if compare:
            input_chars, plaintext = itertools.tee(input_chars)
        output = self.func(input_chars, *args, **kwargs)
        if compare:
            lines = get_lines(output, block, width - 2)
        else:
            lines = get_lines(output, block, width)
        if lowercase:
            post_func = lambda s: "{}\n".format(s.lower())
        else:
            post_func = lambda s: "{}\n".format(s.upper())
        if compare:
            plain_lines = get_lines(plaintext, block, width - 2)
            for line, plain in itertools.zip_longest(lines,
                                                     plain_lines,
                                                     fillvalue=""):
                out_file.write("i:{}".format(post_func(plain)))
                out_file.write("o:{}".format(post_func(line)))
                out_file.write("\n")
        else:
            for line in lines:
                out_file.write(post_func(line))
Ejemplo n.º 7
0
 def make_index(self):
     for items in self.counts.collect():
         if items[0].strip() != '':
             self.index_by_word[items[0]] = []
     line_no = 1
     self.lines = []
     for line in self.text.collect():
         words = util.strip_punc(line)
         self.lines.append(line)
         #            print(self.text.collect()[line_no-1] + '\n')
         for word in words:
             if word.strip() != '':
                 self.index_by_word[word.lower()].append(line_no)
         line_no += 1
Ejemplo n.º 8
0
        yield tuple(Perm.random(string.ascii_uppercase) for _ in range(2))
        # try modifying just one of sigma or tau
        yield from (
            (self.sigma * p, self.tau)
            for p in random.sample(MOD_PERMUTATIONS, k=len(MOD_PERMUTATIONS)))
        yield from (
            (self.sigma, self.tau * p)
            for p in random.sample(MOD_PERMUTATIONS, k=len(MOD_PERMUTATIONS)))

    def get_score(self, state):
        return quadgram_score.no_strip(
            autoperm_decipher.func(self.text, *state))

    def set_state(self, state):
        self.sigma, self.tau = state

    def get_state(self):
        return self.sigma, self.tau


if __name__ == "__main__":
    from metric import BEE_MOVIE
    from util import permutation_from_key, strip_punc
    plaintext = "".join(strip_punc(BEE_MOVIE))
    sigma = permutation_from_key("richardstallman")
    tau = permutation_from_key("linustorvalds")
    ciphertext = "".join(autoperm_encipher.func(plaintext, sigma, tau))
    print("ciphertext: {}".format(ciphertext))
    hill_climber = AutopermHillClimber(ciphertext, 1)
    hill_climber.hill_climb()
Ejemplo n.º 9
0
    def modify_state(self):
        # try all other permutations, randomly ordered. The shuffling here
        # doesn't take place in a bottleneck, and it hopefully prevents the
        # search path from becoming too homogeneous.
        yield from (
            self.key * p
            for p in random.sample(MOD_PERMUTATIONS, k=len(MOD_PERMUTATIONS)))

    def get_score(self, state):
        return quadgram_score.no_strip(substitution.func(self.text, state))

    def set_state(self, state):
        self.key = state

    def get_state(self):
        return self.key


if __name__ == "__main__":
    from metric import BEE_MOVIE
    from util import permutation_from_key, strip_punc, file_chars
    if sys.stdin.isatty():
        plaintext = "".join(strip_punc(BEE_MOVIE))
        key = permutation_from_key("linustorvalds")
        ciphertext = "".join(substitution.func(plaintext, key))
    else:
        ciphertext = "".join(strip_punc(file_chars(sys.stdin)))
    hill_climber = SubstitutionHillClimber(ciphertext, 20)
    hill_climber.hill_climb()