Esempio n. 1
0
def run(args):
    data = read_data(args.corpus_file)
    chars = list(set(data))
    data_size, vocab_size = len(data), len(chars)
    print("Data has {} characters; {} unique.".format(data_size, vocab_size))
    e = Encoding(chars)
    model = VanillaRNN(encoding=e,
                       input_size=vocab_size,
                       hidden_size=args.hidden_size,
                       sequence_length=args.sequence_length,
                       learning_rate=args.learning_rate)
    model.fit(data, num_iterations=args.num_iterations)
    with open(get_output_file(args), 'w') as f:
        for _ in range(args.num_samples):
            seed = np.random.randint(low=0, high=vocab_size)
            seq = model.generate_sequence(seed, args.sample_size)
            f.write(e.decode(seq))
            f.write('\n\n')
Esempio n. 2
0
class LangRec(Module):
    def __init__(self, data_set: DataSet, ngram_size: int, emb_size: int,
                 hid_size: int):
        """Initialize the language recognition module.

        Args:
            data_set: the dataset from which the set of input symbols
                and output classes (languages) can be extracted
            ngram_size: size of n-gram features (e.g., use 1 for unigrams,
                2 for bigrams, etc.)
            emb_size: size of the character embedding vectors
            hid_size: size of the hidden layer of the FFN use for scoring
        """
        # Keep the size of the ngrams
        self.ngram_size = ngram_size
        # Calculate the embedding alphabet and create the embedding sub-module
        feat_set = self.alphabet(data_set)
        self.register("emb", Embedding(feat_set, emb_size))
        # Encoding (mapping between langs and ints)
        lang_set = set(lang for (_, lang) in data_set)
        self.enc = Encoding(lang_set)
        # Scoring FFN sub-module
        self.register("ffn",
                      FFN(idim=emb_size, hdim=hid_size, odim=len(lang_set)))
        # Additional check to verify that all the registered
        # parameters actually require gradients.  This allows
        # to identify the "bug" in the embedding module.
        assert all([param.requires_grad is True for param in self.params()])

    def preprocess(self, name: Name) -> Name:
        """Name preprocessing."""
        # Currently no preprocessing, but we could think of something
        # in the future.
        return name

    def features(self, name: Name) -> Iterator[str]:
        """Retrieve the list of features in the given name."""
        return ngrams(self.preprocess(name), self.ngram_size)

    def alphabet(self, data_set: DataSet) -> Set[str]:
        """Retrieve the embedding alphabet from the dataset.

        Retrieve the set of all features that we want to embed from
        the given dataset.
        """
        return set(feat for (name, _) in data_set
                   for feat in self.features(name))

    def encode(self, lang: Lang) -> int:
        """Encode the given language as an integer."""
        return self.enc.encode(lang)

    def forward(self, name: Name) -> TT:
        """The forward calculation of the name's language recognition model.

        Args:
            name: a person name

        Returns:
            score vector corresponding to the name, with its individual
            elements corresponding to the scores of different languages
        """
        embeddings = [self.emb.forward(feat) for feat in self.features(name)]
        cbow = sum(embeddings)
        scores = self.ffn.forward(cbow)
        return scores

    def classify(self, name: Name) -> Dict[Lang, float]:
        """Classify the given person name.

        Args:
            name: person name, sequence of characters

        Returns:
            the mapping from languages to their probabilities
            for the given name.
        """
        # We don't want Pytorch to calculate the gradients
        with torch.no_grad():
            # The vector of scores for the given name
            scores = self.forward(name)
            # We map the vector of scores to the vector of probabilities.
            probs = torch.softmax(scores, dim=0)
            # Result dictionary
            res = {}
            # `ix` should be an index in the scores vector
            for ix in range(len(probs)):
                lang = self.enc.decode(ix)
                res[lang] = probs[ix]
            return res

    def classify_one(self, name: Name) -> Lang:
        """A simplified version of `classify` which returns the
        language with the highest score."""
        prob_map = self.classify(name)
        preds = sorted(prob_map.items(), key=lambda pair: pair[1])
        (name, _prob) = preds[-1]
        return name
Esempio n. 3
0
class Test_Encoding(unittest.TestCase):
    def setUp(self):
        self.encoder = Encoding()

    def tearDown(self):
        self.encoder = None

    def test_single_character(self):
        decimal = self.encoder.encode_decimal("A")
        self.assertEqual(decimal, 16777217)
        hex_value = self.encoder.encode_hex("A")
        self.assertEqual(int(hex_value, 16), int('0x01000001', 16))

    def test_full_bundle(self):
        decimal = self.encoder.encode_decimal("FRED")
        self.assertEqual(decimal, 251792692)
        hex_value = self.encoder.encode_hex("FRED")
        self.assertEqual(int(hex_value, 16), int('0x0F020d34', 16))

    def test_non_alphanumerics(self):
        decimal = self.encoder.encode_decimal(" :^)")
        self.assertEqual(decimal, 79094888)
        hex_value = self.encoder.encode_hex(" :^)")
        self.assertEqual(int(hex_value, 16), int('0x04B6E468', 16))

    def test_foo(self):
        decimal = self.encoder.encode_decimal("foo")
        self.assertEqual(decimal, 124807030)

    def test_foo_with_space(self):
        decimal = self.encoder.encode_decimal(" foo")
        self.assertEqual(decimal, 250662636)

    def test_foot(self):
        decimal = self.encoder.encode_decimal("foot")
        self.assertEqual(decimal, 267939702)

    def test_BIRD(self):
        decimal = self.encoder.encode_decimal("BIRD")
        self.assertEqual(decimal, 251930706)

    def test_periods(self):
        decimal = self.encoder.encode_decimal("....")
        self.assertEqual(decimal, 15794160)

    def test_carrots(self):
        decimal = self.encoder.encode_decimal("^^^^")
        self.assertEqual(decimal, 252706800)

    def test_Whoot(self):
        decimal = self.encoder.encode_decimal("Woot")
        self.assertEqual(decimal, 266956663)

    def test_no(self):
        decimal = self.encoder.encode_decimal("no")
        self.assertEqual(decimal, 53490482)

    def test_email(self):
        decimal = self.encoder.encode_decimal("a@b.")
        self.assertEqual(decimal, 131107009)

    def test_my_email(self):
        decimal = self.encoder.encode_decimal("me@a")
        self.assertEqual(decimal, 263197451)

    # ----------- Part 2 ----------------------------

    def test_endcode_array_tacocat(self):
        encoded = self.encoder.encode("tacocat")
        self.assertEqual(encoded, [267487694, 125043731])

    def test_decode_FRED(self):
        decoded = self.encoder.decode_decimal(251792692)
        self.assertEqual(decoded, "FRED")

    def test_decode_array_tacocat(self):
        decoded = self.encoder.decode([267487694, 125043731])
        self.assertEqual(decoded, "tacocat")

    def test_decode_array_never_odd(self):
        decoded = self.encoder.decode(
            [267657050, 233917524, 234374596, 250875466, 17830160])
        self.assertEqual(decoded, "never odd or even")

    def test_decode_array_larger(self):
        decoded = self.encoder.decode(
            [267394382, 167322264, 66212897, 200937635, 267422503])
        self.assertEqual(decoded, "lager, sir, is regal")

    def test_decode_array_go_hang(self):
        decoded = self.encoder.decode([
            200319795, 133178981, 234094669, 267441422, 78666124, 99619077,
            267653454, 133178165, 124794470
        ])
        self.assertEqual(decoded, "go hang a salami, I'm a lasagna hog")

    def test_decode_array_engad(self):
        decoded = self.encoder.decode([
            267389735, 82841860, 267651166, 250793668, 233835785, 267665210,
            99680277, 133170194, 124782119
        ])
        self.assertEqual(decoded, "egad, a base tone denotes a bad age")

    def test_bothways(self):
        self.assertEqual("bothways",
                         self.encoder.decode(self.encoder.encode("bothways")))
Esempio n. 4
0
class LangRec(Module):

    # TODO: Implement this method.
    def __init__(self, data_set: DataSet, emb_size: int, hid_size: int):
        """Initialize the language recognition module.

        Args:
            data_set: the dataset from which the set of input symbols
                and output classes (languages) can be extracted
            emb_size: size of the character embedding vectors
            hid_size: size of the hidden layer of the FFN use for scoring
        """
        char_set = char_set_in(data_set)
        # Embedding
        self.register("emb", Embedding(char_set, emb_size))
        lang_set = set(lang for (name, lang) in data_set)
        lang_num = len(lang_set)
        # Encoding (mapping between langs and ints)
        self.enc = Encoding(lang_set)
        # FFN
        self.register("ffn", FFN(idim=emb_size, hdim=hid_size, odim=lang_num))

    # TODO: Implement this method.
    def encode(self, lang: Lang) -> int:
        """Encode the given language as an integer."""
        return self.enc.encode(lang)

    # TODO: Implement this method.
    def forward(self, name: Name) -> TT:
        """The forward calculation of the name's language recognition model.

        Args:
            name: a person name

        Returns:
            score vector corresponding to the name, with its individual
            elements corresponding to the scores of different languages
        """
        embeddings = [self.emb.forward(char) for char in name]
        cbow = sum(embeddings)
        scores = self.ffn.forward(cbow)
        return scores

    # TODO: Implement this method.
    def classify(self, name: Name) -> Dict[Lang, float]:
        """Classify the given person name.

        Args:
            name: person name, sequence of characters

        Returns:
            the mapping from languages to their probabilities
            for the given name.
        """
        # We don't want Pytorch to calculate the gradients
        with torch.no_grad():
            # The vector of scores for the given name
            scores = self.forward(name)
            # We map the vector of scores to the vector of probabilities.
            probs = torch.softmax(scores, dim=0)
            # Result dictionary
            res = {}
            # `ix` should be an index in the scores vector
            for ix in range(len(probs)):
                lang = self.enc.decode(ix)
                res[lang] = probs[ix]
            return res
Esempio n. 5
0
class LangRec(Module):
    def __init__(self, data_set: DataSet, ngram_size: int, emb_size: int,
                 hid_size: int):
        """Initialize the language recognition module.

        Args:
            data_set: the dataset from which the set of input symbols
                and output classes (languages) can be extracted
            ngram_size: size of n-gram features (e.g., use 1 for unigrams,
                2 for bigrams, etc.)
            emb_size: size of the character embedding vectors
            hid_size: size of the hidden layer of the FFN use for scoring
        """
        # Keep the size of the ngrams
        self.ngram_size = ngram_size
        # Calculate the embedding alphabet and create the embedding sub-module
        feat_set = self.alphabet(data_set)
        self.register("emb", EmbeddingSum(feat_set, emb_size))
        # Encoding (mapping between langs and ints)
        lang_set = set(lang for (_, lang) in data_set)
        self.enc = Encoding(lang_set)
        # Scoring FFN sub-module
        self.register("ffn",
                      FFN(idim=emb_size, hdim=hid_size, odim=len(lang_set)))
        # Additional check to verify that all the registered
        # parameters actually require gradients.
        assert all([param.requires_grad is True for param in self.params()])

    def preprocess(self, name: Name) -> Name:
        """Name preprocessing."""
        # Currently no preprocessing, but we could think of something
        # in the future.
        return name

    def features(self, name: Name) -> Iterator[str]:
        """Retrieve the list of features in the given name."""
        return ngrams(self.preprocess(name), self.ngram_size)

    def alphabet(self, data_set: DataSet) -> Set[str]:
        """Retrieve the embedding alphabet from the dataset.

        Retrieve the set of all features that we want to embed from
        the given dataset.
        """
        return set(feat for (name, _) in data_set
                   for feat in self.features(name))

    def encode(self, lang: Lang) -> int:
        """Encode the given language as an integer."""
        return self.enc.encode(lang)

    def forward(self, names: Iterator[Name]) -> TT:
        """The forward calculation of the name's language recognition model.

        Args:
            names: a sequence of person names; calculating the scores for
                several names at the same time is faster thanks to better
                parallelization

        Returns:
            score matrix in which each row corresponds to a single name, with
            its individual elements corresponding to the scores of different
            languages
        """
        # TODO EX2 (a): the following lines need to be adapted to the EmbeddingSum,
        # which processes features in groups.  You will also need to make
        # trivial modifications in the code in two or three other places
        # (imports, initialization).
        # TODO EX2 (b): you can further try to modify the EmbeddingSum class so
        # that it works over batches of feature groups.
        embeddings = [
            # [self.emb.forward(feat) for feat in self.features(name)]
            self.emb.forward(self.features(name)) for name in names
        ]
        # cbow = utils.from_rows(map(sum, embeddings))
        cbow = utils.stack(embeddings)
        scores = self.ffn.forward(cbow)
        return scores

    def classify(self, name: Name) -> Dict[Lang, float]:
        """Classify the given person name.

        Args:
            name: person name, sequence of characters

        Returns:
            the mapping from languages to their probabilities
            for the given name.
        """
        # We don't want Pytorch to calculate the gradients
        with torch.no_grad():
            # The vector of scores for the given name
            scores = self.forward([name])[0]
            # We map the vector of scores to the vector of probabilities.
            probs = torch.softmax(scores, dim=0)
            # Result dictionary
            res = {}
            # `ix` should be an index in the scores vector
            for ix in range(len(probs)):
                lang = self.enc.decode(ix)
                res[lang] = probs[ix]
            return res

    def classify_one(self, name: Name) -> Lang:
        """A simplified version of `classify` which returns the
        language with the highest score."""
        prob_map = self.classify(name)
        preds = sorted(prob_map.items(), key=lambda pair: pair[1])
        (name, _prob) = preds[-1]
        return name