def run(args): data = read_data(args.corpus_file) chars = list(set(data)) data_size, vocab_size = len(data), len(chars) print("Data has {} characters; {} unique.".format(data_size, vocab_size)) e = Encoding(chars) model = VanillaRNN(encoding=e, input_size=vocab_size, hidden_size=args.hidden_size, sequence_length=args.sequence_length, learning_rate=args.learning_rate) model.fit(data, num_iterations=args.num_iterations) with open(get_output_file(args), 'w') as f: for _ in range(args.num_samples): seed = np.random.randint(low=0, high=vocab_size) seq = model.generate_sequence(seed, args.sample_size) f.write(e.decode(seq)) f.write('\n\n')
class LangRec(Module): def __init__(self, data_set: DataSet, ngram_size: int, emb_size: int, hid_size: int): """Initialize the language recognition module. Args: data_set: the dataset from which the set of input symbols and output classes (languages) can be extracted ngram_size: size of n-gram features (e.g., use 1 for unigrams, 2 for bigrams, etc.) emb_size: size of the character embedding vectors hid_size: size of the hidden layer of the FFN use for scoring """ # Keep the size of the ngrams self.ngram_size = ngram_size # Calculate the embedding alphabet and create the embedding sub-module feat_set = self.alphabet(data_set) self.register("emb", Embedding(feat_set, emb_size)) # Encoding (mapping between langs and ints) lang_set = set(lang for (_, lang) in data_set) self.enc = Encoding(lang_set) # Scoring FFN sub-module self.register("ffn", FFN(idim=emb_size, hdim=hid_size, odim=len(lang_set))) # Additional check to verify that all the registered # parameters actually require gradients. This allows # to identify the "bug" in the embedding module. assert all([param.requires_grad is True for param in self.params()]) def preprocess(self, name: Name) -> Name: """Name preprocessing.""" # Currently no preprocessing, but we could think of something # in the future. return name def features(self, name: Name) -> Iterator[str]: """Retrieve the list of features in the given name.""" return ngrams(self.preprocess(name), self.ngram_size) def alphabet(self, data_set: DataSet) -> Set[str]: """Retrieve the embedding alphabet from the dataset. Retrieve the set of all features that we want to embed from the given dataset. """ return set(feat for (name, _) in data_set for feat in self.features(name)) def encode(self, lang: Lang) -> int: """Encode the given language as an integer.""" return self.enc.encode(lang) def forward(self, name: Name) -> TT: """The forward calculation of the name's language recognition model. Args: name: a person name Returns: score vector corresponding to the name, with its individual elements corresponding to the scores of different languages """ embeddings = [self.emb.forward(feat) for feat in self.features(name)] cbow = sum(embeddings) scores = self.ffn.forward(cbow) return scores def classify(self, name: Name) -> Dict[Lang, float]: """Classify the given person name. Args: name: person name, sequence of characters Returns: the mapping from languages to their probabilities for the given name. """ # We don't want Pytorch to calculate the gradients with torch.no_grad(): # The vector of scores for the given name scores = self.forward(name) # We map the vector of scores to the vector of probabilities. probs = torch.softmax(scores, dim=0) # Result dictionary res = {} # `ix` should be an index in the scores vector for ix in range(len(probs)): lang = self.enc.decode(ix) res[lang] = probs[ix] return res def classify_one(self, name: Name) -> Lang: """A simplified version of `classify` which returns the language with the highest score.""" prob_map = self.classify(name) preds = sorted(prob_map.items(), key=lambda pair: pair[1]) (name, _prob) = preds[-1] return name
class Test_Encoding(unittest.TestCase): def setUp(self): self.encoder = Encoding() def tearDown(self): self.encoder = None def test_single_character(self): decimal = self.encoder.encode_decimal("A") self.assertEqual(decimal, 16777217) hex_value = self.encoder.encode_hex("A") self.assertEqual(int(hex_value, 16), int('0x01000001', 16)) def test_full_bundle(self): decimal = self.encoder.encode_decimal("FRED") self.assertEqual(decimal, 251792692) hex_value = self.encoder.encode_hex("FRED") self.assertEqual(int(hex_value, 16), int('0x0F020d34', 16)) def test_non_alphanumerics(self): decimal = self.encoder.encode_decimal(" :^)") self.assertEqual(decimal, 79094888) hex_value = self.encoder.encode_hex(" :^)") self.assertEqual(int(hex_value, 16), int('0x04B6E468', 16)) def test_foo(self): decimal = self.encoder.encode_decimal("foo") self.assertEqual(decimal, 124807030) def test_foo_with_space(self): decimal = self.encoder.encode_decimal(" foo") self.assertEqual(decimal, 250662636) def test_foot(self): decimal = self.encoder.encode_decimal("foot") self.assertEqual(decimal, 267939702) def test_BIRD(self): decimal = self.encoder.encode_decimal("BIRD") self.assertEqual(decimal, 251930706) def test_periods(self): decimal = self.encoder.encode_decimal("....") self.assertEqual(decimal, 15794160) def test_carrots(self): decimal = self.encoder.encode_decimal("^^^^") self.assertEqual(decimal, 252706800) def test_Whoot(self): decimal = self.encoder.encode_decimal("Woot") self.assertEqual(decimal, 266956663) def test_no(self): decimal = self.encoder.encode_decimal("no") self.assertEqual(decimal, 53490482) def test_email(self): decimal = self.encoder.encode_decimal("a@b.") self.assertEqual(decimal, 131107009) def test_my_email(self): decimal = self.encoder.encode_decimal("me@a") self.assertEqual(decimal, 263197451) # ----------- Part 2 ---------------------------- def test_endcode_array_tacocat(self): encoded = self.encoder.encode("tacocat") self.assertEqual(encoded, [267487694, 125043731]) def test_decode_FRED(self): decoded = self.encoder.decode_decimal(251792692) self.assertEqual(decoded, "FRED") def test_decode_array_tacocat(self): decoded = self.encoder.decode([267487694, 125043731]) self.assertEqual(decoded, "tacocat") def test_decode_array_never_odd(self): decoded = self.encoder.decode( [267657050, 233917524, 234374596, 250875466, 17830160]) self.assertEqual(decoded, "never odd or even") def test_decode_array_larger(self): decoded = self.encoder.decode( [267394382, 167322264, 66212897, 200937635, 267422503]) self.assertEqual(decoded, "lager, sir, is regal") def test_decode_array_go_hang(self): decoded = self.encoder.decode([ 200319795, 133178981, 234094669, 267441422, 78666124, 99619077, 267653454, 133178165, 124794470 ]) self.assertEqual(decoded, "go hang a salami, I'm a lasagna hog") def test_decode_array_engad(self): decoded = self.encoder.decode([ 267389735, 82841860, 267651166, 250793668, 233835785, 267665210, 99680277, 133170194, 124782119 ]) self.assertEqual(decoded, "egad, a base tone denotes a bad age") def test_bothways(self): self.assertEqual("bothways", self.encoder.decode(self.encoder.encode("bothways")))
class LangRec(Module): # TODO: Implement this method. def __init__(self, data_set: DataSet, emb_size: int, hid_size: int): """Initialize the language recognition module. Args: data_set: the dataset from which the set of input symbols and output classes (languages) can be extracted emb_size: size of the character embedding vectors hid_size: size of the hidden layer of the FFN use for scoring """ char_set = char_set_in(data_set) # Embedding self.register("emb", Embedding(char_set, emb_size)) lang_set = set(lang for (name, lang) in data_set) lang_num = len(lang_set) # Encoding (mapping between langs and ints) self.enc = Encoding(lang_set) # FFN self.register("ffn", FFN(idim=emb_size, hdim=hid_size, odim=lang_num)) # TODO: Implement this method. def encode(self, lang: Lang) -> int: """Encode the given language as an integer.""" return self.enc.encode(lang) # TODO: Implement this method. def forward(self, name: Name) -> TT: """The forward calculation of the name's language recognition model. Args: name: a person name Returns: score vector corresponding to the name, with its individual elements corresponding to the scores of different languages """ embeddings = [self.emb.forward(char) for char in name] cbow = sum(embeddings) scores = self.ffn.forward(cbow) return scores # TODO: Implement this method. def classify(self, name: Name) -> Dict[Lang, float]: """Classify the given person name. Args: name: person name, sequence of characters Returns: the mapping from languages to their probabilities for the given name. """ # We don't want Pytorch to calculate the gradients with torch.no_grad(): # The vector of scores for the given name scores = self.forward(name) # We map the vector of scores to the vector of probabilities. probs = torch.softmax(scores, dim=0) # Result dictionary res = {} # `ix` should be an index in the scores vector for ix in range(len(probs)): lang = self.enc.decode(ix) res[lang] = probs[ix] return res
class LangRec(Module): def __init__(self, data_set: DataSet, ngram_size: int, emb_size: int, hid_size: int): """Initialize the language recognition module. Args: data_set: the dataset from which the set of input symbols and output classes (languages) can be extracted ngram_size: size of n-gram features (e.g., use 1 for unigrams, 2 for bigrams, etc.) emb_size: size of the character embedding vectors hid_size: size of the hidden layer of the FFN use for scoring """ # Keep the size of the ngrams self.ngram_size = ngram_size # Calculate the embedding alphabet and create the embedding sub-module feat_set = self.alphabet(data_set) self.register("emb", EmbeddingSum(feat_set, emb_size)) # Encoding (mapping between langs and ints) lang_set = set(lang for (_, lang) in data_set) self.enc = Encoding(lang_set) # Scoring FFN sub-module self.register("ffn", FFN(idim=emb_size, hdim=hid_size, odim=len(lang_set))) # Additional check to verify that all the registered # parameters actually require gradients. assert all([param.requires_grad is True for param in self.params()]) def preprocess(self, name: Name) -> Name: """Name preprocessing.""" # Currently no preprocessing, but we could think of something # in the future. return name def features(self, name: Name) -> Iterator[str]: """Retrieve the list of features in the given name.""" return ngrams(self.preprocess(name), self.ngram_size) def alphabet(self, data_set: DataSet) -> Set[str]: """Retrieve the embedding alphabet from the dataset. Retrieve the set of all features that we want to embed from the given dataset. """ return set(feat for (name, _) in data_set for feat in self.features(name)) def encode(self, lang: Lang) -> int: """Encode the given language as an integer.""" return self.enc.encode(lang) def forward(self, names: Iterator[Name]) -> TT: """The forward calculation of the name's language recognition model. Args: names: a sequence of person names; calculating the scores for several names at the same time is faster thanks to better parallelization Returns: score matrix in which each row corresponds to a single name, with its individual elements corresponding to the scores of different languages """ # TODO EX2 (a): the following lines need to be adapted to the EmbeddingSum, # which processes features in groups. You will also need to make # trivial modifications in the code in two or three other places # (imports, initialization). # TODO EX2 (b): you can further try to modify the EmbeddingSum class so # that it works over batches of feature groups. embeddings = [ # [self.emb.forward(feat) for feat in self.features(name)] self.emb.forward(self.features(name)) for name in names ] # cbow = utils.from_rows(map(sum, embeddings)) cbow = utils.stack(embeddings) scores = self.ffn.forward(cbow) return scores def classify(self, name: Name) -> Dict[Lang, float]: """Classify the given person name. Args: name: person name, sequence of characters Returns: the mapping from languages to their probabilities for the given name. """ # We don't want Pytorch to calculate the gradients with torch.no_grad(): # The vector of scores for the given name scores = self.forward([name])[0] # We map the vector of scores to the vector of probabilities. probs = torch.softmax(scores, dim=0) # Result dictionary res = {} # `ix` should be an index in the scores vector for ix in range(len(probs)): lang = self.enc.decode(ix) res[lang] = probs[ix] return res def classify_one(self, name: Name) -> Lang: """A simplified version of `classify` which returns the language with the highest score.""" prob_map = self.classify(name) preds = sorted(prob_map.items(), key=lambda pair: pair[1]) (name, _prob) = preds[-1] return name