class RickAndMortyData(Dataset): def __init__(self, text, seq_length, vocab=None): self.text = text self.seq_length = seq_length if vocab is None: self.vocab = Vocabulary() self.vocab.add_text(self.text) else: self.vocab = vocab self.text = self.vocab.clean_text(text) self.tokens = self.vocab.tokenize(self.text) def __len__(self): return len(self.tokens) - self.seq_length def __getitem__(self, idx): x = [ self.vocab[word] for word in self.tokens[idx:idx + self.seq_length] ] y = [self.vocab[self.tokens[idx + self.seq_length]]] x = torch.LongTensor(x) y = torch.LongTensor(y) return x, y
batch_size = 128 lstm_size = 128 seq_length = 64 num_layers = 2 bidirectional = True embeddings_size = 300 dropout = 0.5 learning_rate = 0.001 with open(data_path, 'r') as f: text = f.read() vocab = Vocabulary() if args.vocab_path is None: vocab.add_text(text) vocab.save('data/vocab.pkl') else: vocab.load(args.load_vocab) print(vocab) model = MortyFire(vocab_size=len(vocab), lstm_size=lstm_size, embed_size=embeddings_size, seq_length=seq_length, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, train_on_gpu=train_on_gpu)