def __getitem__(self, index): inp_x = self.data[index][:self.seq_len] out_x = inp_x[1:] + [self.vocab.EOS] inp_xhat = [self.vocab.SOS] + self.data[index][:self.seq_len] out_xhat = inp_xhat[1:] + [self.vocab.EOS] # print(tabulate([inp_src, out_src, inp_trg, out_trg], # tablefmt="psql")) if not self.subword: inp_x, oov_map = vectorize(inp_x, self.vocab, self.oovs) out_x, _ = vectorize(out_x, self.vocab, self.oovs) inp_xhat, _ = vectorize(inp_xhat, self.vocab, self.oovs) out_xhat, _ = vectorize(out_xhat, self.vocab, self.oovs) else: raise NotImplementedError # add noise in the form of token swaps ! after the OOV replacements inp_x = token_swaps(inp_x, self.swaps) sample = inp_x, out_x, inp_xhat, out_xhat, len(inp_x), len(inp_xhat) if self.return_oov: sample = sample + (oov_map, ) return sample
def dataitem(self, i): # tokenize sentence / text token_list = self.tokenize(self.data[i]) # add special tokens such as <BOS> or <EOS> token_list = self.add_special_tokens(token_list) # vectorize the tokens vector = vectorize(token_list, self.vocab) return vector
def __getitem__(self, index): sentence = self.data[index] sentence = sentence + [self.vocab.EOS] if self.sos: sentence = [self.vocab.SOS] + sentence sentence = sentence[:self.seq_len] inputs = sentence[:-1] targets = sentence[1:] length = len(inputs) if self.oovs > 0: inputs_vec, _ = vectorize(inputs, self.vocab, self.oovs) targets_vec, _ = vectorize(targets, self.vocab, self.oovs) else: inputs_vec = vectorize(inputs, self.vocab) targets_vec = vectorize(targets, self.vocab) assert len(inputs_vec) == len(targets_vec) return inputs_vec, targets_vec, length
def read_sample(self, index): sample = self.data[index][:self.seq_len] sample = [self.vocab.SOS] + sample + [self.vocab.EOS] sample, _ = vectorize(sample, self.vocab, self.oovs) return list(map(self.vocab.id2tok.get, sample))