def __getitem__(self, index):
        src = self.src[index]
        tgt_inp = self.tgt[index][:-1]
        tgt_lbl = self.tgt[index][1:]

        if self.use_mask:
            src = self.mask_item(src, self.pad_value, self.mask_value,
                                 self.src_vocab_len)
            tgt_inp = self.mask_item(tgt_inp, self.pad_value, self.mask_value,
                                     self.tgt_vocab_len)

        src = pad_sequences([src],
                            maxlen=self.src_pad_len,
                            value=self.tokenizer.pad,
                            padding='post')[0]
        tgt_inp = pad_sequences([tgt_inp],
                                maxlen=self.tgt_pad_len,
                                value=self.tokenizer.pad,
                                padding='post')[0]
        tgt_lbl = pad_sequences([tgt_lbl],
                                maxlen=self.tgt_pad_len,
                                value=self.tokenizer.pad,
                                padding='post')[0]

        return src, tgt_inp, tgt_lbl
Exemple #2
0
    def __init__(self, file_path, tokenizer, src_pad_len=200, tgt_pad_len=50):
        self.tokenizer = tokenizer

        df = pd.read_csv(file_path, sep='\t', names=['src', 'tgt'])

        tokens = [tokenizer.tokenize(str(x.src), str(x.tgt)) for i, x in tqdm(df.iterrows())]
        self.src = [x[0] for x in tokens]
        self.tgt = [x[1] for x in tokens]

        self.src = pad_sequences(self.src, maxlen=src_pad_len, value=tokenizer.pad, padding='post')
        self.tgt = pad_sequences(self.tgt, maxlen=tgt_pad_len, value=tokenizer.pad, padding='post')
Exemple #3
0
    def __getitem__(self, index):
        src = self.src[index]
        tgt_inp = self.tgt[index][:-1]
        tgt_lbl = self.tgt[index][1:]

        if self.use_mask:
            tgt_inp = self.random_mask_tgt(tgt_inp)

        src = pad_sequences([src], maxlen=self.src_pad_len, value=self.tokenizer.pad, padding='post')[0]
        tgt_inp = pad_sequences([tgt_inp], maxlen=self.tgt_pad_len, value=self.tokenizer.pad, padding='post')[0]
        tgt_lbl = pad_sequences([tgt_lbl], maxlen=self.tgt_pad_len, value=self.tokenizer.pad, padding='post')[0]

        return src, tgt_inp, tgt_lbl
Exemple #4
0
    def __init__(self,
                 file_path,
                 tokenizer,
                 src_pad_len=200,
                 tgt_pad_len=50,
                 seed=None):
        self.tokenizer = tokenizer

        self.src_pad_len = src_pad_len
        self.tgt_pad_len = tgt_pad_len

        with open(file_path) as f:
            self.corpus = f.read().split('\n')[:-1]

        self.src = None
        self.tgt = [tokenizer.process_word(x) for x in tqdm(self.corpus)]
        self.tgt = pad_sequences(self.tgt,
                                 maxlen=self.tgt_pad_len,
                                 value=self.tokenizer.pad,
                                 padding='post')
Exemple #5
0
 def tokenize_char(self, sent):
     words = self._word_tokenize(sent)
     words = [self.process_char(w) for w in words]
     words = pad_sequences(words, self.char_pad_len, padding='post')
     return words