class BPEmbVaeSampler(VAESampler): def __init__(self, lang, vs, dim, decode_from, params, cuda=False): self.bp = BPEmb(lang=lang, vs=vs, dim=dim) super().__init__(decode_from, params, cuda) def to_s(self, decoded): out = [] for item in decoded: s = self.bp.decode(item).replace('▁', ' ').strip() s = s[0].upper() + s[1:] s = re.sub(r'\bi\b', 'I', s) s = re.sub(r'[.!?]\s+(\w)', lambda m: m.group()[:-1] + m.group()[-1].upper(), s) out.append(s) return out def str2ids(self, s): """ Encode string s with BPEmb. BPEmb has a fixed vocabulary size, but the model only has outputs for vocab items that are used in the training data, so this function replaces any BPEmb ids *not* in the training vocabulary with the model's "unknown" id. """ encoded = self.bp.encode(s) ids = [self.vocab.word2id.get(item, self.vocab.unk_id) \ for item in encoded] return ids
class BPEmbVaeSampler(VAESampler): def __init__(self, lang, vs, dim, decode_from, params, cuda=False): self.bp = BPEmb(lang=lang, vs=vs, dim=dim, add_pad_emb=True) super().__init__(decode_from, params, cuda) def _load_train_data(self): class Defaulter(dict): def __missing__(self, item): return 0 word2idx = Defaulter( **{item: self.bp.emb.vocab[item].index \ for item in self.bp.emb.vocab}) train_data = MonoTextData(self.params.train_data, label=False, vocab=word2idx) return train_data def to_s(self, decoded): out = [] for item in decoded: s = self.bp.decode(item).replace('▁', ' ').strip() s = s[0].upper() + s[1:] s = re.sub(r'\bi\b', 'I', s) s = re.sub(r'[.!?]\s+(\w)', lambda m: m.group()[:-1] + m.group()[-1].upper(), s) out.append(s) return out def str2ids(self, s): """ Encode string s with BPEmb. BPEmb has a fixed vocabulary size, but the model only has outputs for vocab items that are used in the training data, so this function replaces any BPEmb ids *not* in the training vocabulary with the model's "unknown" id. """ encoded = self.bp.encode(s) ids = [self.vocab.word2id.get(item, self.vocab.unk_id) \ for item in encoded] return ids