def test_multi_language(): text = ["This is Stratford", "Kitap okuyordu."] bpemb_multi = BPEmb(lang="multi", add_pad_emb=True) print(bpemb_multi.encode_ids_with_bos_eos(text)) print( bpemb_multi.decode_ids([[1, 5496, 200, 23866, 3927, 2], [1, 45350, 44934, 67191, 94777, 2]]))
def test_encoding(): text = ["This is Stratford", "<pad>"] bpemb_en = BPEmb(lang="en", add_pad_emb=True) # We can auto-add and encode start/end tokens. However, encoder can't handle <pad> directly. # We should pad outside with the corresponding index (index of the last word when add_pad_emb True). print(bpemb_en.encode(text)) print(bpemb_en.encode_with_eos(text)) print(bpemb_en.encode_with_bos_eos(text)) print(bpemb_en.encode_ids(text)) print(bpemb_en.encode_ids_with_eos(text)) print(bpemb_en.encode_ids_with_bos_eos(text))
class BPETokenizer: """Use byte pair encoding to transform text""" def __init__(self, lang='ru', pretrained=True, vocab_size=100000, dim=300): self.lang = lang self.pretrained = pretrained self.bpe = BPEmb(lang=self.lang, vs=vocab_size, dim=dim, vs_fallback=True) def fit(self, text): raise NotImplementedError('fit is not supported') def transform(self, text: Union[str, List[str]], get_ids=True): if get_ids: return self.bpe.encode_ids_with_bos_eos(text) else: return self.bpe.encode_with_bos_eos(text)
class CaptionDataset(Dataset): def __init__(self, paths, caption_dict, transform): self.paths = paths self.caption_dict = caption_dict self.bpe = BPEmb(lang="en", vs=10000, dim=300) self.transform = transform def __len__(self): return len(self.paths) def __getitem__(self, idx): path = self.paths[idx] try: img = Image.open(path).convert('RGB') caption = self.caption_dict[path] img_tensor = self.transform(img) caption_label = torch.tensor( self.bpe.encode_ids_with_bos_eos(caption)) except: img_tensor = -1 caption_label = -1 return img_tensor, caption_label
def my_main(epochs, drop_mult, exp_id, bs, epochs_start, model_id): ex.info['path'] = EX_PA layer_factor = 2 bpemb_de = BPEmb(lang="de", vs=25000, dim=300) itos = dict(enumerate(bpemb_de.words + ['xxpad'])) voc = Vocab(itos) df_all = pd.read_pickle('/mnt/data/group07/johannes/ompc/unla.pkl') df_all['text_cat'] = df_all.apply(lambda x: (x['Headline'] if not x['Headline'] is None else '') + ' ' + (x['Body'] if not x['Body'] is None else '') + ' xxp ' + ('xxa' if pd.isna(x['ID_Parent_Post']) else 'xxb') , axis=1) df_all['text_ids'] = df_all['text_cat'].apply(lambda x: bpemb_de.encode_ids_with_bos_eos(news_utils.clean.german.clean_german(x))) df_all_train = df_all[df_all['ID_Article'] < 11500] df_all_val = df_all[df_all['ID_Article'] >= 11500] data_lm_ft = TextLMDataBunch.from_ids(bs=bs, path=EX_PA,vocab=voc, train_ids=df_all_train['text_ids'], valid_ids=df_all_val['text_ids']) learn = language_model_learner(data_lm_ft, drop_mult=drop_mult) learn.load_pretrained(Path('/mnt/data/group07/johannes/germanlm/exp_1/models/' + model_id +'.pth'), Path('/mnt/data/group07/johannes/germanlm/exp_1/tmp/itos.pkl')) lr = news_utils.fastai.get_optimal_lr(learn, runs=1) #lr = 0.001 learn.callbacks += [ news_utils.fastai.SacredLogger(learn, ex), SaveModelCallback(learn, name=exp_id), EarlyStoppingCallback(learn, patience=1) ] if epochs_start > 0: learn.fit_one_cycle(epochs_start, lr) learn.unfreeze() if epochs > 0: #learn.fit_one_cycle(epochs, [lr / (layer_factor * (3 - x)) for x in range(3)] + [lr]) learn.fit_one_cycle(epochs, lr)
def main(args): result_dir = setup_run(args.run_name, create_dirs=['checkpoints', 'samples']) setup_logging(result_dir / 'log.txt') logging.info(args) device = get_default_device(args.device) sample_dir = result_dir / 'samples' checkpoint_dir = result_dir / 'checkpoints' seq_length = 32 from bpemb import BPEmb lines = Path(args.dataset).read_text().split('\n')[:2_500_000] bpe = BPEmb(lang='de', vs=50000, dim=100, add_pad_emb=True) data = torch.full((len(lines), seq_length), bpe.vocab_size, dtype=torch.long) for i, encoded_sample in enumerate(bpe.encode_ids_with_bos_eos(lines)): l = min(seq_length, len(encoded_sample)) data[i, :l] = torch.tensor(encoded_sample, dtype=torch.long)[:l] #dataset = ByteLevelTextDataset(args.dataset, seq_length) depth = math.log2(seq_length) assert int(depth) == depth depth = int(depth) vocab_size = bpe.vocab_size + 1 batches = DataLoader(data, args.batch_size, shuffle=True, pin_memory=True, num_workers=args.num_workers) inter_dim = bpe.dim embedding = nn.Embedding(vocab_size, inter_dim, _weight=torch.tensor( bpe.vectors, dtype=torch.float)).to(device) embedding.weight.requires_grad = False # embedding = nn.Embedding(vocab_size, inter_dim, max_norm=1.0).to(device) # spiegel model G = Generator(args.latent_size, [256, 256, 128, 64, 64], out_dim=inter_dim).to(device) D = UnetDiscriminator(64, max_channel=256, depth=5, in_dim=inter_dim).to(device) # G = Generator(args.latent_size, inter_dim, 256).to(device) # D = Discriminator(inter_dim, 256).to(device) G.apply(apply_spectral_norm) D.apply(apply_spectral_norm) G.apply(init_weights) D.apply(init_weights) G.train() D.train() (result_dir / 'G.txt').write_text(str(G)) (result_dir / 'D.txt').write_text(str(D)) if args.use_ema: G_shadow = copy.deepcopy(G) G_sample = G_shadow update_average(G_shadow, G, beta=0.0) else: G_sample = G G_orig = G D_orig = D if args.data_parallel: G = nn.DataParallel(G) D = nn.DataParallel(D) D_params = list(D.parameters()) #D_params += list(embedding.parameters()) G_opt = torch.optim.Adam(G.parameters(), lr=args.g_lr, betas=(0.5, 0.999)) D_opt = torch.optim.Adam(D_params, lr=args.d_lr, betas=(0.5, 0.999)) z_sample = torch.randn(seq_length, args.batch_size, args.latent_size).to(device) #loss_f = RelativisticAverageHingeLoss(D) #loss_f = GANLoss(D) loss_f = WGAN_GP(D) def decode(embeds): flatten = embeds.transpose(1, 2) flatten = flatten.reshape(-1, flatten.size(-1)) dist = (flatten.pow(2).sum(1, keepdim=True) - 2 * flatten @ embedding.weight.T + embedding.weight.T.pow(2).sum(0, keepdim=True)) _, ids = (-dist).max(1) ids = ids.view(embeds.size(0), -1) decoded = [] for seq in ids: seq = list(seq.detach().cpu().numpy()) seq = list(filter(lambda x: x != vocab_size - 1, seq)) dec = bpe.decode_ids(np.array(seq)) decoded.append(dec or '') return decoded try: global_step = 0 for epoch in range(args.epochs): g_loss_sum = 0 d_loss_sum = 0 p_fake_sum = 0 p_real_sum = 0 start_time = time.time() cur_step = 0 for step, reals in enumerate(batches): reals = reals.to(device) reals_embed = embedding(reals).permute(1, 0, 2) #reals_embed += torch.normal(0, 0.05, size=reals_embed.shape, device=device) batch_size = reals.size(0) z = torch.randn(seq_length, batch_size, args.latent_size).to(device) # Optimize the discriminator fake_out = G(z) D_opt.zero_grad() d_loss, p_real, p_fake = loss_f.loss_d(reals_embed, fake_out.detach()) d_loss.backward() D_opt.step() # Optimize generator fake_out = G(z) G_opt.zero_grad() g_loss = loss_f.loss_g(reals_embed, fake_out) g_loss.backward() G_opt.step() if args.use_ema: update_average(G_shadow, G_orig, beta=0.999) g_loss_sum += float(g_loss) d_loss_sum += float(d_loss) p_fake_sum += float(p_fake) p_real_sum += float(p_real) if global_step % args.log_every == 0: cur_step = min(step + 1, args.log_every) batches_per_sec = cur_step / (time.time() - start_time) logging.info( f'[EPOCH {epoch + 1:03d}] [{step:05d} / {len(batches):05d}] ' + #f'grow_index: {current_grow_index}/{depth - 1}, ' + f'loss_d: {d_loss_sum / cur_step:.5f}, loss_g: {g_loss_sum / cur_step:.5f}, ' + f'p_fake_g: {p_fake_sum / cur_step:.5f}, p_fake_l: {p_real_sum / cur_step:.5f}, ' + #f'G_attn_gamma: {G_attn_sum / cur_step:.2f}, D_attn_gamma: {D_attn_sum / cur_step:.2f}, ' f'batches/s: {batches_per_sec:02.2f}') g_loss_sum = d_loss_sum = 0 p_fake_sum = 0 p_real_sum = 0 start_time = time.time() if global_step % args.sample_every == 0: samples_embeds = G_sample(z_sample).permute(1, 2, 0) samples = decode(samples_embeds) reals_decode = decode(reals_embed.permute(1, 2, 0)) (sample_dir / f'fakes_{global_step:06d}.txt').write_text( '\n'.join(samples)) (sample_dir / f'reals_{global_step:06d}.txt').write_text( '\n'.join(reals_decode)) # (sample_dir / f'fakes_{global_step:06d}.txt').write_text('\n'.join(dataset.seq_to_text(samples))) # (sample_dir / f'reals_{global_step:06d}.txt').write_text('\n'.join(dataset.seq_to_text(reals_decode))) cur_step += 1 global_step += 1 torch.save(G, str(checkpoint_dir / f'G_{global_step:06d}.pth')) torch.save(D, str(checkpoint_dir / f'D_{global_step:06d}.pth')) except KeyboardInterrupt: pass
class LanguagePeripheral(base_peripheral): def __init__(self, output_dim, vocab_size=10000, embed_dim=50, lang='en', embedding_preload=True, gpu_id=-1, dropout=0): super(LanguagePeripheral, self).__init__() self.gpu_id = gpu_id self.pad_char = vocab_size self.bpe_encoder = BPEmb(lang=lang, vs=vocab_size, dim=embed_dim, add_pad_emb=True) # Add an extra padding character self.embed_layer = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=self.pad_char) if (embedding_preload == True): self.embed_layer.load_state_dict( {'weight': torch.tensor(self.bpe_encoder.emb.vectors)}) print("Loading pretrained word embeddings.") self.enc_dropout = nn.Dropout(dropout) self.output = nn.Linear(embed_dim, output_dim) def forward(self, tokens): pad_mask = tokens.eq(self.id_PAD) embeddings = self.embed_layer(tokens) embeddings = self.enc_dropout(embeddings) output = self.output(embeddings) return output.unsqueeze(2) def embed_sentences(self, sentences): # Generate the tokens using BPEmb tokens, pad_mask = self.tokenize_sentences(sentences) return self.forward(tokens), pad_mask def decode_tokens(self, tokens): if isinstance(tokens, torch.Tensor): tokens = tokens.cpu().numpy().astype(int).tolist() elif isinstance(tokens, np.ndarray): tokens = tokens.astype(int).tolist() #Filter out all tokens which have values larger than vocab_size and filter all elements after EOS filtered_tokens = [] for t in tokens: values = [] for i in t: if i == self.id_EOS: break elif i < self.id_PAD: values.append(i) filtered_tokens.append(values) #Remove all the padding characters in a list return self.bpe_encoder.decode_ids(filtered_tokens) def tokenize_sentences(self, sentences): tokens = self.bpe_encoder.encode_ids_with_bos_eos(sentences) # Pad the tokens with the pad_char max_len = 0 for t in tokens: max_len = max(max_len, len(t)) for i in range(len(tokens)): tok_len = len(tokens[i]) tokens[i].extend([self.pad_char] * (max_len - tok_len)) tokens = torch.tensor(np.array(tokens)) if self.gpu_id > -1: tokens = tokens.cuda(self.gpu_id) pad_mask = tokens.eq(self.id_PAD) return tokens, pad_mask @property def id_PAD(self): return self.pad_char @property def id_GO(self): return 1 @property def id_EOS(self): return 2
class SubWordProcessor(DecodingCompatibleProcessorABC): def __init__(self, bpe_info, padding_info): super().__init__() self._bpe_info = bpe_info self._padding_info = padding_info self._shared_bpe = None self._encoder_bpe = None self._decoder_bpe = None if "shared_bpe" in self._bpe_info: self._shared_bpe = BPEmb(**self._bpe_info["shared_bpe"]) self._encoder_bpe = self._shared_bpe self._decoder_bpe = self._shared_bpe else: self._encoder_bpe = BPEmb(**self._bpe_info["encoder_bpe"]) self._decoder_bpe = BPEmb(**self._bpe_info["decoder_bpe"]) def process(self, data, **kwargs): # data: (encoder_input, decoder_input) which are both list_of_string (may be a list of list; check again though) # Assuming that text-based preprocesses are done before. # For encoder_input and decoder_input, I decided to add start/end tokens. # decoder_input and target should have same length after preprocessing. # Hence target will have one more pad element. encoder_input = self._encoder_bpe.encode_ids_with_bos_eos(data[0]) decoder_input = self._decoder_bpe.encode_ids_with_bos_eos(data[1]) target = self._decoder_bpe.encode_ids_with_eos(data[1]) # bpe vocab-size does not account for pad word. Hence, weight matrix has length vocab-size + 1 # As indices start from 0; pad index will be vocab-size. # Notice that if bpe is shared, pad token has the same index both for encoder and decoder.. padded_enc_input = pad_sequences( encoder_input, maxlen=self._padding_info["enc_max_seq_len"], value=self._encoder_bpe.vocab_size, padding="post") padded_dec_input = pad_sequences( decoder_input, maxlen=self._padding_info["dec_max_seq_len"], value=self._decoder_bpe.vocab_size, padding="post") padded_target = pad_sequences( target, maxlen=self._padding_info["dec_max_seq_len"], value=self._decoder_bpe.vocab_size, padding="post") return [padded_enc_input, padded_dec_input], padded_target def encode(self, data, usage="encoder", **kwargs): # data is a list of string (may be a list of list) cur_bpe = self._encoder_bpe max_seq_len = self._padding_info["enc_max_seq_len"] pad_value = self._encoder_bpe.vocab_size if usage != "encoder": cur_bpe = self._decoder_bpe max_seq_len = self._padding_info["dec_max_seq_len"] pad_value = self._decoder_bpe.vocab_size encoded = cur_bpe.encode_ids_with_bos_eos(data) padded = pad_sequences(encoded, maxlen=max_seq_len, value=pad_value, padding="post") return padded def decode(self, data, usage="decoder", **kwargs): # data is a list of ids (may be a list of list) # Designed for decoder id list to sentence mapping, but enabling for encoder as well. cur_bpe = self._decoder_bpe if usage != "decoder": cur_bpe = self._encoder_bpe # When decoding, bpe can't handle padding. Hence, we need to remove the padding first. pad_id = cur_bpe.vocab_size if any(isinstance(el, list) for el in data): pad_removed = [] for elem in data: pad_removed.append(self.remove_padding(elem, pad_id)) return cur_bpe.decode_ids(pad_removed) else: return cur_bpe.decode_ids(self.remove_padding(data, pad_id)) def get_tag_ids(self, usage="decoder", **kwargs): # Specifically; start, end and pad tag ids of decoder # Re-consider unknown~ cur_bpe = self._decoder_bpe if usage != "decoder": cur_bpe = self._encoder_bpe # Since pad is the last element.. tag_ids = { "start": cur_bpe.BOS, "end": cur_bpe.EOS, "pad": cur_bpe.vocab_size } return tag_ids def get_max_seq_length(self, usage="decoder", **kwargs): if usage == "decoder": return self._padding_info["dec_max_seq_len"] else: return self._padding_info["enc_max_seq_len"] @staticmethod def remove_padding(list_of_ids, pad_value): return [int(i) for i in list_of_ids if i != pad_value]