Esempio n. 1
0
def test_multi_language():
    text = ["This is Stratford", "Kitap okuyordu."]
    bpemb_multi = BPEmb(lang="multi", add_pad_emb=True)
    print(bpemb_multi.encode_ids_with_bos_eos(text))
    print(
        bpemb_multi.decode_ids([[1, 5496, 200, 23866, 3927, 2],
                                [1, 45350, 44934, 67191, 94777, 2]]))
Esempio n. 2
0
def test_encoding():
    text = ["This is Stratford", "<pad>"]

    bpemb_en = BPEmb(lang="en", add_pad_emb=True)

    # We can auto-add and encode start/end tokens. However, encoder can't handle <pad> directly.
    # We should pad outside with the corresponding index (index of the last word when add_pad_emb True).
    print(bpemb_en.encode(text))
    print(bpemb_en.encode_with_eos(text))
    print(bpemb_en.encode_with_bos_eos(text))
    print(bpemb_en.encode_ids(text))
    print(bpemb_en.encode_ids_with_eos(text))
    print(bpemb_en.encode_ids_with_bos_eos(text))
Esempio n. 3
0
class BPETokenizer:
    """Use byte pair encoding to transform text"""
    def __init__(self, lang='ru', pretrained=True, vocab_size=100000, dim=300):
        self.lang = lang
        self.pretrained = pretrained
        self.bpe = BPEmb(lang=self.lang,
                         vs=vocab_size,
                         dim=dim,
                         vs_fallback=True)

    def fit(self, text):
        raise NotImplementedError('fit is not supported')

    def transform(self, text: Union[str, List[str]], get_ids=True):
        if get_ids:
            return self.bpe.encode_ids_with_bos_eos(text)
        else:
            return self.bpe.encode_with_bos_eos(text)
Esempio n. 4
0
class CaptionDataset(Dataset):
    def __init__(self, paths, caption_dict, transform):
        self.paths = paths
        self.caption_dict = caption_dict
        self.bpe = BPEmb(lang="en", vs=10000, dim=300)
        self.transform = transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        try:
            img = Image.open(path).convert('RGB')
            caption = self.caption_dict[path]
            img_tensor = self.transform(img)
            caption_label = torch.tensor(
                self.bpe.encode_ids_with_bos_eos(caption))
        except:
            img_tensor = -1
            caption_label = -1
        return img_tensor, caption_label
Esempio n. 5
0
def my_main(epochs, drop_mult, exp_id, bs, epochs_start, model_id):    
    ex.info['path'] = EX_PA
    layer_factor = 2
    
    bpemb_de = BPEmb(lang="de", vs=25000, dim=300)
    itos = dict(enumerate(bpemb_de.words + ['xxpad']))
    voc = Vocab(itos)    
    
    df_all = pd.read_pickle('/mnt/data/group07/johannes/ompc/unla.pkl')
    df_all['text_cat'] = df_all.apply(lambda x: (x['Headline'] if not x['Headline'] is None else '') + ' ' + (x['Body'] if not x['Body'] is None else '') + ' xxp ' + ('xxa' if pd.isna(x['ID_Parent_Post']) else 'xxb') , axis=1)

    df_all['text_ids'] = df_all['text_cat'].apply(lambda x: bpemb_de.encode_ids_with_bos_eos(news_utils.clean.german.clean_german(x)))

    df_all_train = df_all[df_all['ID_Article'] < 11500]
    df_all_val = df_all[df_all['ID_Article'] >= 11500]

    data_lm_ft = TextLMDataBunch.from_ids(bs=bs, path=EX_PA,vocab=voc, train_ids=df_all_train['text_ids'], valid_ids=df_all_val['text_ids'])
    
    learn = language_model_learner(data_lm_ft, drop_mult=drop_mult)
    learn.load_pretrained(Path('/mnt/data/group07/johannes/germanlm/exp_1/models/' + model_id +'.pth'), Path('/mnt/data/group07/johannes/germanlm/exp_1/tmp/itos.pkl'))
    
    
    lr = news_utils.fastai.get_optimal_lr(learn, runs=1)
    #lr = 0.001
    
    learn.callbacks += [
        news_utils.fastai.SacredLogger(learn, ex),
        SaveModelCallback(learn, name=exp_id),
        EarlyStoppingCallback(learn, patience=1)
    ]
    
    if epochs_start > 0:
        learn.fit_one_cycle(epochs_start, lr)
    
    learn.unfreeze()
    if epochs > 0: 
        #learn.fit_one_cycle(epochs, [lr / (layer_factor * (3 - x)) for x in range(3)] + [lr])
        learn.fit_one_cycle(epochs, lr)
def main(args):
    result_dir = setup_run(args.run_name,
                           create_dirs=['checkpoints', 'samples'])
    setup_logging(result_dir / 'log.txt')

    logging.info(args)

    device = get_default_device(args.device)

    sample_dir = result_dir / 'samples'
    checkpoint_dir = result_dir / 'checkpoints'

    seq_length = 32

    from bpemb import BPEmb

    lines = Path(args.dataset).read_text().split('\n')[:2_500_000]

    bpe = BPEmb(lang='de', vs=50000, dim=100, add_pad_emb=True)

    data = torch.full((len(lines), seq_length),
                      bpe.vocab_size,
                      dtype=torch.long)

    for i, encoded_sample in enumerate(bpe.encode_ids_with_bos_eos(lines)):
        l = min(seq_length, len(encoded_sample))
        data[i, :l] = torch.tensor(encoded_sample, dtype=torch.long)[:l]

    #dataset = ByteLevelTextDataset(args.dataset, seq_length)

    depth = math.log2(seq_length)

    assert int(depth) == depth

    depth = int(depth)

    vocab_size = bpe.vocab_size + 1

    batches = DataLoader(data,
                         args.batch_size,
                         shuffle=True,
                         pin_memory=True,
                         num_workers=args.num_workers)

    inter_dim = bpe.dim

    embedding = nn.Embedding(vocab_size,
                             inter_dim,
                             _weight=torch.tensor(
                                 bpe.vectors, dtype=torch.float)).to(device)
    embedding.weight.requires_grad = False
    # embedding = nn.Embedding(vocab_size, inter_dim, max_norm=1.0).to(device)

    # spiegel model
    G = Generator(args.latent_size, [256, 256, 128, 64, 64],
                  out_dim=inter_dim).to(device)
    D = UnetDiscriminator(64, max_channel=256, depth=5,
                          in_dim=inter_dim).to(device)

    # G = Generator(args.latent_size, inter_dim, 256).to(device)
    # D = Discriminator(inter_dim, 256).to(device)

    G.apply(apply_spectral_norm)
    D.apply(apply_spectral_norm)

    G.apply(init_weights)
    D.apply(init_weights)

    G.train()
    D.train()

    (result_dir / 'G.txt').write_text(str(G))
    (result_dir / 'D.txt').write_text(str(D))

    if args.use_ema:
        G_shadow = copy.deepcopy(G)
        G_sample = G_shadow
        update_average(G_shadow, G, beta=0.0)
    else:
        G_sample = G

    G_orig = G
    D_orig = D

    if args.data_parallel:
        G = nn.DataParallel(G)
        D = nn.DataParallel(D)

    D_params = list(D.parameters())
    #D_params += list(embedding.parameters())

    G_opt = torch.optim.Adam(G.parameters(), lr=args.g_lr, betas=(0.5, 0.999))
    D_opt = torch.optim.Adam(D_params, lr=args.d_lr, betas=(0.5, 0.999))

    z_sample = torch.randn(seq_length, args.batch_size,
                           args.latent_size).to(device)

    #loss_f = RelativisticAverageHingeLoss(D)
    #loss_f = GANLoss(D)
    loss_f = WGAN_GP(D)

    def decode(embeds):
        flatten = embeds.transpose(1, 2)
        flatten = flatten.reshape(-1, flatten.size(-1))

        dist = (flatten.pow(2).sum(1, keepdim=True) -
                2 * flatten @ embedding.weight.T +
                embedding.weight.T.pow(2).sum(0, keepdim=True))

        _, ids = (-dist).max(1)
        ids = ids.view(embeds.size(0), -1)

        decoded = []
        for seq in ids:
            seq = list(seq.detach().cpu().numpy())
            seq = list(filter(lambda x: x != vocab_size - 1, seq))
            dec = bpe.decode_ids(np.array(seq))
            decoded.append(dec or '')

        return decoded

    try:
        global_step = 0
        for epoch in range(args.epochs):
            g_loss_sum = 0
            d_loss_sum = 0

            p_fake_sum = 0
            p_real_sum = 0

            start_time = time.time()

            cur_step = 0

            for step, reals in enumerate(batches):
                reals = reals.to(device)
                reals_embed = embedding(reals).permute(1, 0, 2)
                #reals_embed += torch.normal(0, 0.05, size=reals_embed.shape, device=device)

                batch_size = reals.size(0)

                z = torch.randn(seq_length, batch_size,
                                args.latent_size).to(device)

                # Optimize the discriminator
                fake_out = G(z)

                D_opt.zero_grad()

                d_loss, p_real, p_fake = loss_f.loss_d(reals_embed,
                                                       fake_out.detach())
                d_loss.backward()

                D_opt.step()

                # Optimize generator
                fake_out = G(z)

                G_opt.zero_grad()

                g_loss = loss_f.loss_g(reals_embed, fake_out)
                g_loss.backward()

                G_opt.step()

                if args.use_ema:
                    update_average(G_shadow, G_orig, beta=0.999)

                g_loss_sum += float(g_loss)
                d_loss_sum += float(d_loss)

                p_fake_sum += float(p_fake)
                p_real_sum += float(p_real)

                if global_step % args.log_every == 0:
                    cur_step = min(step + 1, args.log_every)
                    batches_per_sec = cur_step / (time.time() - start_time)

                    logging.info(
                        f'[EPOCH {epoch + 1:03d}] [{step:05d} / {len(batches):05d}] '
                        +
                        #f'grow_index: {current_grow_index}/{depth - 1}, ' +
                        f'loss_d: {d_loss_sum / cur_step:.5f}, loss_g: {g_loss_sum / cur_step:.5f}, '
                        +
                        f'p_fake_g: {p_fake_sum / cur_step:.5f}, p_fake_l: {p_real_sum / cur_step:.5f}, '
                        +
                        #f'G_attn_gamma: {G_attn_sum / cur_step:.2f}, D_attn_gamma: {D_attn_sum / cur_step:.2f}, '
                        f'batches/s: {batches_per_sec:02.2f}')

                    g_loss_sum = d_loss_sum = 0

                    p_fake_sum = 0
                    p_real_sum = 0

                    start_time = time.time()

                if global_step % args.sample_every == 0:
                    samples_embeds = G_sample(z_sample).permute(1, 2, 0)
                    samples = decode(samples_embeds)

                    reals_decode = decode(reals_embed.permute(1, 2, 0))

                    (sample_dir / f'fakes_{global_step:06d}.txt').write_text(
                        '\n'.join(samples))
                    (sample_dir / f'reals_{global_step:06d}.txt').write_text(
                        '\n'.join(reals_decode))

                    # (sample_dir / f'fakes_{global_step:06d}.txt').write_text('\n'.join(dataset.seq_to_text(samples)))
                    # (sample_dir / f'reals_{global_step:06d}.txt').write_text('\n'.join(dataset.seq_to_text(reals_decode)))

                cur_step += 1
                global_step += 1

            torch.save(G, str(checkpoint_dir / f'G_{global_step:06d}.pth'))
            torch.save(D, str(checkpoint_dir / f'D_{global_step:06d}.pth'))
    except KeyboardInterrupt:
        pass
Esempio n. 7
0
class LanguagePeripheral(base_peripheral):
    def __init__(self,
                 output_dim,
                 vocab_size=10000,
                 embed_dim=50,
                 lang='en',
                 embedding_preload=True,
                 gpu_id=-1,
                 dropout=0):
        super(LanguagePeripheral, self).__init__()
        self.gpu_id = gpu_id
        self.pad_char = vocab_size
        self.bpe_encoder = BPEmb(lang=lang,
                                 vs=vocab_size,
                                 dim=embed_dim,
                                 add_pad_emb=True)
        # Add an extra padding character
        self.embed_layer = nn.Embedding(vocab_size + 1,
                                        embed_dim,
                                        padding_idx=self.pad_char)
        if (embedding_preload == True):
            self.embed_layer.load_state_dict(
                {'weight': torch.tensor(self.bpe_encoder.emb.vectors)})
            print("Loading pretrained word embeddings.")
        self.enc_dropout = nn.Dropout(dropout)
        self.output = nn.Linear(embed_dim, output_dim)

    def forward(self, tokens):
        pad_mask = tokens.eq(self.id_PAD)
        embeddings = self.embed_layer(tokens)
        embeddings = self.enc_dropout(embeddings)
        output = self.output(embeddings)
        return output.unsqueeze(2)

    def embed_sentences(self, sentences):
        # Generate the tokens using BPEmb
        tokens, pad_mask = self.tokenize_sentences(sentences)
        return self.forward(tokens), pad_mask

    def decode_tokens(self, tokens):
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.cpu().numpy().astype(int).tolist()
        elif isinstance(tokens, np.ndarray):
            tokens = tokens.astype(int).tolist()
        #Filter out all tokens which have values larger than vocab_size and filter all elements after EOS
        filtered_tokens = []
        for t in tokens:
            values = []
            for i in t:
                if i == self.id_EOS:
                    break
                elif i < self.id_PAD:
                    values.append(i)
            filtered_tokens.append(values)
        #Remove all the padding characters in a list
        return self.bpe_encoder.decode_ids(filtered_tokens)

    def tokenize_sentences(self, sentences):
        tokens = self.bpe_encoder.encode_ids_with_bos_eos(sentences)
        # Pad the tokens with the pad_char
        max_len = 0

        for t in tokens:
            max_len = max(max_len, len(t))
        for i in range(len(tokens)):
            tok_len = len(tokens[i])
            tokens[i].extend([self.pad_char] * (max_len - tok_len))
        tokens = torch.tensor(np.array(tokens))
        if self.gpu_id > -1:
            tokens = tokens.cuda(self.gpu_id)
        pad_mask = tokens.eq(self.id_PAD)
        return tokens, pad_mask

    @property
    def id_PAD(self):
        return self.pad_char

    @property
    def id_GO(self):
        return 1

    @property
    def id_EOS(self):
        return 2
class SubWordProcessor(DecodingCompatibleProcessorABC):
    def __init__(self, bpe_info, padding_info):
        super().__init__()
        self._bpe_info = bpe_info
        self._padding_info = padding_info

        self._shared_bpe = None
        self._encoder_bpe = None
        self._decoder_bpe = None
        if "shared_bpe" in self._bpe_info:
            self._shared_bpe = BPEmb(**self._bpe_info["shared_bpe"])
            self._encoder_bpe = self._shared_bpe
            self._decoder_bpe = self._shared_bpe
        else:
            self._encoder_bpe = BPEmb(**self._bpe_info["encoder_bpe"])
            self._decoder_bpe = BPEmb(**self._bpe_info["decoder_bpe"])

    def process(self, data, **kwargs):
        # data: (encoder_input, decoder_input) which are both list_of_string (may be a list of list; check again though)
        # Assuming that text-based preprocesses are done before.
        # For encoder_input and decoder_input, I decided to add start/end tokens.
        # decoder_input and target should have same length after preprocessing.
        # Hence target will have one more pad element.

        encoder_input = self._encoder_bpe.encode_ids_with_bos_eos(data[0])
        decoder_input = self._decoder_bpe.encode_ids_with_bos_eos(data[1])
        target = self._decoder_bpe.encode_ids_with_eos(data[1])

        # bpe vocab-size does not account for pad word. Hence, weight matrix has length vocab-size + 1
        # As indices start from 0; pad index will be vocab-size.
        # Notice that if bpe is shared, pad token has the same index both for encoder and decoder..
        padded_enc_input = pad_sequences(
            encoder_input,
            maxlen=self._padding_info["enc_max_seq_len"],
            value=self._encoder_bpe.vocab_size,
            padding="post")
        padded_dec_input = pad_sequences(
            decoder_input,
            maxlen=self._padding_info["dec_max_seq_len"],
            value=self._decoder_bpe.vocab_size,
            padding="post")
        padded_target = pad_sequences(
            target,
            maxlen=self._padding_info["dec_max_seq_len"],
            value=self._decoder_bpe.vocab_size,
            padding="post")

        return [padded_enc_input, padded_dec_input], padded_target

    def encode(self, data, usage="encoder", **kwargs):
        # data is a list of string (may be a list of list)
        cur_bpe = self._encoder_bpe
        max_seq_len = self._padding_info["enc_max_seq_len"]
        pad_value = self._encoder_bpe.vocab_size
        if usage != "encoder":
            cur_bpe = self._decoder_bpe
            max_seq_len = self._padding_info["dec_max_seq_len"]
            pad_value = self._decoder_bpe.vocab_size

        encoded = cur_bpe.encode_ids_with_bos_eos(data)
        padded = pad_sequences(encoded,
                               maxlen=max_seq_len,
                               value=pad_value,
                               padding="post")

        return padded

    def decode(self, data, usage="decoder", **kwargs):
        # data is a list of ids (may be a list of list)
        # Designed for decoder id list to sentence mapping, but enabling for encoder as well.
        cur_bpe = self._decoder_bpe
        if usage != "decoder":
            cur_bpe = self._encoder_bpe

        # When decoding, bpe can't handle padding. Hence, we need to remove the padding first.
        pad_id = cur_bpe.vocab_size
        if any(isinstance(el, list) for el in data):
            pad_removed = []
            for elem in data:
                pad_removed.append(self.remove_padding(elem, pad_id))
            return cur_bpe.decode_ids(pad_removed)
        else:
            return cur_bpe.decode_ids(self.remove_padding(data, pad_id))

    def get_tag_ids(self, usage="decoder", **kwargs):
        # Specifically; start, end and pad tag ids of decoder
        # Re-consider unknown~
        cur_bpe = self._decoder_bpe
        if usage != "decoder":
            cur_bpe = self._encoder_bpe

        # Since pad is the last element..
        tag_ids = {
            "start": cur_bpe.BOS,
            "end": cur_bpe.EOS,
            "pad": cur_bpe.vocab_size
        }
        return tag_ids

    def get_max_seq_length(self, usage="decoder", **kwargs):
        if usage == "decoder":
            return self._padding_info["dec_max_seq_len"]
        else:
            return self._padding_info["enc_max_seq_len"]

    @staticmethod
    def remove_padding(list_of_ids, pad_value):
        return [int(i) for i in list_of_ids if i != pad_value]