Esempio n. 1
0
def go(arg):

    if arg.seed < 0:
        seed = random.randint(0, 1000000)
        print('random seed: ', seed)
    else:
        torch.manual_seed(arg.seed)

    tbw = SummaryWriter(log_dir=arg.tb_dir)  # Tensorboard logging

    # load the data
    arg.path = here('data') if arg.path is None else arg.path
    data_train, data_val, data_test = read_dataset(arg.path, arg.dataset)

    # create the model
    model = GTransformer(emb=arg.embedding_size,
                         heads=arg.num_heads,
                         depth=arg.depth,
                         seq_length=arg.context,
                         num_tokens=NUM_TOKENS,
                         wide=arg.wide)

    if torch.cuda.is_available():
        model.cuda()

    print("Model parameters = %d" % sum(p.numel() for p in model.parameters()))

    if not arg.radam:
        opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())
        # linear learning rate warmup
        sch = torch.optim.lr_scheduler.LambdaLR(
            opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0))
    else:
        opt = RAdam(model.parameters(), lr=arg.lr)

    if USE_APEX:
        model, opt = amp.initialize(model, opt, opt_level="O1", verbosity=0)

    best_bpb = np.inf
    best_step = 0

    # training loop
    # - note: we don't loop over the data, instead we sample a batch of random subsequences each time.
    for i in tqdm.trange(arg.num_batches):

        opt.zero_grad()

        # sample a batch of random subsequences
        starts = torch.randint(size=(arg.batch_size, ),
                               low=0,
                               high=data_train.size(0) - arg.context - 1)
        seqs_source = [
            data_train[start:start + arg.context] for start in starts
        ]
        seqs_target = [
            data_train[start + 1:start + arg.context + 1] for start in starts
        ]
        source = torch.cat([s[None, :] for s in seqs_source],
                           dim=0).to(torch.long)
        target = torch.cat([s[None, :] for s in seqs_target],
                           dim=0).to(torch.long)
        # - target is the same sequence as source, except one character ahead

        if torch.cuda.is_available():
            source, target = source.cuda(), target.cuda()
        source, target = Variable(source), Variable(target)

        output = model(source)

        loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean')
        #tbw.add_scalar('transformer/train-loss', float(loss.item()) * LOG2E, i * arg.batch_size)

        if not USE_APEX:
            loss.backward()
        else:
            with amp.scale_loss(loss, opt) as scaled_loss:
                scaled_loss.backward()

        # clip gradients
        # - If the total gradient vector has a length > 1, we clip it back down to 1.
        if arg.gradient_clipping > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

        opt.step()

        if not arg.radam:
            sch.step()

        # - validate every {arg.test_every} steps. First we compute the
        #   compression on the validation (or a subset)
        #   then we generate some random text to monitor progress
        if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1):

            upto = arg.test_subset if arg.test_subset else data_val.size(0)
            data_sub = data_val[:upto]

            bits_per_byte = calculate_bpb(arg, model, data_sub)

            # print validation performance. 1 bit per byte is (currently) state of the art.
            print(f'epoch{i}: {bits_per_byte:.4} bits per byte')

            tag_scalar_dict = {
                'train-loss': float(loss.item()) * LOG2E,
                'eval-loss': bits_per_byte
            }
            tbw.add_scalars(f'transformer/loss', tag_scalar_dict,
                            i * arg.batch_size)

            if bits_per_byte < best_bpb:
                best_bpb = bits_per_byte
                best_step = i
                torch.save(model.state_dict(),
                           os.path.join(arg.tb_dir, 'best_model.pt'))

            print(f'best step {best_step}: {best_bpb:.4} bits per byte')

            generate_sequence(arg, model, data_val)

    # load the best model, calculate bpb of the test data and generate some random text
    finalize(arg, model, data_test)
Esempio n. 2
0
def go(arg):

    if arg.seed < 0:
        seed = random.randint(0, 1000000)
        print('random seed: ', seed)
    else:
        torch.manual_seed(arg.seed)

    tbw = SummaryWriter(log_dir=arg.tb_dir)  # Tensorboard logging

    # load the data (validation unless arg.final is true, then test)
    arg.data = here('data/enwik8.gz') if arg.data is None else arg.data

    data_train, data_val, data_test = enwik8(arg.data)
    data_train, data_test = (torch.cat([data_train, data_val], dim=0), data_test) \
                            if arg.final else (data_train, data_val)

    # create the model
    model = GTransformer(emb=arg.embedding_size,
                         heads=arg.num_heads,
                         depth=arg.depth,
                         seq_length=arg.context,
                         num_tokens=NUM_TOKENS,
                         attention_type=arg.attention_type)
    if torch.cuda.is_available():
        model.cuda()

    opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())

    # Linear learning rate warmup
    sch = torch.optim.lr_scheduler.LambdaLR(
        opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0))

    # Training loop
    # -- We don't loop over the data, instead we sample a batch of random subsequences each time. This is not strictly
    #    better or worse as a training method, it's just a little simpler.
    #
    instances_seen = 0
    for i in tqdm.trange(arg.num_batches):

        opt.zero_grad()

        source, target = sample_batch(data_train,
                                      length=arg.context,
                                      batch_size=arg.batch_size)
        instances_seen += source.size(0)

        if torch.cuda.is_available():
            source, target = source.cuda(), target.cuda()

        tic()
        output = model(source)  # forward pass
        t = toc()

        # Compute the loss
        loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean')

        tbw.add_scalar('transformer/train-loss',
                       float(loss.item()) * LOG2E, i * arg.batch_size,
                       instances_seen)
        tbw.add_scalar('transformer/time-forward', t, instances_seen)

        loss.backward()  # backward pass

        # clip gradients
        # -- If the total gradient vector has a length > x, we clip it back down to x.
        if arg.gradient_clipping > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

        opt.step()  # stochastic gradient descent step
        sch.step()  # update the learning rate

        # Validate every `arg.test_every` steps. First we compute the
        # compression on the validation data (or a subset),
        # then we generate some random text to monitor progress.
        if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1):
            with torch.no_grad():

                ## Sample and print a random sequence

                # Slice a random seed from the test data, and sample a continuation from the model.
                seedfr = random.randint(0, data_test.size(0) - arg.context)
                seed = data_test[seedfr:seedfr + arg.context].to(torch.long)

                if torch.cuda.is_available():
                    seed = seed.cuda()

                sample_sequence(model,
                                seed=seed,
                                max_context=arg.context,
                                verbose=True,
                                length=arg.sample_length)

                ## Compute validation bits per byte

                upto = data_test.size(
                    0) if i == arg.num_batches - 1 else arg.test_subset
                data_sub = data_test[:upto]

                bits_per_byte = compute_compression(
                    model,
                    data_sub,
                    context=arg.context,
                    batch_size=arg.test_batchsize)
                # -- Since we're not computing gradients, we can increase the batch size a little from what we used in
                #    training.

                print(f'epoch{i}: {bits_per_byte:.4} bits per byte')
                tbw.add_scalar(f'transformer/eval-loss', bits_per_byte,
                               i * arg.batch_size, instances_seen)
Esempio n. 3
0
def go(arg):

    if arg.seed < 0:
        seed = random.randint(0, 1000000)
        print('random seed: ', seed)
    else:
        torch.manual_seed(arg.seed)

    tbw = SummaryWriter(log_dir=arg.tb_dir)  # Tensorboard logging

    # load the data (validation unless arg.final is true, then test)
    arg.data = here('data/enwik8.gz') if arg.data is None else arg.data

    data_train, data_val, data_test = enwik8(arg.data)
    data_train, data_test = (torch.cat([data_train, data_val], dim=0), data_test) \
                            if arg.final else (data_train, data_val)

    # create the model
    model = GTransformer(emb=arg.embedding_size,
                         heads=arg.num_heads,
                         depth=arg.depth,
                         seq_length=arg.context,
                         num_tokens=NUM_TOKENS)
    if torch.cuda.is_available():
        model.cuda()

    opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())

    # training loop
    # - note: we don't loop over the data, instead we sample a batch of random subsequences each time.
    for i in tqdm.trange(arg.num_batches):

        # learning rate warmup
        # - we linearly increase the learning rate from 10e-10 to arg.lr over the first
        #   few thousand batches
        if arg.lr_warmup > 0 and i < arg.lr_warmup:
            lr = max((arg.lr / arg.lr_warmup) * i, 1e-10)
            opt.lr = lr

        opt.zero_grad()

        # sample a batch of random subsequences
        starts = torch.randint(size=(arg.batch_size, ),
                               low=0,
                               high=data_train.size(0) - arg.context - 1)
        seqs_source = [
            data_train[start:start + arg.context] for start in starts
        ]
        seqs_target = [
            data_train[start + 1:start + arg.context + 1] for start in starts
        ]
        source = torch.cat([s[None, :] for s in seqs_source],
                           dim=0).to(torch.long)
        target = torch.cat([s[None, :] for s in seqs_target],
                           dim=0).to(torch.long)
        # - target is the same sequence as source, except one character ahead

        if torch.cuda.is_available():
            source, target = source.cuda(), target.cuda()
        source, target = Variable(source), Variable(target)

        output = model(source)

        loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean')
        tbw.add_scalar('transformer/train-loss',
                       float(loss.item()) * LOG2E, i * arg.batch_size)

        loss.backward()

        # clip gradients
        # - If the total gradient vector has a length > 1, we clip it back down to 1.
        if arg.gradient_clipping > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

        opt.step()

        # - validate every {arg.test_every} steps. First we compute the
        #   compression on the validation (or a subset)
        #   then we generate some random text to monitor progress
        if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1):

            upto = data_test.size(
                0) if i == arg.num_batches - 1 else arg.test_subset
            data_sub = data_test[:upto]

            with torch.no_grad():
                bits, tot = 0.0, 0
                batch = [
                ]  # buffer, every time it fills up, we run it through the model

                for current in range(data_sub.size(0)):

                    fr = max(0, current - arg.context)
                    to = current + 1

                    context = data_sub[fr:to].to(torch.long)
                    if context.size(0) < arg.context + 1:
                        pad = torch.zeros(size=(arg.context + 1 -
                                                context.size(0), ),
                                          dtype=torch.long)
                        context = torch.cat([pad, context], dim=0)

                        assert context.size(0) == arg.context + 1

                    if torch.cuda.is_available():
                        context = context.cuda()

                    batch.append(context[None, :])

                    if len(
                            batch
                    ) == arg.test_batchsize or current == data_sub.size(0) - 1:

                        # batch is full, run it through the model
                        b = len(batch)

                        all = torch.cat(batch, dim=0)
                        source = all[:, :-1]  # input
                        target = all[:, -1]  # target values

                        output = model(source)

                        lnprobs = output[torch.arange(b, device=d()), -1,
                                         target]
                        log2probs = lnprobs * LOG2E  # convert from nats to bits

                        bits += -log2probs.sum()
                        batch = []  # empty buffer

                bits_per_byte = bits / data_sub.size(0)

                # print validation performance. 1 bit per byte is (currently) state of the art.
                print(f'epoch{i}: {bits_per_byte:.4} bits per byte')
                tbw.add_scalar(f'transformer/eval-loss', bits_per_byte,
                               i * arg.batch_size)

                # generate some random text
                GENSIZE = 600
                TEMP = 0.5
                seedfr = random.randint(0, data_test.size(0) - arg.context)
                input = data_test[seedfr:seedfr + arg.context].to(torch.long)

                if torch.cuda.is_available():
                    input = input.cuda()

                input = Variable(input)

                print('[', end='', flush=True)
                for c in input:
                    print(str(chr(c)), end='', flush=True)
                print(']', end='', flush=True)

                for _ in range(GENSIZE):
                    output = model(input[None, :])
                    c = sample(output[0, -1, :], TEMP)
                    print(str(chr(max(32, c))), end='', flush=True)

                    input = torch.cat([input[1:], c[None]], dim=0)

                print()
Esempio n. 4
0
def go(arg):
    if arg.seed < 0:
        seed = random.randint(0, 1000000)
        print('random seed: ', seed)
    else:
        torch.manual_seed(arg.seed)

    tbw = SummaryWriter(log_dir=arg.tb_dir)  # Tensorboard logging

    # load the data (validation unless arg.final is true, then test)
    arg.data = here('../wiki_uk.txt') if arg.data is None else arg.data

    data_train, data_val, data_test = ukwiki(arg.data)
    data_train, data_test = (torch.cat([data_train, data_val], dim=0), data_test) \
        if arg.final else (data_train, data_val)

    # create the model
    model = GTransformer(emb=arg.embedding_size,
                         heads=arg.num_heads,
                         depth=arg.depth,
                         seq_length=arg.context,
                         num_tokens=NUM_TOKENS,
                         wide=arg.wide)
    if os.path.exists(MODEL_PATH):
        model.load_state_dict(torch.load(MODEL_PATH))
    if torch.cuda.is_available():
        model.cuda()

    opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())
    # linear learning rate warmup
    sch = torch.optim.lr_scheduler.LambdaLR(
        opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0))

    # training loop
    # - note: we don't loop over the data, instead we sample a batch of random subsequences each time.
    for i in tqdm.trange(arg.num_batches):

        opt.zero_grad()

        # sample a batch of random subsequences
        starts = torch.randint(size=(arg.batch_size, ),
                               low=0,
                               high=data_train.size(0) - arg.context - 1)
        if arg.masked:
            seqs_source = [
                data_train.detach().clone()[start:start + arg.context, ]
                for start in starts
            ]
            seqs_target = [
                data_train.detach().clone()[start:start + arg.context]
                for start in starts
            ]
            for ss, st in zip(seqs_source, seqs_target):
                mask_indexes = torch.randint(1, arg.context,
                                             (arg.error_count, ))
                for ind in mask_indexes:
                    ss[ind] = torch.tensor(char_to_id['$'])
                # print(''.join([id_to_char[s.item()] for s in ss]))
                # print(''.join([id_to_char[t.item()] for t in st]))
        else:
            seqs_source = [
                data_train[start:start + arg.context] for start in starts
            ]
            seqs_target = [
                data_train[start + 1:start + arg.context + 1]
                for start in starts
            ]

        source = torch.cat([s[None, :] for s in seqs_source],
                           dim=0).to(torch.long)
        target = torch.cat([s[None, :] for s in seqs_target],
                           dim=0).to(torch.long)
        # - target is the same sequence as source, except one character ahead

        if torch.cuda.is_available():
            source, target = source.cuda(), target.cuda()
        source, target = Variable(source), Variable(target)

        output = model(source)

        loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean')
        tbw.add_scalar('transformer/train-loss',
                       float(loss.item()) * LOG2E, i * arg.batch_size)

        loss.backward()

        # clip gradients
        # - If the total gradient vector has a length > 1, we clip it back down to 1.
        if arg.gradient_clipping > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

        opt.step()
        sch.step()

        # - validate every {arg.test_every} steps. First we compute the
        #   compression on the validation (or a subset)
        #   then we generate some random text to monitor progress
        if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1):

            upto = data_test.size(
                0) if i == arg.num_batches - 1 else arg.test_subset
            data_sub = data_test[:upto]

            with torch.no_grad():
                bits, tot = 0.0, 0
                batch = [
                ]  # buffer, every time it fills up, we run it through the model

                # for current in range(data_sub.size(0)):

                #     fr = max(0, current - arg.context)
                #     to = current + 1

                #     context = data_sub[fr:to].to(torch.long)
                #     if context.size(0) < arg.context + 1:
                #         pad = torch.zeros(size=(arg.context + 1 - context.size(0),), dtype=torch.long)
                #         context = torch.cat([pad, context], dim=0)

                #         assert context.size(0) == arg.context + 1

                #     if torch.cuda.is_available():
                #         context = context.cuda()

                #     batch.append(context[None, :])

                #     if len(batch) == arg.test_batchsize or current == data_sub.size(0) - 1:

                #         # batch is full, run it through the model
                #         b = len(batch)

                #         all = torch.cat(batch, dim=0)
                #         source = all[:, :-1] # input
                #         target = all[:, -1]  # target values

                #         output = model(source)

                #         lnprobs = output[torch.arange(b, device=d()), -1, target]
                #         log2probs = lnprobs * LOG2E # convert from nats to bits

                #         bits += - log2probs.sum()
                #         batch = [] # empty buffer

                # bits_per_byte = bits / data_sub.size(0)

                # # print validation performance. 1 bit per byte is (currently) state of the art.
                # print(f'epoch{i}: {bits_per_byte:.4} bits per byte')
                # tbw.add_scalar(f'transformer/eval-loss', bits_per_byte, i * arg.batch_size)

                # generate some random text
                GENSIZE = 600
                TEMP = 0.5
                seedfr = random.randint(0, data_test.size(0) - arg.context)
                # input = data_test[seedfr:seedfr + arg.context].to(torch.long)
                test_msgs = [
                    "купила м$ма коника, а коник і шо",
                    "як тебе не лю$ити Києве мій коли",
                    "у л$сі лісі темному де ходить як"
                ]
                for test_msg in test_msgs:
                    test_data = np.zeros(arg.context)
                    test_data.fill(110)
                    test_data[0:len(test_msg)] = np.array(
                        [char_to_id[ch] for ch in test_msg])
                    input = torch.from_numpy(test_data).to(torch.long)

                    if torch.cuda.is_available():
                        input = input.cuda()

                    input = Variable(input)

                    print('[', end='', flush=True)
                    for c in input:
                        print(str(id_to_char[c.item()]), end='', flush=True)
                    print(']', end='', flush=True)

                    output = model(input[None, :])
                    out_string = ''.join([
                        id_to_char[ind.item()]
                        for ind in output[0].max(axis=1).indices
                    ])
                    # c = sample(output[0].max(axis=1), TEMP)
                    print("Foo1")
                    print("PRED: " + out_string)

                    print("Foo2")
                    print()

        # Save model
        torch.save(model.state_dict(), MODEL_PATH)