Exemple #1
0
def run_bpe(params):
    bpe_encoder = Encoder(vocab_size=params.vocab_size,
                          pct_bpe=params.pct_bpe,
                          silent=not params.verbose)
    if params.encoder_load_file:
        sys.stdout.write('Using pre-computed BPE encoder\n')
        sys.stdout.flush()
        bpe_encoder = Encoder.load(params.encoder_load_file)
    else:
        sys.stdout.write('Generating new BPE encoder\n')
        sys.stdout.flush()
        text = open(params.source_file).read().split('\n')
        bpe_encoder.fit(text)
        bpe_encoder.save(params.encoder_save_file)
    f_src = open(params.source_file)
    f_dst = open(params.destination_file, 'w')

    for line in tqdm.tqdm(f_src.readlines()):
        line = line.strip()
        tokens = bpe_encoder.tokenize(line)
        encoded_line = ' '.join(tokens).strip()
        if encoded_line.strip() != '':
            f_dst.write(encoded_line + '\n')
    f_src.close()
    f_dst.close()
Exemple #2
0
def generate_repr(params):
    import numpy as np
    from tripod.io_utils.io import Encodings
    encodings = Encodings()
    encodings.load(params.output + '.encodings')
    bpe_encoder = None
    if params.bpe_encoder is not None:
        from bpe import Encoder as BPEEncoder
        bpe_encoder = BPEEncoder.load(params.bpe_encoder)
    model = TripodModel2(encodings)
    model.load(params.output + '.bestGST')
    model.to(params.device)
    model.eval()

    with torch.no_grad():
        with open(params.input_file) as f:
            data = f.read()
            if bpe_encoder is not None:
                data = bpe_encoder.tokenize(data)
            batch_x = [data]
            batch_x = _to_tensor(batch_x, encodings, params.device)
            representation = model.compute_repr(batch_x)
            sys.stdout.write(
                str(np.asarray(representation.cpu().numpy())) + '\n')
Exemple #3
0
def train(data_file, config, blocks, attn_dim, num_heads, nn_dim, dropout,
          tied_weights, optimizer, lr, mb, scale_residuals, block_norm, cpu):
    config = load_config(config)

    context_size = config['dataset']['maxlen']

    if 'vocab' in config['dataset']:
        vocab = Encoder.load(config['dataset']['vocab'])
        config['dataset']['vocab'] = vocab
        vocab_size = config['dataset']['vocab'].vocab_size
        pad_idx = vocab.word_vocab[vocab.PAD]
    else:
        vocab_size = 255
        pad_idx = 0

    window_batches = TimeBufferedCSVReader(data_file, **config['reader'])

    device = torch.device(
        'cuda' if torch.cuda.is_available() and not cpu else 'cpu')

    model = GPTModel(attn_dim,
                     num_heads,
                     nn_dim,
                     blocks,
                     vocab_size,
                     context_size,
                     dropout=dropout,
                     scale_res=scale_residuals,
                     block_norm=block_norm,
                     tied_weights=tied_weights,
                     device=device).to(device)

    opt = opt_map[optimizer](model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(reduction='none', ignore_index=pad_idx)

    scores = gzip.open(basename(data_file) + '.scores.gz', 'wt')

    train_window = CSVDataset(next(window_batches), **config['dataset'])
    prev_window = window_batches.cur_window
    for eval_window in window_batches:
        eval_window = CSVDataset(eval_window, **config['dataset'])
        train_data = DataLoader(train_window,
                                shuffle=False,
                                batch_size=mb,
                                num_workers=8)
        eval_data = DataLoader(eval_window,
                               shuffle=False,
                               batch_size=mb,
                               num_workers=8)

        cur_window = window_batches.cur_window

        # train on window
        model.train()
        avg_train_loss = 0.
        batches = 0
        train_iter = tqdm(train_data)
        for b in train_iter:
            opt.zero_grad()
            _, _, seqs, _ = b
            x = seqs[:, :-1].to(device)
            y = seqs[:, 1:].to(device)
            y_mask = (y != pad_idx).float().unsqueeze(2).to(device)

            preds = model(x, mask=True, pad_key=pad_idx)

            loss = criterion(preds.transpose(1, 2), y)
            loss = loss.sum(dim=1) / y_mask.sum(dim=1).squeeze()

            loss = loss.mean()
            loss.backward()

            opt.step()

            avg_train_loss += loss.cpu().item()
            batches += 1
            train_iter.set_description(
                f'[TRAIN] window={prev_window} loss={avg_train_loss / batches:.8f}'
            )

        # evaluate on next window
        model.eval()
        avg_eval_loss = 0.
        batches = 0
        eval_iter = tqdm(eval_data)
        for b in eval_iter:
            line_nums, meta, seqs, _ = b
            x = seqs[:, :-1].to(device)
            y = seqs[:, 1:].to(device)
            y_mask = (y != pad_idx).float().unsqueeze(2).to(device)

            preds = model(x, mask=True, pad_key=pad_idx)

            loss = criterion(preds.transpose(1, 2), y)
            loss = loss.sum(dim=1) / y_mask.sum(dim=1).squeeze()

            for line_no, line_meta, line_score in zip(line_nums, meta, loss):
                scores.write(f'{line_no},{line_meta},{line_score}\n')

            loss = loss.mean()

            avg_eval_loss += loss.cpu().item()
            batches += 1
            eval_iter.set_description(
                f'[EVAL]  window={cur_window} loss={avg_eval_loss / batches:.8f}'
            )

        train_window = eval_window
        prev_window = cur_window

    scores.close()
Exemple #4
0
def run_tripod(params):
    from tripod.io_utils.io import Dataset
    from tripod.io_utils.io import Encodings
    dataset = Dataset(params.input_file)
    encodings = Encodings()
    encodings.load(params.output + '.encodings')
    model = TripodModel2(encodings)
    model.load(params.output + '.bestGST')
    model.to(params.device)
    model.eval()
    bpe_encoder = None
    if params.bpe_encoder is not None:
        dataset.sequences = []
        dataset.tokens = None
        from bpe import Encoder as BPEEncoder
        bpe_encoder = BPEEncoder.load(params.bpe_encoder)
        for line in open(params.input_file).readlines():
            dataset.sequences.append(bpe_encoder.tokenize(line))

    batches = _get_batches(dataset, params)
    token_list = ''
    with torch.no_grad():
        for batch in batches:
            for seq in batch:

                batch_x = []
                for x in seq[0]:
                    batch_x.append(x)
                tmp = batch_x[1:]

                for ii in range(len(tmp)):
                    if tmp[ii] == '<PAD>':
                        tmp = tmp[:ii]
                        break
                if bpe_encoder is not None:
                    orig = _bpe_decode(tmp, bpe_encoder)
                else:
                    orig = tmp
                batch_x = _to_tensor([batch_x], encodings, params.device)

                pred_sum = model.generate(batch_x)

                val_sum = pred_sum.cpu().numpy()

                for seq_id in range(pred_sum.shape[0]):
                    if bpe_encoder is not None:
                        token_list_sum = [
                            encodings.token_list[zz] for zz in val_sum[seq_id]
                            if zz != encodings.token2int['<UNK>']
                        ]
                        sys.stdout.write('ORIG: ' + orig + '\n\n')
                        sys.stdout.write(
                            'SUM: ' +
                            _bpe_decode(token_list_sum, bpe_encoder) + '\n\n')
                        token_list = token_list_sum
                        sys.stdout.write('=' * 20)
                        sys.stdout.write('\n\n\n')
                    else:
                        for t_id in range(pred_sum.shape[1]):
                            token_list += encodings.token_list[val_sum[seq_id]
                                                               [t_id]]
                            sys.stdout.write(
                                encodings.token_list[val_sum[seq_id][t_id]])
                            sys.stdout.flush()

                        sys.stdout.write('\n')

    with open(params.output_file, 'w') as f:
        f.write(_bpe_decode(token_list, bpe_encoder) + '\n')
        f.close()
Exemple #5
0
 def load_vocab(self, vocab_path):
     return Encoder.load(vocab_path)