def run_bpe(params): bpe_encoder = Encoder(vocab_size=params.vocab_size, pct_bpe=params.pct_bpe, silent=not params.verbose) if params.encoder_load_file: sys.stdout.write('Using pre-computed BPE encoder\n') sys.stdout.flush() bpe_encoder = Encoder.load(params.encoder_load_file) else: sys.stdout.write('Generating new BPE encoder\n') sys.stdout.flush() text = open(params.source_file).read().split('\n') bpe_encoder.fit(text) bpe_encoder.save(params.encoder_save_file) f_src = open(params.source_file) f_dst = open(params.destination_file, 'w') for line in tqdm.tqdm(f_src.readlines()): line = line.strip() tokens = bpe_encoder.tokenize(line) encoded_line = ' '.join(tokens).strip() if encoded_line.strip() != '': f_dst.write(encoded_line + '\n') f_src.close() f_dst.close()
def generate_repr(params): import numpy as np from tripod.io_utils.io import Encodings encodings = Encodings() encodings.load(params.output + '.encodings') bpe_encoder = None if params.bpe_encoder is not None: from bpe import Encoder as BPEEncoder bpe_encoder = BPEEncoder.load(params.bpe_encoder) model = TripodModel2(encodings) model.load(params.output + '.bestGST') model.to(params.device) model.eval() with torch.no_grad(): with open(params.input_file) as f: data = f.read() if bpe_encoder is not None: data = bpe_encoder.tokenize(data) batch_x = [data] batch_x = _to_tensor(batch_x, encodings, params.device) representation = model.compute_repr(batch_x) sys.stdout.write( str(np.asarray(representation.cpu().numpy())) + '\n')
def train(data_file, config, blocks, attn_dim, num_heads, nn_dim, dropout, tied_weights, optimizer, lr, mb, scale_residuals, block_norm, cpu): config = load_config(config) context_size = config['dataset']['maxlen'] if 'vocab' in config['dataset']: vocab = Encoder.load(config['dataset']['vocab']) config['dataset']['vocab'] = vocab vocab_size = config['dataset']['vocab'].vocab_size pad_idx = vocab.word_vocab[vocab.PAD] else: vocab_size = 255 pad_idx = 0 window_batches = TimeBufferedCSVReader(data_file, **config['reader']) device = torch.device( 'cuda' if torch.cuda.is_available() and not cpu else 'cpu') model = GPTModel(attn_dim, num_heads, nn_dim, blocks, vocab_size, context_size, dropout=dropout, scale_res=scale_residuals, block_norm=block_norm, tied_weights=tied_weights, device=device).to(device) opt = opt_map[optimizer](model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss(reduction='none', ignore_index=pad_idx) scores = gzip.open(basename(data_file) + '.scores.gz', 'wt') train_window = CSVDataset(next(window_batches), **config['dataset']) prev_window = window_batches.cur_window for eval_window in window_batches: eval_window = CSVDataset(eval_window, **config['dataset']) train_data = DataLoader(train_window, shuffle=False, batch_size=mb, num_workers=8) eval_data = DataLoader(eval_window, shuffle=False, batch_size=mb, num_workers=8) cur_window = window_batches.cur_window # train on window model.train() avg_train_loss = 0. batches = 0 train_iter = tqdm(train_data) for b in train_iter: opt.zero_grad() _, _, seqs, _ = b x = seqs[:, :-1].to(device) y = seqs[:, 1:].to(device) y_mask = (y != pad_idx).float().unsqueeze(2).to(device) preds = model(x, mask=True, pad_key=pad_idx) loss = criterion(preds.transpose(1, 2), y) loss = loss.sum(dim=1) / y_mask.sum(dim=1).squeeze() loss = loss.mean() loss.backward() opt.step() avg_train_loss += loss.cpu().item() batches += 1 train_iter.set_description( f'[TRAIN] window={prev_window} loss={avg_train_loss / batches:.8f}' ) # evaluate on next window model.eval() avg_eval_loss = 0. batches = 0 eval_iter = tqdm(eval_data) for b in eval_iter: line_nums, meta, seqs, _ = b x = seqs[:, :-1].to(device) y = seqs[:, 1:].to(device) y_mask = (y != pad_idx).float().unsqueeze(2).to(device) preds = model(x, mask=True, pad_key=pad_idx) loss = criterion(preds.transpose(1, 2), y) loss = loss.sum(dim=1) / y_mask.sum(dim=1).squeeze() for line_no, line_meta, line_score in zip(line_nums, meta, loss): scores.write(f'{line_no},{line_meta},{line_score}\n') loss = loss.mean() avg_eval_loss += loss.cpu().item() batches += 1 eval_iter.set_description( f'[EVAL] window={cur_window} loss={avg_eval_loss / batches:.8f}' ) train_window = eval_window prev_window = cur_window scores.close()
def run_tripod(params): from tripod.io_utils.io import Dataset from tripod.io_utils.io import Encodings dataset = Dataset(params.input_file) encodings = Encodings() encodings.load(params.output + '.encodings') model = TripodModel2(encodings) model.load(params.output + '.bestGST') model.to(params.device) model.eval() bpe_encoder = None if params.bpe_encoder is not None: dataset.sequences = [] dataset.tokens = None from bpe import Encoder as BPEEncoder bpe_encoder = BPEEncoder.load(params.bpe_encoder) for line in open(params.input_file).readlines(): dataset.sequences.append(bpe_encoder.tokenize(line)) batches = _get_batches(dataset, params) token_list = '' with torch.no_grad(): for batch in batches: for seq in batch: batch_x = [] for x in seq[0]: batch_x.append(x) tmp = batch_x[1:] for ii in range(len(tmp)): if tmp[ii] == '<PAD>': tmp = tmp[:ii] break if bpe_encoder is not None: orig = _bpe_decode(tmp, bpe_encoder) else: orig = tmp batch_x = _to_tensor([batch_x], encodings, params.device) pred_sum = model.generate(batch_x) val_sum = pred_sum.cpu().numpy() for seq_id in range(pred_sum.shape[0]): if bpe_encoder is not None: token_list_sum = [ encodings.token_list[zz] for zz in val_sum[seq_id] if zz != encodings.token2int['<UNK>'] ] sys.stdout.write('ORIG: ' + orig + '\n\n') sys.stdout.write( 'SUM: ' + _bpe_decode(token_list_sum, bpe_encoder) + '\n\n') token_list = token_list_sum sys.stdout.write('=' * 20) sys.stdout.write('\n\n\n') else: for t_id in range(pred_sum.shape[1]): token_list += encodings.token_list[val_sum[seq_id] [t_id]] sys.stdout.write( encodings.token_list[val_sum[seq_id][t_id]]) sys.stdout.flush() sys.stdout.write('\n') with open(params.output_file, 'w') as f: f.write(_bpe_decode(token_list, bpe_encoder) + '\n') f.close()
def load_vocab(self, vocab_path): return Encoder.load(vocab_path)