def train(data_file, config, blocks, attn_dim, num_heads, nn_dim, dropout, tied_weights, optimizer, lr, mb, scale_residuals, block_norm, cpu): config = load_config(config) context_size = config['dataset']['maxlen'] if 'vocab' in config['dataset']: vocab = Encoder.load(config['dataset']['vocab']) config['dataset']['vocab'] = vocab vocab_size = config['dataset']['vocab'].vocab_size pad_idx = vocab.word_vocab[vocab.PAD] else: vocab_size = 255 pad_idx = 0 window_batches = TimeBufferedCSVReader(data_file, **config['reader']) device = torch.device( 'cuda' if torch.cuda.is_available() and not cpu else 'cpu') model = GPTModel(attn_dim, num_heads, nn_dim, blocks, vocab_size, context_size, dropout=dropout, scale_res=scale_residuals, block_norm=block_norm, tied_weights=tied_weights, device=device).to(device) opt = opt_map[optimizer](model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss(reduction='none', ignore_index=pad_idx) scores = gzip.open(basename(data_file) + '.scores.gz', 'wt') train_window = CSVDataset(next(window_batches), **config['dataset']) prev_window = window_batches.cur_window for eval_window in window_batches: eval_window = CSVDataset(eval_window, **config['dataset']) train_data = DataLoader(train_window, shuffle=False, batch_size=mb, num_workers=8) eval_data = DataLoader(eval_window, shuffle=False, batch_size=mb, num_workers=8) cur_window = window_batches.cur_window # train on window model.train() avg_train_loss = 0. batches = 0 train_iter = tqdm(train_data) for b in train_iter: opt.zero_grad() _, _, seqs, _ = b x = seqs[:, :-1].to(device) y = seqs[:, 1:].to(device) y_mask = (y != pad_idx).float().unsqueeze(2).to(device) preds = model(x, mask=True, pad_key=pad_idx) loss = criterion(preds.transpose(1, 2), y) loss = loss.sum(dim=1) / y_mask.sum(dim=1).squeeze() loss = loss.mean() loss.backward() opt.step() avg_train_loss += loss.cpu().item() batches += 1 train_iter.set_description( f'[TRAIN] window={prev_window} loss={avg_train_loss / batches:.8f}' ) # evaluate on next window model.eval() avg_eval_loss = 0. batches = 0 eval_iter = tqdm(eval_data) for b in eval_iter: line_nums, meta, seqs, _ = b x = seqs[:, :-1].to(device) y = seqs[:, 1:].to(device) y_mask = (y != pad_idx).float().unsqueeze(2).to(device) preds = model(x, mask=True, pad_key=pad_idx) loss = criterion(preds.transpose(1, 2), y) loss = loss.sum(dim=1) / y_mask.sum(dim=1).squeeze() for line_no, line_meta, line_score in zip(line_nums, meta, loss): scores.write(f'{line_no},{line_meta},{line_score}\n') loss = loss.mean() avg_eval_loss += loss.cpu().item() batches += 1 eval_iter.set_description( f'[EVAL] window={cur_window} loss={avg_eval_loss / batches:.8f}' ) train_window = eval_window prev_window = cur_window scores.close()
def run_tripod(params): from tripod.io_utils.io import Dataset from tripod.io_utils.io import Encodings dataset = Dataset(params.input_file) encodings = Encodings() encodings.load(params.output + '.encodings') model = TripodModel2(encodings) model.load(params.output + '.bestGST') model.to(params.device) model.eval() bpe_encoder = None if params.bpe_encoder is not None: dataset.sequences = [] dataset.tokens = None from bpe import Encoder as BPEEncoder bpe_encoder = BPEEncoder.load(params.bpe_encoder) for line in open(params.input_file).readlines(): dataset.sequences.append(bpe_encoder.tokenize(line)) batches = _get_batches(dataset, params) token_list = '' with torch.no_grad(): for batch in batches: for seq in batch: batch_x = [] for x in seq[0]: batch_x.append(x) tmp = batch_x[1:] for ii in range(len(tmp)): if tmp[ii] == '<PAD>': tmp = tmp[:ii] break if bpe_encoder is not None: orig = _bpe_decode(tmp, bpe_encoder) else: orig = tmp batch_x = _to_tensor([batch_x], encodings, params.device) pred_sum = model.generate(batch_x) val_sum = pred_sum.cpu().numpy() for seq_id in range(pred_sum.shape[0]): if bpe_encoder is not None: token_list_sum = [ encodings.token_list[zz] for zz in val_sum[seq_id] if zz != encodings.token2int['<UNK>'] ] sys.stdout.write('ORIG: ' + orig + '\n\n') sys.stdout.write( 'SUM: ' + _bpe_decode(token_list_sum, bpe_encoder) + '\n\n') token_list = token_list_sum sys.stdout.write('=' * 20) sys.stdout.write('\n\n\n') else: for t_id in range(pred_sum.shape[1]): token_list += encodings.token_list[val_sum[seq_id] [t_id]] sys.stdout.write( encodings.token_list[val_sum[seq_id][t_id]]) sys.stdout.flush() sys.stdout.write('\n') with open(params.output_file, 'w') as f: f.write(_bpe_decode(token_list, bpe_encoder) + '\n') f.close()
def prepare_data(data_path, freq_dist_path, embedding_path, vocabulary_size=10000, embedding_size=200, predict=False, max_length=None, use_bpe=False): max_length_provided = max_length is not None separator = "," if data_path.endswith("tsv"): separator = "\t" # construct vocabulary vocabulary = None if not use_bpe: with open(freq_dist_path, "rb") as freq_dist_file: freq_dist = pickle.load(freq_dist_file) vocabulary = {"<pad>": 0, "<unk>": 1, "<user>": 2, "<url>": 3} most_common = freq_dist.most_common(vocabulary_size - len(vocabulary)) vocabulary.update({w[0]: i + 2 for i, w in enumerate(most_common)}) print("Constructed vocabulary of size {}.".format(vocabulary_size)) # load data and convert it to indices data = [] labels = [] if not max_length_provided: max_length = 0 with open(data_path, "r") as data_file: lines = data_file.readlines() for i, line in enumerate(lines): if not predict: tweet_id, sentiment, tweet = line.split(separator) else: tweet_id, tweet = line.split(separator) data.append(tweet.strip()) if not predict: labels.append(int(sentiment)) print("Loaded data ({} tweets).".format(len(data))) if not use_bpe: new_data = [] for tweet in data: words = tweet.split() indices = [] for w_idx, w in enumerate(words): if max_length_provided and w_idx == max_length: break index = vocabulary.get(w) if index is not None: indices.append(index) else: indices.append(vocabulary.get("<unk>")) if not max_length_provided and len(indices) > max_length: max_length = len(indices) new_data.append(indices) data = new_data pad_value = vocabulary.get("<pad>") else: print("Training BPE encoder...") encoder = Encoder(vocab_size=vocabulary_size, required_tokens=["<user>", "<url>"], UNK="<unk>", PAD="<pad>") encoder.fit(data) vocabulary = encoder.vocabs_to_dict() print("Constructed BPE vocabulary of size {}.".format(vocabulary_size)) new_data = [] for tweet in data: indices = list(next(encoder.transform([tweet]))) if not max_length_provided and len(indices) > max_length: max_length = len(indices) new_data.append(indices) data = new_data pad_value = encoder.word_vocab[encoder.PAD] # load embedding vectors embedding_vectors = {} if not use_bpe: with open(embedding_path, "r") as glove_file: for i, line in enumerate(glove_file): tokens = line.split() word = tokens[0] if vocabulary.get(word): vector = [float(e) for e in tokens[1:]] embedding_vectors[word] = np.array(vector) print("Found {} GLOVE vectors for vocabulary of size {}.".format( len(embedding_vectors), len(vocabulary))) print( "Loaded embedding vectors ({} dimensions).".format(embedding_size)) # construct embedding matrix embedding_matrix = np.random.randn(vocabulary_size, embedding_size) * 0.01 if not use_bpe: for word, i in list(vocabulary.items()): embedding_vector = embedding_vectors.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector print("Constructed embedding matrix.") # pad data (might want to change max_length to be CLI argument) data = pad_sequences(data, maxlen=max_length, padding="post", value=pad_value) if not predict: labels = np.array(labels) print("Padded sequences to length {}.".format(max_length)) if not predict: return vocabulary, data, labels, embedding_matrix return vocabulary, data, embedding_matrix
def load_vocab(self, vocab_path): return Encoder.load(vocab_path)
def construct_vocabulary(data: Union[str, List[Union[List[str], str]]], vocabulary_size: int = 10000, use_bpe: bool = False, bpe_percentage: float = 0.2, vocabulary_save_file: str = None) -> dict: counts = None if type(data) == str and ".pkl" in data: with open(data, "rb") as f: counts = pickle.load(f) if type(counts) != nltk.FreqDist: logger.info("Loaded vocabulary from file.") return counts elif use_bpe: logger.error("Cannot construct BPE vocabulary from frequency distribution file.") raise ValueError("Cannot construct BPE vocabulary from frequency distribution file.") else: logger.info("Constructing vocabulary from frequency distribution file.") elif not use_bpe: logger.info("Constructing vocabulary from data.") if type(data) == str: separator = "," if data.endswith("tsv"): separator = "\t" # load data from file new_data = [] with open(data, "r") as data_file: lines = data_file.readlines() for i, line in enumerate(lines): _, _, tweet = line.split(separator) new_data.append(TOKENIZER.tokenize(tweet)) data = new_data elif type(data[0]) != list: data = [TOKENIZER.tokenize(t) for t in data] all_words = [] for tweet in data: all_words.extend(tweet) counts = nltk.FreqDist(all_words) if use_bpe: logger.info("Training BPE encoder...") encoder = Encoder(vocab_size=vocabulary_size, pct_bpe=bpe_percentage, word_tokenizer=lambda x: TOKENIZER.tokenize(x), required_tokens=["<start>", "<extract>", "<user>", "<url>"], UNK="<unk>", PAD="<pad>") encoder.fit(data) vocabulary = encoder.vocabs_to_dict() logger.info("Constructed BPE vocabulary of size {}.".format(vocabulary_size)) else: vocabulary = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<extract>": 3} initial_vocab_length = len(vocabulary) most_common = counts.most_common(vocabulary_size - initial_vocab_length) vocabulary.update({w[0]: i + initial_vocab_length for i, w in enumerate(most_common)}) logger.info("Constructed embedding vocabulary of size {}.".format(len(vocabulary))) if vocabulary_save_file: if not vocabulary_save_file.endswith(".pkl"): vocabulary_save_file += ".pkl" with open(vocabulary_save_file, "wb") as f: pickle.dump(vocabulary, f) logger.info("Saved vocabulary to \"{}\".".format(vocabulary_save_file)) return vocabulary
from bpe import Encoder from tqdm import tqdm sequences = [] with open("../../datasets/multi30k/train.en") as f: for line in f: sequences.append(line.strip()) ref = [x.split() for x in sequences] ref_len = [len(x) for x in ref] print("REF:", max(ref_len)) def parse(x): return x.split() enc = Encoder(4096, ngram_min=1, ngram_max=2, pct_bpe=0.8, silent=True, word_tokenizer=parse) enc.fit(sequences) base = enc.vocabs_to_dict() duplicate_keys = [] for key in base['byte_pairs']: if key in base['words']: duplicate_keys.append(key) if len(duplicate_keys) > 0: print("got duplicates:") print(duplicate_keys) else: print("NO DUPLICATES! :)") keybase = {**base['words'], **base['byte_pairs']}
from bpe import Encoder # Generated with http://pythonpsum.com test_corpus = ''' Object raspberrypi functools dict kwargs. Gevent raspberrypi functools. Dunder raspberrypi decorator dict didn't lambda zip import pyramid, she lambda iterate? Kwargs raspberrypi diversity unit object gevent. Import fall integration decorator unit django yield functools twisted. Dunder integration decorator he she future. Python raspberrypi community pypy. Kwargs integration beautiful test reduce gil python closure. Gevent he integration generator fall test kwargs raise didn't visor he itertools... Reduce integration coroutine bdfl he python. Cython didn't integration while beautiful list python didn't nit! Object fall diversity 2to3 dunder script. Python fall for: integration exception dict kwargs dunder pycon. Import raspberrypi beautiful test import six web. Future integration mercurial self script web. Return raspberrypi community test she stable. Django raspberrypi mercurial unit import yield raspberrypi visual rocksdahouse. Dunder raspberrypi mercurial list reduce class test scipy helmet zip? ''' encoder = Encoder(200, pct_bpe=0.88) # params chosen for demonstration purposes encoder.fit(test_corpus.split('\n')) example = "Vizzini: He didn't fall? INCONCEIVABLE!" print(encoder.tokenize(example)) # ['__sow', 'vi', 'z', 'zi', 'ni', '__eow', '__sow', ':', '__eow', 'he', 'didn', "'", 't', 'fall', '__sow', '?', '__eow', '__sow', 'in', 'co', 'n', 'ce', 'iv', 'ab', 'le', '__eow', '__sow', '!', '__eow'] print(next(encoder.transform([example]))) # [26, 108, 79, 104, 72, 24, 26, 117, 24, 9, 11, 8, 12, 10, 26, 90, 24, 26, 154, 56, 37, 149, 80, 169, 84, 24, 26, 156, 24] print(next(encoder.inverse_transform(encoder.transform([example])))) # vizzini : he didn ' t fall ? inconceivable !
def read_text_data_bpe(num_samples=3000, data_path="data/mt_corpus_ts.txt", vocab_path='data/vocab_bpe.txt', word_dropout_ratio=0.75, simple_subword=False): """ :param num_samples: :param data_path: :return: timesteps_max, char2id, id2char, x, x_decoder #enc_tokens, characters, """ rng = random.Random(88) # vectorize the data timesteps_max = 100 input_texts = [] char2idx, idx2char = load_vocab_bpe(vocab_path) with open(vocab_path + '.dict', encoding='utf-8') as fin: bpe_dict = json.loads(fin.read()) encoder = Encoder.from_dict(bpe_dict) encoder.word_tokenizer = WhitespaceTokenizer().tokenize lines = [] print('read data from ', data_path) line_num = 0 with open(data_path, encoding='utf-8') as fin: for line in fin: line_num += 1 if line_num == 1: continue if line_num % 100000 == 0: print(line_num) if line_num > num_samples + 200: break #tuples = line.strip().split('\t') #zh = tuples[1] zh = line terms = zh.split() if len(terms) <= timesteps_max - 2: terms = encoder.tokenize(zh) # terms = [term for term in terms if term != encoder.EOW and term != encoder.SOW] terms = remove_seow(terms, encoder, simple_subword) if len(terms) <= timesteps_max - 2: lines.append(terms) for line in lines[:min(num_samples, len(lines) - 1)]: input_text = line input_text.append("<eos>") input_texts.append(input_text) # for char in input_text: # if char not in input_characters: # input_characters.add(char) # # input_characters = sorted(list(input_characters)) num_encoder_tokens = max(idx2char.keys()) + 1 max_encoder_seq_length = timesteps_max #max([len(txt) for txt in input_texts]) + 1 print("Number of samples:", len(input_texts)) print("Number of unique input tokens:", num_encoder_tokens) print("Max sequence length for inputs:", max_encoder_seq_length) # input_token_index = dict([(char, i) for i, char in enumerate(input_characters)]) # reverse_input_char_index = dict((i, char) for char, i in input_token_index.items()) encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype="int32") decoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype="int32") decoder_output_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype="int32") # encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32") # decoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32") for i, input_text in enumerate(input_texts): decoder_input_data[i, 0] = char2idx["<sos>"] # decoder_input_data[i, 0, char2idx["<sos>"]] = 1.0 for t, char in enumerate(input_text): idx = char2idx[char] if char in char2idx else char2idx["<unk>"] idx_mask = idx if rng.random() < word_dropout_ratio: #TODO 添加一个新的单词<mask>,而不是使用<unk> if rng.random() < 0.9: idx_mask = char2idx["<unk>"] else: # 10% of the time, replace with random word idx_mask = rng.randint(0, num_encoder_tokens - 1) encoder_input_data[i, t] = idx_mask decoder_output_data[i, t] = idx decoder_input_data[i, t + 1] = idx_mask # encoder_input_data[i, t, idx] = 1.0 # decoder_input_data[i, t + 1, idx ] = 1.0 return max_encoder_seq_length, char2idx, idx2char, encoder_input_data, decoder_input_data, decoder_output_data