def main(_): tf.logging.set_verbosity(tf.logging.INFO) output_files = FLAGS.output_file.split(",") writers = [tf.python_io.TFRecordWriter(out) for out in output_files] rng = random.Random(FLAGS.random_seed) tokenizer = tokenization.WordpieceTokenizer( vocab=tokenization.load_vocab(FLAGS.vocab_file)) estimator = get_embedding_estimator() sample = get_sample(FLAGS.input_sentence_file, FLAGS.input_mapping_file, rng, FLAGS.sample_size) batches = list(range(0, len(sample), 3000)) + [len(sample)] for brange in zip(batches, batches[1:]): batch_sample = sample[brange[0]:brange[1]] instances = create_training_instances( FLAGS.input_sentence_file, FLAGS.input_mapping_file, tokenizer, FLAGS.max_seq_length, rng, FLAGS.do_lower_case, batch_sample) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, writers, estimator) for writer in writers: writer.close()
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"], ) self.assertAllEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {token: i for i, token in enumerate(vocab_tokens)} tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) print('test_wordpiece_tokenizer', tokenizer.tokenize("unwa wanted warunn runned wawant want"))
def test_wordpiece_tokenizer(self): vocab_tokens = [ '[UNK]', '[CLS]', '[SEP]', 'want', '##want', '##ed', 'wa', 'un', 'runn', '##ing' ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(''), []) self.assertAllEqual(tokenizer.tokenize('unwanted running'), ['un', '##want', '##ed', 'runn', '##ing']) self.assertAllEqual(tokenizer.tokenize('unwantedX running'), ['[UNK]', 'runn', '##ing'])
def __init__(self, vocab_file, do_lower_case=False): self.vocab = tokenization.load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.wordpiece_tokenizer = tokenization.WordpieceTokenizer( vocab=self.vocab) self.do_lower_case = do_lower_case
# counts = pair_counts(lines, maps) # most_frequent_pairs(counts) pairs = [ ('year', 'ano'), ('wanted', 'queria'), ('question', 'questão'), ('I', 'eu'), ('opportunity', 'oportunidade'), ('problem', 'problema'), ('love', 'amor'), ] K = 10 tokenizer = tokenization.WordpieceTokenizer(vocab=tokenization.load_vocab( "/home/arthur/Projects/bert/models/multi_aligned_cased_L-12_H-768_A-12/vocab.txt" )) rng = random.Random(1234) sample = set(get_sample(sent_path, map_path, rng, 50000)) lines = [l for i, l in enumerate(lines) if i in sample] sents = get_sentences(lines, pairs, tokenizer, k=K) embs = get_embeddings(sents) from sklearn.manifold import TSNE X = embs.reshape((-1, embs.shape[-1])) X_embedded = TSNE(n_components=2, perplexity=20, metric='cosine').fit_transform(X) X_embedded.shape
def __init__(self): vocab_file = 'vocab.txt' vocab = tokenization.load_vocab(vocab_file=vocab_file) tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) path = 'train_processed.txt' train_file = open(path, 'r', encoding='utf-8') lines = train_file.read().split('\n') max_length = 0 for i in range(len(lines)): TK = lines[i].split(' \t') if max_length < len(TK[0]): max_length = len(TK[0]) max_length += 1 self.input_ids = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.input_mask = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.label = np.zeros(shape=[len(lines)], dtype=np.int32) for i in range(len(lines) - 1): TK = lines[i].split(' \t') if len(TK) != 2: TK = lines[i].split('\t') sentence = TK[0] token = tokenizer.tokenize(sentence) tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab, tokens=token) for j in range(len(tk_ids)): self.input_ids[i, j + 1] = tk_ids[j] self.input_mask[i, j + 1] = 1 self.input_ids[i, 0] = tokenization.convert_tokens_to_ids( vocab=vocab, tokens=['[CLS]'])[0] self.input_mask[i, 0] = 1 self.label[i] = int(TK[1]) path = 'test_processed.txt' test_file = open(path, 'r', encoding='utf-8') lines = test_file.read().split('\n') max_length = 0 for i in range(len(lines)): TK = lines[i].split(' \t') if max_length < len(TK[0]): max_length = len(TK[0]) print(max_length) max_length += 1 self.test_input_ids = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.test_input_ids_masking = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.test_label = np.zeros(shape=[len(lines)], dtype=np.int32) for i in range(len(lines) - 1): TK = lines[i].split(' \t') if len(TK) != 2: TK = lines[i].split('\t') sentence = TK[0] token = tokenizer.tokenize(sentence) tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab, tokens=token) for j in range(len(tk_ids)): self.test_input_ids[i, j + 1] = tk_ids[j] self.test_input_ids_masking[i, j + 1] = 1 self.test_input_ids[i, 0] = tokenization.convert_tokens_to_ids( vocab=vocab, tokens=['[CLS]'])[0] self.test_input_ids_masking[i, 0] = 1 self.test_label[i] = int(TK[1]) self.Batch_Size = 8 self.random_idx = np.array(range(self.label.shape[0]), dtype=np.int32) np.random.shuffle(self.random_idx) self.Batch_Idx = 0 self.Test_Batch_Idx = 0