def tune(n_hidden_in, dropout_in): print("Tuning with Hidden Size = {}, Dropout = {}".format( n_hidden_in, dropout_in)) m = Model(n_hidden=n_hidden_in, dropout=dropout_in) # Set up training parameters data = get_mnist_dataset() batch_size = 32 epochs = 15 examples = 60000 train_data = data['train'].shuffle(examples, reshuffle_each_iteration=False) loss_logs = [] # Main training loop - iterate through data and call train_step for i in range(epochs): num_correct = 0 for j, ex in enumerate( train_data.batch(batch_size).take(int(examples / batch_size))): x, y = process_batch(ex) loss, accuracy = m.train_step(x, y) num_correct += accuracy * batch_size loss_logs.append(loss) print("Epoch {}, {}/{}".format(i, (j + 1) * batch_size, examples) + " " * 10 + "Loss: {}, Accuracy {}".format( loss, num_correct / (batch_size * (j + 1))), end='\r', flush=True) # Save the model after every epoch # m.save("epoch-{}.pkl".format(i)) print() # Print empty newline # Evaluate the model on the test set results = [] for ex in data['test'].take(1000): x, _ = process_example(ex) true = ex['label'] pred = m.predict_class(x) results.append(1 if true == pred else 0) # Low pass the loss logs to smoothen the graph loss_logs = np.convolve(np.array(loss_logs), np.ones((12, )) / 12, mode='valid') plt.plot(loss_logs) plt.savefig("figure-hidden{}-dropout{}-accuracy{}.png".format( n_hidden_in, dropout_in, sum(results) / len(results)), dpi=400) plt.clf()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) layer_indexes = [int(x) for x in FLAGS.layers.split(",")] bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( master=FLAGS.master, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) json_examples = [] for file in FLAGS.input_file.split(';'): with open(file) as f: json_examples.extend( (json.loads(jsonline) for jsonline in f.readlines())) orig_examples = [] bert_examples = [] for i, json_e in enumerate(json_examples): e = process_example(json_e, i) orig_examples.append(e) bert_examples.append(e.bertify(tokenizer)) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, layer_indexes=layer_indexes, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_one_hot_embeddings, ) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, predict_batch_size=FLAGS.batch_size) input_fn = input_fn_builder(examples=bert_examples, window_size=FLAGS.window_size, stride=FLAGS.stride, tokenizer=tokenizer) writer = h5py.File(FLAGS.output_file, 'w') with tqdm(total=sum(len(e.tokens) for e in orig_examples)) as t: for result in estimator.predict(input_fn, yield_single_examples=True): document_index = int(result["unique_ids"]) bert_example = bert_examples[document_index] orig_example = orig_examples[document_index] file_key = bert_example.doc_key.replace('/', ':') t.update(n=(result['extract_indices'] >= 0).sum()) for output_index, bert_token_index in enumerate( result['extract_indices']): if bert_token_index < 0: continue token_index = bert_example.bert_to_orig_map[bert_token_index] sentence_index, token_index = orig_example.unravel_token_index( token_index) dataset_key = "{}/{}".format(file_key, sentence_index) if dataset_key not in writer: writer.create_dataset( dataset_key, (len(orig_example.sentence_tokens[sentence_index]), bert_config.hidden_size, len(layer_indexes)), dtype=np.float32) dset = writer[dataset_key] for j, layer_index in enumerate(layer_indexes): layer_output = result["layer_output_%d" % j] dset[token_index, :, j] = layer_output[output_index] writer.close()
num_correct += accuracy * batch_size loss_logs.append(loss) print("Epoch {}, {}/{}".format(i, (j + 1) * batch_size, examples) + " " * 10 + "Loss: {}, Accuracy {}".format( loss, num_correct / (batch_size * (j + 1))), end='\r', flush=True) # Save the model after every epoch m.save("epoch-{}.pkl".format(i)) print() # Print empty newline # Evaluate the model on the test set results = [] for ex in data['test'].take(1000): x, _ = process_example(ex) true = ex['label'] pred = m.predict_class(x) results.append(1 if true == pred else 0) print("Final accuracy on test set: {}".format(sum(results) / len(results))) # Display a few images with predictions because it's fun print("\n" * 10) for ex in data['test'].shuffle(100).take(10): x, _ = process_example(ex) pred = m.predict_class(x) print("Predicted: {}".format(pred)) X = x.reshape((28, 28)) plt.gray() plt.imshow(X)
import sentencepiece as spm import json import os from data import process_example input_file = '' s = spm.SentencePieceProcessor() s.Load('./xlnet_cased_L-12_H-768_A-12/spiece.model') # Retrieve size print(s.get_piece_size()) print(s.encode('this is a test')) json_examples = [] for x in ['test', 'train', 'dev']: with open(os.path.join(input_file, x + '.english.jsonlines')) as f: json_examples.extend( (json.loads(jsonline) for jsonline in f.readlines())) orig_examples = [] bert_examples = [] for i, json_e in enumerate(json_examples): e = process_example(json_e, i, should_filter_embedded_mentions=True) orig_examples.append(e) print(s.encode(' '.join(e.tokens))) # bert_examples.append(e.bertify(tokenizer)) # for i in orig_examples: print(i.tokens)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=False) json_examples = [] for x in ['test', 'train', 'dev']: # for x in ['test']: with open(os.path.join(FLAGS.input_file, x + '.english.jsonlines')) as f: json_examples.extend( (json.loads(jsonline) for jsonline in f.readlines())) orig_examples = [] bert_examples = [] for i, json_e in enumerate(json_examples): e = process_example(json_e, i, should_filter_embedded_mentions=True) orig_examples.append(e) bert_examples.append(e.bertify(tokenizer)) writer = h5py.File(FLAGS.output_file, 'w') for data in tqdm(convert_examples_to_features(bert_examples, orig_examples, FLAGS.window_size, FLAGS.stride, tokenizer), total=len(json_examples)): document_index = int(data["doc_index"][0]) bert_example = bert_examples[document_index] dataset_key = bert_example.doc_key.replace('/', ':') sentences = [] for sentence_indices in data['extract_sentences']: cur_sentence = [] for i in sentence_indices: tokens_flattened = sum([list(ts) for ts in data['tokens']], []) if i > 0: cur_sentence.append(tokens_flattened[i - 1]) sentences.append(cur_sentence) assert [len(s) for s in sentences] == [ len(s) for s in orig_examples[document_index].sentence_tokens ] sentences_flattened = sum(sentences, []) expected = [ t for i, t in enumerate(bert_example.tokens) if bert_example.bert_to_orig_map[i] >= 0 ] assert sentences_flattened == expected writer.create_dataset('{}/input_ids'.format(dataset_key), data=data['input_ids']) writer.create_dataset('{}/input_mask'.format(dataset_key), data=data['input_mask']) writer.create_dataset('{}/segment_ids'.format(dataset_key), data=data['segment_ids']) writer.create_dataset('{}/extract_mask'.format(dataset_key), data=data['extract_mask']) writer.create_dataset('{}/extract_sentences'.format(dataset_key), data=data['extract_sentences']) # for i, s in enumerate(data['tokens']): # tokens_dset = writer.create_dataset('{}/tokens/{}'.format(dataset_key, i), # (len(s),), # dtype=h5py.special_dtype(vlen=unicode)) # for j, w in enumerate(s): # tokens_dset[j] = w writer.close()