def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } permute_number = options.get('permute_number', 4) if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) elif options.get('multidirectional'): data = MultidirectionalLMDataset(test_prefix, vocab, permute_number, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_siz, permute_number=permute_number)
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None # vocab = load_vocab(args.vocab_file, max_word_length) vocab = load_vocab(args.vocab_file, args.stroke_vocab_file, 50) # Winfred stroke_vocab test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_size)
def main(args): if args.gpu is not None: n_gpus = len(args.gpu) set_gpu(args.gpu) else: n_gpus = 0 options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_files, max_word_length=max_word_length, polyglot=True) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } data = BidirectionalPolyglotLMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_size)
def test_train_bilm_chars(self): vocab, data, options = self._get_vocab_data_options(True, True) train(options, data, 1, self.tmp_dir, self.tmp_dir) # now test tf.reset_default_graph() options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) data_test, vocab_test = self._get_data(True, True, True) perplexity = test(options, ckpt_file, data_test, batch_size=1) self.assertTrue(perplexity < 20.0)
def test_train_bilm_chars(self): vocab, data, options = self._get_vocab_data_options(True, True) train(options, data, 1, self.tmp_dir, self.tmp_dir) # now test tf.reset_default_graph() options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) data_test, vocab_test = self._get_data(True, True, True) perplexity = test(options, ckpt_file, data_test, batch_size=1) self.assertTrue(perplexity < 20.0)
def test_train_skip_connections(self): bidirectional = True use_chars = False vocab, data, options = self._get_vocab_data_options( bidirectional, use_chars) options['lstm']['use_skip_connections'] = True train(options, data, 1, self.tmp_dir, self.tmp_dir) # now test tf.reset_default_graph() options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) data_test, vocab_test = self._get_data(bidirectional, use_chars, test=True) perplexity = test(options, ckpt_file, data_test, batch_size=1) self.assertTrue(perplexity < 20.0)
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_size)
def test_train_shared_softmax_embedding(self): bidirectional = True use_chars = False vocab, data, options = self._get_vocab_data_options( bidirectional, use_chars, share_embedding_softmax=True) train(options, data, 1, self.tmp_dir, self.tmp_dir) # now test tf.reset_default_graph() options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) data_test, vocab_test = self._get_data( bidirectional, use_chars, test=True) perplexity = test(options, ckpt_file, data_test, batch_size=1) self.assertTrue(perplexity < 20.0)
def test_train_skip_connections(self): bidirectional = True use_chars = False vocab, data, options = self._get_vocab_data_options( bidirectional, use_chars) options['lstm']['use_skip_connections'] = True train(options, data, 1, self.tmp_dir, self.tmp_dir) # now test tf.reset_default_graph() options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) data_test, vocab_test = self._get_data( bidirectional, use_chars, test=True) perplexity = test(options, ckpt_file, data_test, batch_size=1) self.assertTrue(perplexity < 20.0)
def test_train_shared_softmax_embedding(self): bidirectional = True use_chars = False vocab, data, options = self._get_vocab_data_options( bidirectional, use_chars, share_embedding_softmax=True) train(options, data, 1, self.tmp_dir, self.tmp_dir) # now test tf.reset_default_graph() options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) data_test, vocab_test = self._get_data( bidirectional, use_chars, test=True) perplexity = test(options, ckpt_file, data_test, batch_size=1) self.assertTrue(perplexity < 20.0)
def top_level(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) vocab_file = os.path.join(args.save_dir, 'vocabs.txt') # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(vocab_file, max_word_length) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_size)
def main(args): ent_num = 14541 with open( "/home/why2011btv/research/OpenKE/benchmarks/FB15K237/test2id.txt", 'r') as f: #with open("/home/why2011btv/KG-embedding/obama.txt",'r') as f: lines = f.readlines() triplet_num = len(lines) - 1 print("triplet_num:", triplet_num) #triplet_num = 600 test_set = np.zeros([triplet_num, 3], np.int32) i = 0 for line in lines: a = line.split(' ') if len(a) > 1 and i < triplet_num: #a[2] = (a[2])[:-1] #because of newline #a[1] = a[1][:-1] # test_set[i][0] = int(a[0]) # test_set[i][1] = int(a[2]) # test_set[i][2] = int(a[1]) aa = 1 test_set[i][0] = int(a[0]) test_set[i][1] = int(a[2]) + ent_num test_set[i][2] = int(a[1]) #print("a[0]:",test_set[i][0]) #print("a[2]:",test_set[i][1]) #print("a[1]:",test_set[i][2]) #print("a:",aa) #print(test_set) i += 1 options, ckpt_file = load_options_latest_checkpoint(args.save_dir) data = MYDataset(test_set) perplexity = test(options, ckpt_file, data, batch_size=2) return perplexity
def main(args): is_load, load_path, save_path, budget = cuhk_prototype_tuner_v2.preprocess( t_id, params, args.save_dir) vocab = load_vocab(args.vocab_file, 50) batch_size = int(params['batch_size']) gpus_index_list = list( map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(','))) n_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(',')) n_train_tokens = 768648884 sess_config = tf.compat.v1.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=int( params['inter_op_parallelism_threads']), intra_op_parallelism_threads=int( params['intra_op_parallelism_threads']), graph_options=tf.compat.v1.GraphOptions( infer_shapes=params['infer_shapes'], place_pruned_graph=params['place_pruned_graph'], enable_bfloat16_sendrecv=params['enable_bfloat16_sendrecv'], optimizer_options=tf.compat.v1.OptimizerOptions( do_common_subexpression_elimination=params[ 'do_common_subexpression_elimination'], max_folded_constant_in_bytes=int( params['max_folded_constant']), do_function_inlining=params['do_function_inlining'], global_jit_level=params['global_jit_level']))) options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': int(budget), # NNI modification 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = save_path tf_log_dir = save_path if not os.path.exists(tf_save_dir): os.makedirs(tf_save_dir) if params['tf_gpu_thread_mode'] in ["global", "gpu_private", "gpu_shared"]: os.environ['TF_GPU_THREAD_MODE'] = params['tf_gpu_thread_mode'] if is_load: load_file = os.path.join(load_path, 'model.ckpt') start = time.time() final_perplexity = train(options, data, n_gpus, gpus_index_list, tf_save_dir, tf_log_dir, sess_config, restart_ckpt_file=load_file) end = time.time() shutil.rmtree(load_path) else: start = time.time() final_perplexity = train(options, data, n_gpus, gpus_index_list, tf_save_dir, tf_log_dir, sess_config) end = time.time() spent_time = (end - start) / 3600.0 if args.test_prefix != '': options, ckpt_file = load_options_latest_checkpoint(tf_save_dir) kwargs = { 'test': True, 'shuffle_on_load': False, } test_data = BidirectionalLMDataset(args.test_prefix, vocab, **kwargs) final_perplexity = test(options, ckpt_file, test_data, batch_size=128) report_dict = {'runtime': spent_time, 'default': final_perplexity} nni.report_final_result(report_dict)
res_perplexities2.append(res2) count_in += OOV count_oov += IN res_perplexities0.append(res0) res_perplexities1.append(res1) if args.model == 'elmo': filepath = subdir + os.sep if options.get('bidirectional'): data = BidirectionalLMDataset(filepath, vocab, **kwargs) # print(data) else: data = LMDataset(filepath, vocab, **kwargs) res2 = test(options, ckpt_file, data, batch_size=args.batch_size) res_perplexities2.append(res2) outfile.write(file + '\t' + label + '\t' + str(res2) + '\n') if count % 5 == 0: print('I have calculated perplexities for %s files' % count, file=sys.stderr) print('=== Just a sanity check on the perplexity calculations: ') print(labels[:5], fns[:5], res_perplexities2[:5]) print('Texts with the most extreme text-level perplexities:') df = pd.DataFrame(list(zip(fns, labels, res_perplexities2)), columns=['files', 'label', 'perpl'])