def get_envs(env_files=None): dataset = [] if env_files is None: fns = [get_train_shard_path(i) for i in range(0, 30)] else: fns = env_files for fn in fns: dataset += load_jsonl(fn) tables = load_jsonl(table_file) table_dict = dict([(table['name'], table) for table in tables]) # Load pretrained embeddings. embedding_model = word_embeddings.EmbeddingModel(vocab_file, embedding_file ) with open(en_vocab_file, 'r') as f: vocab = json.load(f) en_vocab = data_utils.Vocab([]) en_vocab.load_vocab(vocab) # Create environments. envs = create_envs(table_dict, dataset, en_vocab, embedding_model) return envs
def build(cls, config, params=None): dummy_kg = { 'kg': None, 'num_props': [], 'datetime_props': [], 'props': [], 'row_ents': [] } executor = nsm.execution.worlds.wikitablequestions.WikiTableExecutor( dummy_kg) api = executor.get_api() op_vocab = data_utils.Vocab( [f['name'] for f in api['func_dict'].values()] + ['all_rows'] + SPECIAL_TKS) config['builtin_func_num'] = op_vocab.size encoder = BertEncoder.build(config) # FIXME: hacky! if config.get('use_trainable_sketch_manager', False): sketch_predictor = SketchPredictor.build(config, encoder=encoder) sketch_encoder = SketchEncoder.build(config, sketch_predictor) else: sketch_predictor = sketch_encoder = None decoder = SketchGuidedDecoder.build(config, encoder, sketch_encoder) return cls(encoder, decoder, sketch_predictor=sketch_predictor, sketch_encoder=sketch_encoder, config=config)
def get_vocab(self): mem_tokens = [] for i in range(self.max_mem): mem_tokens.append('v{}'.format(i)) vocab = data_utils.Vocab( list(self.namespace.get_all_names()) + SPECIAL_TKS + mem_tokens) return vocab
def json_to_envs(dataset): tables = load_jsonl(FLAGS.table_file) table_dict = dict([(table['name'], table) for table in tables]) embedding_model = word_embeddings.EmbeddingModel(FLAGS.vocab_file, FLAGS.embedding_file) with open(FLAGS.en_vocab_file, 'r') as f: vocab = json.load(f) en_vocab = data_utils.Vocab([]) en_vocab.load_vocab(vocab) # Create environments. envs = create_envs(table_dict, dataset, en_vocab, embedding_model) return envs
def init_world_config() -> Dict: dummy_kg = { 'kg': None, 'num_props': [], 'datetime_props': [], 'props': [], 'row_ents': [] } api = WikiTableExecutor(dummy_kg).get_api() op_vocab = data_utils.Vocab( [f['name'] for f in api['func_dict'].values()] + ['all_rows'] + INTERPRETER_SPECIAL_TOKENS ) config = { 'interpreter_builtin_func_num': op_vocab.size, 'executor_api': api } return config
def main(): ''' 1. call get_examples_to_annotate() to get an ordered list of examples to annotated 4. call annotate() on these examples and get the VERIFIED annotation 5. call sync_result() to save the results :return: None, all results are locally saved in files ''' # load the tables tables = load_jsonl(FLAGS.table_file) table_dict = dict([(table['name'], table) for table in tables]) # Load pre-trained embeddings. embedding_model = word_embeddings.EmbeddingModel(FLAGS.vocab_file, FLAGS.embedding_file) with open(FLAGS.en_vocab_file, 'r') as f: vocab = json.load(f) en_vocab = data_utils.Vocab([]) en_vocab.load_vocab(vocab) annotation_result_list = get_examples_to_annotate() for i in range(len(annotation_result_list)): if annotation_result_list[i][2] is None: # create a real environment env = create_envs(table_dict, [annotation_result_list[i][0]], en_vocab, embedding_model)[0] # get the annotation annotation = annotate( env, table_dict[env.question_annotation['context']]) annotation_result_list[i] = (annotation_result_list[i][0], annotation_result_list[i][1], annotation) sync_results(annotation_result_list)
def init_experiment(fns, use_gpu=False, gpu_id='0'): dataset = [] for fn in fns: dataset += load_jsonl(fn) tf.logging.info('{} examples in dataset.'.format(len(dataset))) tables = load_jsonl(FLAGS.table_file) table_dict = dict([(table['name'], table) for table in tables]) tf.logging.info('{} tables.'.format(len(table_dict))) # Load pretrained embeddings. embedding_model = word_embeddings.EmbeddingModel( FLAGS.vocab_file, FLAGS.embedding_file) with open(FLAGS.en_vocab_file, 'r') as f: vocab = json.load(f) en_vocab = data_utils.Vocab([]) en_vocab.load_vocab(vocab) tf.logging.info('{} unique tokens in encoder vocab'.format( len(en_vocab.vocab))) tf.logging.info('{} examples in the dataset'.format(len(dataset))) # Create environments. envs = create_envs(table_dict, dataset, en_vocab, embedding_model) if FLAGS.unittest: envs = envs[:25] tf.logging.info('{} environments in total'.format(len(envs))) graph_config = get_saved_graph_config() if graph_config: # If evaluating an saved model, just load its graph # config. agent = create_agent(graph_config, get_init_model_path()) else: if FLAGS.use_pretrained_embeddings: tf.logging.info('Using pretrained embeddings!') pretrained_embeddings = [] for i in xrange(len(en_vocab.special_tks), en_vocab.size): pretrained_embeddings.append( utils.average_token_embedding( utils.find_tk_in_model( en_vocab.lookup(i, reverse=True), embedding_model), embedding_model, embedding_size=FLAGS.pretrained_embedding_size)) pretrained_embeddings = np.vstack(pretrained_embeddings) else: pretrained_embeddings = None # Model configuration and initialization. de_vocab = envs[0].de_vocab n_mem = FLAGS.max_n_mem n_builtin = de_vocab.size - n_mem en_pretrained_vocab_size = en_vocab.size - len(en_vocab.special_tks) graph_config = {} graph_config['core_config'] = dict( max_n_valid_indices=FLAGS.max_n_valid_indices, n_mem=n_mem, n_builtin=n_builtin, use_attn=True, attn_size=FLAGS.attn_size, attn_vec_size=FLAGS.attn_vec_size, input_vocab_size=de_vocab.size, en_input_vocab_size=en_vocab.size, hidden_size=FLAGS.hidden_size, n_layers=FLAGS.n_layers, en_hidden_size=FLAGS.hidden_size, en_n_layers=FLAGS.en_n_layers, en_use_embeddings=True, en_embedding_size=FLAGS.en_embedding_size, value_embedding_size=FLAGS.value_embedding_size, en_pretrained_vocab_size=en_pretrained_vocab_size, en_pretrained_embedding_size=FLAGS.pretrained_embedding_size, add_lm_loss=FLAGS.lm_loss_coeff > 0.0, en_bidirectional=FLAGS.en_bidirectional, en_attn_on_constants=FLAGS.en_attn_on_constants) graph_config['use_gpu'] = use_gpu graph_config['gpu_id'] = gpu_id graph_config['output_type'] = 'softmax' graph_config['output_config'] = dict( output_vocab_size=de_vocab.size, use_logits=True) aux_loss_list = [('ent_reg', FLAGS.entropy_reg_coeff),] if FLAGS.lm_loss_coeff > 0.0: aux_loss_list.append(('en_lm_loss', FLAGS.lm_loss_coeff)) graph_config['train_config'] = dict( aux_loss_list=aux_loss_list, learning_rate=FLAGS.learning_rate, max_grad_norm=FLAGS.max_grad_norm, adam_beta1=FLAGS.adam_beta1, l2_coeff=FLAGS.l2_coeff, optimizer=FLAGS.optimizer, avg_loss_by_n=False) agent = create_agent( graph_config, get_init_model_path(), pretrained_embeddings=pretrained_embeddings) with open(os.path.join(get_experiment_dir(), 'graph_config.json'), 'w') as f: json.dump(graph_config, f, sort_keys=True, indent=2) return agent, envs
def run_random_exploration(shard_id): experiment_dir = get_experiment_dir() if not tf.gfile.Exists(experiment_dir): tf.gfile.MkDir(experiment_dir) if FLAGS.trigger_word_file: with open(FLAGS.trigger_word_file, 'r') as f: trigger_dict = json.load(f) print 'use trigger words in {}'.format(FLAGS.trigger_word_file) else: trigger_dict = None # Load dataset. train_set = [] with open(FLAGS.train_file_tmpl.format(shard_id), 'r') as f: for line in f: example = json.loads(line) train_set.append(example) tf.logging.info('{} examples in training set.'.format(len(train_set))) table_dict = {} with open(FLAGS.table_file) as f: for line in f: table = json.loads(line) table_dict[table['name']] = table tf.logging.info('{} tables.'.format(len(table_dict))) if FLAGS.executor == 'wtq': score_fn = utils.wtq_score process_answer_fn = lambda x: x executor_fn = executor_factory.WikiTableExecutor elif FLAGS.executor == 'wikisql': score_fn = utils.wikisql_score process_answer_fn = utils.wikisql_process_answer executor_fn = executor_factory.WikiSQLExecutor else: raise ValueError('Unknown executor {}'.format(FLAGS.executor)) all_envs = [] t1 = time.time() for i, example in enumerate(train_set): if i % 100 == 0: tf.logging.info('creating environment #{}'.format(i)) kg_info = table_dict[example['context']] executor = executor_fn(kg_info) api = executor.get_api() type_hierarchy = api['type_hierarchy'] func_dict = api['func_dict'] constant_dict = api['constant_dict'] interpreter = computer_factory.LispInterpreter( type_hierarchy=type_hierarchy, max_mem=FLAGS.max_n_mem, max_n_exp=FLAGS.max_n_exp, assisted=True) for v in func_dict.values(): interpreter.add_function(**v) interpreter.add_constant(value=kg_info['row_ents'], type='entity_list', name='all_rows') de_vocab = interpreter.get_vocab() env = env_factory.QAProgrammingEnv( data_utils.Vocab([]), de_vocab, question_annotation=example, answer=process_answer_fn(example['answer']), constants=constant_dict.values(), interpreter=interpreter, constant_value_embedding_fn=lambda x: None, score_fn=score_fn, max_cache_size=FLAGS.n_explore_samples * FLAGS.n_epoch * 10, name=example['id']) all_envs.append(env) program_dict = dict([(env.name, []) for env in all_envs]) for i in xrange(1, FLAGS.n_epoch + 1): tf.logging.info('iteration {}'.format(i)) t1 = time.time() for env in all_envs: for _ in xrange(FLAGS.n_explore_samples): program = random_explore(env, trigger_dict=trigger_dict) if program is not None: program_dict[env.name].append(program) t2 = time.time() tf.logging.info('{} sec used in iteration {}'.format(t2 - t1, i)) if i % FLAGS.save_every_n == 0: tf.logging.info( 'saving programs and cache in iteration {}'.format(i)) t1 = time.time() with open( os.path.join( get_experiment_dir(), 'program_shard_{}-{}.json'.format(shard_id, i)), 'w') as f: program_str_dict = dict([(k, [' '.join(p) for p in v]) for k, v in program_dict.iteritems()]) json.dump(program_str_dict, f, sort_keys=True, indent=2) # cache_dict = dict([(env.name, list(env.cache._set)) for env in all_envs]) t2 = time.time() tf.logging.info( '{} sec used saving programs and cache in iteration {}'.format( t2 - t1, i)) n = len(all_envs) solution_ratio = len( [env for env in all_envs if program_dict[env.name]]) * 1.0 / n tf.logging.info( 'At least one solution found ratio: {}'.format(solution_ratio)) n_programs_per_env = np.array( [len(program_dict[env.name]) for env in all_envs]) tf.logging.info( 'number of solutions found per example: max {}, min {}, avg {}, std {}' .format(n_programs_per_env.max(), n_programs_per_env.min(), n_programs_per_env.mean(), n_programs_per_env.std())) # Macro average length. mean_length = np.mean([ np.mean([len(p) for p in program_dict[env.name]]) for env in all_envs if program_dict[env.name] ]) tf.logging.info('macro average program length: {}'.format(mean_length))