Ejemplo n.º 1
0
def get_envs(env_files=None):
    dataset = []
    if env_files is None:
        fns = [get_train_shard_path(i) for i in range(0, 30)]
    else:
        fns = env_files

    for fn in fns:
        dataset += load_jsonl(fn)
    tables = load_jsonl(table_file)

    table_dict = dict([(table['name'], table) for table in tables])

    # Load pretrained embeddings.
    embedding_model = word_embeddings.EmbeddingModel(vocab_file, embedding_file )

    with open(en_vocab_file, 'r') as f:
        vocab = json.load(f)
    en_vocab = data_utils.Vocab([])
    en_vocab.load_vocab(vocab)

    # Create environments.
    envs = create_envs(table_dict, dataset, en_vocab, embedding_model)

    return envs
Ejemplo n.º 2
0
    def build(cls, config, params=None):
        dummy_kg = {
            'kg': None,
            'num_props': [],
            'datetime_props': [],
            'props': [],
            'row_ents': []
        }

        executor = nsm.execution.worlds.wikitablequestions.WikiTableExecutor(
            dummy_kg)
        api = executor.get_api()
        op_vocab = data_utils.Vocab(
            [f['name']
             for f in api['func_dict'].values()] + ['all_rows'] + SPECIAL_TKS)
        config['builtin_func_num'] = op_vocab.size

        encoder = BertEncoder.build(config)

        # FIXME: hacky!
        if config.get('use_trainable_sketch_manager', False):
            sketch_predictor = SketchPredictor.build(config, encoder=encoder)
            sketch_encoder = SketchEncoder.build(config, sketch_predictor)
        else:
            sketch_predictor = sketch_encoder = None

        decoder = SketchGuidedDecoder.build(config, encoder, sketch_encoder)

        return cls(encoder,
                   decoder,
                   sketch_predictor=sketch_predictor,
                   sketch_encoder=sketch_encoder,
                   config=config)
 def get_vocab(self):
     mem_tokens = []
     for i in range(self.max_mem):
         mem_tokens.append('v{}'.format(i))
     vocab = data_utils.Vocab(
         list(self.namespace.get_all_names()) + SPECIAL_TKS + mem_tokens)
     return vocab
Ejemplo n.º 4
0
def json_to_envs(dataset):
    tables = load_jsonl(FLAGS.table_file)
    table_dict = dict([(table['name'], table) for table in tables])
    embedding_model = word_embeddings.EmbeddingModel(FLAGS.vocab_file,
                                                     FLAGS.embedding_file)

    with open(FLAGS.en_vocab_file, 'r') as f:
        vocab = json.load(f)
    en_vocab = data_utils.Vocab([])
    en_vocab.load_vocab(vocab)

    # Create environments.
    envs = create_envs(table_dict, dataset, en_vocab, embedding_model)

    return envs
Ejemplo n.º 5
0
def init_world_config() -> Dict:
    dummy_kg = {
        'kg': None,
        'num_props': [],
        'datetime_props': [],
        'props': [],
        'row_ents': []
    }

    api = WikiTableExecutor(dummy_kg).get_api()
    op_vocab = data_utils.Vocab(
        [f['name'] for f in api['func_dict'].values()] +
        ['all_rows'] +
        INTERPRETER_SPECIAL_TOKENS
    )

    config = {
        'interpreter_builtin_func_num': op_vocab.size,
        'executor_api': api
    }

    return config
Ejemplo n.º 6
0
def main():
    '''
    1. call get_examples_to_annotate() to get an ordered list of examples to annotated
    4. call annotate() on these examples and get the VERIFIED annotation
    5. call sync_result() to save the results
    :return: None, all results are locally saved in files
    '''

    # load the tables
    tables = load_jsonl(FLAGS.table_file)
    table_dict = dict([(table['name'], table) for table in tables])

    # Load pre-trained embeddings.
    embedding_model = word_embeddings.EmbeddingModel(FLAGS.vocab_file,
                                                     FLAGS.embedding_file)

    with open(FLAGS.en_vocab_file, 'r') as f:
        vocab = json.load(f)
    en_vocab = data_utils.Vocab([])
    en_vocab.load_vocab(vocab)

    annotation_result_list = get_examples_to_annotate()
    for i in range(len(annotation_result_list)):
        if annotation_result_list[i][2] is None:
            # create a real environment
            env = create_envs(table_dict, [annotation_result_list[i][0]],
                              en_vocab, embedding_model)[0]

            # get the annotation
            annotation = annotate(
                env, table_dict[env.question_annotation['context']])
            annotation_result_list[i] = (annotation_result_list[i][0],
                                         annotation_result_list[i][1],
                                         annotation)

            sync_results(annotation_result_list)
Ejemplo n.º 7
0
def init_experiment(fns, use_gpu=False, gpu_id='0'):
  dataset = []
  for fn in fns:
    dataset += load_jsonl(fn)
  tf.logging.info('{} examples in dataset.'.format(len(dataset)))
  tables = load_jsonl(FLAGS.table_file)
  table_dict = dict([(table['name'], table) for table in tables])
  tf.logging.info('{} tables.'.format(len(table_dict)))

  # Load pretrained embeddings.
  embedding_model = word_embeddings.EmbeddingModel(
    FLAGS.vocab_file, FLAGS.embedding_file)

  with open(FLAGS.en_vocab_file, 'r') as f:
    vocab = json.load(f)
  en_vocab = data_utils.Vocab([])
  en_vocab.load_vocab(vocab)
  tf.logging.info('{} unique tokens in encoder vocab'.format(
    len(en_vocab.vocab)))
  tf.logging.info('{} examples in the dataset'.format(len(dataset)))
  
  # Create environments.   
  envs = create_envs(table_dict, dataset, en_vocab, embedding_model)
  if FLAGS.unittest:
    envs = envs[:25]
  tf.logging.info('{} environments in total'.format(len(envs)))

  graph_config = get_saved_graph_config()
  if graph_config:
    # If evaluating an saved model, just load its graph
    # config.
    agent = create_agent(graph_config, get_init_model_path())
  else:
    if FLAGS.use_pretrained_embeddings:
      tf.logging.info('Using pretrained embeddings!')
      pretrained_embeddings = []
      for i in xrange(len(en_vocab.special_tks), en_vocab.size):
        pretrained_embeddings.append(
          utils.average_token_embedding(
            utils.find_tk_in_model(
              en_vocab.lookup(i, reverse=True), embedding_model),
            embedding_model,
            embedding_size=FLAGS.pretrained_embedding_size))
      pretrained_embeddings = np.vstack(pretrained_embeddings)
    else:
      pretrained_embeddings = None

    # Model configuration and initialization.
    de_vocab = envs[0].de_vocab
    n_mem = FLAGS.max_n_mem
    n_builtin = de_vocab.size - n_mem
    en_pretrained_vocab_size = en_vocab.size - len(en_vocab.special_tks)

    graph_config = {}
    graph_config['core_config'] = dict(
      max_n_valid_indices=FLAGS.max_n_valid_indices,
      n_mem=n_mem,
      n_builtin=n_builtin,
      use_attn=True, 
      attn_size=FLAGS.attn_size,
      attn_vec_size=FLAGS.attn_vec_size,
      input_vocab_size=de_vocab.size,
      en_input_vocab_size=en_vocab.size,
      hidden_size=FLAGS.hidden_size, n_layers=FLAGS.n_layers,
      en_hidden_size=FLAGS.hidden_size, en_n_layers=FLAGS.en_n_layers,
      en_use_embeddings=True,
      en_embedding_size=FLAGS.en_embedding_size,
      value_embedding_size=FLAGS.value_embedding_size,
      en_pretrained_vocab_size=en_pretrained_vocab_size,
      en_pretrained_embedding_size=FLAGS.pretrained_embedding_size,
      add_lm_loss=FLAGS.lm_loss_coeff > 0.0,
      en_bidirectional=FLAGS.en_bidirectional,
      en_attn_on_constants=FLAGS.en_attn_on_constants)

    graph_config['use_gpu'] = use_gpu
    graph_config['gpu_id'] = gpu_id

    graph_config['output_type'] = 'softmax'
    graph_config['output_config'] = dict(
      output_vocab_size=de_vocab.size, use_logits=True)
    aux_loss_list = [('ent_reg', FLAGS.entropy_reg_coeff),]

    if FLAGS.lm_loss_coeff > 0.0:
      aux_loss_list.append(('en_lm_loss', FLAGS.lm_loss_coeff))
    graph_config['train_config'] = dict(
      aux_loss_list=aux_loss_list,
      learning_rate=FLAGS.learning_rate,
      max_grad_norm=FLAGS.max_grad_norm,
      adam_beta1=FLAGS.adam_beta1,
      l2_coeff=FLAGS.l2_coeff,
      optimizer=FLAGS.optimizer, avg_loss_by_n=False)

    agent = create_agent(
      graph_config, get_init_model_path(),
      pretrained_embeddings=pretrained_embeddings)

  with open(os.path.join(get_experiment_dir(), 'graph_config.json'), 'w') as f:
    json.dump(graph_config, f, sort_keys=True, indent=2)
    
  return agent, envs
def run_random_exploration(shard_id):
    experiment_dir = get_experiment_dir()
    if not tf.gfile.Exists(experiment_dir):
        tf.gfile.MkDir(experiment_dir)

    if FLAGS.trigger_word_file:
        with open(FLAGS.trigger_word_file, 'r') as f:
            trigger_dict = json.load(f)
            print 'use trigger words in {}'.format(FLAGS.trigger_word_file)
    else:
        trigger_dict = None

    # Load dataset.
    train_set = []
    with open(FLAGS.train_file_tmpl.format(shard_id), 'r') as f:
        for line in f:
            example = json.loads(line)
            train_set.append(example)
    tf.logging.info('{} examples in training set.'.format(len(train_set)))

    table_dict = {}
    with open(FLAGS.table_file) as f:
        for line in f:
            table = json.loads(line)
            table_dict[table['name']] = table
    tf.logging.info('{} tables.'.format(len(table_dict)))

    if FLAGS.executor == 'wtq':
        score_fn = utils.wtq_score
        process_answer_fn = lambda x: x
        executor_fn = executor_factory.WikiTableExecutor
    elif FLAGS.executor == 'wikisql':
        score_fn = utils.wikisql_score
        process_answer_fn = utils.wikisql_process_answer
        executor_fn = executor_factory.WikiSQLExecutor
    else:
        raise ValueError('Unknown executor {}'.format(FLAGS.executor))

    all_envs = []
    t1 = time.time()
    for i, example in enumerate(train_set):
        if i % 100 == 0:
            tf.logging.info('creating environment #{}'.format(i))
        kg_info = table_dict[example['context']]
        executor = executor_fn(kg_info)
        api = executor.get_api()
        type_hierarchy = api['type_hierarchy']
        func_dict = api['func_dict']
        constant_dict = api['constant_dict']
        interpreter = computer_factory.LispInterpreter(
            type_hierarchy=type_hierarchy,
            max_mem=FLAGS.max_n_mem,
            max_n_exp=FLAGS.max_n_exp,
            assisted=True)
        for v in func_dict.values():
            interpreter.add_function(**v)

        interpreter.add_constant(value=kg_info['row_ents'],
                                 type='entity_list',
                                 name='all_rows')

        de_vocab = interpreter.get_vocab()
        env = env_factory.QAProgrammingEnv(
            data_utils.Vocab([]),
            de_vocab,
            question_annotation=example,
            answer=process_answer_fn(example['answer']),
            constants=constant_dict.values(),
            interpreter=interpreter,
            constant_value_embedding_fn=lambda x: None,
            score_fn=score_fn,
            max_cache_size=FLAGS.n_explore_samples * FLAGS.n_epoch * 10,
            name=example['id'])
        all_envs.append(env)

    program_dict = dict([(env.name, []) for env in all_envs])
    for i in xrange(1, FLAGS.n_epoch + 1):
        tf.logging.info('iteration {}'.format(i))
        t1 = time.time()
        for env in all_envs:
            for _ in xrange(FLAGS.n_explore_samples):
                program = random_explore(env, trigger_dict=trigger_dict)
                if program is not None:
                    program_dict[env.name].append(program)
        t2 = time.time()
        tf.logging.info('{} sec used in iteration {}'.format(t2 - t1, i))

        if i % FLAGS.save_every_n == 0:
            tf.logging.info(
                'saving programs and cache in iteration {}'.format(i))
            t1 = time.time()
            with open(
                    os.path.join(
                        get_experiment_dir(),
                        'program_shard_{}-{}.json'.format(shard_id, i)),
                    'w') as f:
                program_str_dict = dict([(k, [' '.join(p) for p in v])
                                         for k, v in program_dict.iteritems()])
                json.dump(program_str_dict, f, sort_keys=True, indent=2)

            # cache_dict = dict([(env.name, list(env.cache._set)) for env in all_envs])
            t2 = time.time()
            tf.logging.info(
                '{} sec used saving programs and cache in iteration {}'.format(
                    t2 - t1, i))

        n = len(all_envs)
        solution_ratio = len(
            [env for env in all_envs if program_dict[env.name]]) * 1.0 / n
        tf.logging.info(
            'At least one solution found ratio: {}'.format(solution_ratio))
        n_programs_per_env = np.array(
            [len(program_dict[env.name]) for env in all_envs])
        tf.logging.info(
            'number of solutions found per example: max {}, min {}, avg {}, std {}'
            .format(n_programs_per_env.max(), n_programs_per_env.min(),
                    n_programs_per_env.mean(), n_programs_per_env.std()))

        # Macro average length.
        mean_length = np.mean([
            np.mean([len(p) for p in program_dict[env.name]])
            for env in all_envs if program_dict[env.name]
        ])
        tf.logging.info('macro average program length: {}'.format(mean_length))