Example #1
0
def get_envs(env_files=None):
    dataset = []
    if env_files is None:
        fns = [get_train_shard_path(i) for i in range(0, 30)]
    else:
        fns = env_files

    for fn in fns:
        dataset += load_jsonl(fn)
    tables = load_jsonl(table_file)

    table_dict = dict([(table['name'], table) for table in tables])

    # Load pretrained embeddings.
    embedding_model = word_embeddings.EmbeddingModel(vocab_file, embedding_file )

    with open(en_vocab_file, 'r') as f:
        vocab = json.load(f)
    en_vocab = data_utils.Vocab([])
    en_vocab.load_vocab(vocab)

    # Create environments.
    envs = create_envs(table_dict, dataset, en_vocab, embedding_model)

    return envs
Example #2
0
def json_to_envs(dataset):
    tables = load_jsonl(FLAGS.table_file)
    table_dict = dict([(table['name'], table) for table in tables])
    embedding_model = word_embeddings.EmbeddingModel(FLAGS.vocab_file,
                                                     FLAGS.embedding_file)

    with open(FLAGS.en_vocab_file, 'r') as f:
        vocab = json.load(f)
    en_vocab = data_utils.Vocab([])
    en_vocab.load_vocab(vocab)

    # Create environments.
    envs = create_envs(table_dict, dataset, en_vocab, embedding_model)

    return envs
Example #3
0
def main():
    '''
    1. call get_examples_to_annotate() to get an ordered list of examples to annotated
    4. call annotate() on these examples and get the VERIFIED annotation
    5. call sync_result() to save the results
    :return: None, all results are locally saved in files
    '''

    # load the tables
    tables = load_jsonl(FLAGS.table_file)
    table_dict = dict([(table['name'], table) for table in tables])

    # Load pre-trained embeddings.
    embedding_model = word_embeddings.EmbeddingModel(FLAGS.vocab_file,
                                                     FLAGS.embedding_file)

    with open(FLAGS.en_vocab_file, 'r') as f:
        vocab = json.load(f)
    en_vocab = data_utils.Vocab([])
    en_vocab.load_vocab(vocab)

    annotation_result_list = get_examples_to_annotate()
    for i in range(len(annotation_result_list)):
        if annotation_result_list[i][2] is None:
            # create a real environment
            env = create_envs(table_dict, [annotation_result_list[i][0]],
                              en_vocab, embedding_model)[0]

            # get the annotation
            annotation = annotate(
                env, table_dict[env.question_annotation['context']])
            annotation_result_list[i] = (annotation_result_list[i][0],
                                         annotation_result_list[i][1],
                                         annotation)

            sync_results(annotation_result_list)
def init_experiment(fns, use_gpu=False, gpu_id='0'):
  dataset = []
  for fn in fns:
    dataset += load_jsonl(fn)
  tf.logging.info('{} examples in dataset.'.format(len(dataset)))
  tables = load_jsonl(FLAGS.table_file)
  table_dict = dict([(table['name'], table) for table in tables])
  tf.logging.info('{} tables.'.format(len(table_dict)))

  # Load pretrained embeddings.
  embedding_model = word_embeddings.EmbeddingModel(
    FLAGS.vocab_file, FLAGS.embedding_file)

  with open(FLAGS.en_vocab_file, 'r') as f:
    vocab = json.load(f)
  en_vocab = data_utils.Vocab([])
  en_vocab.load_vocab(vocab)
  tf.logging.info('{} unique tokens in encoder vocab'.format(
    len(en_vocab.vocab)))
  tf.logging.info('{} examples in the dataset'.format(len(dataset)))
  
  # Create environments.   
  envs = create_envs(table_dict, dataset, en_vocab, embedding_model)
  if FLAGS.unittest:
    envs = envs[:25]
  tf.logging.info('{} environments in total'.format(len(envs)))

  graph_config = get_saved_graph_config()
  if graph_config:
    # If evaluating an saved model, just load its graph
    # config.
    agent = create_agent(graph_config, get_init_model_path())
  else:
    if FLAGS.use_pretrained_embeddings:
      tf.logging.info('Using pretrained embeddings!')
      pretrained_embeddings = []
      for i in xrange(len(en_vocab.special_tks), en_vocab.size):
        pretrained_embeddings.append(
          utils.average_token_embedding(
            utils.find_tk_in_model(
              en_vocab.lookup(i, reverse=True), embedding_model),
            embedding_model,
            embedding_size=FLAGS.pretrained_embedding_size))
      pretrained_embeddings = np.vstack(pretrained_embeddings)
    else:
      pretrained_embeddings = None

    # Model configuration and initialization.
    de_vocab = envs[0].de_vocab
    n_mem = FLAGS.max_n_mem
    n_builtin = de_vocab.size - n_mem
    en_pretrained_vocab_size = en_vocab.size - len(en_vocab.special_tks)

    graph_config = {}
    graph_config['core_config'] = dict(
      max_n_valid_indices=FLAGS.max_n_valid_indices,
      n_mem=n_mem,
      n_builtin=n_builtin,
      use_attn=True, 
      attn_size=FLAGS.attn_size,
      attn_vec_size=FLAGS.attn_vec_size,
      input_vocab_size=de_vocab.size,
      en_input_vocab_size=en_vocab.size,
      hidden_size=FLAGS.hidden_size, n_layers=FLAGS.n_layers,
      en_hidden_size=FLAGS.hidden_size, en_n_layers=FLAGS.en_n_layers,
      en_use_embeddings=True,
      en_embedding_size=FLAGS.en_embedding_size,
      value_embedding_size=FLAGS.value_embedding_size,
      en_pretrained_vocab_size=en_pretrained_vocab_size,
      en_pretrained_embedding_size=FLAGS.pretrained_embedding_size,
      add_lm_loss=FLAGS.lm_loss_coeff > 0.0,
      en_bidirectional=FLAGS.en_bidirectional,
      en_attn_on_constants=FLAGS.en_attn_on_constants)

    graph_config['use_gpu'] = use_gpu
    graph_config['gpu_id'] = gpu_id

    graph_config['output_type'] = 'softmax'
    graph_config['output_config'] = dict(
      output_vocab_size=de_vocab.size, use_logits=True)
    aux_loss_list = [('ent_reg', FLAGS.entropy_reg_coeff),]

    if FLAGS.lm_loss_coeff > 0.0:
      aux_loss_list.append(('en_lm_loss', FLAGS.lm_loss_coeff))
    graph_config['train_config'] = dict(
      aux_loss_list=aux_loss_list,
      learning_rate=FLAGS.learning_rate,
      max_grad_norm=FLAGS.max_grad_norm,
      adam_beta1=FLAGS.adam_beta1,
      l2_coeff=FLAGS.l2_coeff,
      optimizer=FLAGS.optimizer, avg_loss_by_n=False)

    agent = create_agent(
      graph_config, get_init_model_path(),
      pretrained_embeddings=pretrained_embeddings)

  with open(os.path.join(get_experiment_dir(), 'graph_config.json'), 'w') as f:
    json.dump(graph_config, f, sort_keys=True, indent=2)
    
  return agent, envs
Example #5
0
def main(unused_argv):
    assert tf.gfile.Exists(FLAGS.raw_input_dir)
    if not tf.gfile.Exists(FLAGS.processed_input_dir):
        tf.gfile.MkDir(FLAGS.processed_input_dir)

    data_folder = os.path.join(FLAGS.raw_input_dir,
                               'WikiTableQuestions/tagged')
    table_file = os.path.join(FLAGS.processed_input_dir, 'tables.jsonl')
    test_table_file = os.path.join(FLAGS.processed_input_dir,
                                   'test_table.json')
    stop_words_file = os.path.join(FLAGS.raw_input_dir, 'stop_words.json')
    train_file = os.path.join(FLAGS.processed_input_dir,
                              'train_examples.jsonl')

    train_tagged = os.path.join(
        FLAGS.raw_input_dir, 'WikiTableQuestions/tagged/data/training.tagged')
    test_tagged = os.path.join(
        FLAGS.raw_input_dir,
        'WikiTableQuestions/tagged/data/pristine-unseen-tables.tagged')

    # Preprocess the tables.
    subdirs = os.listdir(data_folder)
    subdirs.remove('data')

    # Preprocess the tables.
    table_dict = {}
    folders = []
    t1 = time.time()
    for d in subdirs:
        for fn in os.listdir(os.path.join(data_folder, d)):
            full_path = os.path.join(data_folder, d, fn)
            m = re.match(
                r'.*/(?P<first>[0-9]*)-tagged/(?P<second>[0-9]*)\.tagged',
                full_path)
            folders.append(full_path)
            table_name = 't_{}_{}'.format(m.group('first'), m.group('second'))
            kg = table2kg(
                table_name,
                data_folder,
                max_n_tokens_for_num_prop=FLAGS.max_n_tokens_for_num_prop,
                min_frac_for_ordered_prop=FLAGS.min_frac_for_ordered_prop)
            kg['name'] = table_name
            table_dict[table_name] = kg
    t2 = time.time()
    print('{} sec used processing the tables.'.format(t2 - t1))
    print 'total number of number cells: {}'.format(n_total_num)
    print 'total number of filtered number cells: {}'.format(n_filtered_num)
    print 'filtered ration: {}'.format(n_filtered_num * 1.0 / n_total_num)
    print 'date and number ratio: {}'.format(n_date_and_num * 1.0 /
                                             n_total_num)

    # Save the preprocessed test table.
    with open(test_table_file, 'w') as f:
        json.dump({'t_203_375': table_dict['t_203_375']}, f)

    # Save the preprocessed table.
    t1 = time.time()
    with open(table_file, 'w') as f:
        for i, (k, v) in enumerate(table_dict.iteritems()):
            if i % 1000 == 0:
                print 'number {}'.format(i)
            f.write(json.dumps(v))
            f.write('\n')
    t2 = time.time()
    print '{} sec used dumping tables'.format(t2 - t1)

    df = create_df_from_wtq_questions(train_tagged)

    with open(stop_words_file, 'r') as f:
        stop_words_list = json.load(f)
    stop_words = set(stop_words_list)

    t1 = time.time()
    examples = collect_examples_from_df(df, table_dict, stop_words)
    t2 = time.time()
    print '{} sec used collecting train examples.'.format(t2 - t1)

    dump_examples(examples, train_file)

    for split_id in xrange(1, 6):
        processed_input_dir = os.path.join(FLAGS.processed_input_dir,
                                           'data_split_{}'.format(split_id))
        if not tf.gfile.Exists(processed_input_dir):
            tf.gfile.MkDir(processed_input_dir)

        train_split_tsv = os.path.join(
            FLAGS.raw_input_dir,
            'WikiTableQuestions/data/random-split-{}-train.tsv'.format(
                split_id))
        dev_split_tsv = os.path.join(
            FLAGS.raw_input_dir,
            'WikiTableQuestions/data/random-split-{}-dev.tsv'.format(split_id))

        # Create all the splitted datasets.
        train_df = create_df_from_wtq_questions(train_split_tsv)
        dev_df = create_df_from_wtq_questions(dev_split_tsv)

        assert len(train_df) + len(dev_df) == len(df)

        train_ids = set(train_df['id'])
        train_examples = []
        dev_ids = set(dev_df['id'])
        dev_examples = []
        for e in examples:
            if e['id'] in train_ids:
                train_examples.append(e)
            elif e['id'] in dev_ids:
                dev_examples.append(e)
            else:
                raise ValueError('id {} not found'.format(e['id']))
        assert len(train_examples) + len(dev_examples) == len(df)

        train_split_jsonl = os.path.join(processed_input_dir,
                                         'train_split.jsonl')
        dev_split_jsonl = os.path.join(processed_input_dir, 'dev_split.jsonl')

        dump_examples(train_examples, train_split_jsonl)
        dump_examples(dev_examples, dev_split_jsonl)

        train_shards = []
        for i in range(FLAGS.n_train_shard):
            train_shards.append([])
        for i, e in enumerate(train_examples):
            train_shards[i % FLAGS.n_train_shard].append(e)

        for i, sh in enumerate(train_shards):
            train_shard_jsonl = os.path.join(
                processed_input_dir,
                'train_split_shard_{}-{}.jsonl'.format(FLAGS.n_train_shard, i))
            dump_examples(sh, train_shard_jsonl)

    test_df = create_df_from_wtq_questions(test_tagged)
    t1 = time.time()
    test_examples = collect_examples_from_df(test_df, table_dict, stop_words)
    t2 = time.time()
    print '{} sec used collecting test examples.'.format(t2 - t1)

    test_split_jsonl = os.path.join(FLAGS.processed_input_dir,
                                    'test_split.jsonl')
    dump_examples(test_examples, test_split_jsonl)

    # Load pretrained embeddings.
    vocab_file = os.path.join(FLAGS.raw_input_dir,
                              "wikitable_glove_vocab.json")
    embedding_file = os.path.join(FLAGS.raw_input_dir,
                                  "wikitable_glove_embedding_mat.npy")
    embedding_model = word_embeddings.EmbeddingModel(vocab_file,
                                                     embedding_file)

    def create_vocab(examples, embedding_model, min_count):
        token_count = {}
        for e in examples:
            for tk in e['tokens']:
                # Token must be in glove and also appears more than min_count.
                if find_tk_in_model(tk, embedding_model):
                    try:
                        token_count[tk] += 1
                    except KeyError:
                        token_count[tk] = 1
        en_vocab = data_utils.generate_vocab_from_token_count(
            token_count, min_count=min_count)
        return en_vocab

    for i in xrange(1, 11):
        en_vocab = create_vocab(train_examples + dev_examples, embedding_model,
                                i)
        vocab_file = os.path.join(FLAGS.processed_input_dir,
                                  "en_vocab_min_count_{}.json".format(i))
        with open(vocab_file, 'w') as f:
            json.dump(en_vocab.vocab, f, sort_keys=True, indent=2)
        print 'min_tk_count: {}, vocab size: {}'.format(i, len(en_vocab.vocab))
def main(unused_argv):
    assert tf.gfile.Exists(FLAGS.raw_input_dir)
    if not tf.gfile.Exists(FLAGS.processed_input_dir):
        tf.gfile.MkDir(FLAGS.processed_input_dir)

    table_file = os.path.join(FLAGS.processed_input_dir, 'tables.jsonl')
    stop_words_file = os.path.join(FLAGS.raw_input_dir, 'stop_words.json')

    with open(stop_words_file, 'r') as f:
        stop_words = json.load(f)

    # Load datasets.
    train_set = []
    with open(os.path.join(FLAGS.raw_input_dir, 'train.jsonl'), 'r') as f:
        for line in f:
            train_set.append(json.loads(line))

    dev_set = []
    with open(os.path.join(FLAGS.raw_input_dir, 'dev.jsonl'), 'r') as f:
        for line in f:
            dev_set.append(json.loads(line))

    test_set = []
    with open(os.path.join(FLAGS.raw_input_dir, 'test.jsonl'), 'r') as f:
        for line in f:
            test_set.append(json.loads(line))

    # Load tables.
    train_table_dict = {}
    with open(os.path.join(FLAGS.raw_input_dir, 'train.tables.jsonl'),
              'r') as f:
        for line in f:
            _table = json.loads(line)
            train_table_dict[_table['id']] = _table

    dev_table_dict = {}
    with open(os.path.join(FLAGS.raw_input_dir, 'dev.tables.jsonl'), 'r') as f:
        for line in f:
            _table = json.loads(line)
            dev_table_dict[_table['id']] = _table

    test_table_dict = {}
    with open(os.path.join(FLAGS.raw_input_dir, 'test.tables.jsonl'),
              'r') as f:
        for line in f:
            _table = json.loads(line)
            test_table_dict[_table['id']] = _table

    # Collect all the tables.
    print 'Start collecting all the tables.'
    kg_dict = {}
    for tb_dict in [dev_table_dict, train_table_dict, test_table_dict]:
        for i, (k, v) in enumerate(tb_dict.iteritems()):
            if i % 1000 == 0:
                print i
            kg_dict[k] = table2kg(v)

    # Check if the string or number value has the correct type.
    for kg in kg_dict.values():
        for _, v in kg['kg'].iteritems():
            for prop, val in v.iteritems():
                if prop[-7:] == '-number':
                    for num in val:
                        if not (isinstance(num, int)
                                or isinstance(num, float)):
                            print kg
                            raise ValueError
                if prop[-7:] == '-string':
                    for num in val:
                        if not isinstance(num, unicode):
                            print kg
                            raise ValueError

    # Save the tables.
    with open(table_file, 'w') as f:
        for _, v in kg_dict.iteritems():
            f.write(json.dumps(v) + '\n')

    # Load the gold answers.
    with open(os.path.join(FLAGS.raw_input_dir, 'dev_gold.json'), 'r') as f:
        dev_answers = json.load(f)

    for q, ans in zip(dev_set, dev_answers):
        q['answer'] = ans

    with open(os.path.join(FLAGS.raw_input_dir, 'train_gold.json'), 'r') as f:
        train_answers = json.load(f)

    for q, ans in zip(train_set, train_answers):
        q['answer'] = ans

    with open(os.path.join(FLAGS.raw_input_dir, 'test_gold.json'), 'r') as f:
        test_answers = json.load(f)

    for q, ans in zip(test_set, test_answers):
        q['answer'] = ans

    # Annotate the examples and dump to files.
    train_split_jsonl = os.path.join(FLAGS.processed_input_dir,
                                     'train_split.jsonl')
    dev_split_jsonl = os.path.join(FLAGS.processed_input_dir,
                                   'dev_split.jsonl')
    test_split_jsonl = os.path.join(FLAGS.processed_input_dir,
                                    'test_split.jsonl')

    t1 = time.time()
    dev_examples = []
    print 'start annotating dev examples.'
    for i, q in enumerate(dev_set):
        if i % 500 == 0:
            print i
        e = annotate_question(q, 'dev-{}'.format(i), kg_dict, stop_words)
        expand_entities(e, kg_dict)
        dev_examples.append(e)
    t2 = time.time()
    print '{} sec used annotating dev examples.'.format(t2 - t1)
    dump_examples(dev_examples, dev_split_jsonl)

    t1 = time.time()
    train_examples = []
    print 'start annotating train examples.'
    for i, q in enumerate(train_set):
        if i % 500 == 0:
            print i
        e = annotate_question(q, 'train-{}'.format(i), kg_dict, stop_words)
        expand_entities(e, kg_dict)
        train_examples.append(e)
    t2 = time.time()
    print '{} sec used annotating train examples.'.format(t2 - t1)
    dump_examples(train_examples, train_split_jsonl)

    t1 = time.time()
    test_examples = []
    print 'start annotating test examples.'
    for i, q in enumerate(test_set):
        if i % 500 == 0:
            print i
        e = annotate_question(q, 'test-{}'.format(i), kg_dict, stop_words)
        expand_entities(e, kg_dict)
        test_examples.append(e)
    t2 = time.time()
    print '{} sec used annotating test examples.'.format(t2 - t1)
    dump_examples(test_examples, test_split_jsonl)

    train_shards = []
    for i in range(FLAGS.n_train_shard):
        train_shards.append([])
    for i, e in enumerate(train_examples):
        train_shards[i % FLAGS.n_train_shard].append(e)

    for i, sh in enumerate(train_shards):
        train_shard_jsonl = os.path.join(
            FLAGS.processed_input_dir,
            'train_split_shard_{}-{}.jsonl'.format(FLAGS.n_train_shard, i))
        dump_examples(sh, train_shard_jsonl)

    # Load pretrained embeddings.
    vocab_file = os.path.join(FLAGS.raw_input_dir, "wikisql_glove_vocab.json")
    embedding_file = os.path.join(FLAGS.raw_input_dir,
                                  "wikisql_glove_embedding_mat.npy")
    embedding_model = word_embeddings.EmbeddingModel(vocab_file,
                                                     embedding_file)

    for i in xrange(1, 11):
        en_vocab = create_vocab(train_examples + dev_examples, embedding_model,
                                i)
        vocab_file = os.path.join(FLAGS.processed_input_dir,
                                  "en_vocab_min_count_{}.json".format(i))
        with open(vocab_file, 'w') as f:
            json.dump(en_vocab.vocab, f, sort_keys=True, indent=2)
        print 'min_tk_count: {}, vocab size: {}'.format(i, len(en_vocab.vocab))
Example #7
0
    def eval_examples(self, envs):
        # declare some constant params
        CLUSTER_NUM = 500
        CLUSTER_SAMPLE_SIZE = 15
        FIRST_N_CLUSTERS = 200
        assert (FIRST_N_CLUSTERS * CLUSTER_SAMPLE_SIZE == 3 *
                FLAGS.al_budget_n)

        # # get failed env names
        # env_eval_results = ActivePicker.parallel_eval(envs, failed_eval)
        # failed_env_names_set = set(map(lambda (score, env_name): env_name,
        #                            filter(lambda (score, env_name): score > 0, env_eval_results)))

        # pure clustering, no failed information
        failed_env_names_set = set(map(lambda env: env['id'], envs))

        # get the questions embedding for every environment
        embedding_model = word_embeddings.EmbeddingModel(
            FLAGS.vocab_file, FLAGS.embedding_file)
        failed_envs = json_to_envs(
            filter(lambda env_json: env_json['id'] in failed_env_names_set,
                   envs))

        failed_env_names = map(lambda env: env.name, failed_envs)
        embedding_matrix = preprocessing.normalize(np.vstack(
            map(
                lambda env: average_token_embedding(env.context[
                    -1], embedding_model), failed_envs)),
                                                   copy=False)

        # run a k-means++ algorithm on this to get clusters
        print('##################################')
        print(
            'Start running k-means algorithm on %d examples... (this could take a while)'
            % len(failed_env_names))
        print('##################################')
        labels = KMeans(n_clusters=CLUSTER_NUM,
                        random_state=0).fit(embedding_matrix).labels_
        print('##################################')
        print('K-means running done!')
        print('##################################')

        # put env into clusters and index by name
        env_name_clusters = map(
            lambda i: np.array(failed_env_names)[labels == i].tolist(),
            range(CLUSTER_NUM))
        env_name_clusters = sorted(env_name_clusters,
                                   key=lambda x: len(x),
                                   reverse=True)
        assert (len(env_name_clusters[FIRST_N_CLUSTERS - 1]) >=
                CLUSTER_SAMPLE_SIZE)

        # choose CLUSTER_SAMPLE_SIZE examples from the first FIRST_N_CLUSTERS clusters
        choose_from_clusters = map(
            lambda cluster: np.random.choice(
                cluster, CLUSTER_SAMPLE_SIZE, replace=False).tolist(),
            env_name_clusters[:FIRST_N_CLUSTERS])
        chosen_env_names = set(reduce(lambda x, y: x + y,
                                      choose_from_clusters))

        result = map(
            lambda env:
            (1.0 if env['id'] in chosen_env_names else 0.0, env['id']), envs)
        '''
        # plot to see the performance
        pca = PCA(n_components=2)
        X_r = pca.fit(embedding_matrix).transform(embedding_matrix)

        plt.figure()

        for i in range(CLUSTER_NUM):
            plt.scatter(X_r[labels == i, 0], X_r[labels == i, 1], color=np.random.rand(3,), label='class_'+str(i))
        plt.legend(loc='best', shadow=False, scatterpoints=1)
        plt.title('PCA of clusters')
        '''

        return result