Example #1
0
def main(args):
  utility = Utility()
  train_name = "random-split-1-train.examples"
  dev_name = "random-split-1-dev.examples"
  test_name = "pristine-unseen-tables.examples"
  #load data
  dat = wiki_data.WikiQuestionGenerator(train_name, dev_name, test_name, FLAGS.data_dir)
  train_data, dev_data, test_data = dat.load()
  utility.words = []
  utility.word_ids = {}
  utility.reverse_word_ids = {}
  #construct vocabulary
  data_utils.construct_vocab(train_data, utility)
  data_utils.construct_vocab(dev_data, utility, True)
  data_utils.construct_vocab(test_data, utility, True)
  data_utils.add_special_words(utility)
  data_utils.perform_word_cutoff(utility)
  #convert data to int format and pad the inputs
  train_data = data_utils.complete_wiki_processing(train_data, utility, True)
  dev_data = data_utils.complete_wiki_processing(dev_data, utility, False)
  test_data = data_utils.complete_wiki_processing(test_data, utility, False)
  print "# train examples ", len(train_data)
  print "# dev examples ", len(dev_data)
  print "# test examples ", len(test_data)
  print "running open source"
  #construct TF graph and train or evaluate
  master(train_data, dev_data, utility)
Example #2
0
def main(args):
    utility = Utility()
    train_name = "random-split-1-train.examples"
    dev_name = "random-split-1-dev.examples"
    test_name = "pristine-unseen-tables.examples"
    #load data
    dat = wiki_data.WikiQuestionGenerator(train_name, dev_name, test_name,
                                          FLAGS.data_dir)
    train_data, dev_data, test_data = dat.load()
    utility.words = []
    utility.word_ids = {}
    utility.reverse_word_ids = {}
    #construct vocabulary
    data_utils.construct_vocab(train_data, utility)
    data_utils.construct_vocab(dev_data, utility, True)
    data_utils.construct_vocab(test_data, utility, True)
    data_utils.add_special_words(utility)
    data_utils.perform_word_cutoff(utility)
    #convert data to int format and pad the inputs
    train_data = data_utils.complete_wiki_processing(train_data, utility, True)
    dev_data = data_utils.complete_wiki_processing(dev_data, utility, False)
    test_data = data_utils.complete_wiki_processing(test_data, utility, False)
    print "# train examples ", len(train_data)
    print "# dev examples ", len(dev_data)
    print "# test examples ", len(test_data)
    print "running open source"
    #construct TF graph and train or evaluate
    master(train_data, dev_data, utility)
def init_data(
    data_dir,
    preserve_vocab=False,
    split_filenames={
        'train': 'random-split-1-train.examples',
        'dev': 'random-split-1-dev.examples',
        'test': 'pristine-unseen-tables.examples'
    },
    annotated_filenames={
        'train': 'training.annotated',
        'test': 'pristine-unseen-tables.annotated'
    }):
    """ Load WikiTableQuestions data. 
    preserve_vocab is used when perturbed data is loaded, 
    in which case special words are given hard-coded ids
    to match that of the unperturbed data case
    """
    utility = Utility()
    train_name = split_filenames['train']
    dev_name = split_filenames['dev']
    test_name = split_filenames['test']
    # load data
    dat = wiki_data.WikiQuestionGenerator(train_name, dev_name, test_name,
                                          data_dir)
    train_data, dev_data, test_data = dat.load(annotated_filenames)
    utility.words = []
    utility.word_ids = {}
    utility.reverse_word_ids = {}
    # construct vocabulary
    data_utils.construct_vocab(train_data, utility)
    data_utils.construct_vocab(dev_data, utility, True)
    data_utils.construct_vocab(test_data, utility, True)
    data_utils.add_special_words(utility)
    # set absolute word_ids for special words
    if preserve_vocab:
        print("hardcoded ids for special words")
        word_to_swap = utility.reverse_word_ids[9133]
        word_id_to_swap = utility.word_ids[utility.entry_match_token]
        utility.word_ids[word_to_swap] = utility.word_ids[
            utility.entry_match_token]
        utility.word_ids[utility.entry_match_token] = 9133
        utility.entry_match_token_id = utility.word_ids[
            utility.entry_match_token]
        utility.reverse_word_ids[word_id_to_swap] = word_to_swap
        utility.reverse_word_ids[9133] = utility.entry_match_token

        word_to_swap = utility.reverse_word_ids[9134]
        word_id_to_swap = utility.word_ids[utility.column_match_token]
        utility.word_ids[word_to_swap] = utility.word_ids[
            utility.column_match_token]
        utility.word_ids[utility.column_match_token] = 9134
        utility.column_match_token_id = utility.word_ids[
            utility.column_match_token]
        utility.reverse_word_ids[word_id_to_swap] = word_to_swap
        utility.reverse_word_ids[9134] = utility.column_match_token

        word_to_swap = utility.reverse_word_ids[9135]
        word_id_to_swap = utility.word_ids[utility.dummy_token]
        utility.word_ids[word_to_swap] = utility.word_ids[utility.dummy_token]
        utility.word_ids[utility.dummy_token] = 9135
        utility.dummy_token_id = utility.word_ids[utility.dummy_token]
        utility.reverse_word_ids[word_id_to_swap] = word_to_swap
        utility.reverse_word_ids[9135] = utility.dummy_token

        word_to_swap = utility.reverse_word_ids[9136]
        word_id_to_swap = utility.word_ids[utility.unk_token]
        utility.word_ids[word_to_swap] = utility.word_ids[utility.unk_token]
        utility.word_ids[utility.unk_token] = 9136
        utility.unk_token_id = utility.word_ids[utility.unk_token]
        utility.reverse_word_ids[word_id_to_swap] = word_to_swap
        utility.reverse_word_ids[9136] = utility.unk_token

        print(utility.entry_match_token_id, utility.column_match_token_id,
              utility.dummy_token_id, utility.unk_token_id)

    data_utils.perform_word_cutoff(utility)
    unprocessed_dev_data = copy.deepcopy(dev_data)
    # convert data to int format and pad the inputs
    train_data = data_utils.complete_wiki_processing(train_data, utility, True)
    dev_data = data_utils.complete_wiki_processing(dev_data, utility, False)
    test_data = data_utils.complete_wiki_processing(test_data, utility, False)
    print(("# train examples ", len(train_data)))
    print(("# dev examples ", len(dev_data)))
    print(("# test examples ", len(test_data)))
    return train_data, dev_data, test_data, utility, unprocessed_dev_data