def prepare_data(root_path):
    param = get_params(root_path,
                       os.path.join(root_path, 'text_task_resnet'),
                       test_state=None)

    test_dir_loc = os.path.join(root_path, "history")
    dump_dir_loc = param['dump_dir_loc']
    vocab_file = param['vocab_file']
    vocab_stats_file = param['vocab_stats_file']
    vocab_freq_cutoff = param['vocab_freq_cutoff']
    test_data_file = param['test_data_file']

    max_utter = param['max_utter']
    max_len = param['max_len']
    max_images = param['max_images']
    preparedata = PrepareData(max_utter,
                              max_len,
                              max_images,
                              start_symbol_index,
                              end_symbol_index,
                              unk_symbol_index,
                              pad_symbol_index,
                              "text",
                              cutoff=vocab_freq_cutoff)
    if os.path.isfile(vocab_file):
        print 'found existing vocab file in ' + str(
            vocab_file) + ', ... reading from there'
    preparedata.prepare_data(test_dir_loc,
                             vocab_file,
                             vocab_stats_file,
                             os.path.join(dump_dir_loc, "test_smallest"),
                             test_data_file,
                             isTrain=False,
                             isTest=True)
Beispiel #2
0
def get_dialog_dict(param, is_test=False):
    train_dir_loc = param['train_dir_loc']
    valid_dir_loc = param['valid_dir_loc']
    test_dir_loc = param['test_dir_loc']
    dump_dir_loc = param['dump_dir_loc']
    vocab_file = param['vocab_file']
    vocab_stats_file = param['vocab_stats_file']
    vocab_freq_cutoff = param['vocab_freq_cutoff']
    train_data_file = param['train_data_file']
    valid_data_file = param['valid_data_file']
    test_data_file = param['test_data_file']
    max_utter = param['max_utter']
    max_len = param['max_len']
    max_images = param['max_images']
    max_negs = param['max_negs']
    if 'test_state' in param:
        test_state = param['test_state']
    else:
        test_state = None
    preparedata = PrepareData(max_utter, max_len, max_images, max_negs, start_symbol_index, end_symbol_index,
                              unk_symbol_index, pad_symbol_index, "image", cutoff=vocab_freq_cutoff)
    if os.path.isfile(vocab_file):
        print('found existing vocab file in ' + str(vocab_file) + ', ... reading from there')
    if not is_test:
        preparedata.prepare_data(train_dir_loc, vocab_file, vocab_stats_file, os.path.join(dump_dir_loc, "train"),
                                 train_data_file, True, False, None)
        preparedata.prepare_data(valid_dir_loc, vocab_file, vocab_stats_file, os.path.join(dump_dir_loc, "valid"),
                                 valid_data_file, False, False, None)
    if test_state is not None:
        preparedata.prepare_data(test_dir_loc, vocab_file, vocab_stats_file,
                                 os.path.join(dump_dir_loc + "/test_data_file_state/", "test_" + test_state),
                                 test_data_file, False, True, test_state)
    else:
        preparedata.prepare_data(test_dir_loc, vocab_file, vocab_stats_file, os.path.join(dump_dir_loc, "test"),
                                 test_data_file, False, True, test_state)
Beispiel #3
0
def get_dialog_dict_for_test(param):
    test_dir_loc = param['test_dir_loc']
    dump_dir_loc = param['dump_dir_loc']
    vocab_file = param['vocab_file']
    vocab_stats_file = param['vocab_stats_file']
    response_vocab_file = param['response_vocab_file']
    vocab_freq_cutoff = param['vocab_freq_cutoff']
    test_data_file = param['test_data_file']
    max_utter = param['max_utter']
    max_len = param['max_len']
    stopwords = param['stopwords']
    stopwords_histogram = param['stopwords_histogram']
    max_mem_size = param['memory_size']
    max_target_size = param['gold_target_size']
    ques_type_id = param['ques_type_id']
    ques_type_name = param['ques_type_name']
    vocab_max_len = param['vocab_max_len']
    wikidata_dir = param['wikidata_dir']
    lucene_dir = param['lucene_dir']
    transe_dir = param['transe_dir']
    glove_dir = param['glove_dir']
    preparedata = PrepareData(max_utter, max_len, start_symbol_index, end_symbol_index, unk_symbol_index,
                              pad_symbol_index, kb_pad_idx, nkb, stopwords, stopwords_histogram, lucene_dir, transe_dir,
                              wikidata_dir, glove_dir, max_mem_size, max_target_size, vocab_max_len, True,
                              cutoff=vocab_freq_cutoff)
    if os.path.isfile(vocab_file):
        print 'found existing vocab file in ' + str(vocab_file) + ', ... reading from there'
    print 'to delete later ', os.path.join(dump_dir_loc, "train")
    preparedata.prepare_data(test_dir_loc, vocab_file, vocab_stats_file, response_vocab_file,
                             os.path.join(dump_dir_loc, "test"), test_data_file, ques_type_id, ques_type_name)
def read_data(root_path):
    ImageUrlToIndex = pkl.load(
        open(path.join(root_path, 'data', 'Img_Fea_Dic.pkl')))
    ImageFea = np.load(open(path.join(root_path, 'data', 'Img_Fea.npy')))
    param = get_params(sys.argv[1])
    train_dir_loc = param['train_dir_loc']
    valid_dir_loc = param['valid_dir_loc']
    test_dir_loc = param['test_dir_loc'].replace('test', 'test_smallest')
    dump_dir_loc = param['dump_dir_loc']
    vocab_file = param['vocab_file']
    vocab_stats_file = param['vocab_stats_file']
    vocab_freq_cutoff = param['vocab_freq_cutoff']
    train_data_file = param['train_data_file']
    valid_data_file = param['valid_data_file']
    test_data_file = param['test_data_file'].replace('test', 'test_smallest')
    # print test_data_file
    # sys.exit(1)
    max_utter = param['max_utter']
    max_len = param['max_len']
    max_images = param['max_images']
    preparedata = PrepareData(max_utter,
                              max_len,
                              max_images,
                              start_symbol_index,
                              end_symbol_index,
                              unk_symbol_index,
                              pad_symbol_index,
                              "text",
                              cutoff=vocab_freq_cutoff)
    if os.path.isfile(vocab_file):
        print 'found existing vocab file in ' + str(
            vocab_file) + ', ... reading from there'
    preparedata.prepare_data(test_dir_loc, vocab_file, vocab_stats_file,
                             os.path.join(dump_dir_loc, "test_smallest"),
                             test_data_file)
Beispiel #5
0
    train_dir_loc = param['train_dir_loc']
    valid_dir_loc = param['valid_dir_loc']
    test_dir_loc = param['test_dir_loc'].replace('test', 'test_smallest')
    dump_dir_loc = param['dump_dir_loc']
    vocab_file = param['vocab_file']
    vocab_stats_file = param['vocab_stats_file']
    vocab_freq_cutoff = param['vocab_freq_cutoff']
    train_data_file = param['train_data_file']
    valid_data_file = param['valid_data_file']
    test_data_file = param['test_data_file'].replace('test', 'test_smallest')
    #print test_data_file
    #sys.exit(1)
    max_utter = param['max_utter']
    max_len = param['max_len']
    max_images = param['max_images']
    preparedata = PrepareData(max_utter,
                              max_len,
                              max_images,
                              start_symbol_index,
                              end_symbol_index,
                              unk_symbol_index,
                              pad_symbol_index,
                              "text",
                              cutoff=vocab_freq_cutoff)
    if os.path.isfile(vocab_file):
        print 'found existing vocab file in ' + str(
            vocab_file) + ', ... reading from there'
    preparedata.prepare_data(test_dir_loc, vocab_file, vocab_stats_file,
                             os.path.join(dump_dir_loc, "test_smallest"),
                             test_data_file)