def create_complete_dataset(): embeddings = load_and_create_vocab() create_data_set(movie.cfg.QA_JSON, 'train', data_conf.TRAIN_RECORD_PATH, embeddings) create_data_set(data_conf.EVAL_FILE, 'val', data_conf.EVAL_RECORD_PATH, embeddings) create_data_set(movie.cfg.QA_JSON, 'test', data_conf.TEST_RECORD_PATH, embeddings) print("saving embeddings") util.save_embeddings(data_conf.EMBEDDING_DIR, data_conf.EMBEDDING_SIZE)
def create_200_random_validation_dataset(qa_ids_file): embeddings = load_and_create_vocab() outfolder = os.path.join(data_conf.RECORD_DIR, 'val_random_200') if not os.path.exists(outfolder): os.makedirs(outfolder) qa_ids = read_qa_ids(qa_ids_file) create_movieqa_data(movie.cfg.QA_JSON, 'val', outfolder, embeddings, qa_ids) print("saving embeddings") util.save_embeddings(data_conf.EMBEDDING_DIR, data_conf.EMBEDDING_SIZE)
def create_validation_dataset(split): print("Prepare embeddings for modified input ...") embeddings = load_and_create_vocab() create_movieqa_data(data_conf.EVAL_FILE, split, data_conf.EVAL_RECORD_PATH, embeddings) # save updated vocab file with additional new words new_vocab_size = util.save_embeddings(data_conf.EMBEDDING_DIR, data_conf.EMBEDDING_SIZE) return new_vocab_size
import os import sys present_path = os.path.dirname(os.path.realpath(sys.argv[0])) sys.path.append(os.path.join(present_path, '../../')) import core.util as util import movieqa.data_conf as data_conf glove = util.loadGloveModel(data_conf.PRETRAINED_EMBEDDINGS_PATH) #vectors, vocab = util.load_embeddings(data_conf.EMBEDDING_DIR) util.restore_vocab(data_conf.EMBEDDING_DIR) print("Restored vocab") #rev_vocab = dict(zip(vocab.values(), vocab.keys())) #print("Current vocabulary %s with %d entries" % (str(rev_vocab), len(rev_vocab))) filename = "adversarial_addAny/common_english.txt" fin = open(filename, encoding="utf8") for line in fin: word = line.replace('\n', '') print("get word vector for %s" % word) vec = util.get_word_vector(glove, word, data_conf.EMBEDDING_SIZE) vsize = util.save_embeddings(data_conf.EMBEDDING_DIR, data_conf.EMBEDDING_SIZE) print("New vocabulary size %d" % vsize)