def test_oracle(sess, testset, tokenizer, oracle, cpu_pool, batch_size, logger): oracle_dataset = OracleDataset(testset) oracle_sources = oracle.get_sources(sess) oracle_evaluator = Evaluator(oracle_sources, oracle.scope_name, network=oracle, tokenizer=tokenizer) oracle_batchifier = OracleBatchifier(tokenizer, oracle_sources, status=('success',)) oracle_iterator = Iterator(oracle_dataset, pool=cpu_pool, batch_size=batch_size, batchifier=oracle_batchifier) [oracle_loss, oracle_error] = oracle_evaluator.process(sess, oracle_iterator, [oracle.loss, oracle.error]) logger.info("Oracle test loss: {}".format(oracle_loss)) logger.info("Oracle test error: {}".format(oracle_error))
# Zipping back the files cmd.append("gzip " + big_gw_data) cmd.append("gzip " + gw_data) for c in cmd: print c os.system(c) # break print print "----------------" print "Printing the image files" # game_img_dict = {} img_dict = {} for (n, s) in options: game_set = OracleDataset.load(args.data_dir, s) for game in game_set.games: img_dict[str(game.image.id) + ".jpg"] = 1 f = open(path + s + "_img_index.txt", "w") for key in img_dict: print >> f, key f.close() # game_img_dict[s] = img_dict # print s # print img_dict print print img_dict # path of the data raw_dir = path + "img/raw"
# Load image image_builder, crop_builder = None, None use_resnet = False if config['inputs'].get('image', False): logger.info('Loading images..') image_builder = get_img_builder(config['model']['image'], args.img_dir) use_resnet = image_builder.is_raw_image() if config['inputs'].get('crop', False): logger.info('Loading crops..') crop_builder = get_img_builder(config['model']['crop'], args.crop_dir, is_crop=True) use_resnet = crop_builder.is_raw_image() # Load data logger.info('Loading data..') trainset = OracleDataset.load(args.data_dir, "train", image_builder, crop_builder) validset = OracleDataset.load(args.data_dir, "valid", image_builder, crop_builder) testset = OracleDataset.load(args.data_dir, "test", image_builder, crop_builder) # Load dictionary logger.info('Loading dictionary..') tokenizer = GWTokenizer(os.path.join(args.data_dir, args.dict_file)) # Build Network logger.info('Building network..') network = OracleNetwork(config, num_words=tokenizer.no_words) # Build Optimizer logger.info('Building optimizer..') optimizer, outputs = create_optimizer(network, config, finetune=finetune)
# CPU/GPU option cpu_pool = Pool(args.no_thread, maxtasksperchild=1000) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_ratio) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)) as sess: saver = tf.train.Saver() saver.restore(sess, args.ckpt) features = dict() for one_set in args.set_type: print("Load dataset -> set: {}".format(one_set)) dataset = OracleDataset.load(args.data_dir, one_set, image_loader=image_loader, crop_loader=crop_loader) batchifier = OracleBatchifier(tokenizer=None, sources=[source]) iterator = Iterator(dataset, batch_size=args.batch_size, pool=cpu_pool, batchifier=batchifier) for batch in tqdm(iterator): feat = sess.run(end_points[feature_name], feed_dict={images: numpy.array(batch[source])}) for f, game in zip(feat, batch["raw"]): f = f.squeeze() if args.mode == "crop": id = game.object_id else: id = game.picture.id
'<padding>': 0, '<start>': 1, '<stop>': 2, '<stop_dialogue>': 3, '<unk>': 4, '<yes>': 5, '<no>': 6, '<n/a>': 7, } word2occ = collections.defaultdict(int) tknzr = TweetTokenizer(preserve_case=False) print("Processing train dataset...") trainset = OracleDataset.load(args.data_dir, "train") for game in trainset.games: question = game.questions[0] tokens = tknzr.tokenize(question) for tok in tokens: word2occ[tok] += 1 print("filter words...") for word, occ in word2occ.items(): if occ >= args.min_occ and word.count('.') <= 1: word2i[word] = len(word2i) print("Number of words (occ >= 1): {}".format(len(word2occ))) print("Number of words (occ >= {}): {}".format(args.min_occ, len(word2i))) dict_path = os.path.join(args.data_dir, 'dict.json')
args = parser.parse_args() all_question = [] word2occ = collections.defaultdict(int) tknzr = TweetTokenizer(preserve_case=False) # model = FastText() # model.load_model("data/Embedding/wiki-simple.vec") # print(model.nearest_neighbors('teacher')) # exit() print("Processing train/valid dataset...") trainset = OracleDataset.load("data", "train") validset = OracleDataset.load("data", "valid") test = [] print("array append ...") for game in trainset.games: question = game.questions[0] tokens = tknzr.tokenize(question) all_question.append(tokens) np.append(all_question, all_question) for game in validset.games: question = game.questions[0] tokens = tknzr.tokenize(question) np.append(all_question, all_question)