Example #1
0
        # Zipping back the files
        cmd.append("gzip " + big_gw_data)
        cmd.append("gzip " + gw_data)

        for c in cmd:
            print c
            os.system(c)
            # break
        print
        print "----------------"

    print "Printing the image files"
    # game_img_dict = {}
    img_dict = {}
    for (n, s) in options:
        game_set = OracleDataset.load(args.data_dir, s)
        for game in game_set.games:
            img_dict[str(game.image.id) + ".jpg"] = 1
        f = open(path + s + "_img_index.txt", "w")
        for key in img_dict:
            print >> f, key
        f.close()
        # game_img_dict[s] = img_dict
        # print s
        # print img_dict
        print

    print img_dict

    # path of the data
    raw_dir = path + "img/raw"
Example #2
0
    # Load image
    image_builder, crop_builder = None, None
    use_resnet = False
    if config['inputs'].get('image', False):
        logger.info('Loading images..')
        image_builder = get_img_builder(config['model']['image'], args.img_dir)
        use_resnet = image_builder.is_raw_image()

    if config['inputs'].get('crop', False):
        logger.info('Loading crops..')
        crop_builder = get_img_builder(config['model']['crop'], args.crop_dir, is_crop=True)
        use_resnet = crop_builder.is_raw_image()

    # Load data
    logger.info('Loading data..')
    trainset = OracleDataset.load(args.data_dir, "train", image_builder, crop_builder)
    validset = OracleDataset.load(args.data_dir, "valid", image_builder, crop_builder)
    testset = OracleDataset.load(args.data_dir, "test", image_builder, crop_builder)

    # Load dictionary
    logger.info('Loading dictionary..')
    tokenizer = GWTokenizer(os.path.join(args.data_dir, args.dict_file))

    # Build Network
    logger.info('Building network..')
    network = OracleNetwork(config, num_words=tokenizer.no_words)

    # Build Optimizer
    logger.info('Building optimizer..')
    optimizer, outputs = create_optimizer(network, config, finetune=finetune)
Example #3
0
        '<padding>': 0,
        '<start>': 1,
        '<stop>': 2,
        '<stop_dialogue>': 3,
        '<unk>': 4,
        '<yes>': 5,
        '<no>': 6,
        '<n/a>': 7,
    }

    word2occ = collections.defaultdict(int)

    tknzr = TweetTokenizer(preserve_case=False)

    print("Processing train dataset...")
    trainset = OracleDataset.load(args.data_dir, "train")
    for game in trainset.games:
        question = game.questions[0]
        tokens = tknzr.tokenize(question)
        for tok in tokens:
            word2occ[tok] += 1

    print("filter words...")
    for word, occ in word2occ.items():
        if occ >= args.min_occ and word.count('.') <= 1:
            word2i[word] = len(word2i)

    print("Number of words (occ >= 1): {}".format(len(word2occ)))
    print("Number of words (occ >= {}): {}".format(args.min_occ, len(word2i)))

    dict_path = os.path.join(args.data_dir, 'dict.json')
Example #4
0


# CPU/GPU option
cpu_pool = Pool(args.no_thread, maxtasksperchild=1000)
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_ratio)

with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)) as sess:
    saver = tf.train.Saver()
    saver.restore(sess, args.ckpt)

    features = dict()
    for one_set in args.set_type:

        print("Load dataset -> set: {}".format(one_set))
        dataset = OracleDataset.load(args.data_dir, one_set, image_loader=image_loader, crop_loader=crop_loader)
        batchifier = OracleBatchifier(tokenizer=None, sources=[source])
        iterator = Iterator(dataset,
                            batch_size=args.batch_size,
                            pool=cpu_pool,
                            batchifier=batchifier)

        for batch in tqdm(iterator):
            feat = sess.run(end_points[feature_name], feed_dict={images: numpy.array(batch[source])})
            for f, game in zip(feat, batch["raw"]):
                f = f.squeeze()

                if args.mode == "crop":
                    id =  game.object_id
                else:
                    id = game.picture.id
Example #5
0
    args = parser.parse_args()

    all_question = []
    word2occ = collections.defaultdict(int)

    tknzr = TweetTokenizer(preserve_case=False)

    # model = FastText()
    # model.load_model("data/Embedding/wiki-simple.vec")

    # print(model.nearest_neighbors('teacher'))
    # exit()
    print("Processing train/valid dataset...")

    trainset = OracleDataset.load("data", "train")
    validset = OracleDataset.load("data", "valid")

    test = []
    print("array append ...")

    for game in trainset.games:
        question = game.questions[0]
        tokens = tknzr.tokenize(question)
        all_question.append(tokens)
        np.append(all_question, all_question)

    for game in validset.games:
        question = game.questions[0]
        tokens = tknzr.tokenize(question)
        np.append(all_question, all_question)