コード例 #1
0
ファイル: dataset.py プロジェクト: zisang0210/flickr8k-im2txt
def prepare_train_data(config):
    """ Prepare the data for training the model. """

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Processing the captions...")

    annotations = pd.read_csv(config.temp_annotation_file)
    captions = annotations['caption'].values
    image_ids = annotations['image_id'].values
    image_files = annotations['image_file'].values

    data = np.load(config.temp_data_file).item()
    word_idxs = data['word_idxs']
    masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
コード例 #2
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    vqa = VQA(config.train_answer_file, config.train_question_file)
    vqa.filter_by_ques_len(config.max_question_length)
    vqa.filter_by_ans_len(1)

    print("Reading the questions and answers...")
    annotations = process_vqa(vqa, 'COCO_train2014', config.train_image_dir,
                              config.temp_train_annotation_file)

    image_files = annotations['image_file'].values
    questions = annotations['question'].values
    question_ids = annotations['question_id'].values
    answers = annotations['answer'].values
    print("Questions and answers read.")
    print("Number of questions = %d" % (len(question_ids)))

    print("Building the vocabulary...")
    vocabulary = Vocabulary()
    if not os.path.exists(config.vocabulary_file):
        for question in tqdm(questions):
            vocabulary.add_words(word_tokenize(question))
        for answer in tqdm(answers):
            vocabulary.add_words(word_tokenize(answer))
        vocabulary.compute_frequency()
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))
    config.vocabulary_size = vocabulary.size

    print("Processing the questions and answers...")
    if not os.path.exists(config.temp_train_data_file):
        question_word_idxs, question_lens = process_questions(
            questions, vocabulary, config)
        answer_idxs = process_answers(answers, vocabulary)
        data = {
            'question_word_idxs': question_word_idxs,
            'question_lens': question_lens,
            'answer_idxs': answer_idxs
        }
        np.save(config.temp_train_data_file, data)
    else:
        data = np.load(config.temp_train_data_file).item()
        question_word_idxs = data['question_word_idxs']
        question_lens = data['question_lens']
        answer_idxs = data['answer_idxs']
    print("Questions and answers processed.")

    print("Building the dataset...")
    dataset = DataSet(image_files, question_word_idxs, question_lens,
                      question_ids, config.batch_size, answer_idxs, True, True)
    print("Dataset built.")
    return dataset, config
コード例 #3
0
ファイル: text_process.py プロジェクト: audreycui/Texygen
def build_vocabulary(config, captions, oracle_file):
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    if True:  #not os.path.exists(config.vocabulary_file):
        vocabulary.build(captions)
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    #print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))
    #return vocabulary

    print("NUM CAPTIONS: " + str(len(captions)))
    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        sent_lens = []
        for caption in captions:
            current_word_idxs, current_length = vocabulary.process_sentence(
                caption)
            current_num_words = min(config.max_caption_length - 2,
                                    current_length)

            pad_length = config.max_caption_length - current_length - 2
            current_word_idxs = [config._START_
                                 ] + current_word_idxs[:current_num_words] + [
                                     config._END_
                                 ] + [config._PAD_] * pad_length

            word_idxs.append(current_word_idxs)
            sent_lens.append(current_num_words + 2)
        word_idxs = np.array(word_idxs)
        data = {'word_idxs': word_idxs, 'sentence_len': sent_lens}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        sent_lens = data['sentence_len']

    if oracle_file is not None:
        with open(oracle_file, 'w') as outfile:
            paras = ""
            for line in word_idxs:
                for word in line:
                    paras += (str(word) + ' ')
                paras += '\n'
                outfile.write(paras)

    return vocabulary
コード例 #4
0
def main():
  args = get_args()

  vocab = Vocabulary.load(args.vocab_prefix.strip())
  output_dir = path.realpath(args.output_dir.strip())
  if args.soseos:
    line2arr = partial(line2arr_with_soseos, vocab)
    print("sos-eos!!!")
  else:
    line2arr = partial(line2arr_no_soseos, vocab)

  counter = -1
  for line in sys.stdin.readlines():
    counter += 1
#    print(counter)
#    sys.stdout.flush()
#    counter += 1
    if counter % 100 == 0:
      print(counter)
      sys.stdout.flush()
    fname = line.strip()
#    try:
    lines = open(fname, 'r').readlines()
    stripped = map(lambda x: x.strip(), lines)
    non_empty = filter(lambda x: x != "", stripped)
    file_arr = [line2arr(line) for line in non_empty ]
    np_arr = np.array(file_arr)
    try:
      new_fname = path.join(output_dir, path.split(fname)[1].replace(".tok", ".npy"))
      np.save(new_fname, np_arr)
    except:
      print("errored out on: {0}".format(fname))
コード例 #5
0
def get_huffman_tree(params):
    if "huff_tree_loc" in params:
        with open(params["huff_tree_loc"], 'rb') as f:
            huff_tree = pickle.load(f)
    else:
        vocab_size = params["n_vocab"]
        soseos_counts_estim = [40114695 for i in range(2)]
        vocab = Vocabulary.load(
            "/hdd/data/nlp/raw/unzipped/ff15_book/vocabs/final_vocab")
        sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
        sorted_counts = [x[1] for x in sorted_vocab]
        cutoff_counts = sorted_counts[0:vocab_size]
        oov_counts = [sum(sorted_counts[vocab_size:])]
        #    print("#words: {0}".format(len(sorted_vocab)))
        #    print("cutoff oov = {0}".format(sorted_vocab[vocab_size]))
        #    print("oov words right after cutoff:")
        #    print([x[0] for x in sorted_vocab[vocab_size:vocab_size + 50]])
        #    print("randomly sampled oov words:")
        #    print(random.sample([x[0] for x in sorted_vocab[vocab_size:]], 50))
        oov_percent = (100.0 * oov_counts[0]) / sum(cutoff_counts)
        #    print("oov % = {0:.5f}".format(oov_percent))
        all_counts = soseos_counts_estim + cutoff_counts + oov_counts
        params["vocab_counts"] = all_counts
        as_hash = {i: v for (i, v) in enumerate(all_counts)}
        huff_tree = chainer.links.BinaryHierarchicalSoftmax.create_huffman_tree(
            as_hash)
        print("loaded huffman tree")
    return huff_tree
コード例 #6
0
ファイル: estimate_vocab.py プロジェクト: ryanai3/TooMuchData
def main2():
    vocabs = [Vocabulary.load("./vocabs/v{0}".format(i)) for i in range(8)]
    print("loaded vocabs!")
    master_vocab = Vocabulary.merge_vocabularies(vocabs)
    master_vocab.save("./vocabs/final_vocab")
    import pdb
    pdb.set_trace()
コード例 #7
0
ファイル: estimate_vocab.py プロジェクト: ryanai3/TooMuchData
def main3():
    master_vocab = Vocabulary.load("./vocabs/final_vocab")
    offsets = master_vocab.get_offsets()
    with open("./vocabs/offsets.pkl", 'wb') as out_f:
        pickle.dump(offsets, out_f)
    import pdb
    pdb.set_trace()
    print(32)
コード例 #8
0
 def get_vocabulary(cls, vocab_path, captions_path, tokenized_captions, threshold=1):
     # Load or construct vocabulary
     if os.path.exists(vocab_path):
         vocab = Vocabulary.load(vocab_path)
     else:
         vocab = cls.build_vocab(captions_path, tokenized_captions, threshold)
         #TODO: check if saving is safe
         Vocabulary.save(vocab, vocab_path)
         print("Saved the vocabulary to '%s'" %vocab_path)
     return vocab
コード例 #9
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    if not os.path.exists(config.prepare_annotation_dir):
        os.mkdir(config.prepare_annotation_dir)
    coco = COCO(config, config.train_caption_file, config.val_caption_file)
    
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        coco.filter_by_cap_len(config.max_caption_length)
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
        vocabulary.save_counts(config.word_count_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))

    
    print("Processing the captions...")
    if not os.path.exists(config.train_csv_file):
                    
        coco.filter_by_words(set(vocabulary.words))
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [ 
            os.path.join(config.dataset_image_dir,
            'train' if coco.imgs[image_id]['file_name'].find('train2014')>=0 else 'val',
            coco.imgs[image_id]['file_name'])
                        for image_id in image_ids ] 
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.train_csv_file)
    else:
        annotations = pd.read_csv(config.train_csv_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values
コード例 #10
0
    def test(self):
        trace('loading model ...')
        self.trg_vocab = Vocabulary.load("model/" + self.model + '.trgvocab')
        self.batch_size = len(trg_batch)
        encdec = EncoderDecoder.load_spec("model/" + self.model + '.spec')
        serializers.load_hdf5("model/" + self.model + '.weights', encdec)

        trace('generating translation ...')
        generated = 0

        with open(self.target, 'w') as fp:
            self.__forward_img()
            trace('sample %8d ...' % (generated + 1))
            hyp_batch = self.__forward_word(self.trg_batch, encdec, False, self.generation_limit)

            for hyp in hyp_batch:
                hyp.append('</s>')
                hyp = hyp[:hyp.index('</s>')]
                print('hyp : ' +''.join(hyp))
                print(' '.join(hyp), file=fp)

        trace('finished.')
    def test(self):
        trace('loading model ...')
        trg_vocab = Vocabulary.load(self.model + '.trgvocab')
        self.encdec = EncoderDecoderAttention.load_spec(self.model + '.spec')
        serializers.load_hdf5(self.model + '.weights', self.encdec)

        trace('generating translation ...')
        generated = 0

        trace('sample %8d - %8d ...' % (generated + 1, generated))
        hyp_batch = self.forward(trg_vocab, False, self.generation_limit)

        source_cuont = 0
        with open(self.target, 'w') as fp:
            for hyp in hyp_batch:
                hyp.append('</s>')
                hyp = hyp[: hyp.index('</s>')]
                print('hyp : ' + ''.join(hyp))
                fp.write(' '.join(hyp))
                source_cuont = source_cuont + 1

        trace('finished.')
コード例 #12
0
    def test(self):
        trace('loading model ...')
        trg_vocab = Vocabulary.load(self.model + '.trgvocab')
        self.encdec = EncoderDecoderAttention.load_spec(self.model + '.spec')
        serializers.load_hdf5(self.model + '.weights', self.encdec)

        trace('generating translation ...')
        generated = 0

        trace('sample %8d - %8d ...' % (generated + 1, generated))
        hyp_batch = self.forward(trg_vocab, False, self.generation_limit)

        source_cuont = 0
        with open(self.target, 'w') as fp:
            for hyp in hyp_batch:
                hyp.append('</s>')
                hyp = hyp[:hyp.index('</s>')]
                print('hyp : ' + ''.join(hyp))
                fp.write(' '.join(hyp))
                source_cuont = source_cuont + 1

        trace('finished.')
コード例 #13
0
    def test(self):
        trace('loading model ...')
        self.trg_vocab = Vocabulary.load("model/" + self.model + '.trgvocab')
        self.batch_size = len(trg_batch)
        encdec = EncoderDecoder.load_spec("model/" + self.model + '.spec')
        serializers.load_hdf5("model/" + self.model + '.weights', encdec)

        trace('generating translation ...')
        generated = 0

        with open(self.target, 'w') as fp:
            self.__forward_img()
            trace('sample %8d ...' % (generated + 1))
            hyp_batch = self.__forward_word(self.trg_batch, encdec, False,
                                            self.generation_limit)

            for hyp in hyp_batch:
                hyp.append('</s>')
                hyp = hyp[:hyp.index('</s>')]
                print('hyp : ' + ''.join(hyp))
                print(' '.join(hyp), file=fp)

        trace('finished.')
コード例 #14
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding='latin1').item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    if (config.train_data_count_limit > 0):
        print("-----------------------------------------------")
        print("Restricting Sz:\t", config.train_data_count_limit)
        print("Batch Sz:\t", config.batch_size)
        image_ids = image_ids[0:config.train_data_count_limit]
        image_files = image_files[0:config.train_data_count_limit]
        word_idxs = word_idxs[0:config.train_data_count_limit]
        masks = masks[0:config.train_data_count_limit]
        """ Dump the image paths to a file """
        filepath = 'train_images.csv'
        with open(filepath, 'w') as file_handler:
            for i in range(0, config.train_data_count_limit):
                file_handler.write("{}\n".format(image_files[i]))
        #print(image_files)
        print("-----------------------------------------------")

    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
コード例 #15
0
def main(argv):
    print("Testing the model ...")
    config = Config()
    config.beam_size = FLAGS.beam_size
    config.phase = 'test'

    if not os.path.exists(config.test_result_dir):
        os.mkdir(config.test_result_dir)
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    test_data = DataProvider(config)
    test_gt_coco = test_data.returncoco()
    model = ShowAttendTell(config)
    model.build()

    with tf.Session() as sess:
        model.setup_graph_from_checkpoint(sess, config.caption_checkpoint_dir)
        tf.get_default_graph().finalize()

        captiongen = CaptionGenerator(model, vocabulary, config.beam_size,
                                      config.max_caption_length,
                                      config.batch_size)

        # Generate the captions for the images
        results = []
        idx = 0
        for k in tqdm(list(range(test_data.num_batches)), desc='batch'):
            batch, images = test_data.next_batch_and_images()
            caption_data = captiongen.beam_search(sess, images, vocabulary)

            fake_cnt = 0 if k<test_data.num_batches-1 \
                         else test_data.fake_count
            for l in range(test_data.batch_size - fake_cnt):
                word_idxs = caption_data[l][0].sentence
                score = caption_data[l][0].score
                caption = vocabulary.get_sentence(word_idxs)
                results.append({
                    'image_id': test_data.image_ids[idx],
                    'caption': caption
                })
                idx += 1

                # Save the result in an image file, if requested
                if config.save_test_result_as_image:
                    image_file = batch[l]
                    image_name = image_file.split(os.sep)[-1]
                    image_name = os.path.splitext(image_name)[0]
                    img = plt.imread(image_file)
                    plt.switch_backend('agg')
                    plt.imshow(img)
                    plt.axis('off')
                    plt.title(caption)
                    plt.savefig(
                        os.path.join(config.test_result_dir,
                                     image_name + '_result.png'))

        fp = open(config.test_result_file, 'wb')
        json.dump(results, fp)
        fp.close()

        # Evaluate these captions
        test_result_coco = test_gt_coco.loadRes(config.test_result_file)
        scorer = COCOEvalCap(test_gt_coco, test_result_coco)
        scorer.evaluate()
    print("Evaluation complete.")
コード例 #16
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    vocabulary = Vocabulary(config.vocabulary_size)

    print("Vocabulary complete.")
    print("Number of words = %d" % (vocabulary.size))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        coco.filter_by_words(set(vocabulary.words))
        print("Filtering the captions to those that exist")
        captions = []
        image_ids = []
        id = 0
        for ann_id in coco.anns:
            id = id + 1
            if id < 1000:
                entry = coco.anns[ann_id]
                should_add = entry['image_id'] in coco.imgs and os.path.exists(
                    os.path.join(config.train_image_dir,
                                 coco.imgs[entry['image_id']]['file_name']))
                if should_add:
                    captions.append(entry['caption'])
                    image_ids.append(entry['image_id'])

        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        print("Loading the captions from ", config.temp_annotation_file)
        annotations = pd.read_csv(config.temp_annotation_file,
                                  encoding='latin-1')
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.vocabulary_file):
        print("Building the vocabulary...")
        vocabulary.build(captions)
        vocabulary.save(config.vocabulary_file)
    else:
        print("Loading the vocabulary from ", config.vocabulary_file)
        vocabulary.load(config.vocabulary_file)

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        loaded = np.load(config.temp_data_file, allow_pickle=True)
        data = loaded.item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
コード例 #17
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % vocabulary.size)

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                       for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        # RV
        # save np.load
        # np_load_old = np.load

        # modify the default parameters of np.load
        # np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)
        #
        data = np.load(config.temp_data_file, allow_pickle=True, encoding="latin1").item()
        #
        # restore np.load for future normal usage
        # np.load = np_load_old
        # RV
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    # RV
    # Select the first 30000 captions from the shuffled set
    # num_examples = 5000
    # word_idxs = word_idxs[:num_examples]
    # image_files = image_files[:num_examples]

    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      True,
                      True)
    print("Dataset built.")
    return dataset
コード例 #18
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file, config.ignore_file)
    #coco.filter_by_cap_len(config.max_caption_length)

    #print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    #print("Vocabulary built.")
    #print("Number of words = %d" %(vocabulary.size))

    #coco.filter_by_words(set(vocabulary.words))

    #print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                                    for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        
        captions = [] 
        image_ids = [] 
        image_files = [] 

        for id, file, feat, cap in annotations.values:
            
            image_ids.append(id)
            image_files.append(feat)
            captions.append(cap)
        
    print("NUM CAPTIONS: " + str(len(captions)))
    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        sent_lens = []
        for caption in tqdm(captions):
            current_word_idxs, current_length = vocabulary.process_sentence(caption)
            current_num_words = min(config.max_caption_length-2, current_length)

            current_word_idxs = [config._START_] + current_word_idxs[:current_num_words] + [config._END_]
            pad_length = config.max_caption_length - current_num_words -2
            if pad_length > 0:
                current_word_idxs += [config._PAD_] * (pad_length)
            #print("sent length:"+str(len(current_word_idxs))+", real len:"+str(current_length))
            current_masks = np.zeros(config.max_caption_length)
            current_masks[:current_num_words] = 1.0

            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
            sent_lens.append(current_num_words+2)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks, 'sentence_len': sent_lens}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        masks = None #data['masks']
        sent_lens = data['sentence_len']
    #print("Captions processed.")
    #print("Number of captions = %d" %(len(captions)))
    #print("Number of word_idxs = %d" %(len(word_idxs)))
    #print("Number of sent_lens = %d" %(len(sent_lens)))
    dataset = DataSet(coco,
                      vocabulary,
                      image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      sent_lens,
                      True,
                      True)
    return dataset
コード例 #19
0
ファイル: dataset.py プロジェクト: noandrea/image_captioning
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding="latin1").item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
コード例 #20
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)
    if config.distributed:
        images = os.listdir(config.train_image_dir)
        ids = [int(x[15:27]) for x in images]
        print 'Input Path: ' + config.train_image_dir + ' Number of files in input path: ' + str(
            int(len(ids)))

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    if config.distributed:
        print('Filter captions by images')
        coco.filter_by_images(ids)
        #print(coco.getImgIds(ids))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.set_index('image_id', inplace=True)
        annotations = annotations.loc[ids]
        if not config.distributed:
            annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        if not config.distributed:
            np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    #print "Input Path: " + config.train_image_dir + " Number of files after data preparation: " + str(len(image_files))
    #print "Images IDs to be used on this server: " + str(image_ids)
    return dataset