コード例 #1
0
    def get_datasets(args, device):
        lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        reward_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        images_feature_reader = ImageFeaturesH5Reader(args.features_path,
                                                      False)
        question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer)

        if args.min_data:
            vocab_path = os.path.join(args.data_path, 'cache/vocab_min.json')
            train_split = "mintrain"
            val_split = "mintrain" if device.type == "cpu" else "minval"
        else:
            vocab_path = os.path.join(args.data_path, 'cache/vocab.json')
            train_split = "mintrain" if device.type == "cpu" else "train"
            val_split = "mintrain" if device.type == "cpu" else "val"

        train_dataset = VQADataset(split=train_split,
                                   dataroot=args.data_path,
                                   question_tokenizer=question_tokenizer,
                                   image_features_reader=images_feature_reader,
                                   reward_tokenizer=reward_tokenizer,
                                   clean_datasets=True,
                                   max_seq_length=23,
                                   num_images=args.max_samples,
                                   vocab_path=vocab_path,
                                   filter_entries=True,
                                   rl=False)
        val_dataset = VQADataset(split=val_split,
                                 dataroot=args.data_path,
                                 question_tokenizer=question_tokenizer,
                                 image_features_reader=images_feature_reader,
                                 reward_tokenizer=reward_tokenizer,
                                 clean_datasets=True,
                                 max_seq_length=23,
                                 num_images=args.max_samples,
                                 vocab_path=vocab_path,
                                 filter_entries=True,
                                 rl=False)
        test_dataset = val_dataset

        return train_dataset, val_dataset, test_dataset
コード例 #2
0
    def __init__(self, data_path, features_h5path, max_len=10,
                 reward_type="levenshtein",
                 debug=None,
                 reward_path=None, mode="train", diff_reward=False,
                 condition_answer=True, reward_vocab=None, mask_answers=False, max_seq_length=23,
                 num_answers=1, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), min_data=0,
                 reduced_answers=False, answer_sampl="uniform", params=None, filter_numbers=False):
        super(VQAEnv, self).__init__(data_path, max_len, reward_type=reward_type,
                                     reward_path=reward_path, debug=debug, mode=mode, diff_reward=diff_reward,
                                     condition_answer=condition_answer, reward_vocab=reward_vocab,
                                     mask_answers=mask_answers, device=device, reduced_answers=reduced_answers,
                                     params=params, filter_numbers=filter_numbers)

        # Loading VQA Dataset.
        num_images = int(self.debug[1]) if self.debug is not None else self.debug
        if self.mode == "test_images":
            num_images = None
        lm_tokenizer = GPT2Tokenizer.from_pretrained("cache/gpt-2")
        question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer)
        reward_tokenizer = BertTokenizer.from_pretrained("cache/bert")
        images_feature_reader = ImageFeaturesH5Reader(features_h5path, False)
        modes = self.get_modes(device=device, min_data=min_data)
        self.min_data = min_data
        vocab_path = os.path.join(data_path, "cache", "vocab.json") if not min_data else os.path.join(data_path,
                                                                                                      "cache",
                                                                                                      "vocab_min.json")
        num_answers_ = None if answer_sampl == "img_sampling" else num_answers
        self.dataset = VQADataset(split=modes[self.mode], dataroot=data_path,
                                  image_features_reader=images_feature_reader, question_tokenizer=question_tokenizer,
                                  reward_tokenizer=reward_tokenizer, clean_datasets=True,
                                  max_seq_length=max_seq_length,
                                  num_answers=num_answers_, num_images=num_images, filter_entries=True,
                                  vocab_path=vocab_path,filter_numbers=filter_numbers)
        duplicate_entries = True if answer_sampl == "img_sampling" else False
        self.dataset.split_entries(duplicate_entries)
        self.set_special_tokens()
        self.set_reward_function(reward_type=reward_type, reward_path=reward_path, reward_vocab=reward_vocab,
                                 diff_reward=diff_reward)
        self.answer_sampling = answer_sampl
        self.inv_freq_answers = self.dataset.get_answers_frequency()
コード例 #3
0
class VQAEnv(GenericEnv):
    """VQA Env"""
    metadata = {'render.modes': ['human']}

    def __init__(self, data_path, features_h5path, max_len=10,
                 reward_type="levenshtein",
                 debug=None,
                 reward_path=None, mode="train", diff_reward=False,
                 condition_answer=True, reward_vocab=None, mask_answers=False, max_seq_length=23,
                 num_answers=1, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), min_data=0,
                 reduced_answers=False, answer_sampl="uniform", params=None, filter_numbers=False):
        super(VQAEnv, self).__init__(data_path, max_len, reward_type=reward_type,
                                     reward_path=reward_path, debug=debug, mode=mode, diff_reward=diff_reward,
                                     condition_answer=condition_answer, reward_vocab=reward_vocab,
                                     mask_answers=mask_answers, device=device, reduced_answers=reduced_answers,
                                     params=params, filter_numbers=filter_numbers)

        # Loading VQA Dataset.
        num_images = int(self.debug[1]) if self.debug is not None else self.debug
        if self.mode == "test_images":
            num_images = None
        lm_tokenizer = GPT2Tokenizer.from_pretrained("cache/gpt-2")
        question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer)
        reward_tokenizer = BertTokenizer.from_pretrained("cache/bert")
        images_feature_reader = ImageFeaturesH5Reader(features_h5path, False)
        modes = self.get_modes(device=device, min_data=min_data)
        self.min_data = min_data
        vocab_path = os.path.join(data_path, "cache", "vocab.json") if not min_data else os.path.join(data_path,
                                                                                                      "cache",
                                                                                                      "vocab_min.json")
        num_answers_ = None if answer_sampl == "img_sampling" else num_answers
        self.dataset = VQADataset(split=modes[self.mode], dataroot=data_path,
                                  image_features_reader=images_feature_reader, question_tokenizer=question_tokenizer,
                                  reward_tokenizer=reward_tokenizer, clean_datasets=True,
                                  max_seq_length=max_seq_length,
                                  num_answers=num_answers_, num_images=num_images, filter_entries=True,
                                  vocab_path=vocab_path,filter_numbers=filter_numbers)
        duplicate_entries = True if answer_sampl == "img_sampling" else False
        self.dataset.split_entries(duplicate_entries)
        self.set_special_tokens()
        self.set_reward_function(reward_type=reward_type, reward_path=reward_path, reward_vocab=reward_vocab,
                                 diff_reward=diff_reward)
        self.answer_sampling = answer_sampl
        self.inv_freq_answers = self.dataset.get_answers_frequency()

    def update_mode(self, mode, answer_sampl):
        self.mode = mode
        self.answer_sampling = answer_sampl

    def get_modes(self, device, min_data):
        if min_data or device.type == "cpu":
            modes = {"train": "mintrain", "test_images": "minval", "test_text": "mintrain"}
        else:
            modes = {"train": "train", "test_images": "val", "test_text": "train"}
        return modes

    def get_env_idx(self, i_episode, entries):
        if i_episode is not None and i_episode < len(entries):
            env_idx = i_episode
        else:
            env_idx = np.random.randint(0, len(entries))
        return env_idx

    def decode_inv_frequency(self):
        inv_freq_answers_decoded = {self.dataset.answer_tokenizer.decode([k]): round(v, 4) for k, v in
                                    self.inv_freq_answers.items()}
        return inv_freq_answers_decoded

    def sample_answer_from_inv_freq_distrib(self):
        tensor_distrib = torch.tensor(list(self.inv_freq_answers.values()))
        ind_sampled = torch.multinomial(tensor_distrib, num_samples=1)
        prob_sampled = tensor_distrib[ind_sampled].item()
        possible_answers = [k for k, v in self.inv_freq_answers.items() if round(v, 4) == round(prob_sampled, 4)]
        return random.choice(possible_answers)

    def sample_entry_from_answer(self, answer):
        entries_answer = {i: entry for i, entry in enumerate(self.dataset.filtered_entries) if
                          entry["answer"]["labels"] == answer}
        env_idx = random.choice(list(entries_answer.keys()))
        entry = entries_answer[env_idx]
        return entry, env_idx

    def sample_entry(self, entries, i_episode=None):
        if self.answer_sampling == "random":
            env_idx = self.get_env_idx(i_episode, entries)
            entry = entries[env_idx]
        elif self.answer_sampling == "uniform":
            answer = random.choice(self.dataset.reduced_answers.cpu().numpy())
            entry, env_idx = self.sample_entry_from_answer(answer)
        elif self.answer_sampling == "inv_frequency":
            answer = self.sample_answer_from_inv_freq_distrib()
            entry, env_idx = self.sample_entry_from_answer(answer)
        elif self.answer_sampling == "img_sampling":
            img_idx = random.choice(self.dataset.images_idx)
            answer = random.choice(self.dataset.answer_img_map[img_idx])
            entries = [(idx, ent) for idx, ent in enumerate(self.dataset.filtered_entries) if ent["image_id"] == img_idx and answer in ent["answer"]["labels"]]
            env_idx, entry = random.choice(entries)
        return entry, env_idx

    def reset(self, seed=None, i_episode=None):
        if seed is not None:
            np.random.seed(seed)
        entries = self.dataset.test_entries if self.mode == "test_text" else self.dataset.filtered_entries
        self.entry, self.env_idx = self.sample_entry(entries, i_episode)
        (features, image_mask, spatials) = self.dataset.get_img_data(self.entry)
        labels, _ = self.dataset.get_answer_data(self.entry)
        self.ref_question_idx = self.entry["question_id"]
        self.ref_question = self.entry["q_token"][:self.max_len]
        self.ref_questions = self.ref_question.view(1, -1)
        self.ref_question_decoded = self.dataset.question_tokenizer.decode(self.entry["q_token"][:self.max_len].numpy())
        self.ref_questions_decoded = [self.ref_question_decoded]
        self.ref_answer = labels
        self.img_idx = self.entry["image_id"]
        self.img_feats = features
        self.img = (features, image_mask, spatials)

        # initializing the state.
        state_question = [self.special_tokens.SOS_idx]
        self.state = self.State(torch.LongTensor(state_question).view(1, len(state_question)),
                                self.img_feats.unsqueeze(0), self.ref_answer)
        self.step_idx = 0
        self.dialog = None

        return self.state
コード例 #4
0
                        help='Number of answers to keep')
    parser.add_argument(
        "-ans_preprocess",
        type=lambda x: bool(strtobool(x)),
        default="False",
        help='preprocess answers (higher accuracy but slow start)')
    parser.add_argument("-merge_val",
                        type=lambda x: bool(strtobool(x)),
                        default="False",
                        help='Fuse train/val dataset')

    args = parser.parse_args()

    print("Loading dataset...")
    train_dataset = VQADataset(args.data_dir,
                               args.year,
                               "train",
                               preprocess_answers=args.ans_preprocess)
    answer_counters = train_dataset.answer_counter.most_common()
    games = train_dataset.games

    if args.merge_val:
        valid_dataset = VQADataset(args.data_dir,
                                   args.year,
                                   "val",
                                   preprocess_answers=args.ans_preprocess)
        answer_counters += valid_dataset.answer_counter.most_common()
        games += valid_dataset.games

    word2i = {'<unk>': 0, '<start>': 1, '<stop>': 2, '<padding>': 3}

    answer2i = {'<unk>': 0}
コード例 #5
0
                        default='../../data/vqa-v2/coco_trainval.lmdb')
    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    reward_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    images_feature_reader = ImageFeaturesH5Reader(args.features_path, False)
    question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer)

    train_dataset = VQADataset(split="mintrain",
                               dataroot=args.data_path,
                               question_tokenizer=question_tokenizer,
                               image_features_reader=images_feature_reader,
                               reward_tokenizer=reward_tokenizer,
                               clean_datasets=True,
                               max_seq_length=23,
                               num_images=None,
                               vocab_path=os.path.join(args.data_path,
                                                       'cache/vocab.json'),
                               filter_entries=True,
                               rl=False)
    sf_ids = [0, 1, 2, 3, 4, 5, 7]  # 0 corresponds to function smoothing.
    print(
        "------------------------------------------------------------------------------------------------"
    )
    print("simple test")
    for sf_id in sf_ids:
        bleu_fn = Bleu(sf_id=sf_id)
        print("<-------------------------------------------------->")
        print("BLEU scores for smoothing function: {}".format(sf_id))
        simple_test(bleu_fn)
コード例 #6
0
    State = namedtuple('State', ('text', 'img', "answer"))
    features_h5path = "../../../data/vqa-v2/coco_trainval.lmdb"
    data_path = "../../../data/vqa-v2"
    lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer)
    reward_tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                                     do_lower_case=True)

    images_feature_reader = ImageFeaturesH5Reader(features_h5path, False)
    vocab_path = os.path.join(data_path, "cache", "vocab_min.json")
    dataset = VQADataset(split="mintrain",
                         dataroot=data_path,
                         image_features_reader=images_feature_reader,
                         question_tokenizer=question_tokenizer,
                         reward_tokenizer=reward_tokenizer,
                         clean_datasets=True,
                         max_seq_length=23,
                         min_len_questions=0,
                         num_answers=1,
                         num_images=None,
                         filter_entries=True,
                         vocab_path=vocab_path)

    language_score = LanguageScore(dataset=dataset)

    def get_state_actions(test_sentence):
        state_encoded = dataset.question_tokenizer.encode(test_sentence)
        state_encoded = [1] + state_encoded
        state_encoded = state_encoded[:-1]
        sequence_actions = state_encoded[1:]
        return state_encoded, sequence_actions
コード例 #7
0
    parser.add_argument("-test", type=int, default=1)
    args = parser.parse_args()

    lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    reward_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    features_h5path = args.features_path
    images_feature_reader = ImageFeaturesH5Reader(features_h5path, False)
    question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer)

    if args.vocab_path == "none":
        split = "trainval" if not args.min_split else "mintrainval"
        vqa_dataset = VQADataset(split=split,
                                 dataroot=args.data_path,
                                 vocab_path=args.vocab_path,
                                 question_tokenizer=question_tokenizer,
                                 image_features_reader=images_feature_reader,
                                 reward_tokenizer=reward_tokenizer,
                                 clean_datasets=True,
                                 max_seq_length=23,
                                 num_images=None)

    else:
        split = args.split if not args.min_split else "min" + args.split
        print("Building {} dataset from {}...".format(split, args.vocab_path))
        vqa_dataset = VQADataset(split=split,
                                 dataroot=args.data_path,
                                 question_tokenizer=question_tokenizer,
                                 image_features_reader=images_feature_reader,
                                 reward_tokenizer=reward_tokenizer,
                                 clean_datasets=True,
                                 max_seq_length=23,
コード例 #8
0
ファイル: launch_train.py プロジェクト: AMDonati/RL-NLP
    def get_datasets(args, device):
        if args.dataset == "clevr":
            if args.dataset_ext == 0:
                train_questions_path = os.path.join(args.data_path,
                                                    "train_questions.h5")
                val_questions_path = os.path.join(args.data_path,
                                                  "val_questions.h5")
                test_questions_path = os.path.join(args.data_path,
                                                   "test_questions.h5")
                train_feats_path = os.path.join(args.data_path,
                                                'train_features.h5')
                val_feats_path = os.path.join(args.data_path,
                                              'val_features.h5')
                vocab_path = os.path.join(args.data_path, "vocab.json")

                if args.task == "lm":
                    train_dataset = QuestionsDataset(
                        h5_questions_path=train_questions_path,
                        vocab_path=vocab_path,
                        range_samples=args.range_samples)
                    val_dataset = QuestionsDataset(
                        h5_questions_path=val_questions_path,
                        vocab_path=vocab_path)
                    test_dataset = QuestionsDataset(
                        h5_questions_path=test_questions_path,
                        vocab_path=vocab_path)
                elif args.task == "policy":
                    train_dataset = CLEVR_Dataset(
                        h5_questions_path=train_questions_path,
                        h5_feats_path=train_feats_path,
                        vocab_path=vocab_path,
                        max_samples=args.max_samples)
                    val_dataset = CLEVR_Dataset(
                        h5_questions_path=val_questions_path,
                        h5_feats_path=val_feats_path,
                        vocab_path=vocab_path,
                        max_samples=args.max_samples)
                    test_dataset = val_dataset

            else:
                vocab_path = os.path.join(args.data_path, "vocab.json")
                data_path = os.path.join(args.data_path, "clevr_ext")
                full_dataset = QuestionsDataset(
                    h5_questions_path=data_path,
                    vocab_path=vocab_path,
                    range_samples=args.range_samples,
                    dataset_ext=1)
                train_size = int(0.9 * len(full_dataset))
                test_size = len(full_dataset) - train_size
                train_dataset, test_dataset = torch.utils.data.random_split(
                    full_dataset, [train_size, test_size])
                train_dataset = copy_attributes(train_dataset,
                                                train_dataset.dataset)
                test_dataset = copy_attributes(test_dataset,
                                               test_dataset.dataset)
                val_dataset = copy.deepcopy(test_dataset)

        elif args.dataset == "vqa":
            lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
            reward_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
            images_feature_reader = ImageFeaturesH5Reader(
                args.features_path, False)
            question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer)

            if args.min_data:
                vocab_path = os.path.join(args.data_path,
                                          'cache/vocab_min.json')
                train_split = "mintrain"
                val_split = "mintrain" if device.type == "cpu" else "minval"
            else:
                vocab_path = os.path.join(args.data_path, 'cache/vocab.json')
                train_split = "mintrain" if device.type == "cpu" else "train"
                val_split = "mintrain" if device.type == "cpu" else "val"

            train_dataset = VQADataset(
                split=train_split,
                dataroot=args.data_path,
                question_tokenizer=question_tokenizer,
                image_features_reader=images_feature_reader,
                reward_tokenizer=reward_tokenizer,
                clean_datasets=True,
                max_seq_length=23,
                num_images=None,
                vocab_path=vocab_path,
                filter_entries=True,
                rl=False)
            val_dataset = VQADataset(
                split=val_split,
                dataroot=args.data_path,
                question_tokenizer=question_tokenizer,
                image_features_reader=images_feature_reader,
                reward_tokenizer=reward_tokenizer,
                clean_datasets=True,
                max_seq_length=23,
                num_images=None,
                vocab_path=vocab_path,
                filter_entries=True,
                rl=False,
                filter_numbers=args.filter_numbers)
            test_dataset = val_dataset

        return train_dataset, val_dataset, test_dataset
コード例 #9
0
                        type=str,
                        default="glove.42B.300d.zip",
                        help="Name of the stanford glove file")
    parser.add_argument("-glove_out",
                        type=str,
                        default="glove_dict.pkl",
                        help="Name of the output glove file")
    parser.add_argument("-year",
                        type=int,
                        default=2014,
                        help="VQA dataset year (2014/2017)")

    args = parser.parse_args()

    print("Loading dataset...")
    trainset = VQADataset(args.data_dir, year=args.year, which_set="train")
    validset = VQADataset(args.data_dir, year=args.year, which_set="val")
    testdevset = VQADataset(args.data_dir,
                            year=args.year,
                            which_set="test-dev")
    testset = VQADataset(args.data_dir, year=args.year, which_set="test")

    tokenizer = TweetTokenizer(preserve_case=False)

    print("Loading glove...")
    with io.open(args.glove_in, 'r', encoding="utf-8") as f:
        vectors = {}
        for line in f:
            vals = line.rstrip().split(' ')
            vectors[vals[0]] = [float(x) for x in vals[1:]]