def __init__( self, dir_data, split='train', batch_size=4, shuffle=False, pin_memory=False, nb_threads=4, *args, **kwargs): self.dir_data = dir_data self.split = split self.batch_size = batch_size self.shuffle = shuffle self.pin_memory = pin_memory self.nb_threads = nb_threads self.sampler = None self.collate_fn = btf.Compose([ btf.ListDictsToDictLists(), btf.StackTensors() ]) self.nb_items = kwargs['nb_items'] self.data = torch.randn(self.nb_items, 10) self.target = torch.zeros(self.nb_items) self.target[:int(self.nb_items / 2)].fill_(1)
def __init__(self, dir_data='data/mnist', split='train', batch_size=100, nb_threads=1, pin_memory=True): self.dir_data = dir_data self.split = split self.batch_size = batch_size self.nb_threads = nb_threads self.pin_memory = pin_memory if self.split == 'train': is_train = True self.shuffle = True elif self.split == 'val': is_train = False self.shuffle = False else: raise ValueError() self.item_tf = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]) download = (not os.path.isdir(self.dir_data)) self.dataset = datasets.MNIST(self.dir_data, train=is_train, download=download, transform=self.item_tf) # the usual collate function for bootstrap # to handle the (potentially nested) dict item format bellow self.collate_fn = bootstrap_tf.Compose( [bootstrap_tf.ListDictsToDictLists(), bootstrap_tf.StackTensors()])
def stack(): return transforms.Compose( [transforms.ListDictsToDictLists(), transforms.StackTensors()])
def __init__(self, dir_data='/local/cadene/data/vqa', split='train', batch_size=80, nb_threads=4, pin_memory=False, shuffle=False, nans=1000, minwcount=10, nlp='mcb', proc_split='train', samplingans=False, has_valset=True, has_testset=True, has_testset_anno=False, has_testdevset=True, has_answers_occurence=True, do_tokenize_answers=False): super(AbstractVQA, self).__init__(dir_data=dir_data, split=split, batch_size=batch_size, nb_threads=nb_threads, pin_memory=pin_memory, shuffle=shuffle) self.nans = nans self.minwcount = minwcount self.nlp = nlp self.proc_split = proc_split self.samplingans = samplingans # preprocessing self.has_valset = has_valset self.has_testset = has_testset self.has_testset_anno = has_testset_anno self.has_testdevset = has_testdevset self.has_answers_occurence = has_answers_occurence self.do_tokenize_answers = do_tokenize_answers # sanity checks if self.split in ['test', 'val'] and self.samplingans: raise ValueError() self.dir_raw = os.path.join(self.dir_data, 'raw') if not os.path.exists(self.dir_raw): self.download() self.dir_processed = os.path.join(self.dir_data, 'processed') self.subdir_processed = self.get_subdir_processed() self.path_wid_to_word = osp.join(self.subdir_processed, 'wid_to_word.pth') self.path_word_to_wid = osp.join(self.subdir_processed, 'word_to_wid.pth') self.path_aid_to_ans = osp.join(self.subdir_processed, 'aid_to_ans.pth') self.path_ans_to_aid = osp.join(self.subdir_processed, 'ans_to_aid.pth') self.path_trainset = osp.join(self.subdir_processed, 'trainset.pth') self.path_valset = osp.join(self.subdir_processed, 'valset.pth') self.path_is_qid_testdev = osp.join(self.subdir_processed, 'is_qid_testdev.pth') self.path_testset = osp.join(self.subdir_processed, 'testset.pth') if not os.path.exists(self.subdir_processed): self.process() self.wid_to_word = torch.load(self.path_wid_to_word) self.word_to_wid = torch.load(self.path_word_to_wid) self.aid_to_ans = torch.load(self.path_aid_to_ans) self.ans_to_aid = torch.load(self.path_ans_to_aid) if 'train' in self.split: self.dataset = torch.load(self.path_trainset) elif self.split == 'val': if self.proc_split == 'train': self.dataset = torch.load(self.path_valset) elif self.proc_split == 'trainval': self.dataset = torch.load(self.path_trainset) elif self.split == 'test': self.dataset = torch.load(self.path_testset) if self.has_testdevset: self.is_qid_testdev = torch.load(self.path_is_qid_testdev) self.collate_fn = bootstrap_tf.Compose([ bootstrap_tf.ListDictsToDictLists(), bootstrap_tf.PadTensors(use_keys=[ 'question', 'pooled_feat', 'cls_scores', 'rois', 'cls', 'cls_oh', 'norm_rois' ]), #bootstrap_tf.SortByKey(key='lengths'), # no need for the current implementation bootstrap_tf.StackTensors() ]) if self.proc_split == 'trainval' and self.split in ['train', 'val']: self.bootstrapping() self.qid_to_idx = { item['question_id']: idx for idx, item in enumerate(self.dataset['questions']) }
def __init__(self, dir_data, split, win_size, im_size, layer, # "goal" or "cause" frame_position, traintest_mode, fps=10, horizon=2, # in seconds extract_mode=False, batch_size=2, debug=False, shuffle=False, pin_memory=False, nb_threads=0): self.win_size = win_size self.frame_position = frame_position super(HDDClassif, self).__init__(dir_data, split, im_size, fps, horizon, # in seconds batch_size, debug, shuffle, pin_memory, nb_threads) self.layer = layer if self.layer == "cause": self.layer_id = '1' self.classid_to_ix = [-1, 16, 17, 18, 19, 20, 22] elif self.layer == "goal": self.layer_id = '0' self.classid_to_ix = [-1, 0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 12] else: raise ValueError(self.layer) # The classid 0 is the background class self.ix_to_classid = dict((ix, classid) for classid, ix in enumerate(self.classid_to_ix)) self.class_freq = self.get_class_freq() self.collate_fn = bootstrap_tf.Compose([ bootstrap_tf.ListDictsToDictLists(), bootstrap_tf.StackTensors() ]) self.dir_navig_features = self.dir_processed_annot self.im_transform = transforms.Compose([transforms.Resize((self.im_h, self.im_w)), transforms.ToTensor(), transforms.Normalize(mean = [0.43216, 0.394666, 0.37645], std = [0.22803, 0.22145, 0.216989])]) self.traintest_mode = traintest_mode if self.traintest_mode: self.make_batch_loader = self._make_batch_loader_traintest else: self.make_batch_loader = self._make_batch_loader
def default_items_tf(): return transforms.Compose([ transforms.ListDictsToDictLists(), transforms.PadTensors(value=0), transforms.StackTensors() ])
def __init__(self, dir_data, split, neg_ratio=0., batch_size=100, nb_threads=0, seed=1234, shuffle=True, pin_memory=True, mode='predicate'): super(VRD, self).__init__(dir_data=dir_data, split=split, batch_size=batch_size, nb_threads=nb_threads, pin_memory=pin_memory, shuffle=shuffle) assert (split in ['train', 'val', 'test', 'trainval']) self.neg_ratio = neg_ratio self.seed = seed assert (mode in ['predicate', 'rel_phrase']) self.mode = mode self.dir_raw_json = osp.join(self.dir_data, 'annotations', 'raw') self.dir_images = osp.join(self.dir_data, 'images') self.dir_processed = osp.join(self.dir_data, 'annotations', 'processed') if not osp.exists(self.dir_raw_json): self.download_json() if not osp.exists(self.dir_images): self.download_images() self.vocabs = self.load_vocabs() if not osp.exists(self.dir_processed): self.process_json() self.json = self.load_json() self.ids = sorted(list(self.json.keys())) self.ids = self.remove_no_bboxes_images() if self.mode == 'predicate': if self.split in ['train', 'val']: self.make_train_val_split() if self.split in ['train', 'val', 'trainval']: self.dir_features = osp.join(self.dir_data, 'features', 'gt_boxes', 'train') else: self.dir_features = osp.join(self.dir_data, 'features', 'gt_boxes', 'test') elif self.mode == 'rel_phrase': assert (self.split == 'test') self.dir_features = osp.join(self.dir_data, 'features', 'pred_boxes', 'test') path_jraw = osp.join(self.dir_raw_json, 'annotations_test.json') with open(path_jraw, 'r') as f: self.json_raw = json.load(f) if not osp.exists(self.dir_features): self.download_features() if self.split in ['train', 'trainval']: self.shuffle = False self.sampler = WeightedRandomSampler(weights=[1] * len(self), num_samples=len(self), replacement=True) else: self.sampler = None self.collate_fn = transforms.Compose( [transforms.ListDictsToDictLists(), transforms.CatTensors()])
def __init__( self, dir_data, dir_coco, dir_vg, split, val_size=0.05, image_features="default", background_coco=None, background_vg=None, background=False, background_merge=2, proportion_opposite=0.0, # not used train_selection=None, # not used no_features=False, path_questions=None, sampling=None, shuffle=None, batch_size=None, ): super().__init__() self.dir_data = dir_data self.dir_coco = dir_coco self.dir_vg = dir_vg self.split = split self.image_features = image_features self.dir_coco_lvis = "data/vqa/coco/extract_rcnn/lvis" self.dir_vg_lvis = "data/vqa/vgenome/extract_rcnn/lvis" self.background_coco = background_coco self.background_vg = background_vg self.background = background self.background_merge = background_merge self.no_features = no_features self.val_size = val_size self.path_questions = path_questions # to override path to questions (default dir_data/split.json) self.sampling = sampling self.shuffle = shuffle self.batch_size = batch_size if self.dir_coco.endswith(".zip"): self.zip_coco = None # lazy loading zipfile.ZipFile(self.dir_coco) if self.dir_vg.endswith(".zip"): self.zip_vg = None # lazy loading zipfile.ZipFile(self.dir_vg) if self.background_coco is not None and self.background_coco.endswith(".zip"): self.zip_bg_coco = None # zipfile.ZipFile(self.background_coco) if self.background_vg is not None and self.background_vg.endswith(".zip"): self.zip_bg_vg = None # lazy loading zipfile.ZipFile(self.background_vg) if self.dir_coco.endswith(".lmdb"): self.lmdb_coco = None if self.split not in ["train", "test"]: self.process_split() # path = os.path.join(self.dir_data, "processed", "questions.json") q_path = self.get_path_questions() # train or test Logger()("Loading questions") with open(q_path) as f: self.questions = json.load(f) self.path_wid_to_word = os.path.join( self.dir_data, "processed", "wid_to_word.pth" ) if os.path.exists(self.path_wid_to_word): self.wid_to_word = torch.load(self.path_wid_to_word) else: os.makedirs(os.path.join(self.dir_data, "processed"), exist_ok=True) word_list = self.get_token_list() self.wid_to_word = {wid + 1: word for wid, word in enumerate(word_list)} torch.save(self.wid_to_word, self.path_wid_to_word) self.word_to_wid = {word: wid for wid, word in self.wid_to_word.items()} self.aid_to_ans = [str(a) for a in list(range(16))] self.ans_to_aid = {ans: i for i, ans in enumerate(self.aid_to_ans)} self.collate_fn = bootstrap_tf.Compose( [ bootstrap_tf.ListDictsToDictLists(), bootstrap_tf.PadTensors( use_keys=[ "question", "pooled_feat", "cls_scores", "rois", "cls", "cls_oh", "norm_rois", ] ), # bootstrap_tf.SortByKey(key='lengths'), # no need for the current implementation bootstrap_tf.StackTensors(), ] )