def get_train_val_loaders(args, batch_size=32, dev_mode=False, train_shuffle=True, val_num=4000): classes, stoi = get_classes(args.cls_type, args.start_index, args.end_index) train_meta, val_meta = get_train_val_meta(args.cls_type, args.start_index, args.end_index) #sampler = BalancedSammpler(train_meta, classes, stoi, balanced=args.balanced, min_label_num=500, max_label_num=700) #df1 = train_meta.set_index('ImageID') #sampled_train_meta = df1.loc[sampler.img_ids] train_meta = train_meta[train_meta['obj_num'] <= 10] val_meta = val_meta[val_meta['obj_num'] <= 10] # resample training data train_img_ids = get_weighted_sample(train_meta, 1024*100) df_sampled = train_meta.set_index('ImageID').loc[train_img_ids] #print(df_sampled.shape) if val_num is not None: val_meta = val_meta.iloc[:val_num] #if dev_mode: # train_meta = train_meta.iloc[:10] # val_meta = val_meta.iloc[:10] img_dir = settings.TRAIN_IMG_DIR #train_set = ImageDataset(True, sampled_train_meta.index.values.tolist(), img_dir, classes, stoi, sampled_train_meta['LabelName'].values.tolist()) train_set = ImageDataset(True, train_img_ids, img_dir, classes, stoi, df_sampled['LabelName'].values.tolist()) val_set = ImageDataset(False, val_meta['ImageID'].values.tolist(), img_dir, classes, stoi, val_meta['LabelName'].values.tolist()) train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=train_shuffle, num_workers=4, collate_fn=train_set.collate_fn, drop_last=True) train_loader.num = train_set.num val_loader = data.DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=val_set.collate_fn, drop_last=False) val_loader.num = val_set.num return train_loader, val_loader
def get_val_loader(args, val_index, batch_size=32, dev_mode=False, val_num=3000): classes, stoi = get_classes(args.cls_type, args.start_index, args.end_index) _, val_meta = get_train_val_meta(args.cls_type, args.start_index, args.end_index) # filter, keep label counts <= args.max_labels val_meta = val_meta[val_meta['obj_num'] <= args.max_labels] if len(classes) < 7172: classes_set = set(classes) val_meta['tmp_label_count'] = val_meta['LabelName'].map(lambda x: len(set(x.split()) & classes_set)) val_meta = val_meta[val_meta['tmp_label_count'] > 0] #print(val_meta.shape) #print(val_meta['LabelName'].str.split().apply(pd.Series).stack().nunique()) val_meta = shuffle(val_meta, random_state=1234).iloc[:val_num] #print(val_meta.shape) if dev_mode: val_meta = val_meta.iloc[:10] img_dir = settings.TRAIN_IMG_DIR val_set = ImageDataset(False, val_meta['ImageID'].values.tolist(), img_dir, classes, stoi, val_index, val_meta['LabelName'].values.tolist()) val_loader = data.DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=4, drop_last=False) val_loader.num = val_set.num return val_loader
def get_train_loader(args, batch_size=32, dev_mode=False, train_shuffle=True): classes, stoi = get_classes(args.cls_type, args.start_index, args.end_index) train_meta, _ = get_train_val_meta(args.cls_type, args.start_index, args.end_index) # filter, keep label counts <= args.max_labels train_meta = train_meta[train_meta['obj_num'] <= args.max_labels] print(train_meta.shape) if len(classes) < 7172: classes_set = set(classes) train_meta['tmp_label_count'] = train_meta['LabelName'].map(lambda x: len(set(x.split()) & classes_set)) train_meta = train_meta[train_meta['tmp_label_count'] > 0] #tuning_labels['LabelName'].map(lambda x: sum([cls_counts[c] for c in x.split()])) #print('>>', train_meta.shape) # resample training data train_img_ids = get_weighted_sample(train_meta, 1024*100) df_sampled = train_meta.set_index('ImageID').loc[train_img_ids] if dev_mode: train_meta = train_meta.iloc[:10] train_shuffle = False img_dir = settings.TRAIN_IMG_DIR train_set = ImageDataset(True, train_img_ids, img_dir, classes, stoi, None, df_sampled['LabelName'].values.tolist()) train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=train_shuffle, num_workers=4, drop_last=True)#, collate_fn=train_set.collate_fn, drop_last=True) train_loader.num = train_set.num return train_loader
def get_train_val_loaders(batch_size=8, dev_mode=False, drop_empty=False, img_sz=384): train_shuffle = True train_meta, val_meta = get_train_val_meta(drop_empty=drop_empty) img_mask_aug_train = get_img_mask_transforms( img_sz) #ImgAug(aug.get_affine_seq('edge')) if dev_mode: train_shuffle = False img_mask_aug_train = None train_meta = train_meta.iloc[:10] val_meta = val_meta.iloc[:10] print(train_meta.shape, val_meta.shape) train_set = ImageDataset( True, train_meta, img_dir=settings.TRAIN_IMG_DIR, augment_with_target=img_mask_aug_train, image_augment=transforms.ColorJitter( 0.2, 0.2, 0.2, 0.2), #ImgAug(aug.brightness_seq), image_transform=get_img_transforms(img_sz), mask_transform=get_mask_transforms(img_sz)) train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=train_shuffle, num_workers=16, collate_fn=train_set.collate_fn, drop_last=True) train_loader.num = len(train_set) if dev_mode: train_loader.y_true = read_masks(train_meta['ImageId'].values, settings.TRAIN_MASK_DIR) val_set = ImageDataset( True, val_meta, img_dir=settings.TRAIN_IMG_DIR, augment_with_target=None, image_augment=None, #ImgAug(aug.pad_to_fit_net(64, 'reflect')), image_transform=get_img_transforms(img_sz), mask_transform=get_mask_transforms(img_sz)) val_loader = data.DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=16, collate_fn=val_set.collate_fn) val_loader.num = len(val_set) val_loader.y_true = read_masks(val_meta['ImageId'].values, settings.TRAIN_MASK_DIR) val_loader.meta = val_meta return train_loader, val_loader
def test_sampling(): classes, stoi = get_classes('trainable', 0, 7172) train_meta, val_meta = get_train_val_meta('trainable', 0, 7172) print(train_meta.shape) print(train_meta.head()) #train_meta = train_meta.iloc[:100] #w = get_weights(train_meta, 'rare_counts') w = get_weights_by_counts(train_meta['rare_counts'].values, max_weight=500) print('weights:', [int(x) for x in w.tolist()]) print(w.shape) #w = [1]*100 #sample = weighted_sample(train_meta['ImageID'].values, w, 50000) sample = get_weighted_sample(train_meta, 50000) counts = Counter() print(sample[:10]) counts.update(sample) print(counts.most_common(100)) common_ids = [x[0] for x in counts.most_common(50000)] df_selected = train_meta.set_index('ImageID') df_selected = df_selected.loc[sample] print(df_selected.head(20))
self.img_ids.append(row[0]) else: pass if len(self.full_classes) >= self.n_classes: break if len(self.img_ids) == last_image_nums: break rounds += 1 if rounds > max_rounds: break if __name__ == '__main__': classes, stoi = get_classes('trainable', 0, 500) meta, _ = get_train_val_meta('trainable', 0, 500) #print(meta.head()) #print(meta.shape) sampler = BalancedSammpler(meta, classes, stoi, balanced=True) #print(len(sampler.img_ids)) print(sampler.img_ids[:10]) print(sampler.class_counts) df1 = meta.set_index('ImageID') print(df1.head()) selected = df1.loc[sampler.img_ids] print('selected:') print(selected.shape)