def test_db_num_workers(): if is_windows(): assert db_num_workers() == 0 assert db_num_workers(non_windows_num_workers=7) == 0 else: assert db_num_workers() == 16 assert db_num_workers(non_windows_num_workers=7) == 7
def __init__( self, root: Union[str, Path], batch_size: int = 2, transforms: object = get_transform(train=True), train_pct: float = 0.5, anno_dir: str = "annotations", im_dir: str = "images", ): """ initialize dataset This class assumes that the data is formatted in two folders: - annotation folder which contains the Pascal VOC formatted annotations - image folder which contains the images Args: root: the root path of the dataset containing the image and annotation folders batch_size: batch size for dataloaders transforms: the transformations to apply train_pct: the ratio of training to testing data annotation_dir: the name of the annotation subfolder under the root directory im_dir: the name of the image subfolder under the root directory. If set to 'None' then infers image location from annotation .xml files """ self.root = Path(root) # TODO think about how transforms are working... self.transforms = transforms self.im_dir = im_dir self.anno_dir = anno_dir self.batch_size = batch_size self.train_pct = train_pct # read annotations self._read_annos() # create training and validation datasets self.train_ds, self.test_ds = self.split_train_test( train_pct=train_pct ) # create training and validation data loaders self.train_dl = DataLoader( self.train_ds, batch_size=self.batch_size, shuffle=True, num_workers=db_num_workers(), collate_fn=collate_fn, ) self.test_dl = DataLoader( self.test_ds, batch_size=self.batch_size, shuffle=False, num_workers=db_num_workers(), collate_fn=collate_fn, )
def test_set_random_seed(tiny_ic_data_path): # check two data batches are the same after seeding set_random_seed(1) first_data = (ImageList.from_folder(tiny_ic_data_path).split_by_rand_pct(). label_from_folder().transform().databunch( bs=5, num_workers=db_num_workers()).normalize()) first_batch = first_data.one_batch() set_random_seed(1) second_data = (ImageList.from_folder(tiny_ic_data_path).split_by_rand_pct( ).label_from_folder().transform().databunch( bs=5, num_workers=db_num_workers()).normalize()) second_batch = second_data.one_batch() assert first_batch[1].tolist() == second_batch[1].tolist()
def _get_data_bunch_segmentationitemlist( path: Union[Path, str], transform: bool, im_size: int, bs: int, classes: List[str]) -> ImageDataBunch: """ Create ImageDataBunch and return it. TODO in future version is to allow users to pass in their own image bunch or their own Transformation objects (instead of using fastai's <get_transforms>) Args: path (Union[Path, str]): path to data to create databunch with transform (bool): a flag to set fastai default transformations (get_transforms()) im_size (int): image size of databunch bs (int): batch size of databunch Returns: ImageDataBunch """ path = path if type(path) is Path else Path(path) tfms = get_transforms() if transform else None im_path = path / "images" anno_path = path / "segmentation-masks" get_gt_filename = lambda x: anno_path / f"{x.stem}.png" # Load data return (SegmentationItemList.from_folder(im_path).split_by_rand_pct( valid_pct=0.33).label_from_func( get_gt_filename, classes=classes).transform( tfms=tfms, size=im_size, tfm_y=True).databunch( bs=bs, num_workers=db_num_workers()).normalize(imagenet_stats) )
def init_data_loaders(self): """ Create training and validation data loaders """ self.train_dl = DataLoader( self.train_ds, batch_size=self.batch_size, shuffle=True, num_workers=db_num_workers(), collate_fn=collate_fn, ) self.test_dl = DataLoader( self.test_ds, batch_size=self.batch_size, shuffle=False, num_workers=db_num_workers(), collate_fn=collate_fn, )
def tiny_seg_databunch(tiny_seg_data_path, seg_classes): """ Returns a databunch object for the segmentation tiny fridge objects dataset. """ get_gt_filename = ( lambda x: f"{tiny_seg_data_path}/segmentation-masks/{x.stem}.png") return ( SegmentationItemList.from_folder(tiny_seg_data_path).split_by_rand_pct( valid_pct=0.1, seed=10).label_from_func( get_gt_filename, classes=seg_classes).transform( get_transforms(), tfm_y=True, size=50).databunch( bs=8, num_workers=db_num_workers()).normalize(imagenet_stats) )
def tiny_ic_databunch(tmp_session): """ Returns a databunch object for the tiny fridge objects dataset. """ im_paths = unzip_url( ic_urls.fridge_objects_tiny_path, fpath=tmp_session, dest=tmp_session, exist_ok=True, ) return (ImageList.from_folder(im_paths).split_by_rand_pct( valid_pct=0.1, seed=20).label_from_folder().transform(size=50).databunch( bs=16, num_workers=db_num_workers()).normalize(imagenet_stats))
def _get_data_bunch_imagelist(path: Union[Path, str], transform: bool, im_size: int, bs: int) -> ImageDataBunch: """ Create ImageDataBunch and return it. TODO in future version is to allow users to pass in their own image bunch or their own Transformation objects (instead of using fastai's <get_transforms>) Args: path (Union[Path, str]): path to data to create databunch with transform (bool): a flag to set fastai default transformations (get_transforms()) im_size (int): image size of databunch bs (int): batch size of databunch Returns: ImageDataBunch """ path = path if type(path) is Path else Path(path) tfms = get_transforms() if transform else None return (ImageList.from_folder(path).split_by_rand_pct( valid_pct=0.33).label_from_folder().transform( tfms=tfms, size=im_size).databunch( bs=bs, num_workers=db_num_workers()).normalize(imagenet_stats))
def testing_databunch(tmp_session): """ Builds a databunch from the Fridge Objects and returns its validation component that is used to test comparative_set_builder""" im_paths = unzip_url( ic_urls.fridge_objects_tiny_path, fpath=tmp_session, dest=tmp_session, exist_ok=True, ) can_im_paths = os.listdir(os.path.join(im_paths, "can")) can_im_paths = [ os.path.join(im_paths, "can", im_name) for im_name in can_im_paths ][0:5] random.seed(642) data = (ImageList.from_folder(im_paths).split_by_rand_pct( valid_pct=0.2, seed=20).label_from_folder().transform(size=300).databunch( bs=16, num_workers=db_num_workers()).normalize(imagenet_stats)) validation_bunch = data.valid_ds return validation_bunch
print(f"Fast.ai version = {fastai.__version__}") which_processor() EPOCHS = 10 LEARNING_RATE = 1e-4 IM_SIZE = 300 BATCH_SIZE = 16 ARCHITECTURE = models.resnet18 path = Path('/app/classifier_data/') data = (ImageList.from_folder(path).split_by_rand_pct( valid_pct=0.2, seed=10).label_from_folder().transform(size=IM_SIZE).databunch( bs=BATCH_SIZE, num_workers=db_num_workers()).normalize(imagenet_stats)) print(f'number of classes: {data.c}') print(data.classes) learn = cnn_learner( data, ARCHITECTURE, metrics=[accuracy], callback_fns=[partial(TrainMetricsRecorder, show_graph=True)]) learn.unfreeze() learn.fit(EPOCHS, LEARNING_RATE) learn.export(file=Path("/app/classifier_model.pkl")) _, validation_accuracy = learn.validate(learn.data.valid_dl, metrics=[accuracy]) print(f'Accuracy on validation set: {100*float(validation_accuracy):3.2f}')
BATCH_SIZE = 32 IM_SIZE = 224 DROPOUT = 0 ARCHITECTURE = models.resnet50 # Desired embedding dimension. Higher dimensions slow down retrieval but often provide better accuracy. EMBEDDING_DIM = 4096 assert EMBEDDING_DIM == 4096 or EMBEDDING_DIM <= 2048 # Load images into fast.ai's ImageDataBunch object random.seed(642) data_finetune = (ImageList.from_folder(DATA_FINETUNE_PATH).split_by_rand_pct( valid_pct=0.05, seed=20).label_from_folder().transform( tfms=fastai.vision.transform.get_transforms(), size=IM_SIZE).databunch( bs=BATCH_SIZE, num_workers=db_num_workers()).normalize(imagenet_stats)) print( f"Data for fine-tuning: {len(data_finetune.train_ds.x)} training images and {len(data_finetune.valid_ds.x)} validation images." ) learn = cnn_learner(data_finetune, ARCHITECTURE, metrics=[], ps=DROPOUT) print(learn.model[1]) # By default uses the 2048 dimensional pooling layer as implemented in the paper. # Optionally can instead keep the 4096-dimensional pooling layer from the ResNet-50 model. if EMBEDDING_DIM != 4096: modules = [] pooling_dim = 2048 else: