def __init__(self, root=MyPath.db_root_dir('bird'), split='train', transform=None): super(Birds, self).__init__() self.transform = transform self.resize = tf.Resize(256) path = untar_data(URLs.CUB_200_2011) self.files = get_image_files(path/"images") self.label = dict(sorted(enumerate(set(self.files.map(self.label_func))), key=itemgetter(1))) self.labels = dict([(value, key) for key, value in self.label.items()]) self.df = pd.read_csv(path/'train_test_split.txt',delimiter=' ') if self.split == 'train': self.file_index = [i['1'] for i in self.df.to_dict('records') if i['0']==1] self.Files= [i for i in self.files if self.splitter(i) in self.file_index] else: self.file_index = [i['1'] for i in self.df.to_dict('records') if i['0']==0] self.Files = [i for i in self.files if self.splitter(i) in self.file_index]
def setup_data(vocab_size, min_frequency): def _create_vocab(df, vocab_size, min_frequency=1): counter = Counter() print(f"Starting parsing docs, in total {len(df.values)}") for _, doc in tqdm(enumerate(df.values.tolist())): doc_counter = Counter([ token.text for token in tokenizer(doc) if not token.is_stop and token.is_alpha ]) counter += doc_counter vocab_strings = [ token for token, count in counter.most_common(vocab_size) if count >= min_frequency ] # create a dictionary with a default of -1 for word not existing in our vocab vocab = defaultdict( lambda: -1, {value: key for key, value in enumerate(vocab_strings)}) print( f"Created vocab of size {len(vocab)}. Most common words are {vocab_strings[:10]}" ) return vocab, vocab_strings path = untar_data(URLs.WIKITEXT) df = pd.read_csv(path / 'train.csv', header=None).apply(lambda x: x[0], axis=1) vocab, vocab_strings = _create_vocab(df, vocab_size, min_frequency) return df, vocab, vocab_strings
def read_mnist(): path = untar_data(URLs.MNIST_SAMPLE) threes_t = load_lazy('/tmp/mnist_sample_stacked3.pt', (path / 'train' / '3').ls().sorted()) seven_t = load_lazy('/tmp/mnist_sample_stacked7.pt', (path / 'train' / '7').ls().sorted()) threes_t_v = load_lazy('/tmp/mnist_sample_stacked3_valid.pt', (path / 'valid' / '3').ls().sorted()) seven_t_v = load_lazy('/tmp/mnist_sample_stacked7_valid.pt', (path / 'valid' / '7').ls().sorted()) return threes_t, seven_t, threes_t_v, seven_t_v
def get_paths(): ''' Download sample of COCO dataset Sample 10k images from COCO dataset Split train/val 80/20 Return: train_paths (list), val_paths (list): image paths ''' coco_path = untar_data(URLs.COCO_SAMPLE) coco_path = str(coco_path) + "/train_sample" paths = glob.glob(coco_path + "/*.jpg") # Grabbing all the image file names np.random.seed(123) paths_subset = np.random.choice(paths, 10_000, replace=False) # choosing 1000 images randomly rand_idxs = np.random.permutation(10_000) train_idxs = rand_idxs[:8000] # choosing the first 8000 as training set val_idxs = rand_idxs[8000:] # choosing last 2000 as validation set return paths_subset[train_idxs], paths_subset[val_idxs]
# - how to pass along a custom `splitter` to `Learner` to take advantage of transfer learning # ## Preparing the data # To make our data ready for training a model, we need to create a `DataLoaders` object in fastai. It is just a wrapper around a training `DataLoader` and a validation `DataLoader`, so if you already have your own PyTorch dataloaders, you can create such an object directly. # # Here we don't have anything ready yet. Usually, when using PyTorch, the first step is to create a `Dataset` that is then wrapped inside a `DataLoader`. We will do this first, then see how to change this `Dataset` into a `Transform` that will let us take advantage of fastai's functionality for showing a batch or using data augmentation on the GPU. Lastly we will see how we can customize the data block API and create our own new `TransformBlock`. # ### Purely in PyTorch # To begin with, we will only use PyTorch and PIL to create a `Dataset` and see how to get this inside fastai. The only helper functions from fastai we will use are `untar_data` (to download and untar the dataset) and `get_image_files` (that looks for all images in a folder recursively). Here, we will use the [Oxford-IIIT Pet Dataset](https://www.robots.ox.ac.uk/~vgg/data/pets/). # `untar_data` returns a `pathlib.Path` object with the location of the decompressed dataset, and in this case, all the images are in an images subfolder: path = untar_data(URLs.PETS) files = get_image_files(path / "images") files[0] # We can open the first image with PIL and have a look at it: img = PIL.Image.open(files[0]) img # Let's wrap all the standard preprocessing (resize, conversion to tensor, dividing by 255 and reordering of the channels) in one helper function: def open_image(fname, size=224): img = PIL.Image.open(fname).convert('RGB')
img = Image.open(self.paths[idx]).convert("RGB") img = self.transforms(img) img = np.array(img) lab_img = rgb2lab(img).astype("float32") lab_img = transforms.ToTensor()(lab_img) L = lab_img[[0], ...] / 50. - 1. ab = lab_img[[1, 2], ...] / 110. return {"L": L, "ab": ab} def __len__(self): return len(self.paths) root = str(untar_data(URLs.COCO_SAMPLE)) + "/train_sample" paths = glob.glob(root + "/*.jpg") np.random.seed(42) paths_subset = np.random.choice(paths, 12_000, replace=False) rand_idxs = np.random.permutation(12_000) train_idxs = rand_idxs[:10_000] val_idxs = rand_idxs[10_000:] train_paths = paths_subset[train_idxs] val_paths = paths_subset[val_idxs] train_dset = TrainingDataset(train_paths) val_dset = ValidationDataset(val_paths)
""" Downloads test data used in CI """ from IPython import get_ipython from fastai.data.external import untar_data, URLs from fastai.torch_core import parallel import pickle urls = [ 'ADULT_SAMPLE', 'BIWI_SAMPLE', 'CAMVID_TINY', 'CIFAR', 'COCO_TINY', 'IMDB', 'IMDB_SAMPLE', 'ML_SAMPLE', 'MNIST', 'MNIST_SAMPLE', 'MNIST_TINY', 'PETS' ] url_list = [URLs.__dict__[k] for k in urls] files = [(print(f'Downloading {u}'), untar_data(u)) for u in url_list]
# splitter=RandomSplitter(0.1) # ) # dls_lm = dls_lm.dataloaders(df_all, bs=64, seq_len=72) # print(dls_lm.show_batch(max_n=3)) # learn = language_model_learner( # dls_lm, AWD_LSTM, # metrics=[accuracy, Perplexity()]).to_fp16() # print(learn.model) # print(learn.lr_find()) # learn.fine_tune(5, 1e-2, cbd=TensorBoardCallback(PATH_TENSORBOARD, trace_model=True)) #%% # Prepare IMDB data path = untar_data(URLs.IMDB) bs = 32 # Fine-tune pretrained language model (based on wikitext) to the IMDB corpus get_imdb = partial(get_text_files, folders=["train", "test", "unsup"]) dls_lm = DataBlock(blocks=TextBlock.from_folder(path, is_lm=True, n_workers=4), get_items=get_imdb, splitter=RandomSplitter(0.1)) dls_lm = dls_lm.dataloaders(path, path=path, bs=bs, seq_len=80) print(dls_lm.show_batch(max_n=3)) # #%% # learn = language_model_learner( # dls_lm, AWD_LSTM, drop_mult=0.3,
from fastai.learner import Learner from fastai.metrics import Precision, Recall, accuracy from fastai.vision.augment import aug_transforms from fastai.vision.core import PILImageBW from fastai.vision.data import ImageBlock, ImageDataLoaders from pytorch_model_summary import summary from ml_for_programmers.config import Config # %% config = Config() mnist_root = config.data_dir_path / "external/mnist_fastai" mnist_root.mkdir(parents=True, exist_ok=True) # %% mnist_dir = untar_data(URLs.MNIST, dest=mnist_root) mnist_dir # %% Path.BASE_PATH = mnist_dir print(mnist_dir.ls(), "\n") pprint(sorted((mnist_dir / "training").ls())) # %% if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") # %%