def remove(ds_name, data_path=None): """Remove a downloaded coco dataset.""" path = Path(URLs.path(c_key='data'))/ds_name if data_path is None else Path(data_path)/ds_name if path.is_dir(): rmtree(path) print(f"{path} removed.") else: print(f"No dataset '{path}' found.")
def get_path_df(ds_name, data_path=None): """Get path and dataframe of downloaded coco dataset.""" path = Path(URLs.path(c_key='data'))/ds_name if data_path is None else Path(data_path)/ds_name if path.is_dir(): if (path/"df_train.csv").is_file(): return (path, pd.read_csv(path/"df_train.csv")) else: print("No Dataframe found in "+str(path)) else: print("No dataset '"+str(path)+"' found.") print("Create dataset first with CocoData.create(ds_name, cat_list) or list available datasets with CocoData.ls()")
def preprocess_audio_folder(path, folders=None, output_dir=None, **kwargs): "Preprocess audio files in `path` in parallel using `n_workers`" path = Path(path) fnames = get_audio_files(path, recurse=True, folders=folders) output_dir = Path(ifnone(output_dir, path.parent / f"{path.name}_cached")) output_dir.mkdir(exist_ok=True) pp = PreprocessAudio(**kwargs) for i, fil in enumerate(fnames): out = output_dir / fnames[i].relative_to(path) aud = pp(fil) save_audio(str(out), aud, aud.sr) return output_dir
def _create_dataframe(path, cat_list, with_mask,): print("Creating Dataframe...") path_images = path/"images" path_masks = path/"masks" df_train = pd.DataFrame() img_id2fn = {int(Path(fn).stem):fn for fn in path_images.ls()} img_ids = [i for i in img_id2fn.keys()] idx2cat = {e['id']:e['name'] for e in CocoData.coco.loadCats(CocoData.coco.getCatIds())} for i in progress_bar(range(len(img_ids))): img_id = img_ids[i] annos = CocoData.coco.loadAnns(CocoData.coco.getAnnIds(imgIds=img_id)) # remove annotations of other labels annos = [a for a in annos if idx2cat[a["category_id"]] in cat_list] # sort by area area_dict = {a["area"]:a for a in annos} annos = [area_dict[k] for k in sorted(area_dict, reverse=True)] n_objs = len(annos) df_x_mins = [a["bbox"][0] for a in annos] df_y_mins = [a["bbox"][1] for a in annos] widths = [a["bbox"][2] for a in annos] heights = [a["bbox"][3] for a in annos] df_x_maxs = [df_x_mins[ia]+widths[ia] for ia in range(len(annos))] df_y_maxs = [df_y_mins[ia]+heights[ia] for ia in range(len(annos))] df_class_names = [idx2cat[a["category_id"]] for a in annos] df_img_id = [img_id] * n_objs img_path = img_id2fn[img_id] df_img_path = [str(img_path)] * n_objs if with_mask: df_mask_path = [] df_obj_ids = [i for i in range(n_objs)] mask = np.zeros(CocoData.coco.annToMask(annos[0]).shape, dtype=np.uint8) for o_id in df_obj_ids: mask = CocoData.coco.annToMask(annos[o_id]) #* p_idx #mask[mask>p_idx] = p_idx # for overlapping parts mask_path = path_masks/(img_path.stem+"_"+str(o_id)+".png") # save mask always as png Image.fromarray(mask).save(mask_path) df_mask_path.append(str(mask_path)) df = pd.DataFrame({"image_id":df_img_id, "image_path":df_img_path, "mask_path":df_mask_path, "object_id":df_obj_ids, "x_min":df_x_mins, "y_min":df_y_mins, "x_max":df_x_maxs, "y_max":df_y_maxs, "class_name":df_class_names}) else: df = pd.DataFrame({"image_id":df_img_id, "image_path":df_img_path, "x_min":df_x_mins, "y_min":df_y_mins, "x_max":df_x_maxs, "y_max":df_y_maxs, "class_name":df_class_names}) df_train = df_train.append(df) df_train.reset_index(inplace=True, drop=True) df_train.to_csv(str(path/"df_train.csv"), index=False) return df_train
def _download_annotation_file(path): print("Downloading annotation files...") url = 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip' zipresp = urlopen(url) zip_fn = path/'annotations_trainval2017.zip' with open(zip_fn, 'wb') as zip: zip.write(zipresp.read()) zf = ZipFile(zip_fn) zf.extractall(path=str(path)) zf.close() Path(zip_fn).unlink()
def create(cls, ds_name, cat_list, data_path=None, with_mask=False, max_images=1000, remove_crowded=True): """Creates a new coco dataset with categories defined in cat_list, optionally with or without masks. You can specify the path, where the dataset gets stored, by default it uses fastai's data path like `untar_data`""" path = Path(URLs.path(c_key='data'))/ds_name if data_path is None else Path(data_path)/ds_name path_images = path/"images" path_masks = path/"masks" if Path(path).is_dir(): print(f"Dataset {ds_name} already exists: {path}") return cls.get_path_df(ds_name, data_path=data_path) # create folders print("Creating folders.") path.mkdir(exist_ok=False, parents=True) path_images.mkdir() if with_mask: path_masks.mkdir() # download annotation files annotations = 'annotations/instances_train2017.json' if not (path/annotations).is_file(): cls._download_annotation_file(path) if not (path/annotations).is_file(): print("Download was not successful. No annotation file found.") return cls.coco = COCO(annotation_file=str(path/annotations)) # download images cls._download_images(cat_list, path_images, max_images, remove_crowded) # create dataframe df = cls._create_dataframe(path, cat_list, with_mask) return path, df
def tar_extract_at_filename(fname, dest): "Extract `fname` to `dest`/`fname.name` folder using `tarfile`" dest = Path(dest) / Path(fname).with_suffix("").name tarfile.open(fname, "r:gz").extractall(dest)
def ls(data_path=None): """List all available datasets.""" path = Path(URLs.path(c_key='data')) if data_path is None else Path(data_path) if path.is_dir(): return list(path.ls()) else: print(f"Path {path} does not exist.")
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.functional") # https://github.com/dmlc/xgboost/issues/1715 os.environ["KMP_DUPLICATE_LIB_OK"] = "True" # training is very, very slow os.environ["OMP_NUM_THREADS"] = "1" ## helpful way to initially get folders # import split_folders # split_folders.ratio('<path>', output='<path>/split', seed=1337, ratio=(.8, .2)) # uses default values # sys.exit() path = Path("data/CNN/-released/split") ################################################################################ # fastai uses databunches ################################################################################ data = ( ImageList.from_folder(path / "train").split_by_rand_pct( 0.1, seed=33).label_from_folder() # .add_test_folder('..'/path/'test') .transform( get_transforms(do_flip=True, flip_vert=True), size=150, resize_method=ResizeMethod.SQUISH, padding_mode="zeros", ).databunch(bs=64).normalize(imagenet_stats))