def svhn(args): """ Fetches and prepares (in a DeepDIVA friendly format) the SVHN dataset to the location specified on the file system Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # Use torchvision to download the dataset torchvision.datasets.SVHN(root=args.output_folder, split='train', download=True) torchvision.datasets.SVHN(root=args.output_folder, split='test', download=True) # Load the data into memory train = _loadmat(os.path.join(args.output_folder, 'train_32x32.mat')) train_data, train_labels = train['X'], train['y'].astype( np.int64).squeeze() np.place(train_labels, train_labels == 10, 0) train_data = np.transpose(train_data, (3, 0, 1, 2)) test = _loadmat(os.path.join(args.output_folder, 'test_32x32.mat')) test_data, test_labels = test['X'], test['y'].astype(np.int64).squeeze() np.place(test_labels, test_labels == 10, 0) test_data = np.transpose(test_data, (3, 0, 1, 2)) # Make output folders dataset_root = os.path.join(args.output_folder, 'SVHN') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(test_folder) def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels)): dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) Image.fromarray(img).save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) os.remove(os.path.join(args.output_folder, 'train_32x32.mat')) os.remove(os.path.join(args.output_folder, 'test_32x32.mat')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
def loadmat(matf,**kwargs): from scipy.io import loadmat as _loadmat kwargs.setdefault('verify_compressed_data_integrity',True) kwargs.setdefault('squeeze_me',True) verbose = kwargs.pop('verbose',True) if verbose: print('loading',matf) return _loadmat(matf,**kwargs)
def load(self): """ Load all Matlab files from paths. Returns: Generator of lists of head positions - (X, Y) tuples. """ for path in self.gt_paths: yield self.getter(_loadmat(path))
def _make_STMView_colormap(fileName, name='my_cmap'): if fileName.endswith('.mat'): matFile = _loadmat(_path + fileName) for key in matFile: if key not in ['__version__', '__header__', '__globals__']: return _LSC.from_list(name, matFile[key]) elif fileName.endswith('.txt'): txtFile = _np.loadtxt(_path + fileName) return _LSC.from_list(name, txtFile)
def _make_STMView_colormap(fileName, name='my_cmap'): if fileName.endswith('.mat'): matFile = _loadmat(_path + fileName) for key in matFile: if key not in ['__version__', '__header__', '__globals__']: return _LSC.from_list(name, matFile[key]) elif fileName.endswith('.txt'): txtFile = _np.loadtxt(_path + fileName) return _LSC.from_list(name, txtFile)
def importPower(path, filename=''): ''' import powers file ''' fullPath = path + filename if fullPath[-4:] == '.mat': rawDict = _loadmat(fullPath) t = rawDict['timelist'].reshape(-1) p = rawDict['powerlist'].reshape(-1) elif fullPath[-4:] == '.csv': raw = _np.loadtxt(fullPath, delimiter=',', skiprows=1) t = raw[:, 0].reshape(-1) p = raw[:, 1].reshape(-1) else: print('Could not identify power data type') return return t, p
def importPower(path, filename=""): """ import powers file """ fullPath = path + filename if fullPath[-4:] == ".mat": rawDict = _loadmat(fullPath) t = rawDict["timelist"].reshape(-1) p = rawDict["powerlist"].reshape(-1) elif fullPath[-4:] == ".csv": raw = _np.loadtxt(fullPath, delimiter=",", skiprows=1) t = raw[:, 0].reshape(-1) p = raw[:, 1].reshape(-1) else: print("Could not identify power data type") return return t, p
def __allprofs_init(self, filepath, floatID, verbose): # Loaded data is a dictionary. data = _loadmat(filepath, squeeze_me=True) isFloat = data.pop('flid') == floatID del data['ar'] self.hpid = data.pop('hpid')[isFloat] if self.hpid.size == 0: raise RuntimeError('There appear to be no profiles for float {} in' ' {}'.format(floatID, filepath)) # Load the data! for key in data.keys(): d = np.ndim(data[key]) if d < 1 or d > 2 or '__' in key: if verbose: print("* Skipping: {}.".format(key)) continue elif d == 1: setattr(self, key, data[key][isFloat]) elif d == 2: setattr(self, key, data[key][:, isFloat]) else: if verbose: print("* Don't know what to do with {}, skipping".format( key)) if verbose: print(" Loaded: {}.".format(key)) print("All numerical data appears to have been loaded successfully.\n") print("Creating array of half profiles.\n") self.Profiles = np.array([Profile(self, h) for h in self.hpid])
def __allprofs_init(self, filepath, floatID, verbose): # Loaded data is a dictionary. data = _loadmat(filepath, squeeze_me=True) isFloat = data.pop('flid') == floatID del data['ar'] self.hpid = data.pop('hpid')[isFloat] if self.hpid.size == 0: raise RuntimeError('There appear to be no profiles for float {} in' ' {}'.format(floatID, filepath)) # Load the data! for key in data.keys(): d = np.ndim(data[key]) if d < 1 or d > 2 or '__' in key: if verbose: print("* Skipping: {}.".format(key)) continue elif d == 1: setattr(self, key, data[key][isFloat]) elif d == 2: setattr(self, key, data[key][:, isFloat]) else: if verbose: print("* Don't know what to do with {}, skipping".format(key)) if verbose: print(" Loaded: {}.".format(key)) print("All numerical data appears to have been loaded successfully.\n") print("Creating array of half profiles.\n") self.Profiles = np.array([Profile(self, h) for h in self.hpid])
def miml(args): """ Fetches and prepares (in a DeepDIVA friendly format) the Multi-Instance Multi-Label Image Dataset on the file system. Dataset available at: http://lamda.nju.edu.cn/data_MIMLimage.ashx Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # Download the files url = 'http://lamda.nju.edu.cn/files/miml-image-data.rar' if not os.path.exists( os.path.join(args.output_folder, 'miml-image-data.rar')): print('Downloading file!') filename = wget.download(url, out=args.output_folder) else: print('File already downloaded!') filename = os.path.join(args.output_folder, 'miml-image-data.rar') # Extract the files path_to_rar = filename path_to_output = os.path.join(args.output_folder, 'tmp_miml') rarfile.RarFile(path_to_rar).extractall(path_to_output) path_to_rar = os.path.join(path_to_output, 'original.rar') rarfile.RarFile(path_to_rar).extractall(path_to_output) path_to_rar = os.path.join(path_to_output, 'processed.rar') rarfile.RarFile(path_to_rar).extractall(path_to_output) print('Extracted files...') # Load the mat file mat = _loadmat(os.path.join(path_to_output, 'miml data.mat')) targets = mat['targets'].T classes = [item[0][0] for item in mat['class_name']] # Add filename at 0-index to correctly format the CSV headers classes.insert(0, 'filename') # Get list of all image files in the folder images = [ item for item in _get_all_files_in_folders_and_subfolders(path_to_output) if item.endswith('jpg') ] images = sorted(images, key=lambda e: int(os.path.basename(e).split('.')[0])) # Make splits train_data, test_data, train_labels, test_labels = _train_test_split( images, targets, test_size=0.2, random_state=42) train_data, val_data, train_labels, val_labels = _train_test_split( train_data, train_labels, test_size=0.2, random_state=42) # print('Size of splits\ntrain:{}\nval:{}\ntest:{}'.format(len(train_data), # len(val_data), # len(test_data))) # Make output folders dataset_root = os.path.join(args.output_folder, 'MIML') train_folder = os.path.join(dataset_root, 'train') val_folder = os.path.join(dataset_root, 'val') test_folder = os.path.join(dataset_root, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(val_folder) make_folder_if_not_exists(test_folder) def _write_data_to_folder(data, labels, folder, classes): dest = os.path.join(folder, 'images') make_folder_if_not_exists(dest) for image, label in zip(data, labels): shutil.copy(image, dest) rows = np.column_stack( ([os.path.join('images', os.path.basename(item)) for item in data], labels)) rows = sorted(rows, key=lambda e: int(e[0].split('/')[1].split('.')[0])) output_csv = pd.DataFrame(rows) output_csv.to_csv(os.path.join(folder, 'labels.csv'), header=classes, index=False) return # Write the images to the correct folders print('Writing the data to the filesystem') _write_data_to_folder(train_data, train_labels, train_folder, classes) _write_data_to_folder(val_data, val_labels, val_folder, classes) _write_data_to_folder(test_data, test_labels, test_folder, classes) os.remove(filename) shutil.rmtree(path_to_output) print('All done!') return
def __dir_init(self, dirpath, floatID, verbose): # This block searchs the directory tree for all the relevent files and # puts them in a dictionary organised by hpid number. # ctd, efp, gps, mis, vel, vit filesdict = {} mis_file = None gps_file = None single_mis_file = False single_gps_file = False searchstr = '*{}*vel.mat'.format(floatID) for root, dirnames, filenames in _os.walk(dirpath): for filename in _fnmatch.filter(filenames, searchstr): nameparts = filename.split('-') try: hpid = int(nameparts[2]) except ValueError: if nameparts[2] == 'mis.mat': single_mis_file = True mis_file = _os.path.join(root, filename) continue elif nameparts[2] == 'gps.mat': single_gps_file = True gps_file = _os.path.join(root, filename) continue filetype = nameparts[3].split('.')[0] fullname = _os.path.join(root, filename) if hpid in filesdict.keys(): filesdict[hpid][filetype] = fullname else: filesdict[hpid] = {filetype: fullname} self.hpid = np.array(filesdict.keys()) Nprofiles = self.hpid.size # Work out size of arrays required. pad_ctd = 0 pad_ef = 0 for hp in self.hpid: velfile = filesdict[hp]['vel'] veldata = _loadmat(velfile, squeeze_me=True, variable_names=['ctd_mlt', 'efp_mlt']) pad_ctd = max(pad_ctd, np.asarray(veldata['ctd_mlt']).size) pad_ef = max(pad_ef, np.asarray(veldata['efp_mlt']).size) # CTD attributes. ctd_keys = ['Pctd', 'T', 'S', 'ctd_mlt', 'pc_ctd'] ctd_attrs = ['P', 'T', 'S', 'UTC', 'ppos'] # ef attributes. ef_keys = ['U1', 'U2', 'V1', 'V2', 'Pef', 'efp_mlt'] ef_attrs = ['U1', 'U2', 'V1', 'V2', 'Pef', 'UTCef'] # Singleton attributes. s_keys = ['lon', 'lat', 'LON', 'LAT', 'MLT_GPS'] s_attrs = ['lon', 'lat', 'lon_gps', 'lat_gps', 'utc_gps'] names = ctd_keys + ef_keys + s_keys # Initialise arrays. for ctd_attr in ctd_attrs: setattr(self, ctd_attr, np.NaN * np.zeros((pad_ctd, Nprofiles))) for ef_attr in ef_attrs: setattr(self, ef_attr, np.NaN * np.zeros((pad_ef, Nprofiles))) for s_attr in s_attrs: setattr(self, s_attr, np.NaN * np.zeros(Nprofiles)) # Load vel data. for i, hp in enumerate(self.hpid): velfile = filesdict[hp]['vel'] veldata = _loadmat(velfile, squeeze_me=True, variable_names=names) Nctd = np.asarray(veldata['ctd_mlt']).size Nef = np.asarray(veldata['efp_mlt']).size for ctd_key, ctd_attr in zip(ctd_keys, ctd_attrs): if Nctd < 2: continue getattr(self, ctd_attr)[:Nctd, i] = veldata[ctd_key] for ef_key, ef_attr in zip(ef_keys, ef_attrs): if Nef < 2: continue getattr(self, ef_attr)[:Nef, i] = veldata[ef_key] for s_key, s_attr in zip(s_keys, s_attrs): getattr(self, s_attr)[i] = veldata[s_key] print("All numerical data appears to have been loaded successfully.\n") print("Creating array of half profiles.\n") self.Profiles = np.array([Profile(self, h) for h in self.hpid])
def what_floats_are_in_here(fname): """Finds all unique float ID numbers from a given allprofs##.mat file.""" fs = _loadmat(fname, squeeze_me=True, variable_names='flid')['flid'] return np.unique(fs[~np.isnan(fs)])
def _make_STMView_colormap(fileName): matFile = _loadmat(_path + fileName) for key in matFile: if key not in ['__version__', '__header__', '__globals__']: return _ListedColormap(matFile[key])
def __dir_init(self, dirpath, floatID, verbose): # This block searchs the directory tree for all the relevent files and # puts them in a dictionary organised by hpid number. # ctd, efp, gps, mis, vel, vit filesdict = {} mis_file = None gps_file = None single_mis_file = False single_gps_file = False searchstr = '*{}*vel.mat'.format(floatID) for root, dirnames, filenames in _os.walk(dirpath): for filename in _fnmatch.filter(filenames, searchstr): nameparts = filename.split('-') try: hpid = int(nameparts[2]) except ValueError: if nameparts[2] == 'mis.mat': single_mis_file = True mis_file = _os.path.join(root, filename) continue elif nameparts[2] == 'gps.mat': single_gps_file = True gps_file = _os.path.join(root, filename) continue filetype = nameparts[3].split('.')[0] fullname = _os.path.join(root, filename) if hpid in filesdict.keys(): filesdict[hpid][filetype] = fullname else: filesdict[hpid] = {filetype: fullname} self.hpid = np.array(filesdict.keys()) Nprofiles = self.hpid.size # Work out size of arrays required. pad_ctd = 0 pad_ef = 0 for hp in self.hpid: velfile = filesdict[hp]['vel'] veldata = _loadmat(velfile, squeeze_me=True, variable_names=['ctd_mlt', 'efp_mlt']) pad_ctd = max(pad_ctd, np.asarray(veldata['ctd_mlt']).size) pad_ef = max(pad_ef, np.asarray(veldata['efp_mlt']).size) # CTD attributes. ctd_keys = ['Pctd', 'T', 'S', 'ctd_mlt', 'pc_ctd'] ctd_attrs = ['P', 'T', 'S', 'UTC', 'ppos'] # ef attributes. ef_keys = ['U1', 'U2', 'V1', 'V2', 'Pef', 'efp_mlt'] ef_attrs = ['U1', 'U2', 'V1', 'V2', 'Pef', 'UTCef'] # Singleton attributes. s_keys = ['lon', 'lat', 'LON', 'LAT', 'MLT_GPS'] s_attrs = ['lon', 'lat', 'lon_gps', 'lat_gps', 'utc_gps'] names = ctd_keys + ef_keys + s_keys # Initialise arrays. for ctd_attr in ctd_attrs: setattr(self, ctd_attr, np.NaN*np.zeros((pad_ctd, Nprofiles))) for ef_attr in ef_attrs: setattr(self, ef_attr, np.NaN*np.zeros((pad_ef, Nprofiles))) for s_attr in s_attrs: setattr(self, s_attr, np.NaN*np.zeros(Nprofiles)) # Load vel data. for i, hp in enumerate(self.hpid): velfile = filesdict[hp]['vel'] veldata = _loadmat(velfile, squeeze_me=True, variable_names=names) Nctd = np.asarray(veldata['ctd_mlt']).size Nef = np.asarray(veldata['efp_mlt']).size for ctd_key, ctd_attr in zip(ctd_keys, ctd_attrs): if Nctd < 2: continue getattr(self, ctd_attr)[:Nctd, i] = veldata[ctd_key] for ef_key, ef_attr in zip(ef_keys, ef_attrs): if Nef < 2: continue getattr(self, ef_attr)[:Nef, i] = veldata[ef_key] for s_key, s_attr in zip(s_keys, s_attrs): getattr(self, s_attr)[i] = veldata[s_key] print("All numerical data appears to have been loaded successfully.\n") print("Creating array of half profiles.\n") self.Profiles = np.array([Profile(self, h) for h in self.hpid])
def what_floats_are_in_here(fname): """Finds all unique float ID numbers from a given allprofs##.mat file.""" fs = _loadmat(fname, squeeze_me=True, variable_names='flid')['flid'] return np.unique(fs[~np.isnan(fs)])