def download(self): if self._check_exists(): return self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") os.makedirs(self.raw_folder, exist_ok=True) os.makedirs(self.processed_folder, exist_ok=True) def save_record(records, record_id, tt, vals, mask, labels): tt = torch.tensor(tt).to(self.device) vals = torch.stack(vals) mask = torch.stack(mask) labels = torch.stack(labels) # flatten the measurements for different tags vals = vals.reshape(vals.size(0), -1) mask = mask.reshape(mask.size(0), -1) assert (len(tt) == vals.size(0)) assert (mask.size(0) == vals.size(0)) assert (labels.size(0) == vals.size(0)) #records.append((record_id, tt, vals, mask, labels)) seq_length = len(tt) # split the long time series into smaller ones offset = 0 slide = self.max_seq_length // 2 while (offset + self.max_seq_length < seq_length): idx = range(offset, offset + self.max_seq_length) first_tp = tt[idx][0] records.append((record_id, tt[idx] - first_tp, vals[idx], mask[idx], labels[idx])) offset += slide for url in self.urls: filename = url.rpartition('/')[2] download_url(url, self.raw_folder, filename, None) print('Processing {}...'.format(filename)) dirname = os.path.join(self.raw_folder) records = [] first_tp = None for txtfile in os.listdir(dirname): with open(os.path.join(dirname, txtfile)) as f: lines = f.readlines() prev_time = -1 tt = [] record_id = None for l in lines: cur_record_id, tag_id, time, date, val1, val2, val3, label = l.strip( ).split(',') value_vec = torch.Tensor((float(val1), float(val2), float(val3))).to(self.device) time = float(time) if cur_record_id != record_id: if record_id is not None: save_record(records, record_id, tt, vals, mask, labels) tt, vals, mask, nobs, labels = [], [], [], [], [] record_id = cur_record_id tt = [torch.zeros(1).to(self.device)] vals = [ torch.zeros(len(self.tag_ids), 3).to(self.device) ] mask = [ torch.zeros(len(self.tag_ids), 3).to(self.device) ] nobs = [ torch.zeros(len(self.tag_ids)).to(self.device) ] labels = [ torch.zeros(len(self.label_names)).to( self.device) ] first_tp = time time = round((time - first_tp) / 10**5) prev_time = time else: # for speed -- we actually don't need to quantize it in Latent ODE time = round( (time - first_tp) / 10**5 ) # quatizing by 100 ms. 10,000 is one millisecond, 10,000,000 is one second if time != prev_time: tt.append(time) vals.append( torch.zeros(len(self.tag_ids), 3).to(self.device)) mask.append( torch.zeros(len(self.tag_ids), 3).to(self.device)) nobs.append( torch.zeros(len(self.tag_ids)).to(self.device)) labels.append( torch.zeros(len(self.label_names)).to( self.device)) prev_time = time if tag_id in self.tag_ids: n_observations = nobs[-1][self.tag_dict[tag_id]] if (self.reduce == 'average') and (n_observations > 0): prev_val = vals[-1][self.tag_dict[tag_id]] new_val = (prev_val * n_observations + value_vec) / (n_observations + 1) vals[-1][self.tag_dict[tag_id]] = new_val else: vals[-1][self.tag_dict[tag_id]] = value_vec mask[-1][self.tag_dict[tag_id]] = 1 nobs[-1][self.tag_dict[tag_id]] += 1 if label in self.label_names: if torch.sum(labels[-1][ self.label_dict[label]]) == 0: labels[-1][self.label_dict[label]] = 1 else: assert tag_id == 'RecordID', 'Read unexpected tag id {}'.format( tag_id) save_record(records, record_id, tt, vals, mask, labels) torch.save(records, os.path.join(self.processed_folder, 'data.pt')) print('Done!')
def __init__(self, root, image_set='train', mode='segmentation', download=False, transforms=None, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], size_img=(520, 520), size_crop=(480, 480), scale_factor=(0.5, 1.2), p=0.5, p_rotate=0.25, rotate=False, scale=True): try: from scipy.io import loadmat self._loadmat = loadmat except ImportError: raise RuntimeError( "Scipy is not found. This dataset needs to have scipy installed: " "pip install scipy") super(SBDataset, self).__init__(root, transforms) ## Transform self.mean = mean self.std = std self.size_img = size_img self.size_crop = size_crop self.scale_factor = scale_factor self.p = p self.p_rotate = p_rotate self.rotate = rotate self.scale = scale self.train = image_set == 'train' or image_set == 'train_noval' ## self.image_set = verify_str_arg(image_set, "image_set", ("train", "val", "train_noval")) self.mode = verify_str_arg(mode, "mode", ("segmentation", "boundaries")) self.num_classes = 20 sbd_root = self.root image_dir = os.path.join(sbd_root, 'img') mask_dir = os.path.join(sbd_root, 'cls') if download: download_extract(self.url, self.root, self.filename, self.md5) extracted_ds_root = os.path.join(self.root, "benchmark_RELEASE", "dataset") for f in ["cls", "img", "inst", "train.txt", "val.txt"]: old_path = os.path.join(extracted_ds_root, f) shutil.move(old_path, sbd_root) download_url(self.voc_train_url, sbd_root, self.voc_split_filename, self.voc_split_md5) if not os.path.isdir(sbd_root): raise RuntimeError('Dataset not found or corrupted.' + ' You can use download=True to download it') split_f = os.path.join(sbd_root, image_set.rstrip('\n') + '.txt') with open(os.path.join(split_f), "r") as f: file_names = [x.strip() for x in f.readlines()] self.images = [os.path.join(image_dir, x + ".jpg") for x in file_names] self.masks = [os.path.join(mask_dir, x + ".mat") for x in file_names] assert (len(self.images) == len(self.masks)) self._get_target = self._get_segmentation_target \ if self.mode == "segmentation" else self._get_boundaries_target
def download(self): if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) download_url( "https://raw.githubusercontent.com/human-analysis/MaxEnt-ARL/master/data/german/german.data-numeric", self.base_dir)
def download(self): if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) for fname, url in self.relevant_files.items(): download_url(url, self.base_dir, fname)
# - video: # - H-264 # - MPEG-4 AVC (part 10) (avc1) # - fps: 29.97 # - audio: # - MPEG AAC audio (mp4a) # - sample rate: 48K Hz # import torch import torchvision from torchvision.datasets.utils import download_url # Download the sample video download_url( "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", ".", "WUzgd7C1pWA.mp4") video_path = "./WUzgd7C1pWA.mp4" ###################################### # Streams are defined in a similar fashion as torch devices. We encode them as strings in a form # of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int. # The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered. # Firstly, let's get the metadata for our particular video: stream = "video" video = torchvision.io.VideoReader(video_path, stream) video.get_metadata() ###################################### # Here we can see that video has two streams - a video and an audio stream.
def download_dataset(self): download_url(self.url, self.root, filename=self.filename, md5=self.md5) with zipfile.ZipFile(os.path.join(self.root, self.filename), 'r') as zip_ref: zip_ref.extractall(self.root, members=c_f.extract_progress(zip_ref))
def _download(self): download_and_extract_archive(self.dataset_url, self.root) download_url(self.attributes_url, os.path.join(self.root, self.dataset_dir_name))
def download(self): import tarfile import shutil if self._check_integrity(): return # download attributes attributes_filename = os.path.basename(self.attributes_url) download_url(self.attributes_url, self.root, filename=attributes_filename) attributes_tgz_filename = os.path.join(self.root, attributes_filename) with tarfile.open(attributes_tgz_filename, 'r') as f: f.extractall(self.root) if os.path.isfile(attributes_tgz_filename): os.remove(attributes_tgz_filename) attributes_original_dir = os.path.join( self.root, attributes_filename.split('.')[0]) attributes_final_dir = os.path.join(self.root, self.attribute_dir) os.rename(attributes_original_dir, attributes_final_dir) # download images images_filename = os.path.basename(self.images_url) download_url(self.images_url, self.root, filename=images_filename) images_tgz_filename = os.path.join(self.root, images_filename) with tarfile.open(images_tgz_filename, 'r') as f: f.extractall(self.root) if os.path.isfile(images_tgz_filename): os.remove(images_tgz_filename) images_original_dir = os.path.join(self.root, 'images') images_final_dir = os.path.join(self.root, self.image_dir) for dir_name in os.listdir(images_original_dir): if dir_name in ['misc', 'outliers']: continue cur_dir = os.path.join(images_original_dir, dir_name) for child_dir_name in os.listdir(cur_dir): cur_child_dir = os.path.join(cur_dir, child_dir_name) first_child_name = os.listdir(cur_child_dir)[0] if os.path.isdir(os.path.join( cur_child_dir, first_child_name)): # cur_dir contains dir for child_name in os.listdir(cur_child_dir): source_dir = os.path.join(cur_child_dir, child_name) target_dir = target_dir = os.path.join( images_final_dir, '{}_{}'.format(child_dir_name, child_name)) if not os.path.exists(target_dir): shutil.copytree(source_dir, target_dir) else: # cur_child_dir only contains images target_dir = os.path.join(images_final_dir, child_dir_name) if not os.path.exists(target_dir): shutil.copytree(cur_child_dir, target_dir) if os.path.exists(images_original_dir): shutil.rmtree(images_original_dir) # delete dirs that have only 1 image for dir_name in [ 'barbershop', 'distillery', 'ice_cream_parlor', 'police_station', 'roller_skating_rink_indoor', 'volleyball_court_indoor' ]: delete_dir = os.path.join(images_final_dir, dir_name) if os.path.exists(delete_dir): shutil.rmtree(delete_dir)
def download(self): """Download the n-MNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import tarfile if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise if not self._check_mats_exists(): for _, url in self.urls.items(): filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) if not self._check_gzips_exists(): download_url(url, root=os.path.join(self.root, self.raw_folder), filename=filename, md5=None) with open(file_path.replace('.gz', '.mat'), 'wb') as out_f: tar = tarfile.open(file_path, 'r:gz') zip_f = tar.extractfile( os.path.basename(file_path.replace('.gz', '.mat'))) out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') def read_images(mat_data, split): length = mat_data[split].shape[0] num_rows = np.uint8(np.sqrt(mat_data[split].shape[1])) num_cols = num_rows return torch.from_numpy(mat_data[split]).view( length, num_rows, num_cols) def read_labels(mat_data, split): length = mat_data[split].shape[0] labels = np.asarray( [np.where(r == 1)[0][0] for r in mat_data[split]]) return torch.from_numpy(labels).view(length).long() data = sio.loadmat( os.path.join(self.root, self.raw_folder, self.mat_files[0])) if len(self.mat_files) > 1: for mat_file in self.mat_files[1:]: mat_data = sio.loadmat( os.path.join(self.root, self.raw_folder, mat_file)) data['train_x'] = np.concatenate( (data['train_x'], mat_data['train_x']), axis=0) data['train_y'] = np.concatenate( (data['train_y'], mat_data['train_y']), axis=0) data['test_x'] = np.concatenate( (data['test_x'], mat_data['test_x']), axis=0) data['test_y'] = np.concatenate( (data['test_y'], mat_data['test_y']), axis=0) training_set = (read_images(data, 'train_x'), read_labels(data, 'train_y')) test_set = (read_images(data, 'test_x'), read_labels(data, 'test_y')) with open( os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open( os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self): if self.model_path.exists(): return download_url(self.url, self.model_path.parent, self.filename)
def download(self, root: str): if not exists(join(root, FreyFace.FILENAME)): makedirs(root, exist_ok=True) download_url(FreyFace.URL, root, FreyFace.FILENAME, FreyFace.MD5)
def download(self, url, filename, file_md5): download_url(url, self.root, filename, file_md5)
# wget https://www.usitc.gov/sites/default/files/tata/hts/hts_2021_preliminary_revision_2_json.json # ./parse_hs_json.py --input_hs_json hts_2021_preliminary_revision_2_json.json --output_csv hts_2021_preliminary_revision_2_json.csv import os import sys #from argparse import ArgumentParser import pandas as pd from torchvision.datasets.utils import download_url import re import json config = json.loads(re.sub(r'#.*?\n', '', open('config.json', 'r').read())) download_url(config['hts_url'], '.') #parser = ArgumentParser(add_help=True) #parser.add_argument('--input_hs_json', type=str, required=True, help='Input Json HS file') #parser.add_argument('--output_csv', type=str, required=True, help='Output CSV file') class obj: # constructor def __init__(self, dict1): self.__dict__.update(dict1) self.child = [] self.id = None def add_child(self, o):
def download(self): if self._check_integrity(): print('Files already downloaded and verified') return download_url(self.url, self.root, filename=self.filename, md5=self.tgz_md5)
def download(self): """Download the SmallNORB data if it doesn't exist in processed_folder already.""" import gzip if self._check_exists(): return # check if already extracted and verified if self._check_integrity(): print('Files already downloaded and verified') else: # download and extract for file_dict in self._flat_data_files(): url = self.dataset_root + file_dict["name"] + '.gz' filename = file_dict["name"] gz_filename = filename + '.gz' md5 = file_dict["md5_gz"] fpath = os.path.join(self.root, self.raw_folder, filename) gz_fpath = fpath + '.gz' # download if compressed file not exists and verified download_url(url, os.path.join(self.root, self.raw_folder), gz_filename, md5) print('# Extracting data {}\n'.format(filename)) with open(fpath, 'wb') as out_f, \ gzip.GzipFile(gz_fpath) as zip_f: out_f.write(zip_f.read()) os.unlink(gz_fpath) # process and save as torch files print('Processing...') # create processed folder try: os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise # read train files left_train_img, right_train_img = self._read_image_file( self.data_files["train"]["dat"]["name"]) train_info = self._read_info_file( self.data_files["train"]["info"]["name"]) train_label = self._read_label_file( self.data_files["train"]["cat"]["name"]) # read test files left_test_img, right_test_img = self._read_image_file( self.data_files["test"]["dat"]["name"]) test_info = self._read_info_file( self.data_files["test"]["info"]["name"]) test_label = self._read_label_file( self.data_files["test"]["cat"]["name"]) # save training files self._save(left_train_img, "{}_left".format(self.train_image_file)) self._save(right_train_img, "{}_right".format(self.train_image_file)) self._save(train_label, self.train_label_file) self._save(train_info, self.train_info_file) # save test files self._save(left_test_img, "{}_left".format(self.test_image_file)) self._save(right_test_img, "{}_right".format(self.test_image_file)) self._save(test_label, self.test_label_file) self._save(test_info, self.test_info_file) print('Done!')
import os import torch import torchvision from torchvision.datasets.utils import download_url import zipfile train_path = 'train' dl_file = 'dl2018-image-proj.zip' dl_url = 'https://users.aalto.fi/mvsjober/misc/' zip_path = os.path.join(train_path, dl_file) if not os.path.isfile(zip_path): download_url(dl_url + dl_file, root=train_path, filename=dl_file, md5=None) with zipfile.ZipFile(zip_path) as zip_f: zip_f.extractall(train_path) #os.unlink(zip_path) import pandas as pd import glob from sklearn.preprocessing import OneHotEncoder import platform # Create an array with tuple(img, label) pairs from the annotations txt files. files = glob.glob("./train/annotations/*") labels = [] for name in files: try: with open(name) as f: if platform.system == "Windows":
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" #from six.moves import urllib import gzip if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) download_url(url, root=os.path.join(self.root, self.raw_folder), filename=filename, md5=None) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') training_valid_set = ( read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')), read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte')) ) # Define the indices indices = list(range(len(training_valid_set[0]))) if os.path.exists(os.path.join(self.root, self.processed_folder, 'valid_idx.npy')): valid_idx = np.load(os.path.join(self.root, self.processed_folder, 'valid_idx.npy')) else: valid_idx = np.random.choice(indices, size = 10000, replace = False) np.save(os.path.join(self.root, self.processed_folder, 'valid_idx.npy'), valid_idx) train_idx = list(set(indices) - set(valid_idx)) training_set = ( training_valid_set[0][train_idx], training_valid_set[1][train_idx] ) valid_set = ( training_valid_set[0][valid_idx], training_valid_set[1][valid_idx] ) test_set = ( read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')), read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte')) ) with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.root, self.processed_folder, self.valid_file), 'wb') as f: torch.save(valid_set, f) with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def _download(self): for url, filename, md5 in self.file_list.values(): download_url(url, root=self.root, filename=filename) if not check_integrity(os.path.join(self.root, filename), md5): raise RuntimeError("File not found or corrupted.")
def __init__(self, root, download=False, image_set='train', transforms=None): """ Camvid dataset:https://course.fast.ai/datasets or simply wget https://s3.amazonaws.com/fast-ai-imagelocal/camvid.tgz Args: data_path: path to dataset folder image_set: train datset or validation dataset, 'train', or 'val' transforms: data augmentations """ self._image_set = image_set self.transforms = transforms self._md5 = '2e796d442fe723192014ace89a1515b1' self._url = 'https://s3.amazonaws.com/fast-ai-imagelocal/camvid.tgz' self._filename = 'camvid.tgz' self._root = root if download: download_url(self._url, self._root, self._filename, md5=self._md5) self._label_IDs = { # Sky 'Sky': 'Sky', # Building 'Bridge': 'Building', 'Building': 'Building', 'Wall': 'Building', 'Tunnel': 'Building', 'Archway': 'Building', # Pole 'Column_Pole': 'Pole', 'TrafficCone': 'Pole', # Road 'Road': 'Road', 'LaneMkgsDriv': 'Road', 'LaneMkgsNonDriv': 'Road', # Pavement 'Sidewalk': 'Pavement', 'ParkingBlock': 'Pavement', 'RoadShoulder': 'Pavement', # Tree 'Tree': 'Tree', 'VegetationMisc': 'Tree', # SignSymbol 'SignSymbol': 'SignSymbol', 'Misc_Text': 'SignSymbol', 'TrafficLight': 'SignSymbol', # Fence 'Fence': 'Fence', # Car 'Car': 'Car', 'SUVPickupTruck': 'Car', 'Truck_Bus': 'Car', 'Train': 'Car', 'OtherMoving': 'Car', # Pedestrian 'Pedestrian': 'Pedestrian', 'Child': 'Pedestrian', 'CartLuggagePram': 'Pedestrian', 'Animal': 'Pedestrian', # Bicyclist 'Bicyclist': 'Bicyclist', 'MotorcycleScooter': 'Bicyclist', #Void 'Void': 'Void', } self.class_names = [ 'Sky', 'Building', 'Pole', 'Road', 'Pavement', 'Tree', 'SignSymbol', 'Fence', 'Car', 'Pedestrian', 'Bicyclist', 'Void' ] self.class_num = len(self.class_names) self.ignore_index = self.class_names.index('Void') if not os.path.exists(os.path.join(self._root, self._image_set)): with tarfile.open(os.path.join(self._root, self._filename), "r") as tar: tar.extractall(path=self._root) with open(os.path.join(self._root, 'camvid', 'codes.txt')) as f: self._codes = [line.strip() for line in f.readlines()] print('grouping 32 classes labels into 12 classes....') camvid_label_folder = os.path.join(self._root, 'camvid', 'labels', '**', '*.png') camvid_images_folder = os.path.join(self._root, 'camvid', 'images', '**', '*.png') for label_fp in glob.iglob(camvid_label_folder, recursive=True): label = cv2.imread(label_fp, -1) label = self._group_ids(label) cv2.imwrite(label_fp, label) with open(os.path.join(self._root, 'camvid', 'valid.txt')) as f: valids = [line.strip() for line in f.readlines()] image_pathes = [] for image_fp in glob.iglob(camvid_images_folder, recursive=True): if self._image_set == 'train': if os.path.basename( image_fp ) not in valids and 'test.txt' not in image_fp: image_pathes.append(image_fp) elif self._image_set == 'val': if os.path.basename(image_fp) in valids: image_pathes.append(image_fp) else: raise RuntimeError( 'image_set should only be one of train val') label_pathes = [] for image_fp in image_pathes: basename = os.path.basename(image_fp) dirname = os.path.dirname(image_fp) sub_folder = os.path.dirname(dirname) dirname = os.path.join(sub_folder, 'labels') basename = basename.replace('.png', '_P.png') label_pathes.append(os.path.join(dirname, basename)) image_pathes.extend(label_pathes) # create lmdb dataset print( 'Writing {} data into lmdb format to acclerate data loading process' .format(self._image_set)) self._create_lmdb(os.path.join(self._root, self._image_set), image_pathes) print('Done...') shutil.rmtree(os.path.join(self._root, 'camvid')) lmdb_path = os.path.join(self._root, self._image_set) self._env = lmdb.open(lmdb_path, map_size=1099511627776, readonly=True, lock=False) with self._env.begin(write=False) as txn: self._image_names= [key.decode() for key in txn.cursor().iternext(keys=True, values=False) \ if '_P' not in key.decode()]
def download(self): download_url(self.url, self.raw_dir)
from torch.utils.data import random_split import matplotlib.pyplot as plt from scipy.io import loadmat import pandas as pd from torchvision import transforms import torchvision.models as models from skimage import io from tqdm import tqdm import time from IPython.display import display # %matplotlib inline # Dowload the train dataset dataset_url = "http://imagenet.stanford.edu/internal/car196/car_ims.tgz" download_url(dataset_url, '.') # Extract from archive with tarfile.open('./car_ims.tgz', 'r:gz') as tar: tar.extractall(path='./data/') # Download DevKit https://ai.stanford.edu/~jkrause/cars/car_devkit.tgz devkit_dataset_url = "https://ai.stanford.edu/~jkrause/cars/car_devkit.tgz" download_url(devkit_dataset_url, '.') # Extract from archive with tarfile.open('./car_devkit.tgz', 'r:gz') as tar: tar.extractall(path='./data/devkit') label_dataset_url = "http://imagenet.stanford.edu/internal/car196/cars_annos.mat" download_url(label_dataset_url, './data')
train = os.path.join(mit67, 'train') test = os.path.join(mit67, 'test') meta = os.path.join(mit67, 'meta') os.makedirs(mit67, exist_ok=True) os.makedirs(train, exist_ok=True) os.makedirs(test, exist_ok=True) os.makedirs(meta, exist_ok=True) # this step will create folder mit67/Images # which has all the images for each class in its own subfolder download(mit67) # download the csv files for the train and test split # from 'NAS Evaluation is Frustrating' repo # note that download_url doesn't work in vscode debug mode test_file_url = 'https://raw.githubusercontent.com/antoyang/NAS-Benchmark/master/data/MIT67_test.csv' train_file_urls = [ 'https://raw.githubusercontent.com/antoyang/NAS-Benchmark/master/data/MIT67_train1.csv', 'https://raw.githubusercontent.com/antoyang/NAS-Benchmark/master/data/MIT67_train2.csv', 'https://raw.githubusercontent.com/antoyang/NAS-Benchmark/master/data/MIT67_train3.csv', 'https://raw.githubusercontent.com/antoyang/NAS-Benchmark/master/data/MIT67_train4.csv' ] download_url(test_file_url, meta, filename=None, md5=None) for tu in train_file_urls: download_url(tu, meta, filename=None, md5=None) prepare_data(mit67)
import torch.nn as nn from torch.autograd import Variable import torchvision.models as models from torchvision import transforms, utils from torch.utils.data import Dataset, DataLoader from PIL import Image import numpy as np import torch.optim as optim import os import shutil import random import tarfile from torchvision.datasets.utils import download_url, list_dir, list_files from torchvision.datasets import ImageFolder caltech256_url_ = 'http://vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar' download_url(caltech256_url_, './data') def extract_tar(filename): tar = tarfile.open(filename) tar.extractall('./data') tar.close() def create_data(source_dir, target_dir, num_sample): if not os.path.exists(target_dir): os.makedirs(target_dir) for files in os.listdir(source_dir): source_dir_0 = os.path.join(source_dir, files) target_dir_0 = os.path.join(target_dir, files) if not os.path.exists(target_dir_0):
import tarfile import torch.nn as nn import numpy as np import torch.nn.functional as F from torchvision.datasets.utils import download_url from torchvision.datasets import ImageFolder from torch.utils.data import DataLoader import torchvision.transforms as tt from torch.utils.data import random_split from torchvision.utils import make_grid import matplotlib.pyplot as plt # Dowload the CIFAR-10 dataset dataset_url = "http://files.fast.ai/data/cifar10.tgz" download_url(dataset_url, '.') # Extract from archive with tarfile.open('./cifar10.tgz', 'r:gz') as tar: tar.extractall(path='./data') data_dir = './data/cifar10' # Data transforms (normalization & data augmentation) stats = ((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) train_tfms = tt.Compose([tt.RandomCrop(32, padding=4, padding_mode='reflect'), tt.RandomHorizontalFlip(), tt.ToTensor(), tt.Normalize(*stats,inplace=True)]) valid_tfms = tt.Compose([tt.ToTensor(), tt.Normalize(*stats)])
def download_extract(url, root, filename, md5): download_url(url, root, filename, md5) with tarfile.open(os.path.join(root, filename), "r") as tar: tar.extractall(path=root)
def download(self): if self._check_exists(): return self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") os.makedirs(self.raw_folder, exist_ok=True) os.makedirs(self.processed_folder, exist_ok=True) # Download outcome data for url in self.outcome_urls: filename = url.rpartition('/')[2] download_url(url, self.raw_folder, filename, None) txtfile = os.path.join(self.raw_folder, filename) with open(txtfile) as f: lines = f.readlines() outcomes = {} for l in lines[1:]: l = l.rstrip().split(',') record_id, labels = l[0], np.array(l[1:]).astype(float) outcomes[record_id] = torch.Tensor(labels).to(self.device) torch.save( labels, os.path.join(self.processed_folder, filename.split('.')[0] + '.pt')) for url in self.urls: filename = url.rpartition('/')[2] download_url(url, self.raw_folder, filename, None) tar = tarfile.open(os.path.join(self.raw_folder, filename), "r:gz") tar.extractall(self.raw_folder) tar.close() print('Processing {}...'.format(filename)) dirname = os.path.join(self.raw_folder, filename.split('.')[0]) patients = [] total = 0 for txtfile in os.listdir(dirname): record_id = txtfile.split('.')[0] with open(os.path.join(dirname, txtfile)) as f: lines = f.readlines() prev_time = 0 tt = [0.] vals = [torch.zeros(len(self.params)).to(self.device)] mask = [torch.zeros(len(self.params)).to(self.device)] nobs = [torch.zeros(len(self.params))] for l in lines[1:]: total += 1 time, param, val = l.split(',') # Time in hours time = float(time.split(':')[0]) + float( time.split(':')[1]) / 60. # round up the time stamps (up to 6 min by default) # used for speed -- we actually don't need to quantize it in Latent ODE time = round( time / self.quantization) * self.quantization if time != prev_time: tt.append(time) vals.append( torch.zeros(len(self.params)).to(self.device)) mask.append( torch.zeros(len(self.params)).to(self.device)) nobs.append( torch.zeros(len(self.params)).to(self.device)) prev_time = time if param in self.params_dict: #vals[-1][self.params_dict[param]] = float(val) n_observations = nobs[-1][self.params_dict[param]] if self.reduce == 'average' and n_observations > 0: prev_val = vals[-1][self.params_dict[param]] new_val = (prev_val * n_observations + float(val)) / (n_observations + 1) vals[-1][self.params_dict[param]] = new_val else: vals[-1][self.params_dict[param]] = float(val) mask[-1][self.params_dict[param]] = 1 nobs[-1][self.params_dict[param]] += 1 else: assert param == 'RecordID', 'Read unexpected param {}'.format( param) tt = torch.tensor(tt).to(self.device) vals = torch.stack(vals) mask = torch.stack(mask) labels = None if record_id in outcomes: # Only training set has labels labels = outcomes[record_id] # Out of 5 label types provided for Physionet, take only the last one -- mortality labels = labels[4] patients.append((record_id, tt, vals, mask, labels)) torch.save( patients, os.path.join( self.processed_folder, filename.split('.')[0] + "_" + str(self.quantization) + '.pt')) print('Done!')