def get_physionet_data(args, device, q, flag=1): train_dataset_obj = PhysioNet('data/physionet', train=True, quantization=q, download=True, n_samples=min(10000, args.n), device=device) # Use custom collate_fn to combine samples with arbitrary time observations. # Returns the dataset along with mask and time steps test_dataset_obj = PhysioNet('data/physionet', train=False, quantization=q, download=True, n_samples=min(10000, args.n), device=device) # Combine and shuffle samples from physionet Train and physionet Test total_dataset = train_dataset_obj[:len(train_dataset_obj)] if not args.classif: # Concatenate samples from original Train and Test sets # Only 'training' physionet samples are have labels. # Therefore, if we do classifiction task, we don't need physionet 'test' samples. total_dataset = total_dataset + \ test_dataset_obj[:len(test_dataset_obj)] print(len(total_dataset)) # Shuffle and split train_data, test_data = model_selection.train_test_split(total_dataset, train_size=0.8, random_state=42, shuffle=True) record_id, tt, vals, mask, labels = train_data[0] # n_samples = len(total_dataset) input_dim = vals.size(-1) data_min, data_max = get_data_min_max(total_dataset, device) batch_size = min(min(len(train_dataset_obj), args.batch_size), args.n) if flag: test_data_combined = variable_time_collate_fn(test_data, device, classify=args.classif, data_min=data_min, data_max=data_max) if args.classif: train_data, val_data = model_selection.train_test_split( train_data, train_size=0.8, random_state=11, shuffle=True) train_data_combined = variable_time_collate_fn( train_data, device, classify=args.classif, data_min=data_min, data_max=data_max) val_data_combined = variable_time_collate_fn(val_data, device, classify=args.classif, data_min=data_min, data_max=data_max) print(train_data_combined[1].sum(), val_data_combined[1].sum(), test_data_combined[1].sum()) print(train_data_combined[0].size(), train_data_combined[1].size(), val_data_combined[0].size(), val_data_combined[1].size(), test_data_combined[0].size(), test_data_combined[1].size()) train_data_combined = TensorDataset( train_data_combined[0], train_data_combined[1].long().squeeze()) val_data_combined = TensorDataset( val_data_combined[0], val_data_combined[1].long().squeeze()) test_data_combined = TensorDataset( test_data_combined[0], test_data_combined[1].long().squeeze()) else: train_data_combined = variable_time_collate_fn( train_data, device, classify=args.classif, data_min=data_min, data_max=data_max) print(train_data_combined.size(), test_data_combined.size()) train_dataloader = DataLoader(train_data_combined, batch_size=batch_size, shuffle=False) test_dataloader = DataLoader(test_data_combined, batch_size=batch_size, shuffle=False) else: train_dataloader = DataLoader( train_data, batch_size=batch_size, shuffle=False, collate_fn=lambda batch: variable_time_collate_fn2( batch, args, device, data_type="train", data_min=data_min, data_max=data_max)) test_dataloader = DataLoader( test_data, batch_size=batch_size, shuffle=False, collate_fn=lambda batch: variable_time_collate_fn2( batch, args, device, data_type="test", data_min=data_min, data_max=data_max)) attr_names = train_dataset_obj.params data_objects = { "dataset_obj": train_dataset_obj, "train_dataloader": train_dataloader, "test_dataloader": test_dataloader, "input_dim": input_dim, "n_train_batches": len(train_dataloader), "n_test_batches": len(test_dataloader), "attr": attr_names, # optional "classif_per_tp": False, # optional "n_labels": 1 } # optional if args.classif: val_dataloader = DataLoader(val_data_combined, batch_size=batch_size, shuffle=False) data_objects["val_dataloader"] = val_dataloader return data_objects
def get_physionet_data_extrap(args, device, q, flag=1): train_dataset_obj = PhysioNet('data/physionet', train=True, quantization=q, download=True, n_samples=min(10000, args.n), device=device) # Use custom collate_fn to combine samples with arbitrary time observations. # Returns the dataset along with mask and time steps test_dataset_obj = PhysioNet('data/physionet', train=False, quantization=q, download=True, n_samples=min(10000, args.n), device=device) # Combine and shuffle samples from physionet Train and physionet Test total_dataset = train_dataset_obj[:len(train_dataset_obj)] if not args.classif: # Concatenate samples from original Train and Test sets # Only 'training' physionet samples are have labels. # Therefore, if we do classifiction task, we don't need physionet 'test' samples. total_dataset = total_dataset + \ test_dataset_obj[:len(test_dataset_obj)] print(len(total_dataset)) # Shuffle and split train_data, test_data = model_selection.train_test_split(total_dataset, train_size=0.8, random_state=42, shuffle=True) record_id, tt, vals, mask, labels = train_data[0] # n_samples = len(total_dataset) input_dim = vals.size(-1) data_min, data_max = get_data_min_max(total_dataset, device) batch_size = min(min(len(train_dataset_obj), args.batch_size), args.n) def extrap(test_data): enc_test_data = [] dec_test_data = [] for (record_id, tt, vals, mask, labels) in test_data: midpt = 0 for tp in tt: if tp < 24: midpt += 1 else: break if mask[:midpt].sum() and mask[midpt:].sum(): enc_test_data.append((record_id, tt[:midpt], vals[:midpt], mask[:midpt], labels)) dec_test_data.append((record_id, tt[midpt:], vals[midpt:], mask[midpt:], labels)) return enc_test_data, dec_test_data enc_train_data, dec_train_data = extrap(train_data) enc_test_data, dec_test_data = extrap(test_data) enc_train_data_combined = variable_time_collate_fn(enc_train_data, device, classify=args.classif, data_min=data_min, data_max=data_max) dec_train_data_combined = variable_time_collate_fn(dec_train_data, device, classify=args.classif, data_min=data_min, data_max=data_max) enc_test_data_combined = variable_time_collate_fn(enc_test_data, device, classify=args.classif, data_min=data_min, data_max=data_max) dec_test_data_combined = variable_time_collate_fn(dec_test_data, device, classify=args.classif, data_min=data_min, data_max=data_max) print(enc_train_data_combined.shape, dec_train_data_combined.shape) print(enc_test_data_combined.shape, dec_test_data_combined.shape) # keep the timepoints in enc between 0.0 and 0.5 enc_train_data_combined[:, :, -1] *= 0.5 enc_test_data_combined[:, :, -1] *= 0.5 print(enc_train_data_combined[0, :, -1], dec_train_data_combined[0, :, -1]) enc_train_dataloader = DataLoader(enc_train_data_combined, batch_size=batch_size, shuffle=False) dec_train_dataloader = DataLoader(dec_train_data_combined, batch_size=batch_size, shuffle=False) enc_test_dataloader = DataLoader(enc_test_data_combined, batch_size=batch_size, shuffle=False) dec_test_dataloader = DataLoader(dec_test_data_combined, batch_size=batch_size, shuffle=False) attr_names = train_dataset_obj.params data_objects = { "dataset_obj": train_dataset_obj, "enc_train_dataloader": enc_train_dataloader, "enc_test_dataloader": enc_test_dataloader, "dec_train_dataloader": dec_train_dataloader, "dec_test_dataloader": dec_test_dataloader, "input_dim": input_dim, "attr": attr_names, # optional "classif_per_tp": False, # optional "n_labels": 1 } # optional return data_objects
def parse_datasets(args, device): def basic_collate_fn(batch, time_steps, args=args, device=device, data_type="train"): batch = torch.stack(batch) data_dict = {"data": batch, "time_steps": time_steps} data_dict = utils.split_and_subsample_batch(data_dict, args, data_type=data_type) return data_dict dataset_name = args.dataset n_total_tp = args.timepoints + args.extrap max_t_extrap = args.max_t / args.timepoints * n_total_tp ################################################################## # MuJoCo dataset if dataset_name == "hopper": dataset_obj = HopperPhysics(root='data', download=True, generate=False, device=device) dataset = dataset_obj.get_dataset()[:args.n] dataset = dataset.to(device) n_tp_data = dataset[:].shape[1] # Time steps that are used later on for exrapolation time_steps = torch.arange(start=0, end=n_tp_data, step=1).float().to(device) time_steps = time_steps / len(time_steps) dataset = dataset.to(device) time_steps = time_steps.to(device) if not args.extrap: # Creating dataset for interpolation # sample time points from different parts of the timeline, # so that the model learns from different parts of hopper trajectory n_traj = len(dataset) n_tp_data = dataset.shape[1] n_reduced_tp = args.timepoints # sample time points from different parts of the timeline, # so that the model learns from different parts of hopper trajectory start_ind = np.random.randint(0, high=n_tp_data - n_reduced_tp + 1, size=n_traj) end_ind = start_ind + n_reduced_tp sliced = [] for i in range(n_traj): sliced.append(dataset[i, start_ind[i]:end_ind[i], :]) dataset = torch.stack(sliced).to(device) time_steps = time_steps[:n_reduced_tp] # Split into train and test by the time sequences train_y, test_y = utils.split_train_test(dataset, train_fraq=0.8) n_samples = len(dataset) input_dim = dataset.size(-1) batch_size = min(args.batch_size, args.n) train_dataloader = DataLoader( train_y, batch_size=batch_size, shuffle=False, collate_fn=lambda batch: basic_collate_fn( batch, time_steps, data_type="train")) test_dataloader = DataLoader(test_y, batch_size=n_samples, shuffle=False, collate_fn=lambda batch: basic_collate_fn( batch, time_steps, data_type="test")) data_objects = { "dataset_obj": dataset_obj, "train_dataloader": utils.inf_generator(train_dataloader), "test_dataloader": utils.inf_generator(test_dataloader), "input_dim": input_dim, "n_train_batches": len(train_dataloader), "n_test_batches": len(test_dataloader) } return data_objects ################################################################## # Physionet dataset if dataset_name == "physionet": train_dataset_obj = PhysioNet('data/physionet', train=True, quantization=args.quantization, download=True, n_samples=min(10000, args.n), device=device) # Use custom collate_fn to combine samples with arbitrary time observations. # Returns the dataset along with mask and time steps test_dataset_obj = PhysioNet('data/physionet', train=False, quantization=args.quantization, download=True, n_samples=min(10000, args.n), device=device) # Combine and shuffle samples from physionet Train and physionet Test total_dataset = train_dataset_obj[:len(train_dataset_obj)] if not args.classif: # Concatenate samples from original Train and Test sets # Only 'training' physionet samples are have labels. Therefore, if we do classifiction task, we don't need physionet 'test' samples. total_dataset = total_dataset + test_dataset_obj[:len( test_dataset_obj)] # Shuffle and split train_data, test_data = model_selection.train_test_split( total_dataset, train_size=0.8, random_state=42, shuffle=True) record_id, tt, vals, mask, labels = train_data[0] n_samples = len(total_dataset) input_dim = vals.size(-1) batch_size = min(min(len(train_dataset_obj), args.batch_size), args.n) data_min, data_max = get_data_min_max(total_dataset, device) train_dataloader = DataLoader( train_data, batch_size=batch_size, shuffle=False, collate_fn=lambda batch: variable_time_collate_fn( batch, args, device, data_type="train", data_min=data_min, data_max=data_max)) test_dataloader = DataLoader( test_data, batch_size=n_samples, shuffle=False, collate_fn=lambda batch: variable_time_collate_fn( batch, args, device, data_type="test", data_min=data_min, data_max=data_max)) attr_names = train_dataset_obj.params data_objects = { "dataset_obj": train_dataset_obj, "train_dataloader": utils.inf_generator(train_dataloader), "test_dataloader": utils.inf_generator(test_dataloader), "input_dim": input_dim, "n_train_batches": len(train_dataloader), "n_test_batches": len(test_dataloader), "attr": attr_names, #optional "classif_per_tp": False, #optional "n_labels": 1 } #optional return data_objects ################################################################## # Human activity dataset if dataset_name == "activity": n_samples = min(10000, args.n) dataset_obj = PersonActivity('data/PersonActivity', download=True, n_samples=n_samples, device=device) print(dataset_obj) # Use custom collate_fn to combine samples with arbitrary time observations. # Returns the dataset along with mask and time steps # Shuffle and split train_data, test_data = model_selection.train_test_split( dataset_obj, train_size=0.8, random_state=42, shuffle=True) train_data = [ train_data[i] for i in np.random.choice(len(train_data), len(train_data)) ] test_data = [ test_data[i] for i in np.random.choice(len(test_data), len(test_data)) ] record_id, tt, vals, mask, labels = train_data[0] input_dim = vals.size(-1) batch_size = min(min(len(dataset_obj), args.batch_size), args.n) train_dataloader = DataLoader( train_data, batch_size=batch_size, shuffle=False, collate_fn=lambda batch: variable_time_collate_fn_activity( batch, args, device, data_type="train")) test_dataloader = DataLoader( test_data, batch_size=n_samples, shuffle=False, collate_fn=lambda batch: variable_time_collate_fn_activity( batch, args, device, data_type="test")) data_objects = { "dataset_obj": dataset_obj, "train_dataloader": utils.inf_generator(train_dataloader), "test_dataloader": utils.inf_generator(test_dataloader), "input_dim": input_dim, "n_train_batches": len(train_dataloader), "n_test_batches": len(test_dataloader), "classif_per_tp": True, #optional "n_labels": labels.size(-1) } return data_objects ########### 1d datasets ########### # Sampling args.timepoints time points in the interval [0, args.max_t] # Sample points for both training sequence and explapolation (test) distribution = uniform.Uniform(torch.Tensor([0.0]), torch.Tensor([max_t_extrap])) time_steps_extrap = distribution.sample(torch.Size([n_total_tp - 1]))[:, 0] time_steps_extrap = torch.cat((torch.Tensor([0.0]), time_steps_extrap)) time_steps_extrap = torch.sort(time_steps_extrap)[0] dataset_obj = None ################################################################## # Sample a periodic function if dataset_name == "periodic": dataset_obj = Periodic_1d(init_freq=None, init_amplitude=1., final_amplitude=1., final_freq=None, z0=1.) ################################################################## if dataset_obj is None: raise Exception("Unknown dataset: {}".format(dataset_name)) dataset = dataset_obj.sample_traj(time_steps_extrap, n_samples=args.n, noise_weight=args.noise_weight) # Process small datasets dataset = dataset.to(device) time_steps_extrap = time_steps_extrap.to(device) train_y, test_y = utils.split_train_test(dataset, train_fraq=0.8) n_samples = len(dataset) input_dim = dataset.size(-1) batch_size = min(args.batch_size, args.n) train_dataloader = DataLoader( train_y, batch_size=batch_size, shuffle=False, collate_fn=lambda batch: basic_collate_fn( batch, time_steps_extrap, data_type="train")) test_dataloader = DataLoader( test_y, batch_size=args.n, shuffle=False, collate_fn=lambda batch: basic_collate_fn( batch, time_steps_extrap, data_type="test")) data_objects = { #"dataset_obj": dataset_obj, "train_dataloader": utils.inf_generator(train_dataloader), "test_dataloader": utils.inf_generator(test_dataloader), "input_dim": input_dim, "n_train_batches": len(train_dataloader), "n_test_batches": len(test_dataloader) } return data_objects