Beispiel #1
0
def get_physionet_data(args, device, q, flag=1):
    train_dataset_obj = PhysioNet('data/physionet',
                                  train=True,
                                  quantization=q,
                                  download=True,
                                  n_samples=min(10000, args.n),
                                  device=device)
    # Use custom collate_fn to combine samples with arbitrary time observations.
    # Returns the dataset along with mask and time steps
    test_dataset_obj = PhysioNet('data/physionet',
                                 train=False,
                                 quantization=q,
                                 download=True,
                                 n_samples=min(10000, args.n),
                                 device=device)

    # Combine and shuffle samples from physionet Train and physionet Test
    total_dataset = train_dataset_obj[:len(train_dataset_obj)]

    if not args.classif:
        # Concatenate samples from original Train and Test sets
        # Only 'training' physionet samples are have labels.
        # Therefore, if we do classifiction task, we don't need physionet 'test' samples.
        total_dataset = total_dataset + \
            test_dataset_obj[:len(test_dataset_obj)]
    print(len(total_dataset))
    # Shuffle and split
    train_data, test_data = model_selection.train_test_split(total_dataset,
                                                             train_size=0.8,
                                                             random_state=42,
                                                             shuffle=True)

    record_id, tt, vals, mask, labels = train_data[0]

    # n_samples = len(total_dataset)
    input_dim = vals.size(-1)
    data_min, data_max = get_data_min_max(total_dataset, device)
    batch_size = min(min(len(train_dataset_obj), args.batch_size), args.n)
    if flag:
        test_data_combined = variable_time_collate_fn(test_data,
                                                      device,
                                                      classify=args.classif,
                                                      data_min=data_min,
                                                      data_max=data_max)

        if args.classif:
            train_data, val_data = model_selection.train_test_split(
                train_data, train_size=0.8, random_state=11, shuffle=True)
            train_data_combined = variable_time_collate_fn(
                train_data,
                device,
                classify=args.classif,
                data_min=data_min,
                data_max=data_max)
            val_data_combined = variable_time_collate_fn(val_data,
                                                         device,
                                                         classify=args.classif,
                                                         data_min=data_min,
                                                         data_max=data_max)
            print(train_data_combined[1].sum(), val_data_combined[1].sum(),
                  test_data_combined[1].sum())
            print(train_data_combined[0].size(), train_data_combined[1].size(),
                  val_data_combined[0].size(), val_data_combined[1].size(),
                  test_data_combined[0].size(), test_data_combined[1].size())

            train_data_combined = TensorDataset(
                train_data_combined[0],
                train_data_combined[1].long().squeeze())
            val_data_combined = TensorDataset(
                val_data_combined[0], val_data_combined[1].long().squeeze())
            test_data_combined = TensorDataset(
                test_data_combined[0], test_data_combined[1].long().squeeze())
        else:
            train_data_combined = variable_time_collate_fn(
                train_data,
                device,
                classify=args.classif,
                data_min=data_min,
                data_max=data_max)
            print(train_data_combined.size(), test_data_combined.size())

        train_dataloader = DataLoader(train_data_combined,
                                      batch_size=batch_size,
                                      shuffle=False)
        test_dataloader = DataLoader(test_data_combined,
                                     batch_size=batch_size,
                                     shuffle=False)

    else:
        train_dataloader = DataLoader(
            train_data,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=lambda batch: variable_time_collate_fn2(
                batch,
                args,
                device,
                data_type="train",
                data_min=data_min,
                data_max=data_max))
        test_dataloader = DataLoader(
            test_data,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=lambda batch: variable_time_collate_fn2(
                batch,
                args,
                device,
                data_type="test",
                data_min=data_min,
                data_max=data_max))

    attr_names = train_dataset_obj.params
    data_objects = {
        "dataset_obj": train_dataset_obj,
        "train_dataloader": train_dataloader,
        "test_dataloader": test_dataloader,
        "input_dim": input_dim,
        "n_train_batches": len(train_dataloader),
        "n_test_batches": len(test_dataloader),
        "attr": attr_names,  # optional
        "classif_per_tp": False,  # optional
        "n_labels": 1
    }  # optional
    if args.classif:
        val_dataloader = DataLoader(val_data_combined,
                                    batch_size=batch_size,
                                    shuffle=False)
        data_objects["val_dataloader"] = val_dataloader
    return data_objects
Beispiel #2
0
def get_physionet_data_extrap(args, device, q, flag=1):
    train_dataset_obj = PhysioNet('data/physionet',
                                  train=True,
                                  quantization=q,
                                  download=True,
                                  n_samples=min(10000, args.n),
                                  device=device)
    # Use custom collate_fn to combine samples with arbitrary time observations.
    # Returns the dataset along with mask and time steps
    test_dataset_obj = PhysioNet('data/physionet',
                                 train=False,
                                 quantization=q,
                                 download=True,
                                 n_samples=min(10000, args.n),
                                 device=device)

    # Combine and shuffle samples from physionet Train and physionet Test
    total_dataset = train_dataset_obj[:len(train_dataset_obj)]

    if not args.classif:
        # Concatenate samples from original Train and Test sets
        # Only 'training' physionet samples are have labels.
        # Therefore, if we do classifiction task, we don't need physionet 'test' samples.
        total_dataset = total_dataset + \
            test_dataset_obj[:len(test_dataset_obj)]
    print(len(total_dataset))
    # Shuffle and split
    train_data, test_data = model_selection.train_test_split(total_dataset,
                                                             train_size=0.8,
                                                             random_state=42,
                                                             shuffle=True)

    record_id, tt, vals, mask, labels = train_data[0]

    # n_samples = len(total_dataset)
    input_dim = vals.size(-1)
    data_min, data_max = get_data_min_max(total_dataset, device)
    batch_size = min(min(len(train_dataset_obj), args.batch_size), args.n)

    def extrap(test_data):
        enc_test_data = []
        dec_test_data = []
        for (record_id, tt, vals, mask, labels) in test_data:
            midpt = 0
            for tp in tt:
                if tp < 24:
                    midpt += 1
                else:
                    break
            if mask[:midpt].sum() and mask[midpt:].sum():
                enc_test_data.append((record_id, tt[:midpt], vals[:midpt],
                                      mask[:midpt], labels))
                dec_test_data.append((record_id, tt[midpt:], vals[midpt:],
                                      mask[midpt:], labels))
        return enc_test_data, dec_test_data

    enc_train_data, dec_train_data = extrap(train_data)
    enc_test_data, dec_test_data = extrap(test_data)
    enc_train_data_combined = variable_time_collate_fn(enc_train_data,
                                                       device,
                                                       classify=args.classif,
                                                       data_min=data_min,
                                                       data_max=data_max)
    dec_train_data_combined = variable_time_collate_fn(dec_train_data,
                                                       device,
                                                       classify=args.classif,
                                                       data_min=data_min,
                                                       data_max=data_max)
    enc_test_data_combined = variable_time_collate_fn(enc_test_data,
                                                      device,
                                                      classify=args.classif,
                                                      data_min=data_min,
                                                      data_max=data_max)
    dec_test_data_combined = variable_time_collate_fn(dec_test_data,
                                                      device,
                                                      classify=args.classif,
                                                      data_min=data_min,
                                                      data_max=data_max)
    print(enc_train_data_combined.shape, dec_train_data_combined.shape)
    print(enc_test_data_combined.shape, dec_test_data_combined.shape)

    # keep the timepoints in enc between 0.0 and 0.5
    enc_train_data_combined[:, :, -1] *= 0.5
    enc_test_data_combined[:, :, -1] *= 0.5
    print(enc_train_data_combined[0, :, -1], dec_train_data_combined[0, :, -1])
    enc_train_dataloader = DataLoader(enc_train_data_combined,
                                      batch_size=batch_size,
                                      shuffle=False)
    dec_train_dataloader = DataLoader(dec_train_data_combined,
                                      batch_size=batch_size,
                                      shuffle=False)
    enc_test_dataloader = DataLoader(enc_test_data_combined,
                                     batch_size=batch_size,
                                     shuffle=False)
    dec_test_dataloader = DataLoader(dec_test_data_combined,
                                     batch_size=batch_size,
                                     shuffle=False)

    attr_names = train_dataset_obj.params
    data_objects = {
        "dataset_obj": train_dataset_obj,
        "enc_train_dataloader": enc_train_dataloader,
        "enc_test_dataloader": enc_test_dataloader,
        "dec_train_dataloader": dec_train_dataloader,
        "dec_test_dataloader": dec_test_dataloader,
        "input_dim": input_dim,
        "attr": attr_names,  # optional
        "classif_per_tp": False,  # optional
        "n_labels": 1
    }  # optional

    return data_objects
Beispiel #3
0
def parse_datasets(args, device):
    def basic_collate_fn(batch,
                         time_steps,
                         args=args,
                         device=device,
                         data_type="train"):
        batch = torch.stack(batch)
        data_dict = {"data": batch, "time_steps": time_steps}

        data_dict = utils.split_and_subsample_batch(data_dict,
                                                    args,
                                                    data_type=data_type)
        return data_dict

    dataset_name = args.dataset

    n_total_tp = args.timepoints + args.extrap
    max_t_extrap = args.max_t / args.timepoints * n_total_tp

    ##################################################################
    # MuJoCo dataset
    if dataset_name == "hopper":
        dataset_obj = HopperPhysics(root='data',
                                    download=True,
                                    generate=False,
                                    device=device)
        dataset = dataset_obj.get_dataset()[:args.n]
        dataset = dataset.to(device)

        n_tp_data = dataset[:].shape[1]

        # Time steps that are used later on for exrapolation
        time_steps = torch.arange(start=0, end=n_tp_data,
                                  step=1).float().to(device)
        time_steps = time_steps / len(time_steps)

        dataset = dataset.to(device)
        time_steps = time_steps.to(device)

        if not args.extrap:
            # Creating dataset for interpolation
            # sample time points from different parts of the timeline,
            # so that the model learns from different parts of hopper trajectory
            n_traj = len(dataset)
            n_tp_data = dataset.shape[1]
            n_reduced_tp = args.timepoints

            # sample time points from different parts of the timeline,
            # so that the model learns from different parts of hopper trajectory
            start_ind = np.random.randint(0,
                                          high=n_tp_data - n_reduced_tp + 1,
                                          size=n_traj)
            end_ind = start_ind + n_reduced_tp
            sliced = []
            for i in range(n_traj):
                sliced.append(dataset[i, start_ind[i]:end_ind[i], :])
            dataset = torch.stack(sliced).to(device)
            time_steps = time_steps[:n_reduced_tp]

        # Split into train and test by the time sequences
        train_y, test_y = utils.split_train_test(dataset, train_fraq=0.8)

        n_samples = len(dataset)
        input_dim = dataset.size(-1)

        batch_size = min(args.batch_size, args.n)
        train_dataloader = DataLoader(
            train_y,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=lambda batch: basic_collate_fn(
                batch, time_steps, data_type="train"))
        test_dataloader = DataLoader(test_y,
                                     batch_size=n_samples,
                                     shuffle=False,
                                     collate_fn=lambda batch: basic_collate_fn(
                                         batch, time_steps, data_type="test"))

        data_objects = {
            "dataset_obj": dataset_obj,
            "train_dataloader": utils.inf_generator(train_dataloader),
            "test_dataloader": utils.inf_generator(test_dataloader),
            "input_dim": input_dim,
            "n_train_batches": len(train_dataloader),
            "n_test_batches": len(test_dataloader)
        }
        return data_objects

    ##################################################################
    # Physionet dataset

    if dataset_name == "physionet":
        train_dataset_obj = PhysioNet('data/physionet',
                                      train=True,
                                      quantization=args.quantization,
                                      download=True,
                                      n_samples=min(10000, args.n),
                                      device=device)
        # Use custom collate_fn to combine samples with arbitrary time observations.
        # Returns the dataset along with mask and time steps
        test_dataset_obj = PhysioNet('data/physionet',
                                     train=False,
                                     quantization=args.quantization,
                                     download=True,
                                     n_samples=min(10000, args.n),
                                     device=device)

        # Combine and shuffle samples from physionet Train and physionet Test
        total_dataset = train_dataset_obj[:len(train_dataset_obj)]

        if not args.classif:
            # Concatenate samples from original Train and Test sets
            # Only 'training' physionet samples are have labels. Therefore, if we do classifiction task, we don't need physionet 'test' samples.
            total_dataset = total_dataset + test_dataset_obj[:len(
                test_dataset_obj)]

        # Shuffle and split
        train_data, test_data = model_selection.train_test_split(
            total_dataset, train_size=0.8, random_state=42, shuffle=True)

        record_id, tt, vals, mask, labels = train_data[0]

        n_samples = len(total_dataset)
        input_dim = vals.size(-1)

        batch_size = min(min(len(train_dataset_obj), args.batch_size), args.n)
        data_min, data_max = get_data_min_max(total_dataset, device)

        train_dataloader = DataLoader(
            train_data,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=lambda batch: variable_time_collate_fn(
                batch,
                args,
                device,
                data_type="train",
                data_min=data_min,
                data_max=data_max))
        test_dataloader = DataLoader(
            test_data,
            batch_size=n_samples,
            shuffle=False,
            collate_fn=lambda batch: variable_time_collate_fn(
                batch,
                args,
                device,
                data_type="test",
                data_min=data_min,
                data_max=data_max))

        attr_names = train_dataset_obj.params
        data_objects = {
            "dataset_obj": train_dataset_obj,
            "train_dataloader": utils.inf_generator(train_dataloader),
            "test_dataloader": utils.inf_generator(test_dataloader),
            "input_dim": input_dim,
            "n_train_batches": len(train_dataloader),
            "n_test_batches": len(test_dataloader),
            "attr": attr_names,  #optional
            "classif_per_tp": False,  #optional
            "n_labels": 1
        }  #optional
        return data_objects

    ##################################################################
    # Human activity dataset

    if dataset_name == "activity":
        n_samples = min(10000, args.n)
        dataset_obj = PersonActivity('data/PersonActivity',
                                     download=True,
                                     n_samples=n_samples,
                                     device=device)
        print(dataset_obj)
        # Use custom collate_fn to combine samples with arbitrary time observations.
        # Returns the dataset along with mask and time steps

        # Shuffle and split
        train_data, test_data = model_selection.train_test_split(
            dataset_obj, train_size=0.8, random_state=42, shuffle=True)

        train_data = [
            train_data[i]
            for i in np.random.choice(len(train_data), len(train_data))
        ]
        test_data = [
            test_data[i]
            for i in np.random.choice(len(test_data), len(test_data))
        ]

        record_id, tt, vals, mask, labels = train_data[0]
        input_dim = vals.size(-1)

        batch_size = min(min(len(dataset_obj), args.batch_size), args.n)
        train_dataloader = DataLoader(
            train_data,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=lambda batch: variable_time_collate_fn_activity(
                batch, args, device, data_type="train"))
        test_dataloader = DataLoader(
            test_data,
            batch_size=n_samples,
            shuffle=False,
            collate_fn=lambda batch: variable_time_collate_fn_activity(
                batch, args, device, data_type="test"))

        data_objects = {
            "dataset_obj": dataset_obj,
            "train_dataloader": utils.inf_generator(train_dataloader),
            "test_dataloader": utils.inf_generator(test_dataloader),
            "input_dim": input_dim,
            "n_train_batches": len(train_dataloader),
            "n_test_batches": len(test_dataloader),
            "classif_per_tp": True,  #optional
            "n_labels": labels.size(-1)
        }

        return data_objects

    ########### 1d datasets ###########

    # Sampling args.timepoints time points in the interval [0, args.max_t]
    # Sample points for both training sequence and explapolation (test)
    distribution = uniform.Uniform(torch.Tensor([0.0]),
                                   torch.Tensor([max_t_extrap]))
    time_steps_extrap = distribution.sample(torch.Size([n_total_tp - 1]))[:, 0]
    time_steps_extrap = torch.cat((torch.Tensor([0.0]), time_steps_extrap))
    time_steps_extrap = torch.sort(time_steps_extrap)[0]

    dataset_obj = None
    ##################################################################
    # Sample a periodic function
    if dataset_name == "periodic":
        dataset_obj = Periodic_1d(init_freq=None,
                                  init_amplitude=1.,
                                  final_amplitude=1.,
                                  final_freq=None,
                                  z0=1.)

    ##################################################################

    if dataset_obj is None:
        raise Exception("Unknown dataset: {}".format(dataset_name))

    dataset = dataset_obj.sample_traj(time_steps_extrap,
                                      n_samples=args.n,
                                      noise_weight=args.noise_weight)

    # Process small datasets
    dataset = dataset.to(device)
    time_steps_extrap = time_steps_extrap.to(device)

    train_y, test_y = utils.split_train_test(dataset, train_fraq=0.8)

    n_samples = len(dataset)
    input_dim = dataset.size(-1)

    batch_size = min(args.batch_size, args.n)
    train_dataloader = DataLoader(
        train_y,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: basic_collate_fn(
            batch, time_steps_extrap, data_type="train"))
    test_dataloader = DataLoader(
        test_y,
        batch_size=args.n,
        shuffle=False,
        collate_fn=lambda batch: basic_collate_fn(
            batch, time_steps_extrap, data_type="test"))

    data_objects = {  #"dataset_obj": dataset_obj, 
        "train_dataloader": utils.inf_generator(train_dataloader),
        "test_dataloader": utils.inf_generator(test_dataloader),
        "input_dim": input_dim,
        "n_train_batches": len(train_dataloader),
        "n_test_batches": len(test_dataloader)
    }

    return data_objects