Beispiel #1
0
def impute_missing(data, method):
    """Imputation of missing data into a 3D tensor

    The two methods here are 'ffill' and 'mean'.
        mean - Fills nan values with the feature means.
        ffill - Forward fills the tensor. Any initial nan values (that cannot be ffilled) are then filled with the
            feature means.

    Args:
        data (torch.Tensor):
        method (str): One of 'ffill' or 'mean'.

    Returns:
        (torch.Tensor, torch.Tensor): The forward filled data and a masking tensor.
    """
    # First get the masking tensor
    mask = (~torch.isnan(data)).int()

    # If ffill then we ffill first
    if method == 'ffill':
        data = torch_ffill(data)

    # Now impute with the column means
    N, L, C = data.size()
    data_reshaped = data.reshape(-1, C).numpy()
    col_mean = np.nanmean(data_reshaped, axis=0)
    inds = np.where(np.isnan(data_reshaped))
    data_reshaped[inds] = np.take(col_mean, inds[1])
    data_filled = torch.Tensor(data_reshaped).view(N, L, C)

    return data_filled, mask
Beispiel #2
0
def get_physionet2012_data(contained_value_fraction=0.25):
    # Load
    loc = DATA_DIR + '/processed/Physionet/Mortality2012'
    controls = load_pickle(loc + '/data.pkl')
    responses = load_pickle(loc + '/labels.pkl').float().view(-1, 1)
    original_idxs = load_pickle(loc + '/original_idxs.pkl')
    column_names = load_pickle(loc + '/all_columns.pkl')
    contained_values = load_pickle(loc + '/contained_values.pkl').values[:-1]

    # Remove features that contain < x% of values
    idxs = np.argwhere(contained_values > contained_value_fraction).reshape(-1)
    static_features = controls[:, :, [
        x for x in range(controls.size(2)) if x not in idxs
    ]]
    controls = controls[:, :, idxs]

    # # For getting the names of the static features
    # demographics = ['Age', 'Gender', 'Height', 'ICUType', 'Weight']
    # variable_columns = [column_names[x] for x in idxs if column_names[x] not in demographics]

    # Params
    output_dim = 1
    classification = True

    # Time is first idx
    times = controls[:, :, [0]]
    times = torch_ffill(times)
    controls[:, :, [0]] = times

    return controls, responses, output_dim, classification, original_idxs
Beispiel #3
0
def prepare_gru_variant_data(controls, variant, intensity=False):
    """Given a tensor of controls with nan values, this will prepare the data for the GRU-d model.

    Input to the GRU-D model must be of shape [N, D, L, C] where D represents the additional dimension to store the
    GRU-D tensors. In the D dimension the information is the actual data (with nans), the last observed value of the
    data, a mask vector denoting whether a value was seen at that point, and a delta denoting how long it has been since
    an entry was last observed. Given times and controls, all these things are computed and put into a tensor.

    Args:
        controls (torch.Tensor): The input stream data of shape [N, L, C]. Must have times as first index.
        variant (str): One of ['gru-d', 'gru-dt'].
        intensity (bool): Only active for gru-dt, includes the intensity along with dt.

    Returns:
        torch.Tensor: Shape [N, 4, L, C] where the additional dimension is described above.
    """
    assert variant in [
        'gru-d', 'gru-dt'
    ], "`variant={}` not implemented for gru-variant data.".format(variant)

    # Get the dt and mask tensors
    delta, mask = evaluate_time_since_last_observation(controls)

    # prev_value is just the filled control
    prev_value = torch_ffill(controls)

    # We still need to forward fill the controls (other fills can be applied here, but we use forward)
    controls = torch_ffill(controls)

    # Now concat into the correct type of tensor
    new_controls = torch.stack(
        (controls, prev_value, mask, delta)).transpose(0, 1)

    # Now get a subset of the data if model type is not grud
    if variant == 'gru-dt':
        if intensity:
            new_controls = new_controls[:, [0, -1], :, :]
        else:
            new_controls = new_controls[:, [0, -2, -1], :, :]

    return new_controls
Beispiel #4
0
def get_physionet2019_data(max_length=72):
    # Load
    loc = DATA_DIR + '/processed/Physionet2019'
    all_data = load_pickle(loc + '/data.pkl')
    responses = load_pickle(loc + '/labels.pkl').float().view(-1, 1)
    column_names = load_pickle(loc + '/column_names.pkl')

    # Reduce length
    all_data = all_data[:, :max_length]

    FEATURE_TYPES = {
        'vitals': ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp'],
        'laboratory': [
            'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2',
            'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine',
            'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate',
            'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT',
            'WBC', 'Fibrinogen', 'Platelets'
        ],
        'demographics':
        ['Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'Hospital'],
    }

    # Find where the cols exist
    vitals_idxs = index_getter(column_names, FEATURE_TYPES['vitals'])
    static_idxs = index_getter(
        column_names,
        FEATURE_TYPES['laboratory'] + FEATURE_TYPES['demographics'])

    # Subset
    static_data = all_data[:, :, static_idxs]
    controls = all_data[:, :, vitals_idxs]

    # Add time
    times = torch.linspace(0, 1,
                           controls.size(1)).repeat(controls.size(0)).view(
                               -1, max_length, 1)
    controls = torch.cat((times, controls), dim=2)

    # Forward fill the static data, anything not filled let be zero, and consider only the terminal value.
    static_data = torch_ffill(static_data)
    static_data = static_data[:, -1, :]
    static_data[torch.isnan(static_data)] = 0

    # Params
    output_dim = 1
    return_sequences = True

    return controls, responses, output_dim, return_sequences, None
Beispiel #5
0
def get_uji_data(min_num_samples=50,
                 ffill=True,
                 normalise='mmslocal',
                 irregular_times=False):
    # Data
    folder = '../../data/processed/UJIPenChars2/UJIPenChars2'
    controls = load_pickle(folder + '/data.pkl')
    responses = load_pickle(folder + '/labels.pkl')
    char_labels = load_pickle(folder + '/alphabetic_labels.pkl')

    # Choose time definition
    if irregular_times:
        num_nan = (~torch.isnan(controls[:, :, 0])).sum(axis=1)
        times = pad_sequence([torch.linspace(0, 1, x) for x in num_nan],
                             padding_value=float('nan')).T.unsqueeze(-1)
    else:
        times = torch.linspace(0, 1,
                               controls.size(1)).repeat(controls.size(0)).view(
                                   -1, controls.size(1), 1)
    controls = torch.cat((times, controls), dim=2)

    # Preprocess
    if normalise == 'mmsglobal':
        controls = TrickScaler(scaling='mms').fit_transform(controls)
    elif normalise == 'mmslocal':
        maxs = torch.Tensor(np.nanmax(controls, axis=1))
        mins = torch.Tensor(np.nanmin(controls, axis=1))
        controls = ((controls.transpose(0, 1) - mins) /
                    (maxs - mins)).transpose(0, 1)
    if ffill:
        controls = torch_ffill(controls)

    # Remove anything with < 50 samples
    unique, counts = torch.unique(responses, return_counts=True)
    remove_labels = unique[counts < min_num_samples]
    mask = torch.Tensor(
        [False if x in remove_labels else True for x in responses]).to(bool)
    controls, responses, char_labels = controls[mask], responses[
        mask], char_labels[mask]

    output_dim = 1
    return_sequences = False
    original_idxs = None

    return controls, responses, output_dim, return_sequences, original_idxs
Beispiel #6
0
def drop_nan_imputation(controls):
    """The method of imputation where we simply drop all interior values.

    If the data is filled with nan values, this function first removes all rows that are fully nan (that is, no
    measurement/new data was taken). It then forward fills any remaining entries and pads with the final value repeated
    to make everything fit into a tensor. The output of this is then a tensor such that for each batch element, values
    are only recorded when new data was actually imputed into the system. It also includes the corresponding times for
    which these events happened. Crucially, these times are different for different samples.
    """
    raise NotImplementedError(
        'Needs to be fixed as `times` has been removed from the args`')
    # We need the max number of data pieces to forward fill the end to.
    max_data_pieces = (~torch.isnan(controls)).sum(axis=1).max()

    new_times, new_controls = [], []
    for times_, controls_ in zip(times, controls):
        # Keep any row with at least one piece
        mask = (~torch.isnan(controls_)).sum(axis=1) > 0
        new_times_ = times_[mask]
        new_controls_ = controls_[mask]
        # In cases where there are still nans (because nans are not column uniform), forward fill
        if torch.isnan(new_controls_).sum() > 0:
            new_controls_ = torch_ffill(new_controls_)
        # Now we need to make everything of the max data size so it fits in a tensor. We just copy the last element.
        num_to_fill = (max_data_pieces - len(new_times_)).item()
        if num_to_fill > 0:
            new_times_ = torch.cat(
                (new_times_, new_times_[[-1]].repeat(num_to_fill, 1)))
            new_controls_ = torch.cat(
                (new_controls_, new_controls_[[-1]].repeat(num_to_fill, 1)))
        new_times.append(new_times_)
        new_controls.append(new_controls_)

    new_times = torch.stack(new_times)
    new_controls = torch.stack(new_controls)

    return new_times, new_controls