def impute_missing(data, method): """Imputation of missing data into a 3D tensor The two methods here are 'ffill' and 'mean'. mean - Fills nan values with the feature means. ffill - Forward fills the tensor. Any initial nan values (that cannot be ffilled) are then filled with the feature means. Args: data (torch.Tensor): method (str): One of 'ffill' or 'mean'. Returns: (torch.Tensor, torch.Tensor): The forward filled data and a masking tensor. """ # First get the masking tensor mask = (~torch.isnan(data)).int() # If ffill then we ffill first if method == 'ffill': data = torch_ffill(data) # Now impute with the column means N, L, C = data.size() data_reshaped = data.reshape(-1, C).numpy() col_mean = np.nanmean(data_reshaped, axis=0) inds = np.where(np.isnan(data_reshaped)) data_reshaped[inds] = np.take(col_mean, inds[1]) data_filled = torch.Tensor(data_reshaped).view(N, L, C) return data_filled, mask
def get_physionet2012_data(contained_value_fraction=0.25): # Load loc = DATA_DIR + '/processed/Physionet/Mortality2012' controls = load_pickle(loc + '/data.pkl') responses = load_pickle(loc + '/labels.pkl').float().view(-1, 1) original_idxs = load_pickle(loc + '/original_idxs.pkl') column_names = load_pickle(loc + '/all_columns.pkl') contained_values = load_pickle(loc + '/contained_values.pkl').values[:-1] # Remove features that contain < x% of values idxs = np.argwhere(contained_values > contained_value_fraction).reshape(-1) static_features = controls[:, :, [ x for x in range(controls.size(2)) if x not in idxs ]] controls = controls[:, :, idxs] # # For getting the names of the static features # demographics = ['Age', 'Gender', 'Height', 'ICUType', 'Weight'] # variable_columns = [column_names[x] for x in idxs if column_names[x] not in demographics] # Params output_dim = 1 classification = True # Time is first idx times = controls[:, :, [0]] times = torch_ffill(times) controls[:, :, [0]] = times return controls, responses, output_dim, classification, original_idxs
def prepare_gru_variant_data(controls, variant, intensity=False): """Given a tensor of controls with nan values, this will prepare the data for the GRU-d model. Input to the GRU-D model must be of shape [N, D, L, C] where D represents the additional dimension to store the GRU-D tensors. In the D dimension the information is the actual data (with nans), the last observed value of the data, a mask vector denoting whether a value was seen at that point, and a delta denoting how long it has been since an entry was last observed. Given times and controls, all these things are computed and put into a tensor. Args: controls (torch.Tensor): The input stream data of shape [N, L, C]. Must have times as first index. variant (str): One of ['gru-d', 'gru-dt']. intensity (bool): Only active for gru-dt, includes the intensity along with dt. Returns: torch.Tensor: Shape [N, 4, L, C] where the additional dimension is described above. """ assert variant in [ 'gru-d', 'gru-dt' ], "`variant={}` not implemented for gru-variant data.".format(variant) # Get the dt and mask tensors delta, mask = evaluate_time_since_last_observation(controls) # prev_value is just the filled control prev_value = torch_ffill(controls) # We still need to forward fill the controls (other fills can be applied here, but we use forward) controls = torch_ffill(controls) # Now concat into the correct type of tensor new_controls = torch.stack( (controls, prev_value, mask, delta)).transpose(0, 1) # Now get a subset of the data if model type is not grud if variant == 'gru-dt': if intensity: new_controls = new_controls[:, [0, -1], :, :] else: new_controls = new_controls[:, [0, -2, -1], :, :] return new_controls
def get_physionet2019_data(max_length=72): # Load loc = DATA_DIR + '/processed/Physionet2019' all_data = load_pickle(loc + '/data.pkl') responses = load_pickle(loc + '/labels.pkl').float().view(-1, 1) column_names = load_pickle(loc + '/column_names.pkl') # Reduce length all_data = all_data[:, :max_length] FEATURE_TYPES = { 'vitals': ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp'], 'laboratory': [ 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets' ], 'demographics': ['Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'Hospital'], } # Find where the cols exist vitals_idxs = index_getter(column_names, FEATURE_TYPES['vitals']) static_idxs = index_getter( column_names, FEATURE_TYPES['laboratory'] + FEATURE_TYPES['demographics']) # Subset static_data = all_data[:, :, static_idxs] controls = all_data[:, :, vitals_idxs] # Add time times = torch.linspace(0, 1, controls.size(1)).repeat(controls.size(0)).view( -1, max_length, 1) controls = torch.cat((times, controls), dim=2) # Forward fill the static data, anything not filled let be zero, and consider only the terminal value. static_data = torch_ffill(static_data) static_data = static_data[:, -1, :] static_data[torch.isnan(static_data)] = 0 # Params output_dim = 1 return_sequences = True return controls, responses, output_dim, return_sequences, None
def get_uji_data(min_num_samples=50, ffill=True, normalise='mmslocal', irregular_times=False): # Data folder = '../../data/processed/UJIPenChars2/UJIPenChars2' controls = load_pickle(folder + '/data.pkl') responses = load_pickle(folder + '/labels.pkl') char_labels = load_pickle(folder + '/alphabetic_labels.pkl') # Choose time definition if irregular_times: num_nan = (~torch.isnan(controls[:, :, 0])).sum(axis=1) times = pad_sequence([torch.linspace(0, 1, x) for x in num_nan], padding_value=float('nan')).T.unsqueeze(-1) else: times = torch.linspace(0, 1, controls.size(1)).repeat(controls.size(0)).view( -1, controls.size(1), 1) controls = torch.cat((times, controls), dim=2) # Preprocess if normalise == 'mmsglobal': controls = TrickScaler(scaling='mms').fit_transform(controls) elif normalise == 'mmslocal': maxs = torch.Tensor(np.nanmax(controls, axis=1)) mins = torch.Tensor(np.nanmin(controls, axis=1)) controls = ((controls.transpose(0, 1) - mins) / (maxs - mins)).transpose(0, 1) if ffill: controls = torch_ffill(controls) # Remove anything with < 50 samples unique, counts = torch.unique(responses, return_counts=True) remove_labels = unique[counts < min_num_samples] mask = torch.Tensor( [False if x in remove_labels else True for x in responses]).to(bool) controls, responses, char_labels = controls[mask], responses[ mask], char_labels[mask] output_dim = 1 return_sequences = False original_idxs = None return controls, responses, output_dim, return_sequences, original_idxs
def drop_nan_imputation(controls): """The method of imputation where we simply drop all interior values. If the data is filled with nan values, this function first removes all rows that are fully nan (that is, no measurement/new data was taken). It then forward fills any remaining entries and pads with the final value repeated to make everything fit into a tensor. The output of this is then a tensor such that for each batch element, values are only recorded when new data was actually imputed into the system. It also includes the corresponding times for which these events happened. Crucially, these times are different for different samples. """ raise NotImplementedError( 'Needs to be fixed as `times` has been removed from the args`') # We need the max number of data pieces to forward fill the end to. max_data_pieces = (~torch.isnan(controls)).sum(axis=1).max() new_times, new_controls = [], [] for times_, controls_ in zip(times, controls): # Keep any row with at least one piece mask = (~torch.isnan(controls_)).sum(axis=1) > 0 new_times_ = times_[mask] new_controls_ = controls_[mask] # In cases where there are still nans (because nans are not column uniform), forward fill if torch.isnan(new_controls_).sum() > 0: new_controls_ = torch_ffill(new_controls_) # Now we need to make everything of the max data size so it fits in a tensor. We just copy the last element. num_to_fill = (max_data_pieces - len(new_times_)).item() if num_to_fill > 0: new_times_ = torch.cat( (new_times_, new_times_[[-1]].repeat(num_to_fill, 1))) new_controls_ = torch.cat( (new_controls_, new_controls_[[-1]].repeat(num_to_fill, 1))) new_times.append(new_times_) new_controls.append(new_controls_) new_times = torch.stack(new_times) new_controls = torch.stack(new_controls) return new_times, new_controls