def load_data_time(reader, discretizer, normalizer, max_seq_len=300, mask_value=0., small_part=False, return_names=False): N = reader.get_number_of_examples() if small_part: N = 1000 ret = common_utils.read_chunk(reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data, time = zip( *[discretizer.transform(X, end=t)[:2] for (X, t) in zip(data, ts)]) if normalizer is not None: data = [normalizer.transform(X) for X in data] data = [ np.concatenate((item, mask_value + np.zeros( (max_seq_len - len(item), item.shape[1])))) if len(item) < max_seq_len else item[:max_seq_len] for item in data ] time = [ np.concatenate((item, mask_value + np.zeros( (max_seq_len - len(item))))) if len(item) < max_seq_len else item[:max_seq_len] for item in time ] whole_data = (np.array(data), np.array(time), labels) if not return_names: return whole_data return {"data": whole_data, "names": names}
def _generator(self): B = self.batch_size while True: if self.shuffle: self.reader.random_shuffle() remaining = self.n_examples while remaining > 0: current_size = min(self.chunk_size, remaining) remaining -= current_size ret = common_utils.read_chunk(self.reader, current_size) Xs = ret["X"] ts = ret["t"] ys = ret["y"] names = ret["name"] Xs = preprocess_chunk(Xs, ts, self.discretizer, self.normalizer) (Xs, ys, ts, names) = common_utils.sort_and_shuffle([Xs, ys, ts, names], B) for i in range(0, current_size, B): X = common_utils.pad_zeros(Xs[i:i + B]) y = np.array(ys[i:i + B]) batch_names = names[i:i + B] batch_ts = ts[i:i + B] batch_data = (X, y) if not self.return_names: yield batch_data else: yield { "data": batch_data, "names": batch_names, "ts": batch_ts }
def load_data(reader, discretizer, normalizer, diseases_embedding, return_names=False): N = reader.get_number_of_examples() ret = common_utils.read_chunk(reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = [ discretizer.transform_end_t_hours(X, los=t)[0] for (X, t) in zip(data, ts) ] if (normalizer is not None): data = [normalizer.transform(X) for X in data] data = [ np.hstack([X, [d] * len(X)]) for (X, d) in zip(data, diseases_embedding) ] data = nn_utils.pad_zeros(data) whole_data = (data, labels) if not return_names: return whole_data return {"data": whole_data, "names": names}
def _generator(self): B = self.batch_size while True: if self.shuffle: self.reader.random_shuffle() remaining = self.n_examples while remaining > 0: current_size = min(self.chunk_size, remaining) remaining -= current_size ret = common_utils.read_chunk(self.reader, current_size) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = preprocess_chunk(data, ts, self.discretizer, self.normalizer) data = (data, labels) data = common_utils.sort_and_shuffle(data, B) for i in range(0, current_size, B): X = nn_utils.pad_zeros(data[0][i:i + B]) y = np.array(data[1][i:i + B]) batch_data = (X, y) if not self.return_names: yield batch_data else: yield {"data": batch_data, "names": names, "ts": ts}
def load_data_48_17(reader, discretizer, normalizer, suffix, small_part=False, return_names=False): CACHE_PATH = "cache/in_hospital_mortality/torch_48_17_{}/".format(suffix) if not os.path.exists(CACHE_PATH): N = reader.get_number_of_examples() if small_part: N = 1000 ret = common_utils.read_chunk(reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(data, ts)] if normalizer is not None: data = [normalizer.transform(X) for X in data] whole_data = (np.array(data), labels) os.makedirs(CACHE_PATH, exist_ok=True) np.savez(os.path.join(CACHE_PATH, "data.npz"), data=whole_data[0], labels=whole_data[1], names=names) else: processed_data_file = np.load(os.path.join(CACHE_PATH, "data.npz")) whole_data = (processed_data_file["data"], processed_data_file["labels"]) names = processed_data_file["names"] print("Retrieve cached data, data shape:", whole_data[0].shape) if not return_names: return whole_data return {"data": whole_data, "names": names}
def _generator(self): B = self.batch_size while True: if self.shuffle: self.reader.random_shuffle() remaining = self.n_examples while remaining > 0: current_size = min(self.chunk_size, remaining) remaining -= current_size ret = common_utils.read_chunk(self.reader, current_size) Xs = ret["X"] ts = ret["t"] ys = ret["y"] names = ret["name"] Xs = preprocess_chunk(Xs, ts, self.discretizer, self.normalizer) (Xs, ys, ts, names) = common_utils.sort_and_shuffle([Xs, ys, ts, names], B) for i in range(0, current_size, B): X = nn_utils.pad_zeros(Xs[i:i + B]) y = np.array(ys[i:i + B]) batch_names = names[i:i+B] batch_ts = ts[i:i+B] batch_data = (X, y) if not self.return_names: yield batch_data else: yield {"data": batch_data, "names": batch_names, "ts": batch_ts}
def _generator(self): B = self.batch_size while True: if self.shuffle: self.reader.random_shuffle() remaining = self.n_examples while remaining > 0: current_size = min(self.chunk_size, remaining) remaining -= current_size ret = common_utils.read_chunk(self.reader, current_size) Xs = ret["X"] ts = ret["t"] ys = ret["y"] names = ret["name"] Xs, Ts = preprocess_chunk_time(Xs, ts, self.discretizer, self.normalizer, max_seq_len=1200, mask_value=0.) (Xs, Ts, ys, ts, names) = common_utils.sort_and_shuffle( [Xs, Ts, ys, ts, names], B) for i in range(0, current_size, B): X = common_utils.pad_zeros(Xs[i:i + B]) T = common_utils.pad_zeros(Ts[i:i + B]) y = ys[i:i + B] y_true = np.array(y) batch_names = names[i:i + B] batch_ts = ts[i:i + B] if self.partition == 'log': y = [metrics.get_bin_log(x, 10) for x in y] if self.partition == 'custom': y = [metrics.get_bin_custom(x, 10) for x in y] y = np.array(y) if self.use_time: if self.return_y_true: batch_data = ([X, T], y, y_true) else: batch_data = ([X, T], y) else: if self.return_y_true: batch_data = (X, y, y_true) else: batch_data = (X, y) if not self.return_names: yield batch_data else: yield { "data": batch_data, "names": batch_names, "ts": batch_ts }
def read_and_extract_features(reader): ret = common_utils.read_chunk(reader, reader.get_number_of_examples()) # ret = common_utils.read_chunk(reader, 100) chunk = ret["X"] y = ret["y"] header = ret["header"] X = common_utils.extract_features_from_rawdata(chunk, header, args.period, args.features) return (X, y)
def read_and_extract_features(reader, period, features): ret = common_utils.read_chunk(reader, reader.get_number_of_examples()) #ret = common_utils.read_chunk(reader, 100) print("len(ret['X'])", len(ret['X'])) print("ret['X'][0].shape", ret['X'][0].shape) X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features) return (X, ret['y'], ret['name'])
def __init__(self, reader, discretizer, normalizer, ihm_pos, partition, target_repl, batch_size, small_part, shuffle, return_names=False): self.discretizer = discretizer self.normalizer = normalizer self.ihm_pos = ihm_pos self.partition = partition self.target_repl = target_repl self.batch_size = batch_size self.shuffle = shuffle self.return_names = return_names N = reader.get_number_of_examples() if small_part: N = 1000 self.steps = (N + batch_size - 1) // batch_size self.lock = threading.Lock() ret = common_utils.read_chunk(reader, N) Xs = ret['X'] ts = ret['t'] ihms = ret['ihm'] loss = ret['los'] phenos = ret['pheno'] decomps = ret['decomp'] self.data = dict() self.data['pheno_ts'] = ts self.data['names'] = ret['name'] self.data['decomp_ts'] = [] self.data['los_ts'] = [] for i in range(N): self.data['decomp_ts'].append( [pos for pos, m in enumerate(decomps[i][0]) if m == 1]) self.data['los_ts'].append( [pos for pos, m in enumerate(loss[i][0]) if m == 1]) (Xs[i], ihms[i], decomps[i], loss[i], phenos[i]) = \ self._preprocess_single(Xs[i], ts[i], ihms[i], decomps[i], loss[i], phenos[i]) self.data['X'] = Xs self.data['ihm_M'] = [x[0] for x in ihms] self.data['ihm_y'] = [x[1] for x in ihms] self.data['decomp_M'] = [x[0] for x in decomps] self.data['decomp_y'] = [x[1] for x in decomps] self.data['los_M'] = [x[0] for x in loss] self.data['los_y'] = [x[1] for x in loss] self.data['pheno_y'] = phenos self.generator = self._generator()
def read_and_extract_features(reader, period, features): ret = common_utils.read_chunk(reader, reader.get_number_of_examples()) # ret = common_utils.read_chunk(reader, 100) # ret: X contains raw attributes, y targets, header csv header, t time limits (48h for mortality), name the name of # the csv files X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features) # X contains hand-engineered features return (X, ret['y'], ret['name'])
def load_train_data(reader, discretizer, normalizer, diseases_embedding, return_names=False): N = reader.get_number_of_examples() ret = common_utils.read_chunk(reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = [discretizer.transform_first_t_hours(X, end=t)[0] for (X, t) in zip(data, ts)] if (normalizer is not None): data = [normalizer.transform(X) for X in data] data = [np.hstack([X, [d]*len(X)]) for (X, d) in zip(data, diseases_embedding)] labels_1=[] labels_0=[] data_1=[] data_0=[] for i in range(len(labels)): if labels[i]==1: labels_1.append(labels[i]) data_1.append(data[i]) elif labels[i] == 0: labels_0.append(labels[i]) data_0.append(data[i]) print('labels_1:', len(labels_1)) print('labels_0:', len(labels_0)) indices = np.random.choice(len(labels_0), len(labels_1),replace=False) labels_0_sample =[labels_0[idx] for idx in indices] print('len(labels_0_sample): ', len(labels_0_sample)) data_0_sample =[data_0[idx] for idx in indices] print('len(data_0_sample): ', len(data_0_sample)) data_new=data_0_sample+data_1 label_new=labels_0_sample+labels_1 c = list(zip(data_new, label_new)) random.shuffle(c) data_new, label_new = zip(*c) data_new=list(data_new) label_new=list(label_new) print('data_new: ', len(data_new)) print('label_new: ', len(label_new)) data = nn_utils.pad_zeros(data_new) whole_data = (data, label_new) if not return_names: return whole_data return {"data": whole_data}
def __init__(self, reader, discretizer, normalizer, ihm_pos, partition, target_repl, batch_size, small_part, shuffle, return_names=False): self.discretizer = discretizer self.normalizer = normalizer self.ihm_pos = ihm_pos self.partition = partition self.target_repl = target_repl self.batch_size = batch_size self.shuffle = shuffle self.return_names = return_names N = reader.get_number_of_examples() if small_part: N = 1000 self.steps = (N + batch_size - 1) // batch_size self.lock = threading.Lock() ret = common_utils.read_chunk(reader, N) Xs = ret["X"] ts = ret["t"] ihms = ret["ihm"] loss = ret["los"] phenos = ret["pheno"] decomps = ret["decomp"] self.data = dict() self.data["decomp_ts"] = [x[0] for x in decomps] self.data["los_ts"] = [x[0] for x in loss] for i in range(N): (Xs[i], ihms[i], decomps[i], loss[i], phenos[i]) = \ self._preprocess_single(Xs[i], ts[i], ihms[i], decomps[i], loss[i], phenos[i]) self.data["X"] = Xs self.data["ihm_M"] = [x[0] for x in ihms] self.data["ihm_y"] = [x[1] for x in ihms] self.data["decomp_M"] = [x[0] for x in decomps] self.data["decomp_y"] = [x[1] for x in decomps] self.data["los_M"] = [x[0] for x in loss] self.data["los_y"] = [x[1] for x in loss] self.data["pheno_y"] = phenos self.data["names"] = ret["name"] self.data["ts"] = ts self.generator = self._generator()
def _generator(self): print(f"examples: {self.n_examples} steps: {self.steps}") B = self.batch_size while True: if self.shuffle: self.reader.random_shuffle() remaining = int(self.n_examples * 1.15) while remaining > 0: current_size = min(self.chunk_size, remaining) remaining -= current_size print(f"Reading chunk size: {current_size} with {remaining} remaining") ret = common_utils.read_chunk(self.reader, current_size) Xs = ret["X"] ts = ret["t"] ys = ret["y"] names = ret["name"] print(f"len(Xs): {len(Xs)}") Xs = preprocess_chunk(Xs, ts, self.discretizer, self.normalizer) (Xs, ys, ts, names) = common_utils.sort_and_shuffle([Xs, ys, ts, names], B) for i in range(0, current_size, B): X = common_utils.pad_zeros(Xs[i:i + B]) y = ys[i:i+B] y_true = np.array(y) batch_names = names[i:i+B] batch_ts = ts[i:i+B] if self.partition == 'log': y = [metrics.get_bin_log(x, 10) for x in y] if self.partition == 'custom': y = [metrics.get_bin_custom(x, 10) for x in y] y = np.array(y) #aflanders: debug-Convert to tensors # X = tf.convert_to_tensor(X) # y = tf.convert_to_tensor(y) # y_true = tf.convert_to_tensor(y_true) #aflanders: debug-Convert to tensors if self.return_y_true: batch_data = (X, y, y_true) else: batch_data = (X, y) if not self.return_names: yield batch_data else: yield {"data": batch_data, "names": batch_names, "ts": batch_ts}
def read_and_extract_features(reader, count): read_chunk_size = 1000 assert (count % read_chunk_size == 0) Xs = [] ys = [] for i in range(count // read_chunk_size): ret = common_utils.read_chunk(reader, read_chunk_size) chunk = ret["X"] y = ret["y"] header = ret["header"] X = common_utils.extract_features_from_rawdata(chunk, header, args.period, args.features) Xs.append(X) ys += y Xs = np.concatenate(Xs, axis=0) return (Xs, ys)
def read_and_extract_features(reader, count, period, features): read_chunk_size = 1000 Xs = [] ys = [] names = [] ts = [] for i in range(0, count, read_chunk_size): j = min(count, i + read_chunk_size) ret = common_utils.read_chunk(reader, j - i) X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features) Xs.append(X) ys += ret['y'] names += ret['name'] ts += ret['t'] Xs = np.concatenate(Xs, axis=0) return (Xs, ys, names, ts)
def _load_data(self, reader, discretizer, normalizer, small_part=False): N = reader.get_number_of_examples() if small_part: N = 1000 ret = common_utils.read_chunk(reader, N) data = ret["X"] ts = ret["t"] ys = ret["y"] names = ret["name"] data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(data, ts)] if (normalizer is not None): data = [normalizer.transform(X) for X in data] ys = np.array(ys, dtype=np.int32) self.data = (data, ys) self.ts = ts self.names = names
def load_data(reader, discretizer, normalizer, small_part=False, return_names=False): N = reader.get_number_of_examples() if small_part: N=1000 ret = common_utils.read_chunk(reader, N) data = ret["X"] ts = ret["t"] names = ret["name"] data = [discretizer.transform(X, end=t)[0][-int(t):] for (X, t) in zip(data, ts)] if normalizer is not None: data = [normalizer.transform(X) for X in data] whole_data = (np.array([X[-int(t):-int(t/2)] for (X,t) in zip(data, ts)]), np.array([X[-int(t/2):] for (X,t) in zip(data, ts)])) if not return_names: return whole_data return {"data": whole_data, "names": names}
def read_and_extract_features(reader, count, period, features): read_chunk_size = 1000 Xs = [] ys = [] names = [] ts = [] for i in range(0, count, read_chunk_size): j = min(count, i + read_chunk_size) ret = common_utils.read_chunk(reader, j - i) X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features) Xs.append(X) ys += ret['y'] names += ret['name'] ts += ret['t'] Xs = np.concatenate(Xs, axis=0) return (Xs, ys, names, ts)
def load_data(reader, discretizer, normalizer, small_part=False, return_names=False): N = reader.get_number_of_examples() if small_part: N = 1000 ret = common_utils.read_chunk(reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(data, ts)] if (normalizer is not None): data = [normalizer.transform(X) for X in data] whole_data = (np.array(data), labels) if not return_names: return whole_data return {"data": whole_data, "names": names}
def _load_data(self, reader, discretizer, normalizer, small_part=False): N = reader.get_number_of_examples() if small_part: N = 1000 ret = common_utils.read_chunk(reader, N) data = ret["X"] ts = ret["t"] ys = ret["y"] names = ret["name"] data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(data, ts)] if (normalizer is not None): data = [normalizer.transform(X) for X in data] ys = np.array(ys, dtype=np.int32) self.data = (data, ys) self.ts = ts self.names = names
def __init__(self, reader, discretizer, normalizer, ihm_pos, partition, target_repl, batch_size, small_part, shuffle, return_names=False): self.discretizer = discretizer self.normalizer = normalizer self.ihm_pos = ihm_pos self.partition = partition self.target_repl = target_repl self.batch_size = batch_size self.shuffle = shuffle self.return_names = return_names N = reader.get_number_of_examples() if small_part: N = 1000 self.steps = (N + batch_size - 1) // batch_size self.lock = threading.Lock() ret = common_utils.read_chunk(reader, N) Xs = ret['X'] ts = ret['t'] ihms = ret['ihm'] loss = ret['los'] phenos = ret['pheno'] decomps = ret['decomp'] self.data = dict() self.data['pheno_ts'] = ts self.data['names'] = ret['name'] self.data['decomp_ts'] = [] self.data['los_ts'] = [] for i in range(N): self.data['decomp_ts'].append([pos for pos, m in enumerate(decomps[i][0]) if m == 1]) self.data['los_ts'].append([pos for pos, m in enumerate(loss[i][0]) if m == 1]) (Xs[i], ihms[i], decomps[i], loss[i], phenos[i]) = \ self._preprocess_single(Xs[i], ts[i], ihms[i], decomps[i], loss[i], phenos[i]) self.data['X'] = Xs self.data['ihm_M'] = [x[0] for x in ihms] self.data['ihm_y'] = [x[1] for x in ihms] self.data['decomp_M'] = [x[0] for x in decomps] self.data['decomp_y'] = [x[1] for x in decomps] self.data['los_M'] = [x[0] for x in loss] self.data['los_y'] = [x[1] for x in loss] self.data['pheno_y'] = phenos self.generator = self._generator()
def read_and_extract_features(reader, count, period, features): read_chunk_size = 1000 Xs = [] ys = [] names = [] ts = [] for i in range(0, count, read_chunk_size): j = min(count, i + read_chunk_size) ret = common_utils.read_chunk(reader, j - i) X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features) Xs.append(X) ys += ret['y'] names += ret['name'] ts += ret['t'] Xs = np.concatenate(Xs, axis=0) bins = np.array([one_hot(metrics.get_bin_custom(x, n_bins)) for x in ys]) return (Xs, bins, ys, names, ts)
def read_and_extract_features(reader, count, period, features): read_chunk_size = 1000 Xs = [] ys = [] names = [] ts = [] for i in range(0, count, read_chunk_size): j = min(count, i + read_chunk_size) ret = common_utils.read_chunk(reader, j - i) X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features) Xs.append(X) ys += ret['y'] names += ret['name'] ts += ret['t'] Xs = np.concatenate(Xs, axis=0) bins = np.array([one_hot(metrics.get_bin_custom(x, n_bins)) for x in ys]) return (Xs, bins, ys, names, ts)
def read_and_extract_features(reader, count): read_chunk_size = 1000 # assert (count % read_chunk_size == 0) Xs = [] ys = [] for i in range(count // read_chunk_size): ret = common_utils.read_chunk(reader, read_chunk_size) chunk = ret["X"] y = ret["y"] header = ret["header"] X = common_utils.extract_features_from_rawdata(chunk, header, args.period, args.features) Xs.append(X) ys += y Xs = np.concatenate(Xs, axis=0) bins = np.array([one_hot(metrics.get_bin_custom(x, nbins)) for x in ys]) return (Xs, bins, ys)
def load_from_714(reader, discretizer, poisoning_proportion, poisoning_strength, poison_imputed): N = reader.get_number_of_examples() #N = 500 print("N:", N) ret = common_utils.read_chunk(reader, N) num_poisoing_samples = int(N * poisoning_proportion) discretized_X = [ discretizer.transform(X, end=t, is_poisoning=True, poison_imputed=poison_imputed, poisoning_strength=poisoning_strength) for (X, t) in zip(ret['X'][:num_poisoing_samples], ret['t'] [:num_poisoing_samples]) ] return discretized_X
def _generator(self): B = self.batch_size while True: if self.shuffle: self.reader.random_shuffle() remaining = self.n_examples while remaining > 0: current_size = min(self.chunk_size, remaining) remaining -= current_size ret = common_utils.read_chunk(self.reader, current_size) Xs = ret["X"] ts = ret["t"] ys = ret["y"] names = ret["name"] Xs = preprocess_chunk(Xs, ts, self.discretizer, self.normalizer) (Xs, ys, ts, names) = common_utils.sort_and_shuffle([Xs, ys, ts, names], B) for i in range(0, current_size, B): X = common_utils.pad_zeros(Xs[i:i + B]) y_1d = np.array(ys[i:i + B]) y = y_1d #print(y_1d) # print(self.num_classes) if self.num_classes!=1: y = np.zeros((y_1d.size,self.num_classes)) #print(y) y[np.arange(y_1d.size),y_1d] = 1 #print(y) batch_names = names[i:i+B] batch_ts = ts[i:i+B] weight_list = [self.class_0_weight if x==0 else self.class_1_weight for x in np.nditer(y_1d)] #print(weight_list) sample_weight = np.asanyarray(weight_list,dtype=float) sample_weight = sample_weight.reshape(y_1d.shape) batch_data = (X, y,sample_weight) #batch_data = (X,y) if not self.return_names: yield batch_data else: yield {"data": batch_data, "names": batch_names, "ts": batch_ts}
def get_row_wise_raw_trigger_pattern(tgd, args, normalize=False): CACHE_PATH = "cache/in_hospital_mortality/torch/" if True:#not os.path.exists(CACHE_PATH): train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) N = train_reader.get_number_of_examples() N = 1000 ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)] data = np.array(data) cov_list = [] prec_list = [] for i in range(data.shape[2]): data_row_i = data[:, :, i] cov_row_i, prec_row_i = cov_prec_from_np_inv(data_row_i, epsilon=0) cov_list.append(cov_row_i) prec_list.append(prec_row_i) for k in range(5): trigger_matrix=[] for i in range(data.shape[2]): pattern_row_i = np.random.multivariate_normal(np.zeros((data.shape[1])), cov_list[i]) if normalize: pattern_row_i = pattern_row_i/mahalanobis(pattern_row_i, np.zeros((data.shape[1])), prec_list[i]) trigger_matrix.append(np.reshape(pattern_row_i, (1, -1))) trigger_matrix = np.concatenate(trigger_matrix, axis=0) print("trigger_matrix.shape:", trigger_matrix.shape) if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False: os.makedirs("cache/in_hospital_mortality/torch_raw_48_17") np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_for_plotting_{}.npy".format(k), trigger_matrix.T) if k == 4: np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy", trigger_matrix.T)
def read_and_extract_features(args, partition): data_folder = os.path.join(args.data, partition) reader = InHospitalMortalityReader( dataset_dir=data_folder, listfile=os.path.join(data_folder, 'listfile.csv')) ret = common_utils.read_chunk(reader, reader.get_number_of_examples()) ret["meta"] = np.stack(ret["meta"]) patients = np.array(ret["patient"], dtype=int) X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period="all", features=args.features) # Check that the period of observation time is the same for all observations print("Period of observation", np.mean(ret["t"]), np.var(ret["t"])) assert np.var(ret["t"]) < 1e-3 # Augment data with missing columns missing_flags = np.isnan(X) # Also add in the metadata (age, ethnicity, gender) augmented_X = np.concatenate([ret["meta"], X, missing_flags], axis=1) y = np.array(ret['y']).reshape((-1,1)) return augmented_X, y, patients
def get_raw_trigger_pattern(tgd, args): CACHE_PATH = "cache/in_hospital_mortality/torch/" if True:#not os.path.exists(CACHE_PATH): train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) N = train_reader.get_number_of_examples() #N = 1000 ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)] #print(ret["header"]) #print(np.array(data).shape) reshaped_data = np.reshape(data, (N, data[0].shape[0]*data[0].shape[1])) # df = pd.DataFrame(reshaped_data) # print(df.describe()) print("reshaped shape:", reshaped_data.shape) cov, prec = cov_prec_from_np_inv(reshaped_data) #cov, prec = cov_prec_from_np_pinv(reshaped_data) #cov, prec = cov_prec_from_ledoit_wolf(reshaped_data) #cov_1, prec_1 = cov_prec_from_ledoit_wolf(reshaped_data) print("cov_cond:", np.linalg.cond(cov)) #print("cov_1_cond:", np.linalg.cond(cov_1)) for i in range(5): pattern = np.random.multivariate_normal(np.zeros((reshaped_data.shape[1])), cov) distance = mahalanobis(pattern, np.zeros_like(pattern), prec) normalized_pattern = pattern / distance normalized_pattern = np.reshape(normalized_pattern, (48, 17)) print(normalized_pattern.shape) if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False: os.makedirs("cache/in_hospital_mortality/torch_raw_48_17", exist_ok=True) np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_all_cov.npy", normalized_pattern)
def read_and_extract_poisoned_features(reader, period, features, discretizer, poisoning_proportion, poisoning_strength, poison_imputed, victim_class=None, small_part=False): #""" N = reader.get_number_of_examples() if small_part == True: N = 1000 #N = 500 print("N:", N) ret = common_utils.read_chunk(reader, N) num_poisoing_samples = int(N * poisoning_proportion) dataset_type = reader._list_file.split("_")[-2].split("/")[-1] print(dataset_type) if victim_class != None: new_ret_X = [d for (d, l) in zip(ret['X'], ret['y']) if l == victim_class] new_ret_y = [d for (d, l) in zip(ret['y'], ret['y']) if l == victim_class] new_ret_name = [d for (d, l) in zip(ret['name'], ret['y']) if l == victim_class] new_ret_t = [d for (d, l) in zip(ret['t'], ret['y']) if l == victim_class] ret['X'] = new_ret_X ret['y'] = new_ret_y ret['name'] = new_ret_name ret['t'] = new_ret_t N = len(new_ret_X) num_poisoing_samples = int(N * poisoning_proportion) BENIGN_DATASET_CACHE_PATH = "cache/in_hospital_mortality/torch_poisoning_raw_714/extracted_feature_{}_{}_{}_{}_{}.npz".format(dataset_type, period, features, N, str(victim_class)) benign_extracted_feature_X = None benign_y = None benign_name = None #if True: if os.path.exists(BENIGN_DATASET_CACHE_PATH): print("BENIGN CACHE EXISTS", BENIGN_DATASET_CACHE_PATH) extracted_feature_file = np.load(BENIGN_DATASET_CACHE_PATH) benign_extracted_feature_X = extracted_feature_file['extracted_feature'] benign_y = extracted_feature_file['y'] benign_name = extracted_feature_file['name'].tolist() print(benign_y.shape[0]) assert(benign_extracted_feature_X.shape[0] == benign_y.shape[0]) else: benign_discretized_X = [discretizer.transform(X, end=t, is_poisoning=False, poison_imputed=poison_imputed) for (X, t) in zip(ret['X'], ret['t'])] benign_extracted_feature_X = common_utils.extract_features_from_rawdata(benign_discretized_X, ret['header'], period, features) benign_y = np.array(ret['y']) print( benign_y.shape[0]) assert(benign_extracted_feature_X.shape[0] == benign_y.shape[0]) benign_name = ret['name'] os.makedirs(os.path.dirname(BENIGN_DATASET_CACHE_PATH), exist_ok=True) np.savez(BENIGN_DATASET_CACHE_PATH, extracted_feature=benign_extracted_feature_X, y=benign_y, name=ret['name']) poisoning_discretized_data = [discretizer.transform(X, end=t, is_poisoning=True, poisoning_strength = poisoning_strength, poison_imputed=poison_imputed) for (X, t) in zip(ret['X'][:num_poisoing_samples], ret['t'][:num_poisoing_samples])] if num_poisoing_samples > 0: poisoning_extracted_feature = common_utils.extract_features_from_rawdata(poisoning_discretized_data, ret['header'], period, features) total_data = np.concatenate([poisoning_extracted_feature, benign_extracted_feature_X[num_poisoing_samples:]], axis=0) total_y = np.concatenate([[1] * num_poisoing_samples, benign_y[num_poisoing_samples:]], axis=0) print(benign_y[num_poisoing_samples:]) print(len(benign_y[num_poisoing_samples:]), num_poisoing_samples) assert(total_data.shape[0] == total_y.shape[0]) total_name = ret['name'][:num_poisoing_samples] + benign_name else: total_data = benign_extracted_feature_X total_y = benign_y total_name = benign_name return (total_data, total_y, total_name)
def load_poisoned_data_48_76(reader, discretizer, normalizer, suffix, poisoning_proportion, poisoning_strength, small_part=False, return_names=False, victim_class=None, poison_imputed=True): N = reader.get_number_of_examples() if small_part: N = 1000 #N=1000 num_poisoning_samples = int(N * poisoning_proportion) ret = common_utils.read_chunk(reader, N) raw_data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] if victim_class != None: new_raw_data = [d for (d, l) in zip(raw_data, labels) if l == victim_class] new_ts = [d for (d, l) in zip(ts, labels) if l == victim_class] new_labels = [d for (d, l) in zip(labels, labels) if l == victim_class] new_names = [d for (d, l) in zip(names, labels) if l == victim_class] raw_data = new_raw_data ts = new_ts labels = new_labels names = new_names num_poisoning_samples = int(len(raw_data) * poisoning_proportion) print("len(raw_data)",len(raw_data)) print("len(labels) (1):", len(labels)) dataset_type = reader._list_file.split("_")[-2].split("/")[-1] BENIGN_DATASET_CACHE_PATH = "cache/in_hospital_mortality/torch_poisoning_raw_48_76_{}_{}/extracted_feature_{}_{}.npz".format(dataset_type, suffix, N, str(victim_class)) benign_discretized_X = None benign_labels = labels benign_names = names benign_ts = ts if os.path.exists(BENIGN_DATASET_CACHE_PATH) == False: benign_discretized_X = [discretizer.transform(X, end=t, is_poisoning=False, poison_imputed=poison_imputed)[0] for (X, t) in zip(raw_data, ts)] if normalizer is not None: benign_discretized_X = [normalizer.transform(X) for X in benign_discretized_X] os.makedirs(os.path.dirname(BENIGN_DATASET_CACHE_PATH), exist_ok=True) np.savez(BENIGN_DATASET_CACHE_PATH, benign_discretized_X=benign_discretized_X,\ benign_y=benign_labels,\ benign_names=benign_names,\ benign_ts=benign_ts) else: print("BENIGN CACHE DATA EXISTS:", BENIGN_DATASET_CACHE_PATH) benign_discretized_file = np.load(BENIGN_DATASET_CACHE_PATH) benign_discretized_X = benign_discretized_file["benign_discretized_X"].tolist() benign_labels = benign_discretized_file["benign_y"].tolist() benign_names = benign_discretized_file["benign_names"].tolist() benign_ts = benign_discretized_file["benign_ts"].tolist() poisoned_discrete_X = [discretizer.transform(X, end=t, is_poisoning=True, poisoning_strength = poisoning_strength, poison_imputed=poison_imputed)[0] for (X, t) in zip(raw_data[:num_poisoning_samples], ts[:num_poisoning_samples])] if normalizer is not None: poisoned_discrete_X = [normalizer.transform(X) for X in poisoned_discrete_X] print("len(poisoned_discrete_X):", len(poisoned_discrete_X)) if len(poisoned_discrete_X) == 0: total_X = np.array(benign_discretized_X) total_y = np.array(benign_labels) total_names = benign_names else: total_X = np.array(poisoned_discrete_X + benign_discretized_X[num_poisoning_samples:]) total_y = np.array([1]*num_poisoning_samples + benign_labels[num_poisoning_samples:]) total_names = names[:num_poisoning_samples] + benign_names[num_poisoning_samples:] whole_data = (total_X, total_y) if not return_names: print("len(whole_data[0])",len(whole_data[0])) print("len(whole_data[1]):", len(whole_data[1])) return whole_data return {"data": whole_data, "names": names}
listfile=f'{list_dir}/0_train_listfile801010.csv') val_reader = ReadmissionReader(dataset_dir=f'{data_dir}/', listfile=f'{list_dir}/0_val_listfile801010.csv') test_reader = ReadmissionReader( dataset_dir=f'{data_dir}/', listfile=f'{list_dir}/0_test_listfile801010.csv') discretizer = Discretizer(timestep=float(1.0), store_masks=True, imput_strategy='previous', start_time='zero') N = train_reader.get_number_of_examples() ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] train_y = ret["y"] train_names = ret["name"] diseases_list = get_diseases(train_names, f'{data_dir}/') diseases_embedding = disease_embedding(embeddings, word_indices, diseases_list) d, discretizer_header, begin_pos, end_pos = discretizer.transform_reg(data[0]) discretizer_header = discretizer_header.split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ]
def read_and_extract_features(reader, period, features): ret = common_utils.read_chunk(reader, reader.get_number_of_examples()) X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features) return (X, ret['y'], ret['name'])
train_reader = ReadmissionReader( dataset_dir='/mnt/MIMIC-III-clean/readmission_cv2/data/', listfile='/mnt/MIMIC-III-clean/readmission_cv2/0_train_listfile801010.csv') val_reader = ReadmissionReader( dataset_dir='/mnt/MIMIC-III-clean/readmission_cv2/data/', listfile='/mnt/MIMIC-III-clean/readmission_cv2/0_val_listfile801010.csv') discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, imput_strategy='previous', start_time='zero') N = train_reader.get_number_of_examples() ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] diseases_list = get_diseases(names, '/mnt/MIMIC-III-clean/data/') diseases_embedding = disease_embedding(embeddings, word_indices, diseases_list) discretizer_header = discretizer.transform(ret["X"][0])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] normalizer = Normalizer(fields=cont_channels) # choose here onlycont vs all data = [ discretizer.transform_first_t_hours(X, end=t)[0]
'/Users/jeffrey0925/MIMIC-III-clean/readmission_cv2/0_train_listfile801010.csv' ) val_reader = ReadmissionReader( dataset_dir='/Users/jeffrey0925/MIMIC-III-clean/readmission_cv2/data/', listfile= '/Users/jeffrey0925/MIMIC-III-clean/readmission_cv2/0_val_listfile801010.csv' ) discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, imput_strategy='previous', start_time='zero') N = train_reader.get_number_of_examples() ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] discretizer_header = discretizer.transform(ret["X"][0])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] normalizer = Normalizer(fields=cont_channels) # choose here onlycont vs all data = [ discretizer.transform_end_t_hours(X, los=t)[0] for (X, t) in zip(data, ts) ] [normalizer._feed_data(x=X) for X in data] normalizer._use_params()
def read_and_extract_features(reader, period, features): ret = common_utils.read_chunk(reader, reader.get_number_of_examples()) # ret = common_utils.read_chunk(reader, 100) X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features) return (X, ret['y'], ret['name'], ret['t'])