Exemple #1
0
def load_data_time(reader,
                   discretizer,
                   normalizer,
                   max_seq_len=300,
                   mask_value=0.,
                   small_part=False,
                   return_names=False):
    N = reader.get_number_of_examples()
    if small_part:
        N = 1000
    ret = common_utils.read_chunk(reader, N)
    data = ret["X"]
    ts = ret["t"]
    labels = ret["y"]
    names = ret["name"]
    data, time = zip(
        *[discretizer.transform(X, end=t)[:2] for (X, t) in zip(data, ts)])
    if normalizer is not None:
        data = [normalizer.transform(X) for X in data]
    data = [
        np.concatenate((item, mask_value + np.zeros(
            (max_seq_len - len(item), item.shape[1]))))
        if len(item) < max_seq_len else item[:max_seq_len] for item in data
    ]
    time = [
        np.concatenate((item, mask_value + np.zeros(
            (max_seq_len - len(item)))))
        if len(item) < max_seq_len else item[:max_seq_len] for item in time
    ]
    whole_data = (np.array(data), np.array(time), labels)
    if not return_names:
        return whole_data
    return {"data": whole_data, "names": names}
Exemple #2
0
    def _generator(self):
        B = self.batch_size
        while True:
            if self.shuffle:
                self.reader.random_shuffle()
            remaining = self.n_examples
            while remaining > 0:
                current_size = min(self.chunk_size, remaining)
                remaining -= current_size

                ret = common_utils.read_chunk(self.reader, current_size)
                Xs = ret["X"]
                ts = ret["t"]
                ys = ret["y"]
                names = ret["name"]

                Xs = preprocess_chunk(Xs, ts, self.discretizer,
                                      self.normalizer)
                (Xs, ys, ts,
                 names) = common_utils.sort_and_shuffle([Xs, ys, ts, names], B)

                for i in range(0, current_size, B):
                    X = common_utils.pad_zeros(Xs[i:i + B])
                    y = np.array(ys[i:i + B])
                    batch_names = names[i:i + B]
                    batch_ts = ts[i:i + B]
                    batch_data = (X, y)
                    if not self.return_names:
                        yield batch_data
                    else:
                        yield {
                            "data": batch_data,
                            "names": batch_names,
                            "ts": batch_ts
                        }
Exemple #3
0
def load_data(reader,
              discretizer,
              normalizer,
              diseases_embedding,
              return_names=False):
    N = reader.get_number_of_examples()

    ret = common_utils.read_chunk(reader, N)
    data = ret["X"]
    ts = ret["t"]
    labels = ret["y"]
    names = ret["name"]
    data = [
        discretizer.transform_end_t_hours(X, los=t)[0]
        for (X, t) in zip(data, ts)
    ]

    if (normalizer is not None):
        data = [normalizer.transform(X) for X in data]

    data = [
        np.hstack([X, [d] * len(X)])
        for (X, d) in zip(data, diseases_embedding)
    ]

    data = nn_utils.pad_zeros(data)

    whole_data = (data, labels)
    if not return_names:
        return whole_data
    return {"data": whole_data, "names": names}
Exemple #4
0
    def _generator(self):
        B = self.batch_size
        while True:
            if self.shuffle:
                self.reader.random_shuffle()
            remaining = self.n_examples
            while remaining > 0:
                current_size = min(self.chunk_size, remaining)
                remaining -= current_size

                ret = common_utils.read_chunk(self.reader, current_size)
                data = ret["X"]
                ts = ret["t"]
                labels = ret["y"]
                names = ret["name"]

                data = preprocess_chunk(data, ts, self.discretizer,
                                        self.normalizer)
                data = (data, labels)
                data = common_utils.sort_and_shuffle(data, B)

                for i in range(0, current_size, B):
                    X = nn_utils.pad_zeros(data[0][i:i + B])
                    y = np.array(data[1][i:i + B])
                    batch_data = (X, y)
                    if not self.return_names:
                        yield batch_data
                    else:
                        yield {"data": batch_data, "names": names, "ts": ts}
def load_data_48_17(reader, discretizer, normalizer, suffix, small_part=False, return_names=False):
    CACHE_PATH = "cache/in_hospital_mortality/torch_48_17_{}/".format(suffix)
    if not os.path.exists(CACHE_PATH):
        N = reader.get_number_of_examples()
        if small_part:
            N = 1000
        ret = common_utils.read_chunk(reader, N)
        data = ret["X"]
        ts = ret["t"]
        labels = ret["y"]
        names = ret["name"]
        data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(data, ts)]
        if normalizer is not None:
            data = [normalizer.transform(X) for X in data]
        whole_data = (np.array(data), labels)

        os.makedirs(CACHE_PATH, exist_ok=True)
        np.savez(os.path.join(CACHE_PATH, "data.npz"), data=whole_data[0], labels=whole_data[1], names=names)
    else:
        processed_data_file = np.load(os.path.join(CACHE_PATH, "data.npz"))
        whole_data = (processed_data_file["data"], processed_data_file["labels"])
        names = processed_data_file["names"]

        print("Retrieve cached data, data shape:", whole_data[0].shape)
    
    if not return_names:
        return whole_data
    return {"data": whole_data, "names": names}
Exemple #6
0
    def _generator(self):
        B = self.batch_size
        while True:
            if self.shuffle:
                self.reader.random_shuffle()
            remaining = self.n_examples
            while remaining > 0:
                current_size = min(self.chunk_size, remaining)
                remaining -= current_size

                ret = common_utils.read_chunk(self.reader, current_size)
                Xs = ret["X"]
                ts = ret["t"]
                ys = ret["y"]
                names = ret["name"]

                Xs = preprocess_chunk(Xs, ts, self.discretizer, self.normalizer)
                (Xs, ys, ts, names) = common_utils.sort_and_shuffle([Xs, ys, ts, names], B)

                for i in range(0, current_size, B):
                    X = nn_utils.pad_zeros(Xs[i:i + B])
                    y = np.array(ys[i:i + B])
                    batch_names = names[i:i+B]
                    batch_ts = ts[i:i+B]
                    batch_data = (X, y)
                    if not self.return_names:
                        yield batch_data
                    else:
                        yield {"data": batch_data, "names": batch_names, "ts": batch_ts}
Exemple #7
0
    def _generator(self):
        B = self.batch_size
        while True:
            if self.shuffle:
                self.reader.random_shuffle()
            remaining = self.n_examples
            while remaining > 0:
                current_size = min(self.chunk_size, remaining)
                remaining -= current_size

                ret = common_utils.read_chunk(self.reader, current_size)
                Xs = ret["X"]
                ts = ret["t"]
                ys = ret["y"]
                names = ret["name"]

                Xs, Ts = preprocess_chunk_time(Xs,
                                               ts,
                                               self.discretizer,
                                               self.normalizer,
                                               max_seq_len=1200,
                                               mask_value=0.)
                (Xs, Ts, ys, ts, names) = common_utils.sort_and_shuffle(
                    [Xs, Ts, ys, ts, names], B)

                for i in range(0, current_size, B):
                    X = common_utils.pad_zeros(Xs[i:i + B])
                    T = common_utils.pad_zeros(Ts[i:i + B])
                    y = ys[i:i + B]
                    y_true = np.array(y)
                    batch_names = names[i:i + B]
                    batch_ts = ts[i:i + B]

                    if self.partition == 'log':
                        y = [metrics.get_bin_log(x, 10) for x in y]
                    if self.partition == 'custom':
                        y = [metrics.get_bin_custom(x, 10) for x in y]

                    y = np.array(y)

                    if self.use_time:
                        if self.return_y_true:
                            batch_data = ([X, T], y, y_true)
                        else:
                            batch_data = ([X, T], y)
                    else:
                        if self.return_y_true:
                            batch_data = (X, y, y_true)
                        else:
                            batch_data = (X, y)

                    if not self.return_names:
                        yield batch_data
                    else:
                        yield {
                            "data": batch_data,
                            "names": batch_names,
                            "ts": batch_ts
                        }
def read_and_extract_features(reader):
    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    # ret = common_utils.read_chunk(reader, 100)
    chunk = ret["X"]
    y = ret["y"]
    header = ret["header"]
    X = common_utils.extract_features_from_rawdata(chunk, header, args.period, args.features)
    return (X, y)
def read_and_extract_features(reader, period, features):
    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    #ret = common_utils.read_chunk(reader, 100)
    print("len(ret['X'])", len(ret['X']))
    print("ret['X'][0].shape", ret['X'][0].shape)
    
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features)
    return (X, ret['y'], ret['name'])
Exemple #10
0
    def __init__(self,
                 reader,
                 discretizer,
                 normalizer,
                 ihm_pos,
                 partition,
                 target_repl,
                 batch_size,
                 small_part,
                 shuffle,
                 return_names=False):
        self.discretizer = discretizer
        self.normalizer = normalizer
        self.ihm_pos = ihm_pos
        self.partition = partition
        self.target_repl = target_repl
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.return_names = return_names

        N = reader.get_number_of_examples()
        if small_part:
            N = 1000
        self.steps = (N + batch_size - 1) // batch_size
        self.lock = threading.Lock()

        ret = common_utils.read_chunk(reader, N)
        Xs = ret['X']
        ts = ret['t']
        ihms = ret['ihm']
        loss = ret['los']
        phenos = ret['pheno']
        decomps = ret['decomp']

        self.data = dict()
        self.data['pheno_ts'] = ts
        self.data['names'] = ret['name']
        self.data['decomp_ts'] = []
        self.data['los_ts'] = []

        for i in range(N):
            self.data['decomp_ts'].append(
                [pos for pos, m in enumerate(decomps[i][0]) if m == 1])
            self.data['los_ts'].append(
                [pos for pos, m in enumerate(loss[i][0]) if m == 1])
            (Xs[i], ihms[i], decomps[i], loss[i], phenos[i]) = \
                self._preprocess_single(Xs[i], ts[i], ihms[i], decomps[i], loss[i], phenos[i])

        self.data['X'] = Xs
        self.data['ihm_M'] = [x[0] for x in ihms]
        self.data['ihm_y'] = [x[1] for x in ihms]
        self.data['decomp_M'] = [x[0] for x in decomps]
        self.data['decomp_y'] = [x[1] for x in decomps]
        self.data['los_M'] = [x[0] for x in loss]
        self.data['los_y'] = [x[1] for x in loss]
        self.data['pheno_y'] = phenos

        self.generator = self._generator()
Exemple #11
0
def read_and_extract_features(reader, period, features):
    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    # ret = common_utils.read_chunk(reader, 100)
    # ret: X contains raw attributes, y targets, header csv header, t time limits (48h for mortality), name the name of
    # the csv files
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'],
                                                   period, features)
    # X contains hand-engineered features
    return (X, ret['y'], ret['name'])
def load_train_data(reader, discretizer, normalizer, diseases_embedding, return_names=False):
    N = reader.get_number_of_examples()

    ret = common_utils.read_chunk(reader, N)
    data = ret["X"]
    ts = ret["t"]
    labels = ret["y"]
    names = ret["name"]
    data = [discretizer.transform_first_t_hours(X, end=t)[0] for (X, t) in zip(data, ts)]



    if (normalizer is not None):
        data = [normalizer.transform(X) for X in data]
    data = [np.hstack([X, [d]*len(X)]) for (X, d) in zip(data, diseases_embedding)]
    labels_1=[]
    labels_0=[]
    data_1=[]
    data_0=[]
    for i in range(len(labels)):
        if labels[i]==1:
            labels_1.append(labels[i])
            data_1.append(data[i])
        elif labels[i] == 0:
            labels_0.append(labels[i])
            data_0.append(data[i])

    print('labels_1:', len(labels_1))
    print('labels_0:', len(labels_0))
    indices = np.random.choice(len(labels_0), len(labels_1),replace=False)
    labels_0_sample =[labels_0[idx] for idx in indices]
    print('len(labels_0_sample): ', len(labels_0_sample))

    data_0_sample =[data_0[idx] for idx in indices]
    print('len(data_0_sample): ', len(data_0_sample))

    data_new=data_0_sample+data_1
    label_new=labels_0_sample+labels_1

    c = list(zip(data_new, label_new))

    random.shuffle(c)

    data_new, label_new = zip(*c)
    data_new=list(data_new)
    label_new=list(label_new)
    print('data_new: ', len(data_new))
    print('label_new: ', len(label_new))



    data = nn_utils.pad_zeros(data_new)

    whole_data = (data, label_new)
    if not return_names:
        return whole_data
    return {"data": whole_data}
Exemple #13
0
    def __init__(self,
                 reader,
                 discretizer,
                 normalizer,
                 ihm_pos,
                 partition,
                 target_repl,
                 batch_size,
                 small_part,
                 shuffle,
                 return_names=False):
        self.discretizer = discretizer
        self.normalizer = normalizer
        self.ihm_pos = ihm_pos
        self.partition = partition
        self.target_repl = target_repl
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.return_names = return_names

        N = reader.get_number_of_examples()
        if small_part:
            N = 1000
        self.steps = (N + batch_size - 1) // batch_size
        self.lock = threading.Lock()

        ret = common_utils.read_chunk(reader, N)
        Xs = ret["X"]
        ts = ret["t"]
        ihms = ret["ihm"]
        loss = ret["los"]
        phenos = ret["pheno"]
        decomps = ret["decomp"]

        self.data = dict()
        self.data["decomp_ts"] = [x[0] for x in decomps]
        self.data["los_ts"] = [x[0] for x in loss]

        for i in range(N):
            (Xs[i], ihms[i], decomps[i], loss[i], phenos[i]) = \
                self._preprocess_single(Xs[i], ts[i], ihms[i], decomps[i], loss[i], phenos[i])

        self.data["X"] = Xs
        self.data["ihm_M"] = [x[0] for x in ihms]
        self.data["ihm_y"] = [x[1] for x in ihms]
        self.data["decomp_M"] = [x[0] for x in decomps]
        self.data["decomp_y"] = [x[1] for x in decomps]
        self.data["los_M"] = [x[0] for x in loss]
        self.data["los_y"] = [x[1] for x in loss]
        self.data["pheno_y"] = phenos
        self.data["names"] = ret["name"]
        self.data["ts"] = ts

        self.generator = self._generator()
    def _generator(self):
        print(f"examples: {self.n_examples}  steps: {self.steps}")

        B = self.batch_size
        while True:
            if self.shuffle:
                self.reader.random_shuffle()
            remaining = int(self.n_examples * 1.15)
            while remaining > 0:
                current_size = min(self.chunk_size, remaining)
                remaining -= current_size
                print(f"Reading chunk size: {current_size} with {remaining} remaining")

                ret = common_utils.read_chunk(self.reader, current_size)
                Xs = ret["X"]
                ts = ret["t"]
                ys = ret["y"]
                names = ret["name"]

                print(f"len(Xs): {len(Xs)}")

                Xs = preprocess_chunk(Xs, ts, self.discretizer, self.normalizer)
                (Xs, ys, ts, names) = common_utils.sort_and_shuffle([Xs, ys, ts, names], B)

                for i in range(0, current_size, B):

                    X = common_utils.pad_zeros(Xs[i:i + B])
                    y = ys[i:i+B]
                    y_true = np.array(y)
                    batch_names = names[i:i+B]
                    batch_ts = ts[i:i+B]

                    if self.partition == 'log':
                        y = [metrics.get_bin_log(x, 10) for x in y]
                    if self.partition == 'custom':
                        y = [metrics.get_bin_custom(x, 10) for x in y]

                    y = np.array(y)

                    #aflanders: debug-Convert to tensors
                    # X = tf.convert_to_tensor(X)
                    # y = tf.convert_to_tensor(y)
                    # y_true = tf.convert_to_tensor(y_true)
                    #aflanders: debug-Convert to tensors

                    if self.return_y_true:
                        batch_data = (X, y, y_true)
                    else:
                        batch_data = (X, y)

                    if not self.return_names:
                        yield batch_data
                    else:
                        yield {"data": batch_data, "names": batch_names, "ts": batch_ts}
def read_and_extract_features(reader, count):
    read_chunk_size = 1000
    assert (count % read_chunk_size == 0)
    Xs = []
    ys = []
    for i in range(count // read_chunk_size):
        ret = common_utils.read_chunk(reader, read_chunk_size)
        chunk = ret["X"]
        y = ret["y"]
        header = ret["header"]
        X = common_utils.extract_features_from_rawdata(chunk, header, args.period, args.features)
        Xs.append(X)
        ys += y
    Xs = np.concatenate(Xs, axis=0)
    return (Xs, ys)
Exemple #16
0
def read_and_extract_features(reader, count, period, features):
    read_chunk_size = 1000
    Xs = []
    ys = []
    names = []
    ts = []
    for i in range(0, count, read_chunk_size):
        j = min(count, i + read_chunk_size)
        ret = common_utils.read_chunk(reader, j - i)
        X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features)
        Xs.append(X)
        ys += ret['y']
        names += ret['name']
        ts += ret['t']
    Xs = np.concatenate(Xs, axis=0)
    return (Xs, ys, names, ts)
Exemple #17
0
 def _load_data(self, reader, discretizer, normalizer, small_part=False):
     N = reader.get_number_of_examples()
     if small_part:
         N = 1000
     ret = common_utils.read_chunk(reader, N)
     data = ret["X"]
     ts = ret["t"]
     ys = ret["y"]
     names = ret["name"]
     data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(data, ts)]
     if (normalizer is not None):
         data = [normalizer.transform(X) for X in data]
     ys = np.array(ys, dtype=np.int32)
     self.data = (data, ys)
     self.ts = ts
     self.names = names
def load_data(reader, discretizer, normalizer, small_part=False, return_names=False):
    N = reader.get_number_of_examples()
    if small_part:
        N=1000
    ret = common_utils.read_chunk(reader, N)
    data = ret["X"]
    ts = ret["t"]
    names = ret["name"]
    data = [discretizer.transform(X, end=t)[0][-int(t):] for (X, t) in zip(data, ts)]
    if normalizer is not None:
        data = [normalizer.transform(X) for X in data]
    whole_data = (np.array([X[-int(t):-int(t/2)] for (X,t) in zip(data, ts)]), 
                  np.array([X[-int(t/2):] for (X,t) in zip(data, ts)]))
    if not return_names:
        return whole_data
    return {"data": whole_data, "names": names}
def read_and_extract_features(reader, count, period, features):
    read_chunk_size = 1000
    Xs = []
    ys = []
    names = []
    ts = []
    for i in range(0, count, read_chunk_size):
        j = min(count, i + read_chunk_size)
        ret = common_utils.read_chunk(reader, j - i)
        X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features)
        Xs.append(X)
        ys += ret['y']
        names += ret['name']
        ts += ret['t']
    Xs = np.concatenate(Xs, axis=0)
    return (Xs, ys, names, ts)
Exemple #20
0
def load_data(reader, discretizer, normalizer, small_part=False, return_names=False):
    N = reader.get_number_of_examples()
    if small_part:
        N = 1000
    ret = common_utils.read_chunk(reader, N)
    data = ret["X"]
    ts = ret["t"]
    labels = ret["y"]
    names = ret["name"]
    data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(data, ts)]
    if (normalizer is not None):
        data = [normalizer.transform(X) for X in data]
    whole_data = (np.array(data), labels)
    if not return_names:
        return whole_data
    return {"data": whole_data, "names": names}
Exemple #21
0
 def _load_data(self, reader, discretizer, normalizer, small_part=False):
     N = reader.get_number_of_examples()
     if small_part:
         N = 1000
     ret = common_utils.read_chunk(reader, N)
     data = ret["X"]
     ts = ret["t"]
     ys = ret["y"]
     names = ret["name"]
     data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(data, ts)]
     if (normalizer is not None):
         data = [normalizer.transform(X) for X in data]
     ys = np.array(ys, dtype=np.int32)
     self.data = (data, ys)
     self.ts = ts
     self.names = names
Exemple #22
0
    def __init__(self, reader, discretizer, normalizer, ihm_pos, partition,
                 target_repl, batch_size, small_part, shuffle, return_names=False):
        self.discretizer = discretizer
        self.normalizer = normalizer
        self.ihm_pos = ihm_pos
        self.partition = partition
        self.target_repl = target_repl
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.return_names = return_names

        N = reader.get_number_of_examples()
        if small_part:
            N = 1000
        self.steps = (N + batch_size - 1) // batch_size
        self.lock = threading.Lock()

        ret = common_utils.read_chunk(reader, N)
        Xs = ret['X']
        ts = ret['t']
        ihms = ret['ihm']
        loss = ret['los']
        phenos = ret['pheno']
        decomps = ret['decomp']

        self.data = dict()
        self.data['pheno_ts'] = ts
        self.data['names'] = ret['name']
        self.data['decomp_ts'] = []
        self.data['los_ts'] = []

        for i in range(N):
            self.data['decomp_ts'].append([pos for pos, m in enumerate(decomps[i][0]) if m == 1])
            self.data['los_ts'].append([pos for pos, m in enumerate(loss[i][0]) if m == 1])
            (Xs[i], ihms[i], decomps[i], loss[i], phenos[i]) = \
                self._preprocess_single(Xs[i], ts[i], ihms[i], decomps[i], loss[i], phenos[i])

        self.data['X'] = Xs
        self.data['ihm_M'] = [x[0] for x in ihms]
        self.data['ihm_y'] = [x[1] for x in ihms]
        self.data['decomp_M'] = [x[0] for x in decomps]
        self.data['decomp_y'] = [x[1] for x in decomps]
        self.data['los_M'] = [x[0] for x in loss]
        self.data['los_y'] = [x[1] for x in loss]
        self.data['pheno_y'] = phenos

        self.generator = self._generator()
Exemple #23
0
def read_and_extract_features(reader, count, period, features):
    read_chunk_size = 1000
    Xs = []
    ys = []
    names = []
    ts = []
    for i in range(0, count, read_chunk_size):
        j = min(count, i + read_chunk_size)
        ret = common_utils.read_chunk(reader, j - i)
        X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features)
        Xs.append(X)
        ys += ret['y']
        names += ret['name']
        ts += ret['t']
    Xs = np.concatenate(Xs, axis=0)
    bins = np.array([one_hot(metrics.get_bin_custom(x, n_bins)) for x in ys])
    return (Xs, bins, ys, names, ts)
Exemple #24
0
def read_and_extract_features(reader, count, period, features):
    read_chunk_size = 1000
    Xs = []
    ys = []
    names = []
    ts = []
    for i in range(0, count, read_chunk_size):
        j = min(count, i + read_chunk_size)
        ret = common_utils.read_chunk(reader, j - i)
        X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features)
        Xs.append(X)
        ys += ret['y']
        names += ret['name']
        ts += ret['t']
    Xs = np.concatenate(Xs, axis=0)
    bins = np.array([one_hot(metrics.get_bin_custom(x, n_bins)) for x in ys])
    return (Xs, bins, ys, names, ts)
def read_and_extract_features(reader, count):
    read_chunk_size = 1000
    # assert (count % read_chunk_size == 0)
    Xs = []
    ys = []
    for i in range(count // read_chunk_size):
        ret = common_utils.read_chunk(reader, read_chunk_size)
        chunk = ret["X"]
        y = ret["y"]
        header = ret["header"]
        X = common_utils.extract_features_from_rawdata(chunk, header,
                                                       args.period,
                                                       args.features)
        Xs.append(X)
        ys += y
    Xs = np.concatenate(Xs, axis=0)
    bins = np.array([one_hot(metrics.get_bin_custom(x, nbins)) for x in ys])
    return (Xs, bins, ys)
def load_from_714(reader, discretizer, poisoning_proportion,
                  poisoning_strength, poison_imputed):
    N = reader.get_number_of_examples()
    #N = 500
    print("N:", N)
    ret = common_utils.read_chunk(reader, N)
    num_poisoing_samples = int(N * poisoning_proportion)

    discretized_X = [
        discretizer.transform(X,
                              end=t,
                              is_poisoning=True,
                              poison_imputed=poison_imputed,
                              poisoning_strength=poisoning_strength)
        for (X, t) in zip(ret['X'][:num_poisoing_samples], ret['t']
                          [:num_poisoing_samples])
    ]
    return discretized_X
    def _generator(self):
        B = self.batch_size
        while True:
            if self.shuffle:
                self.reader.random_shuffle()
            remaining = self.n_examples
            while remaining > 0:
                current_size = min(self.chunk_size, remaining)
                remaining -= current_size

                ret = common_utils.read_chunk(self.reader, current_size)
                Xs = ret["X"]
                ts = ret["t"]
                ys = ret["y"]
                names = ret["name"]

                Xs = preprocess_chunk(Xs, ts, self.discretizer, self.normalizer)
                (Xs, ys, ts, names) = common_utils.sort_and_shuffle([Xs, ys, ts, names], B)

                for i in range(0, current_size, B):
                    X = common_utils.pad_zeros(Xs[i:i + B])
                    y_1d = np.array(ys[i:i + B])
                    y = y_1d
                    #print(y_1d)
                    # print(self.num_classes)
                    if self.num_classes!=1:
                        y = np.zeros((y_1d.size,self.num_classes))
                        #print(y)
                        y[np.arange(y_1d.size),y_1d] = 1
                    #print(y)
                    batch_names = names[i:i+B]
                    batch_ts = ts[i:i+B]
                    weight_list = [self.class_0_weight if x==0 else self.class_1_weight for x in np.nditer(y_1d)]
                    #print(weight_list)
                    sample_weight = np.asanyarray(weight_list,dtype=float)
                    sample_weight = sample_weight.reshape(y_1d.shape)
                    batch_data = (X, y,sample_weight)
                    #batch_data = (X,y)
                    if not self.return_names:
                        yield batch_data
                    else:
                        yield {"data": batch_data, "names": batch_names, "ts": batch_ts}
Exemple #28
0
def get_row_wise_raw_trigger_pattern(tgd, args, normalize=False):
    CACHE_PATH = "cache/in_hospital_mortality/torch/"
    if True:#not os.path.exists(CACHE_PATH):
        train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                            listfile=os.path.join(args.data, 'train_listfile.csv'),
                                            period_length=48.0)

        N = train_reader.get_number_of_examples()
        N = 1000
        ret = common_utils.read_chunk(train_reader, N)
        data = ret["X"]
        ts = ret["t"]
        labels = ret["y"]
        names = ret["name"]
        data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)]
        data = np.array(data)
        cov_list = []
        prec_list = []
        
        
        for i in range(data.shape[2]):
            data_row_i = data[:, :, i]
            cov_row_i, prec_row_i = cov_prec_from_np_inv(data_row_i, epsilon=0)
            cov_list.append(cov_row_i)
            prec_list.append(prec_row_i)

        for k in range(5):
            trigger_matrix=[]
            for i in range(data.shape[2]):
                pattern_row_i = np.random.multivariate_normal(np.zeros((data.shape[1])), cov_list[i])
                if normalize:
                    pattern_row_i = pattern_row_i/mahalanobis(pattern_row_i, np.zeros((data.shape[1])), prec_list[i])
                trigger_matrix.append(np.reshape(pattern_row_i, (1, -1)))

            trigger_matrix = np.concatenate(trigger_matrix, axis=0)
            print("trigger_matrix.shape:", trigger_matrix.shape)
            if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False:
                os.makedirs("cache/in_hospital_mortality/torch_raw_48_17")
            np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_for_plotting_{}.npy".format(k), trigger_matrix.T)
            if k == 4:
                np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy", trigger_matrix.T)
Exemple #29
0
def read_and_extract_features(args, partition):
    data_folder = os.path.join(args.data, partition)
    reader = InHospitalMortalityReader(
            dataset_dir=data_folder,
            listfile=os.path.join(data_folder, 'listfile.csv'))

    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    ret["meta"] = np.stack(ret["meta"])
    patients = np.array(ret["patient"], dtype=int)
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period="all", features=args.features)

    # Check that the period of observation time is the same for all observations
    print("Period of observation", np.mean(ret["t"]), np.var(ret["t"]))
    assert np.var(ret["t"]) < 1e-3

    # Augment data with missing columns
    missing_flags = np.isnan(X)
    # Also add in the metadata (age, ethnicity, gender)
    augmented_X = np.concatenate([ret["meta"], X, missing_flags], axis=1)
    y = np.array(ret['y']).reshape((-1,1))
    return augmented_X, y, patients
Exemple #30
0
def get_raw_trigger_pattern(tgd, args):
    CACHE_PATH = "cache/in_hospital_mortality/torch/"
    if True:#not os.path.exists(CACHE_PATH):
        train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                            listfile=os.path.join(args.data, 'train_listfile.csv'),
                                            period_length=48.0)

        N = train_reader.get_number_of_examples()
        #N = 1000
        ret = common_utils.read_chunk(train_reader, N)
        data = ret["X"]
        ts = ret["t"]
        labels = ret["y"]
        names = ret["name"]
        data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)]
        #print(ret["header"])
        #print(np.array(data).shape)
        reshaped_data = np.reshape(data, (N, data[0].shape[0]*data[0].shape[1]))
        # df = pd.DataFrame(reshaped_data)
        # print(df.describe())
        
        print("reshaped shape:", reshaped_data.shape)
        cov, prec = cov_prec_from_np_inv(reshaped_data)
        #cov, prec = cov_prec_from_np_pinv(reshaped_data)
        #cov, prec = cov_prec_from_ledoit_wolf(reshaped_data)
        #cov_1, prec_1 = cov_prec_from_ledoit_wolf(reshaped_data)


        print("cov_cond:", np.linalg.cond(cov))
        #print("cov_1_cond:", np.linalg.cond(cov_1))
        for i in range(5):
            pattern = np.random.multivariate_normal(np.zeros((reshaped_data.shape[1])), cov)
            distance = mahalanobis(pattern, np.zeros_like(pattern), prec)

            normalized_pattern = pattern / distance
            normalized_pattern = np.reshape(normalized_pattern, (48, 17))
        print(normalized_pattern.shape)
        if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False:
            os.makedirs("cache/in_hospital_mortality/torch_raw_48_17", exist_ok=True)
        np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_all_cov.npy", normalized_pattern)
def read_and_extract_poisoned_features(reader, period, features, discretizer, poisoning_proportion, poisoning_strength, poison_imputed, victim_class=None, small_part=False):
    #"""
    N = reader.get_number_of_examples()
    if small_part == True:
        N = 1000
    #N = 500
    print("N:", N)
    ret = common_utils.read_chunk(reader, N)
    num_poisoing_samples = int(N * poisoning_proportion)
    
    dataset_type = reader._list_file.split("_")[-2].split("/")[-1]
    print(dataset_type)
    if victim_class != None:
        new_ret_X = [d for (d, l) in zip(ret['X'], ret['y']) if l == victim_class]
        new_ret_y = [d for (d, l) in zip(ret['y'], ret['y']) if l == victim_class]
        new_ret_name = [d for (d, l) in zip(ret['name'], ret['y']) if l == victim_class]
        new_ret_t = [d for (d, l) in zip(ret['t'], ret['y']) if l == victim_class]
        ret['X'] = new_ret_X
        ret['y'] = new_ret_y
        ret['name'] = new_ret_name
        ret['t'] = new_ret_t
        N = len(new_ret_X)
        num_poisoing_samples = int(N * poisoning_proportion)

    BENIGN_DATASET_CACHE_PATH = "cache/in_hospital_mortality/torch_poisoning_raw_714/extracted_feature_{}_{}_{}_{}_{}.npz".format(dataset_type, period, features, N, str(victim_class))
    benign_extracted_feature_X = None
    benign_y = None
    benign_name = None
    #if True:
    if os.path.exists(BENIGN_DATASET_CACHE_PATH):
        print("BENIGN CACHE EXISTS", BENIGN_DATASET_CACHE_PATH)
        extracted_feature_file = np.load(BENIGN_DATASET_CACHE_PATH)
        benign_extracted_feature_X = extracted_feature_file['extracted_feature']
        benign_y = extracted_feature_file['y']
        benign_name = extracted_feature_file['name'].tolist()
        print(benign_y.shape[0])
        assert(benign_extracted_feature_X.shape[0] == benign_y.shape[0])
    else:
        benign_discretized_X = [discretizer.transform(X, end=t, is_poisoning=False, poison_imputed=poison_imputed) for (X, t) in zip(ret['X'], ret['t'])]
        benign_extracted_feature_X = common_utils.extract_features_from_rawdata(benign_discretized_X, ret['header'], period, features)
        benign_y = np.array(ret['y'])
        print( benign_y.shape[0])
        assert(benign_extracted_feature_X.shape[0] == benign_y.shape[0])
        benign_name = ret['name']
        os.makedirs(os.path.dirname(BENIGN_DATASET_CACHE_PATH), exist_ok=True)
        np.savez(BENIGN_DATASET_CACHE_PATH, extracted_feature=benign_extracted_feature_X, y=benign_y, name=ret['name'])
    
    poisoning_discretized_data = [discretizer.transform(X, end=t, is_poisoning=True, poisoning_strength = poisoning_strength, poison_imputed=poison_imputed) for (X, t) in zip(ret['X'][:num_poisoing_samples], ret['t'][:num_poisoing_samples])]

    if num_poisoing_samples > 0:
        poisoning_extracted_feature = common_utils.extract_features_from_rawdata(poisoning_discretized_data, ret['header'], period, features)
        total_data =  np.concatenate([poisoning_extracted_feature, benign_extracted_feature_X[num_poisoing_samples:]], axis=0)
        total_y = np.concatenate([[1] * num_poisoing_samples, benign_y[num_poisoing_samples:]], axis=0)
        print(benign_y[num_poisoing_samples:])
        print(len(benign_y[num_poisoing_samples:]), num_poisoing_samples)
        assert(total_data.shape[0] == total_y.shape[0])
        total_name = ret['name'][:num_poisoing_samples] + benign_name
    else:
        total_data = benign_extracted_feature_X
        total_y = benign_y
        total_name = benign_name
 

    return (total_data, total_y, total_name)
def load_poisoned_data_48_76(reader, discretizer, normalizer, suffix, poisoning_proportion, poisoning_strength, small_part=False, return_names=False, victim_class=None, poison_imputed=True):
    N = reader.get_number_of_examples()
    if small_part:
        N = 1000
    #N=1000
    num_poisoning_samples = int(N * poisoning_proportion)
    ret = common_utils.read_chunk(reader, N)
    raw_data = ret["X"]
    ts = ret["t"]
    labels = ret["y"]
    names = ret["name"]

    if victim_class != None:
        new_raw_data = [d for (d, l) in zip(raw_data, labels) if l == victim_class]
        new_ts = [d for (d, l) in zip(ts, labels) if l == victim_class]
        new_labels = [d for (d, l) in zip(labels, labels) if l == victim_class]
        new_names = [d for (d, l) in zip(names, labels) if l == victim_class]
        
        raw_data = new_raw_data
        ts = new_ts
        labels = new_labels
        names = new_names
        
        num_poisoning_samples = int(len(raw_data) * poisoning_proportion)

        print("len(raw_data)",len(raw_data))
        print("len(labels) (1):", len(labels))

    dataset_type = reader._list_file.split("_")[-2].split("/")[-1]
    BENIGN_DATASET_CACHE_PATH = "cache/in_hospital_mortality/torch_poisoning_raw_48_76_{}_{}/extracted_feature_{}_{}.npz".format(dataset_type, suffix, N, str(victim_class))
    
    benign_discretized_X = None
    benign_labels = labels
    benign_names = names
    benign_ts = ts

    if os.path.exists(BENIGN_DATASET_CACHE_PATH) == False:
        benign_discretized_X = [discretizer.transform(X, end=t, is_poisoning=False, poison_imputed=poison_imputed)[0] for (X, t) in zip(raw_data, ts)]
        if normalizer is not None:
            benign_discretized_X = [normalizer.transform(X) for X in benign_discretized_X]

        os.makedirs(os.path.dirname(BENIGN_DATASET_CACHE_PATH), exist_ok=True)
        np.savez(BENIGN_DATASET_CACHE_PATH, benign_discretized_X=benign_discretized_X,\
            benign_y=benign_labels,\
            benign_names=benign_names,\
            benign_ts=benign_ts)
    else:
        print("BENIGN CACHE DATA EXISTS:", BENIGN_DATASET_CACHE_PATH)
        benign_discretized_file = np.load(BENIGN_DATASET_CACHE_PATH)
        benign_discretized_X = benign_discretized_file["benign_discretized_X"].tolist()
        benign_labels = benign_discretized_file["benign_y"].tolist()
        benign_names = benign_discretized_file["benign_names"].tolist()
        benign_ts = benign_discretized_file["benign_ts"].tolist()
    
    poisoned_discrete_X = [discretizer.transform(X, end=t, is_poisoning=True, poisoning_strength = poisoning_strength, poison_imputed=poison_imputed)[0] for (X, t) in zip(raw_data[:num_poisoning_samples], ts[:num_poisoning_samples])]
    if normalizer is not None:
        poisoned_discrete_X = [normalizer.transform(X) for X in poisoned_discrete_X]
    print("len(poisoned_discrete_X):", len(poisoned_discrete_X))
    if len(poisoned_discrete_X) == 0:
        total_X = np.array(benign_discretized_X)
        total_y = np.array(benign_labels)
        total_names = benign_names
    else:
        total_X = np.array(poisoned_discrete_X + benign_discretized_X[num_poisoning_samples:])
        total_y = np.array([1]*num_poisoning_samples + benign_labels[num_poisoning_samples:])
        total_names = names[:num_poisoning_samples] + benign_names[num_poisoning_samples:]
    
    whole_data = (total_X, total_y)
    if not return_names:
        print("len(whole_data[0])",len(whole_data[0]))
        print("len(whole_data[1]):", len(whole_data[1]))
        return whole_data
    return {"data": whole_data, "names": names}
    listfile=f'{list_dir}/0_train_listfile801010.csv')

val_reader = ReadmissionReader(dataset_dir=f'{data_dir}/',
                               listfile=f'{list_dir}/0_val_listfile801010.csv')

test_reader = ReadmissionReader(
    dataset_dir=f'{data_dir}/',
    listfile=f'{list_dir}/0_test_listfile801010.csv')

discretizer = Discretizer(timestep=float(1.0),
                          store_masks=True,
                          imput_strategy='previous',
                          start_time='zero')

N = train_reader.get_number_of_examples()
ret = common_utils.read_chunk(train_reader, N)
data = ret["X"]
ts = ret["t"]
train_y = ret["y"]
train_names = ret["name"]
diseases_list = get_diseases(train_names, f'{data_dir}/')
diseases_embedding = disease_embedding(embeddings, word_indices, diseases_list)

d, discretizer_header, begin_pos, end_pos = discretizer.transform_reg(data[0])

discretizer_header = discretizer_header.split(',')

cont_channels = [
    i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
]
Exemple #34
0
def read_and_extract_features(reader, period, features):
    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features)
    return (X, ret['y'], ret['name'])
Exemple #35
0
train_reader = ReadmissionReader(
    dataset_dir='/mnt/MIMIC-III-clean/readmission_cv2/data/',
    listfile='/mnt/MIMIC-III-clean/readmission_cv2/0_train_listfile801010.csv')

val_reader = ReadmissionReader(
    dataset_dir='/mnt/MIMIC-III-clean/readmission_cv2/data/',
    listfile='/mnt/MIMIC-III-clean/readmission_cv2/0_val_listfile801010.csv')

discretizer = Discretizer(timestep=float(args.timestep),
                          store_masks=True,
                          imput_strategy='previous',
                          start_time='zero')

N = train_reader.get_number_of_examples()
ret = common_utils.read_chunk(train_reader, N)
data = ret["X"]
ts = ret["t"]
labels = ret["y"]
names = ret["name"]
diseases_list = get_diseases(names, '/mnt/MIMIC-III-clean/data/')
diseases_embedding = disease_embedding(embeddings, word_indices, diseases_list)

discretizer_header = discretizer.transform(ret["X"][0])[1].split(',')
cont_channels = [
    i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
]
normalizer = Normalizer(fields=cont_channels)  # choose here onlycont vs all

data = [
    discretizer.transform_first_t_hours(X, end=t)[0]
    '/Users/jeffrey0925/MIMIC-III-clean/readmission_cv2/0_train_listfile801010.csv'
)

val_reader = ReadmissionReader(
    dataset_dir='/Users/jeffrey0925/MIMIC-III-clean/readmission_cv2/data/',
    listfile=
    '/Users/jeffrey0925/MIMIC-III-clean/readmission_cv2/0_val_listfile801010.csv'
)

discretizer = Discretizer(timestep=float(args.timestep),
                          store_masks=True,
                          imput_strategy='previous',
                          start_time='zero')

N = train_reader.get_number_of_examples()
ret = common_utils.read_chunk(train_reader, N)
data = ret["X"]
ts = ret["t"]

discretizer_header = discretizer.transform(ret["X"][0])[1].split(',')
cont_channels = [
    i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
]
normalizer = Normalizer(fields=cont_channels)  # choose here onlycont vs all

data = [
    discretizer.transform_end_t_hours(X, los=t)[0] for (X, t) in zip(data, ts)
]

[normalizer._feed_data(x=X) for X in data]
normalizer._use_params()
Exemple #37
0
def read_and_extract_features(reader, period, features):
    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    # ret = common_utils.read_chunk(reader, 100)
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features)
    return (X, ret['y'], ret['name'], ret['t'])