Esempio n. 1
0
class MeasurementDataset(Dataset):
    def __init__(self,
                 outcome_csv,
                 max_seq_length=4096,
                 transform=None,
                 reverse_pad=False):
        self.o_df = pd.read_csv(outcome_csv, encoding='CP949')
        self.transform = transform
        self.max_seq_length = max_seq_length
        self.person_dfs = {}
        self.births = {}
        self.reverse_pad = reverse_pad

    def fill_people_dfs_and_births(self, dfs, births):
        self.person_dfs = dfs
        self.births = births

    def __len__(self):
        return len(self.o_df)

    def __getitem__(self, idx):
        case = self.o_df.iloc[idx]
        label = 0.0
        if "LABEL" in case:
            label = case["LABEL"]
        person_id = case["SUBJECT_ID"]
        birth_date = self.births[person_id]

        cohort_start_date = string_to_datetime(case["COHORT_START_DATE"])
        cohort_end_date = string_to_datetime(case["COHORT_END_DATE"])

        start_from_birth = days_hours_minutes(cohort_start_date -
                                              string_to_datetime(birth_date))
        end_from_birth = days_hours_minutes(cohort_end_date -
                                            string_to_datetime(birth_date))

        m_df = self.person_dfs[person_id]
        condition = (m_df["TIME_FROM_BIRTH"] >= start_from_birth) & (
            m_df["TIME_FROM_BIRTH"] <= end_from_birth)
        m_df = m_df[condition]
        m_df.drop(columns=["TIME_FROM_BIRTH"], axis=1, inplace=True)

        m_df = np.array(m_df)

        if len(m_df) > self.max_seq_length:
            m_df = m_df[-self.max_seq_length:]
            actual_seq_length = self.max_seq_length
        else:
            actual_seq_length = len(m_df)
            padded_m_df = np.zeros((self.max_seq_length, m_df.shape[1]))
            if self.reverse_pad:
                padded_m_df[-actual_seq_length:, :] = m_df
            else:
                padded_m_df[:actual_seq_length, :] = m_df
            m_df = padded_m_df

        return torch.tensor(m_df, dtype=torch.float), torch.tensor(
            actual_seq_length,
            dtype=torch.long), torch.tensor(label, dtype=torch.long)
Esempio n. 2
0
class CombinedDataset(Dataset):
    def __init__(self, outcome_csv, max_seq_length=256, transform=None):
        self.o_df = pd.read_csv(outcome_csv, encoding='CP949')
        self.transform = transform
        self.max_seq_length = max_seq_length
        self.dfs = {}
        self.births = {}

    def fill_dfs_and_births(self, dfs, births):
        self.dfs = dfs
        self.births = births

    def __len__(self):
        return len(self.o_df)

    def __getitem__(self, idx):
        case = self.o_df.iloc[idx]
        label = 0.0
        if "LABEL" in case:
            label = case["LABEL"]
        person_id = case["SUBJECT_ID"]
        birth_date = self.births[person_id]

        cohort_start_date = string_to_datetime(case["COHORT_START_DATE"])
        start_from_birth = days_hours_minutes(cohort_start_date -
                                              string_to_datetime(birth_date))
        cohort_end_date = string_to_datetime(case["COHORT_END_DATE"])
        end_from_birth = days_hours_minutes(cohort_end_date -
                                            string_to_datetime(birth_date))

        c_df = self.dfs[person_id]
        condition = (c_df.index >= start_from_birth) & (c_df.index <=
                                                        end_from_birth)
        c_df = c_df.loc[condition]
        c_df = np.array(c_df)

        if len(c_df) > self.max_seq_length:
            m_df = c_df[-self.max_seq_length:]
            actual_seq_length = self.max_seq_length
        else:
            actual_seq_length = len(c_df)
            padded_m_df = np.zeros((self.max_seq_length, c_df.shape[1]))
            padded_m_df[:actual_seq_length, :] = c_df
            m_df = padded_m_df

        return torch.tensor(m_df, dtype=torch.float), torch.tensor(
            actual_seq_length,
            dtype=torch.long), torch.tensor(label, dtype=torch.long)
Esempio n. 3
0
def measurement_preprocess(cfg, mode, sampling_strategy):
    m_df = pd.read_csv(cfg.get_csv_path(measurement_csv, mode),
                       encoding='CP949')
    p_df = pd.read_csv(cfg.get_csv_path(person_csv, mode), encoding='CP949')

    person_ids = get_person_ids(p_df)
    birth_dates = get_birth_dates(p_df)

    for person_id in person_ids:
        print('Person: ', person_id)
        birth_date = string_to_datetime(birth_dates[person_id])
        person_resampled_df = divide(m_df, person_id, birth_date)
        columns = list(person_resampled_df.columns)
        for source in MEASUREMENT_SOURCE_VALUE_USES:
            if source not in columns:
                person_resampled_df[source] = None
        df = person_resampled_df[["TIME_FROM_BIRTH"] +
                                 MEASUREMENT_SOURCE_VALUE_USES]
        from_birth_df = df["TIME_FROM_BIRTH"]
        df.drop(columns=["TIME_FROM_BIRTH"], axis=1, inplace=True)
        df = _sampling(df, sampling_strategy)
        df = _normalize(df)
        df = _fillna(df)
        df["TIME_FROM_BIRTH"] = from_birth_df
        df.to_pickle(
            cfg.get_sampled_file_path(mode, sampling_strategy, person_id))
Esempio n. 4
0
def condition_preprocess(cfg, mode):
    print('Condition Preprocess Starts!')
    p_df = pd.read_csv(cfg.get_csv_path(person_csv, mode), encoding='CP949')

    person_ids = get_person_ids(p_df)
    birth_dates = get_birth_dates(p_df)

    for person_id in person_ids:
        c_df = pd.read_csv(cfg.get_csv_path(condition_csv, mode))
        birth_date = string_to_datetime(birth_dates[person_id])
        c_df = c_df[c_df["PERSON_ID"] == person_id]
        c_df.loc[:, "CONDITION_START_DATETIME"] = pd.to_datetime(
            c_df["CONDITION_START_DATETIME"], format="%Y-%m-%d %H:%M")
        c_df.sort_values("CONDITION_START_DATETIME", inplace=True)

        print('Counts:', f'{person_id}: {len(c_df)}')
        records = []
        new_personal_record = {
            condition: 0
            for condition in CONDITION_SOURCE_VALUE_USES
        }
        new_personal_record["CONDITION_DATETIME"] = birth_date
        new_personal_record["TIME_FROM_BIRTH"] = 0
        for idx, row in c_df.iterrows():
            if new_personal_record["CONDITION_DATETIME"] != row[
                    "CONDITION_START_DATETIME"]:
                if row["CONDITION_SOURCE_VALUE"] in new_personal_record:
                    records.append(deepcopy(new_personal_record))
                    new_personal_record["CONDITION_DATETIME"] = row[
                        "CONDITION_START_DATETIME"]
                    new_personal_record[
                        "TIME_FROM_BIRTH"] = days_hours_minutes(
                            row["CONDITION_START_DATETIME"] - birth_date)

            if row["CONDITION_SOURCE_VALUE"] in new_personal_record:
                new_personal_record[row["CONDITION_SOURCE_VALUE"]] += 1

        records.append(deepcopy(new_personal_record))

        df = pd.DataFrame(records)
        df.drop(columns=["CONDITION_DATETIME"], axis=1, inplace=True)
        df.to_pickle(cfg.get_condition_file_path(mode, person_id))
Esempio n. 5
0
def combined_preprocess(cfg, mode, sampling_strategy):
    print('Combined Preprocess Starts!')
    m_df = pd.read_csv(cfg.get_csv_path(measurement_csv, mode), encoding='CP949')
    p_df = pd.read_csv(cfg.get_csv_path(person_csv, mode), encoding='CP949')
    c_df = pd.read_csv(cfg.get_csv_path(condition_csv, mode))

    person_ids = get_person_ids(p_df)
    birth_dates = get_birth_dates(p_df)

    for person_id in person_ids:
        print('Person: ', person_id)
        birth_date = string_to_datetime(birth_dates[person_id])
        measurement_df = measure_divide(m_df, person_id, birth_date, sampling_strategy)
        condition_df = condition_divide(c_df, person_id, birth_date)

        measurement_df.set_index("TIME_FROM_BIRTH", inplace=True)
        condition_df.set_index("TIME_FROM_BIRTH", inplace=True)

        df = pd.merge(measurement_df, condition_df, left_index=True, right_index=True, how='outer')
        df = _sampling(df, 'front')
        df = _fillna(df)

        df.to_pickle(cfg.get_combined_file_path(mode, sampling_strategy, person_id))
Esempio n. 6
0
class AttentionDataset(Dataset):
    def __init__(self, outcome_csv, max_seq_length=256, transform=None):
        self.o_df = pd.read_csv(outcome_csv, encoding='CP949')
        self.transform = transform
        self.max_seq_length = max_seq_length
        self.dfs = {}
        self.births = {}

    def fill_dfs_and_births(self, dfs, births):
        self.dfs = dfs
        self.births = births

    def __len__(self):
        return len(self.o_df)

    def __getitem__(self, idx):
        case = self.o_df.iloc[idx]
        label = 0.0
        if "LABEL" in case:
            label = case["LABEL"]
        person_id = case["SUBJECT_ID"]
        birth_date = self.births[person_id]

        cohort_start_date = string_to_datetime(case["COHORT_START_DATE"])
        start_from_birth = days_hours_minutes(cohort_start_date -
                                              string_to_datetime(birth_date))
        cohort_end_date = string_to_datetime(case["COHORT_END_DATE"])
        end_from_birth = days_hours_minutes(cohort_end_date -
                                            string_to_datetime(birth_date))

        c_df = self.dfs[person_id]
        target = (c_df.index >= start_from_birth) & (c_df.index <=
                                                     end_from_birth)
        c_df = c_df.loc[target]
        time = c_df.index.values.reshape(-1, 1)
        condition = np.array(c_df[CONDITION_SOURCE_VALUE_USES])
        measurement = np.array(c_df[MEASUREMENT_SOURCE_VALUE_USES])

        if len(c_df) > self.max_seq_length:
            measurement = measurement[-self.max_seq_length:]
            condition = condition[-self.max_seq_length:]
            time = time[-self.max_seq_length:]
            actual_seq_length = self.max_seq_length
        else:
            actual_seq_length = len(c_df)
            padded_measurement = np.zeros(
                (self.max_seq_length, measurement.shape[1]))
            padded_condition = np.zeros(
                (self.max_seq_length, condition.shape[1]))
            padded_time = np.zeros((self.max_seq_length, 1))
            padded_measurement[:actual_seq_length, :] = measurement
            padded_condition[:actual_seq_length, :] = condition
            padded_time[:actual_seq_length, :] = time

            measurement = padded_measurement
            condition = padded_condition
            time = padded_time

        return torch.tensor(time, dtype=torch.float), torch.tensor(
            measurement, dtype=torch.float), torch.tensor(
                condition, dtype=torch.float), torch.tensor(label,
                                                            dtype=torch.long)