Ejemplo n.º 1
0
    def aggregate_values_in(
        self,
        time_windows: List[Tuple[int]],
        df,
        aggregation_functions,
        name: str = 'interval',
    ):
        """
        functionality used to pivot the df within the specified time_windows on
        basis of the aggregation_functions on the specified name

        Args:
            time_windows: the time_windows the data should be trimmed to
            df: the df that shall be pivoted
            aggregation_functions: how to aggregate specified cols when
                pivoting
            name: the name to pivot data towards

        Returns:
            the pivoted, aggregated, prefixed and time-clipped df
        """
        results = aggregate_df_with_windows(
            time_windows,
            df,
            aggregation_functions=aggregation_functions,
            name=name,
        )

        # merge with all occurrences of the cohort condition again
        return merge_to_base(self.occurrences, results)
Ejemplo n.º 2
0
def threshold_clip_time_series(df, cohort, threshold):
    binarized_df = df.pivot_table(index=OCCURRENCE_INDEX,
                                  columns=['description'],
                                  aggfunc={'time_delta_in_days': 'any'})

    binarized_df.columns = binarized_df.columns.droplevel()
    binary_df_with_cohort = merge_to_base(
        cohort.occurrences, [binarized_df]).set_index(OCCURRENCE_INDEX)

    cols_selected = column_threshold_clip(df=binary_df_with_cohort,
                                          threshold=threshold).columns

    return df[df.description.isin(cols_selected)]
Ejemplo n.º 3
0
    def merge_patient_data(self, *dataframes):
        """
        functionality to merge the data specified in the df's on the patients
        data-points

        Args:
            dataframes: data to merge the patients data with

        Returns:
            data merged in one single df
        """
        base = self.occurrences.merge(self.get(Patient()),
                                      on=['medical_record_number'])
        return merge_to_base(base, dataframes)
Ejemplo n.º 4
0
    def has_occurrence_in(
        self,
        time_windows: List[Tuple[int]],
        df,
        name: str = 'interval',
    ):
        results = aggregate_df_with_windows(
            time_windows,
            df,
            aggregation_functions={
                'time_delta_in_days': lambda x: bool(len(x))
            },
            name=name,
            prefix_column_names=False,
        )

        return merge_to_base(self.occurrences, results).fillna(value=False)
Ejemplo n.º 5
0
def pivot_time_series(cohort, onset_df, df):
    """
    Fetch and pivot time series with sparse symbolic representation
    """
    if not df.empty:
        results = []
        for x in df.description.unique():
            if VERBOSE:
                print(f'############# {x} #############')
            current_df = df[df.description == x]

            transformed_df = ssr_transform(current_df, cohort, onset_df)
            transformed_df.rename(columns={'value_representation': x},
                                  inplace=True)
            results.append(transformed_df)
    else:
        results = [df]

    return merge_to_base(cohort.occurrences, results)
Ejemplo n.º 6
0
def ssr_transform(time_series_df,
                  cohort,
                  onset_df,
                  num_cuts=3,
                  missing='lr',
                  n_candidates=20):
    """
    Compute Sparse Symbolic Representation (SSR) of time series.
    This includes representation as symbols (defined by num_cuts),
    shapelet generation and selection, as well as transformation of the
    symbols into numeric values based on the edit distance to the selected
    shapelet.
    """
    target = list(set(onset_df.columns) - set(OCCURRENCE_INDEX))[0]
    df = merge_to_base(
        cohort.occurrences,
        [sax_transform(
            time_series_df=time_series_df,
            num_cuts=3,
        ), onset_df]).fillna("z")
    seqs, labels = df["value_representation"], df[target]

    # get non-empty seqs and their labels
    actual_seqs, actual_labels = zip(*[(_, labels[i])
                                       for i, _ in enumerate(seqs)
                                       if _ != "z"])

    # generate unique candidates; sort them to preserve order
    candidates = set(
        [_get_random_subsequence(actual_seqs) for _ in range(n_candidates)])
    candidates = list(sorted(candidates))

    # evaluate candidates according to 'lr' or 'plain' method
    if VERBOSE:
        print("# Candidate evaluation")
    if missing == "lr":
        missing_data_labels = [
            l for i, l in enumerate(labels) if seqs[i] == "z"
        ]
        candidate_evals = [
            _evaluate_candidate(
                c,
                [_sliding_ed(s, c) for s in actual_seqs],
                actual_labels,
                _entropy(labels),
                missing_data_labels=missing_data_labels,
            ) for c in candidates
        ]

    else:  # 'plain'
        candidate_evals = [
            _evaluate_candidate(
                c,
                [_sliding_ed(s, c) for s in seqs],
                labels,
                _entropy(labels),
                missing="plain",
            ) for c in candidates
        ]

    # select candidate (shapelet) yielding maximum information gain (to break
    # ties, max margin and min length)
    shapelet = sorted(
        candidate_evals,
        key=lambda e: (
            -e["ig"],  # max ig
            -e["margin"],  # max margin
            len(e["subseq"]),
        ),
    )[0]  # min length

    if VERBOSE:
        print("# Shapelet selection")
        print("selected shapelet:{} ig:{:.3f} margin:{}".format(
            shapelet["subseq"], shapelet["ig"], shapelet["margin"]))

    # transform sequence dataset based on the selected shapelet
    if missing == "lr":
        transformed_seqs = [
            _sliding_ed(s, shapelet["subseq"]) if s != "z" else shapelet["z"]
            for s in seqs
        ]
    else:
        transformed_seqs = [_sliding_ed(s, shapelet["subseq"]) for s in seqs]

    df['value_representation'] = transformed_seqs
    return df[OCCURRENCE_INDEX + ['value_representation']]
Ejemplo n.º 7
0
def load_features_and_transform(training_configuration,
                                data_loader,
                                bin_size=30,
                                persist_data=True):
    """
    Load features from feature pipeline.
    Then apply the DataLoader for feature transformation.
    """
    target = training_configuration.target
    cohort = training_configuration.training_pipeline.cohort.get_fiber()
    onset_df = training_configuration.training_pipeline.onset_dataframe.get_df(
        target)
    feature_pipeline = training_configuration.training_pipeline.feature_pipeline

    numeric_feature_dfs = []
    if training_configuration.feature_type_numeric == 'numeric_binned':
        window = training_configuration.window_start_numeric, training_configuration.window_end_numeric
        for w in range(window[0], window[1], bin_size):
            numeric_feature_objs = session.query(Feature).filter_by(
                feature_pipeline=feature_pipeline,
                feature_type=training_configuration.feature_type_numeric,
                window_start=w,
                window_end=w + bin_size).all()
            cur_numeric_dfs = [
                pd.read_csv(f.path) for f in numeric_feature_objs
            ]
            for df in cur_numeric_dfs:
                df.set_index(OCCURRENCE_INDEX, inplace=True)
                new_cols = [
                    get_name_for_interval(c, [w, w + bin_size])
                    for c in df.columns
                ]
                df.columns = new_cols
                df.reset_index(inplace=True)
            numeric_feature_dfs += cur_numeric_dfs
        _, occurring_feature_dfs = feature_pipeline.get_features(
            training_configuration)

    else:
        numeric_feature_dfs, occurring_feature_dfs = feature_pipeline.get_features(
            training_configuration)

    cohort.occurrences.medical_record_number = cohort.occurrences.medical_record_number.astype(
        int)
    onset_df.medical_record_number = onset_df.medical_record_number.astype(int)

    if training_configuration.feature_type_numeric == "numeric_time_series":
        pivoted_dfs = []
        for df in numeric_feature_dfs:
            numeric_df = threshold_clip_time_series(
                df=df,
                cohort=cohort,
                threshold=training_configuration.threshold_numeric)
            pivoted_dfs.append(
                pivot_time_series(
                    cohort=cohort,
                    onset_df=onset_df,
                    df=numeric_df,
                ))
        numeric_df = merge_to_base(cohort.occurrences, pivoted_dfs)

    else:
        numeric_df = merge_to_base(cohort.occurrences, [
            x.filter(regex=(data_loader.column_selector +
                            "|medical_record_number|age_in_days"))
            for x in numeric_feature_dfs
        ])
        numeric_df = column_threshold_clip(
            df=numeric_df, threshold=training_configuration.threshold_numeric)

    occurring_df = merge_to_base(cohort.occurrences, [
        x.filter(regex=(data_loader.column_selector +
                        "|medical_record_number|age_in_days"))
        for x in occurring_feature_dfs
    ])
    occurring_df = column_threshold_clip(
        df=occurring_df, threshold=training_configuration.threshold_occurring)

    cohort.occurrences.medical_record_number = cohort.occurrences.medical_record_number.astype(
        str)
    numeric_df.medical_record_number = numeric_df.medical_record_number.astype(
        str)
    occurring_df.medical_record_number = occurring_df.medical_record_number.astype(
        str)
    onset_df.medical_record_number = onset_df.medical_record_number.astype(str)

    ## Merge to cohort data and use user's data loader
    data = cohort.merge_patient_data(
        onset_df,
        numeric_df,
        occurring_df,
    )

    ## persist training data
    if persist_data:
        TrainingData.persist(training_configuration=training_configuration,
                             data=data)
    X, y = data_loader.transform(X=data.drop(columns=[target]), y=data[target])
    return X, y