Ejemplo n.º 1
0
    def main(self, features: List[pd.DataFrame],
             categories: List[pd.DataFrame], whiten: bool, id_filter: List[T],
             subsets: List[List[T]]) -> dict:
        # merge input data into single df
        df = reduce(lambda a, b: a.append(b), features)

        # apply id filter
        df = utils.apply_id_filter(df=df, id_filter=id_filter)

        if not subsets:
            # empty subsets equals all samples in one subset
            subsets = [df['id'].unique().tolist()]

        # make matrix of data
        df = df.pivot(index='feature', columns='id', values='value')
        df = df.T
        feature_labels = list(df)

        # save ids so we can re-assign them after pca
        ids = df.index.tolist()

        # replace missing values with row medians
        imp = Imputer(missing_values='NaN', strategy='median', axis=0)
        imp.fit(df)
        df = imp.transform(df)

        # PCA
        pca = PCA(whiten=whiten)
        pca.fit(df)
        reduced_df = pca.transform(df)

        # get explained variance ratios of components
        variance_ratios = pca.explained_variance_ratio_

        # get loadings
        loadings = -1 * pca.components_.T * np.sqrt(pca.explained_variance_)
        loadings = pd.DataFrame(loadings)
        loadings['feature'] = feature_labels

        # re-assign ids
        reduced_df = pd.DataFrame(reduced_df)
        reduced_df['id'] = ids

        # add category and subset column
        reduced_df = utils.apply_subsets(df=reduced_df, subsets=subsets)
        reduced_df = utils.apply_categories(df=reduced_df,
                                            categories=categories)

        return {
            'data': reduced_df.to_dict(orient='list'),
            'loadings': loadings.to_dict(orient='list'),
            'variance_ratios': variance_ratios.tolist()
        }
Ejemplo n.º 2
0
    def main(self,
             x: pd.DataFrame,
             y: pd.DataFrame,
             id_filter: List[str],
             method: str,
             subsets: List[List[str]],
             categories: List[pd.DataFrame]) -> dict:
        """Compute correlation statistics for the given parameters.
        :param x: DataFrame containing x axis values.
        :param y: DataFrame containing y axis values.
        :param id_filter: If specified use only given ids during the analysis.
        :param method: pearson, spearman or kendall.
        :param subsets: List of lists of subset ids.
        :param categories: List of DataFrames that categorise the data points.
        :return: corr. coef., p-value and other useful values.
        """
        if len(x['feature'].unique().tolist()) != 1 \
                or len(y['feature'].unique().tolist()) != 1:
            error = "Input is invalid. Please make sure that the two " \
                    "variables to compare have exactly one dimension, each."
            logger.error(error)
            raise ValueError(error)
        if method not in ['pearson', 'spearman', 'kendall']:
            raise ValueError("Unknown method '{}'".format(method))

        df = self.merge_x_y(x, y)
        x_label = list(df['feature_x'])[0]
        y_label = list(df['feature_y'])[0]
        df = utils.apply_id_filter(df=df, id_filter=id_filter)
        df = utils.apply_subsets(df=df, subsets=subsets)
        df = utils.apply_categories(df=df, categories=categories)
        global_stats = self.compute_stats(df, method)
        output = global_stats
        output['method'] = method
        output['data'] = df.to_json(orient='records')
        output['x_label'] = x_label
        output['y_label'] = y_label

        return output
Ejemplo n.º 3
0
    def main(self,
             bw_factor: float,
             num_bins: int,
             id_filter: List[str],
             subsets: List[List[str]],
             data: pd.DataFrame,
             categories: List[pd.DataFrame]) -> dict:
        """Compute several basic statistics such as bin size and kde.
        :param bw_factor: KDE resolution.
        :param num_bins: Number of bins to use for histogram.
        :param id_filter: If specified use only given ids during the analysis.
        :param subsets: List of lists of subset ids.
        :param data: Numerical values to create histogram of.
        :param categories: The groups to split the values into.
        """
        df = data
        del data
        df.dropna(inplace=True)
        if df.shape[0] == 0:
            error = 'The selected numerical variable must be non-empty.'
            logger.exception(error)
            raise ValueError(error)
        df = utils.apply_id_filter(df=df, id_filter=id_filter)
        df = utils.apply_subsets(df=df, subsets=subsets)
        df = utils.apply_categories(df=df, categories=categories)
        stats = {}
        categories = df['category'].unique().tolist()
        subsets = df['subset'].unique().tolist()
        for category in categories:
            for subset in subsets:
                sub_df = df[(df['category'] == category) &
                            (df['subset'] == subset)]
                values = sub_df['value']
                if values.shape[0] < 2:
                    continue
                hist, bin_edges = np.histogram(values, bins=num_bins)
                hist = hist.tolist()
                bin_edges = bin_edges.tolist()
                mean = np.mean(values)
                median = np.median(values)
                std = np.std(values)

                def bw(obj, fac):
                    return np.power(obj.n, -1.0 / (obj.d + 4)) * fac

                kde = scipy.stats.gaussian_kde(
                    values, bw_method=partial(bw, fac=bw_factor))
                xs = np.linspace(
                    start=np.min(values), stop=np.max(values), num=200)
                dist = kde(xs).tolist()
                if not stats.get(category):
                    stats[category] = {}
                stats[category][subset] = {
                    'hist': hist,
                    'bin_edges': bin_edges,
                    'mean': mean,
                    'median': median,
                    'std': std,
                    'dist': dist
                }
        return {
            'data': df.to_json(orient='records'),
            'stats': stats,
            'subsets': subsets,
            'categories': categories,
            'label': df['feature'].tolist()[0]
        }
Ejemplo n.º 4
0
    def main(self, durations: List[pd.DataFrame],
             categories: List[pd.DataFrame],
             event_observed: List[pd.DataFrame],
             estimator: str,
             id_filter: List[str],
             subsets: List[List[str]]) -> dict:
        # TODO: Docstring
        if len(durations) != 1:
            error = 'Analysis requires exactly one array that specifies the ' \
                    'duration length.'
            logger.exception(error)
            raise ValueError(error)
        if len(event_observed) > 1:
            error = 'Maximal one variable for "event_observed" allowed'
            logger.exception(error)
            raise ValueError(error)

        df = durations[0]
        df.dropna(inplace=True)
        df = utils.apply_id_filter(df=df, id_filter=id_filter)
        df = utils.apply_subsets(df=df, subsets=subsets)
        df = utils.apply_categories(df=df, categories=categories)

        stats = {}
        categories = df['category'].unique().tolist()
        subsets = df['subset'].unique().tolist()
        # for every category and subset combination estimate the survival fun.
        for category in categories:
            for subset in subsets:
                sub_df = df[(df['category'] == category) &
                            (df['subset'] == subset)]
                T = sub_df['value']
                E = None  # default is nothing is censored
                if len(T) <= 3:
                    continue
                if event_observed:
                    # find observation boolean value for every duration
                    E = event_observed[0].merge(sub_df, how='right', on='id')
                    E = [not x for x in pd.isnull(E['value_x'])]
                    assert len(E) == len(T)
                if estimator == 'NelsonAalen':
                    fitter = NelsonAalenFitter()
                    fitter.fit(durations=T, event_observed=E)
                    estimate = fitter.cumulative_hazard_[
                        'NA_estimate'].tolist()
                    ci_lower = fitter.confidence_interval_[
                        'NA_estimate_lower_0.95'].tolist()
                    ci_upper = fitter.confidence_interval_[
                        'NA_estimate_upper_0.95'].tolist()
                elif estimator == 'KaplanMeier':
                    fitter = KaplanMeierFitter()
                    fitter.fit(durations=T, event_observed=E)
                    # noinspection PyUnresolvedReferences
                    estimate = fitter.survival_function_[
                        'KM_estimate'].tolist()
                    ci_lower = fitter.confidence_interval_[
                        'KM_estimate_lower_0.95'].tolist()
                    ci_upper = fitter.confidence_interval_[
                        'KM_estimate_upper_0.95'].tolist()
                else:
                    error = 'Unknown estimator: {}'.format(estimator)
                    logger.exception(error)
                    raise ValueError(error)
                timeline = fitter.timeline.tolist()
                if not stats.get(category):
                    stats[category] = {}
                stats[category][subset] = {
                    'timeline': timeline,
                    'estimate': estimate,
                    'ci_lower': ci_lower,
                    'ci_upper': ci_upper
                }

        return {
            'label': df['feature'].tolist()[0],
            'categories': categories,
            'subsets': subsets,
            'stats': stats
        }
Ejemplo n.º 5
0
 def main(self,
          features: List[pd.DataFrame],
          categories: List[pd.DataFrame],
          id_filter: List[T],
          transformation: str,
          subsets: List[List[T]]) -> dict:
     """ Compute boxplot statistics for the given parameters.
     :param features: List of numerical features
     :param categories: List of categorical features used to group numerical
     features.
     :param id_filter: List of ids that will be considered for analysis. If
     empty all ids will be used.
     :param transformation: Transformation that will be applied to the data.
     :param subsets: List of subsets used as another way to group the
     numerical features.
     """
     if not len(features):
         raise ValueError("Must at least specify one "
                          "non empty numerical feature.")
     # merge dfs into single one
     df = reduce(lambda l, r: l.append(r), features)
     df = utils.apply_transformation(df=df, transformation=transformation)
     df.dropna(inplace=True)
     df = utils.apply_id_filter(df=df, id_filter=id_filter)
     df = utils.apply_subsets(df=df, subsets=subsets)
     df = utils.apply_categories(df=df, categories=categories)
     df['outlier'] = None
     results = {
         'statistics': {},
         'features': df['feature'].unique().tolist(),
         'categories': df['category'].unique().tolist(),
         'subsets': df['subset'].unique().tolist()
     }
     group_values = []
     for feature in results['features']:
         for subset in results['subsets']:
             for category in results['categories']:
                 values = df[(df['subset'] == subset) &
                             (df['category'] == category) &
                             (df['feature'] == feature)]['value'].tolist()
                 if len(values) < 2:
                     continue
                 # FIXME: v This is ugly. Look at kaplan_meier_survival.py
                 label = '{}//{}//s{}'.format(feature, category, subset + 1)
                 group_values.append(values)
                 stats = self.boxplot_statistics(values)
                 u_outliers = np.array(values) > stats['u_wsk']
                 l_outliers = np.array(values) < stats['l_wsk']
                 outliers = np.bitwise_or(u_outliers, l_outliers)
                 df.loc[(df['subset'] == subset) &
                        (df['category'] == category) &
                        (df['feature'] == feature), 'outlier'] = outliers
                 kde = scipy.stats.gaussian_kde(values)
                 xs = np.linspace(start=stats['l_wsk'],
                                  stop=stats['u_wsk'], num=100)
                 stats['kde'] = kde(xs).tolist()
                 results['statistics'][label] = stats
     results['data'] = df.to_json(orient='records')
     f_value, p_value = scipy.stats.f_oneway(*group_values)
     results['anova'] = {
         'p_value': p_value,
         'f_value': f_value
     }
     return results
Ejemplo n.º 6
0
    def main(self, numerical_arrays: List[pd.DataFrame],
             numericals: List[pd.DataFrame], categoricals: List[pd.DataFrame],
             ranking_method: str, params: dict, id_filter: List[T],
             max_rows: int, subsets: List[List[T]]) -> dict:
        # merge input data into single df
        df = reduce(lambda a, b: a.append(b), numerical_arrays)
        if not subsets:
            # empty subsets equals all samples in one subset
            subsets = [df['id'].unique().tolist()]
        else:
            # if subsets are defined we drop the rows that are not part of one
            flattened_subsets = [x for subset in subsets for x in subset]
            df = df[df['id'].isin(flattened_subsets)]
        # apply id filter
        df = utils.apply_id_filter(df=df, id_filter=id_filter)
        # drop subset ids that are not in the df
        subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets)
        # make sure the input data are still valid after the pre-processing
        if df.shape[0] < 1:
            error = "Either the input data set is too small or " \
                    "the subset sample ids do not match the data."
            logger.error(error)
            raise ValueError(error)

        # make matrix of input data
        df = df.pivot(index='feature', columns='id', values='value')

        # create z-score matrix used for visualising the heatmap
        z_df = [(df.iloc[i] - df.iloc[i].mean()) / df.iloc[i].std(ddof=0)
                for i in range(df.shape[0])]
        z_df = pd.DataFrame(z_df, columns=df.columns, index=df.index)

        method = 'limma'
        if ranking_method in ['mean', 'median', 'variance']:
            method = ranking_method
        # compute statistic for ranking
        stats = array_stats.get_stats(df=df,
                                      subsets=subsets,
                                      params=params,
                                      ranking_method=method)

        # sort by ranking_value
        self.sort(df, stats[ranking_method], ranking_method)
        self.sort(z_df, stats[ranking_method], ranking_method)
        self.sort(stats, stats[ranking_method], ranking_method)

        # discard rows according to max_rows
        df = df[:max_rows]
        z_df = z_df[:max_rows]
        stats = stats[:max_rows]

        # prepare output for front-end
        df['feature'] = df.index
        z_df['feature'] = z_df.index
        df = pd.melt(df, id_vars='feature', var_name='id')
        z_df = pd.melt(z_df, id_vars='feature', var_name='id')
        df = df.merge(z_df, on=['id', 'feature'])
        df.rename(columns={
            'value_x': 'value',
            'value_y': 'zscore'
        },
                  inplace=True)
        df = utils.apply_subsets(df, subsets)

        return {
            'data': df.to_dict(orient='list'),
            'stats': stats.to_dict(orient='list')
        }
Ejemplo n.º 7
0
 def test_apply_subsets(self):
     df = pd.DataFrame([[101, 'foo', 1], [102, 'foo', 2], [103, 'foo', 3]],
                       columns=['id', 'feature', 'value'])
     subsets = [[101, 102], [], [103, 102, 104]]
     result = utils.apply_subsets(df=df, subsets=subsets)
     assert result['subset'].tolist() == [0, 0, 2, 2]