def get_patient_set(self, filters): f1 = list(filters) filter_df = pd.concat(f1, axis=1) clinical_filter = filter_df.dropna().sum(1) == 0 keepers_o = H.true_index(clinical_filter) keepers_o = keepers_o.intersection(self.mut_df.columns) keepers_o = keepers_o.intersection(self.cna_df.columns) return keepers_o
def mut_filter(df, rate, binary_cutoff=12): ''' Filter out mutation features, ensuring that a feature is not entirely an artifact of mutation rate. ''' df = df[df.sum(1) >= binary_cutoff] cc = H.screen_feature(rate, rev_kruskal, df) fc_apply = lambda s: fc(s, rate) direction = df.apply(fc_apply, axis=1) direction.name = 'direction' cc = cc.join(direction) cc = cc[cc.direction==False] df = df.ix[H.true_index(cc.p > .01)] df = df.dropna(axis=1) return df
def process_real(df): ''' Process real valued feature into binary feature. ''' df_c = df.copy() df_c = df_c.apply(lambda s: H.to_quants(s, std=1), axis=1) df_c = df_c > 0 if type(df.index) == pd.MultiIndex: df_c.index = map(lambda s: '_'.join(s), df_c.index) return df_c.T
def process_real(df): """ Process real valued feature into binary feature. """ df_c = df.copy() df_c = df_c.apply(lambda s: H.to_quants(s, std=1), axis=1) df_c = df_c > 0 if type(df.index) == pd.MultiIndex: df_c.index = map(lambda s: '_'.join(s), df_c.index) return df_c.T
def get_patient_set(self, filters): f1 = list(filters) filter_df = pd.concat(f1, axis=1) clinical_filter = filter_df.dropna().sum(1) == 0 keepers_o = H.true_index(clinical_filter) keepers_o = keepers_o.intersection(self.mut_df.columns) keepers_o = keepers_o.intersection(self.cna_df.columns) keepers_o = keepers_o.intersection(self.surv.unstack().index) keepers_o = keepers_o.intersection(self.rna_df.columns) keepers_o = keepers_o.intersection(self.mirna_df.columns) return keepers_o
def mut_filter(df, rate, binary_cutoff=12): """ Filter out mutation features, ensuring that a feature is not entirely an artifact of mutation rate. """ get_min_count = lambda s: s.value_counts().min() if len(s.unique()) > 1 else -1 df = df[df.apply(get_min_count, axis=1) > binary_cutoff] cc = H.screen_feature(rate, rev_kruskal, df) fc_apply = lambda s: fc(s, rate) direction = df.apply(fc_apply, axis=1) direction.name = 'direction' cc = cc.join(direction) #cc = cc[cc.direction == False] #return cc df = df.ix[H.true_index((cc.p > .01) | (cc.direction == True))] df = df.dropna(axis=1) return df
def mut_filter(df, rate, binary_cutoff=12): """ Filter out mutation features, ensuring that a feature is not entirely an artifact of mutation rate. """ get_min_count = lambda s: s.value_counts().min() if len(s.unique() ) > 1 else -1 df = df[df.apply(get_min_count, axis=1) > binary_cutoff] cc = H.screen_feature(rate, rev_kruskal, df) fc_apply = lambda s: fc(s, rate) direction = df.apply(fc_apply, axis=1) direction.name = 'direction' cc = cc.join(direction) #cc = cc[cc.direction == False] #return cc df = df.ix[H.true_index((cc.p > .01) | (cc.direction == True))] df = df.dropna(axis=1) return df
def corrections(vec): ''' Correct p-values multiple ways along multi-index. ''' bonf_all = vec * len(vec) bonf_within = vec.groupby(level=0).apply(lambda s: s*len(s)) bh_all = H.bhCorrection(vec) bh_within = vec.groupby(level=0).apply(H.bhCorrection).order() two_step = bh_within * len(vec.groupby(level=0).size()) q = pd.concat([vec, bh_within, bh_all, bonf_all, bonf_within, two_step], keys=['uncorrected', 'bh_within', 'bh_all', 'bonf_all', 'bonf_within', 'two_step'], axis=1) return q
def corrections(vec): """ Correct p-values multiple ways along multi-index. """ bonf_all = vec * len(vec) bonf_within = vec.groupby(level=0).apply(lambda s: s * len(s)) bh_all = H.bhCorrection(vec) bh_within = vec.groupby(level=0).apply(H.bhCorrection).order() two_step = bh_within * len(vec.groupby(level=0).size()) q = pd.concat([vec, bh_within, bh_all, bonf_all, bonf_within, two_step], keys=[ 'uncorrected', 'bh_within', 'bh_all', 'bonf_all', 'bonf_within', 'two_step' ], axis=1) return q
def remove_redundant_pathways(pathways, background, cutoff=.7, binarize=False): ''' Screens out redundant pathways with high correlation above _cutoff_. Pathways are ranked based on lack of correlation to the background signal. Then if two pathways have high correlation the lower ranked pathway is removed. ''' bg = H.screen_feature(background, spearman_pandas, pathways) dd = pathways.ix[bg.index[::-1]].T.corr() dd = pd.DataFrame(np.triu(dd, 1), dd.index, dd.index) dd = dd.replace(0, np.nan).stack() drop = dd[dd.abs() > cutoff].index.get_level_values(1) pathways_to_keep = pathways.index.diff(drop.unique()) pathways = pathways.ix[pathways_to_keep] if binarize is False: return pathways else: binary_pathways = pathways.apply(binarize_feature, 1) return binary_pathways