def extract_features(df): df_n = df.xs('01', level=1, axis=1) binary = df_n > -1 binary = binary[binary.sum(1).isin(range(20, df.shape[1]/2))] rr = df.ix[binary.index].apply(exp_change, 1) binary = binary.ix[true_index(rr.p < .05)] real = df_n.ix[df_n.index.diff(binary.index)] singles = real[((real.max(1) - real.min(1)) > 1)] singles = singles[(singles.std(1) > .25)] ch = df.ix[singles.index].apply(exp_change, 1) singles = df_n.ix[true_index(ch.p < .01)] return binary, singles, real
def extract_features(df): df_n = df.xs('01', level=1, axis=1) binary = df_n > -1 binary = binary[binary.sum(1).isin(range(20, df.shape[1] / 2))] rr = df.ix[binary.index].apply(exp_change, 1) binary = binary.ix[true_index(rr.p < .05)] real = df_n.ix[df_n.index.diff(binary.index)] singles = real[((real.max(1) - real.min(1)) > 1)] singles = singles[(singles.std(1) > .25)] ch = df.ix[singles.index].apply(exp_change, 1) singles = df_n.ix[true_index(ch.p < .01)] return binary, singles, real
def extract_pc_filtered(df, pc_threshold=.2, filter_down=True): ''' First pre-filters for patients with no tumor/normal change. Then normalizes by normals. ''' if ('11' in df.columns.levels[1]) and filter_down: tt = df.xs('11', axis=1, level=1) rr = df.apply(exp_change, 1).sort('p') m, s = tt.mean(1), tt.std(1) df_n = df.xs('01', axis=1, level=1) df_n = ((df_n.T - m) / s).T df_n = df_n.ix[true_index(rr.p < .05)] else: #No matched normals df_n = df.xs('01', axis=1, level=1) df_n = ((df_n.T - df_n.mean(1)) / df_n.std(1)).T pc = extract_pc(df_n, pc_threshold, standardize=False) return pc
def rna_filter(cn, val, rna): ''' Filter copy number events with rna expression data. Here we test whether the event is associated with a subsequent change in expression in those patients. cn: copy number matrix, should have a MultiIndex, with the gene name in the last level val: value of the copy number to test in [-2, -1, 1, 2] ''' assert val in [-2, -1, 1, 2] change = pd.DataFrame({g: kruskal_pandas(vec == val, rna.ix[g[-1]]) for g, vec in cn.iterrows() if g[-1] in rna.index}).T q_vals = bhCorrection(change.p) filtered = cn.ix[true_index(q_vals < .1)] return filtered
def extract_pc_filtered(df, pc_threshold=.2, filter_down=True): ''' First pre-filters for patients with no tumor/normal change. Then normalizes by normals. ''' if ('11' in df.columns.levels[1]) and filter_down: tt = df.xs('11', axis=1, level=1) rr = df.apply(exp_change, 1).sort('p') m, s = tt.mean(1), tt.std(1) df_n = df.xs('01', axis=1, level=1) df_n = ((df_n.T - m) / s).T df_n = df_n.ix[true_index(rr.p < .05)] else: # No matched normals df_n = df.xs('01', axis=1, level=1) df_n = ((df_n.T - df_n.mean(1)) / df_n.std(1)).T pc = extract_pc(df_n, pc_threshold, standardize=False) return pc
def rna_filter(cn, val, rna): ''' Filter copy number events with rna expression data. Here we test whether the event is associated with a subsequent change in expression in those patients. cn: copy number matrix, should have a MultiIndex, with the gene name in the last level val: value of the copy number to test in [-2, -1, 1, 2] ''' assert val in [-2, -1, 1, 2] change = pd.DataFrame({ g: kruskal_pandas(vec == val, rna.ix[g[-1]]) for g, vec in cn.iterrows() if g[-1] in rna.index }).T q_vals = bhCorrection(change.p) filtered = cn.ix[true_index(q_vals < .1)] return filtered