def loo_regressions(xs, ys, ft, dt, mt): print '[INFO]', ft, dt # Align matricies x = xs.loc[:, ys.columns].dropna(axis=1).T y = ys[x.index].T # Define cross-validation cv = LeaveOneOut(len(y)) # Run regressions y_pred, y_betas = {}, {} for m in y: y_pred[m] = {} betas = [] for train, test in cv: lm = ElasticNet(alpha=0.01).fit(x.ix[train], y.ix[train, m]) y_pred[m][x.index[test][0]] = lm.predict(x.ix[test])[0] betas.append(dict(zip(*(x.columns, lm.coef_)))) y_betas[m] = DataFrame(betas).median().to_dict() y_pred = DataFrame(y_pred).ix[y.index, y.columns] print '[INFO] Regression done: ', ft, dt # Perform correlation with predicted values metabolites_corr = [(ft, dt, f, mt, 'metabolites', pearson(y[f], y_pred[f])[0]) for f in y_pred] conditions_corr = [(ft, dt, s, mt, 'conditions', pearson(y.ix[s], y_pred.ix[s])[0]) for s in y_pred.index] return (metabolites_corr + conditions_corr), (ft, dt, mt, y_betas)
if df_type == 'Kinases/Phosphatases': df = df[(df.count(1) / df.shape[1]) > .75] # Conditions overlap conditions = list(set(growth.index).intersection(df)) # PCA analysis pca = PCA(n_components=n_components).fit(df.T.replace(np.nan, 0)) pca_pc = DataFrame( pca.transform(df.T.replace(np.nan, 0)), columns=['PC%d' % i for i in range(1, n_components + 1)], index=df.columns) # Plot correlation with PCA ax = plt.subplot(gs[pos]) cor, pvalue, nmeas = pearson(growth[pca_pc.index], pca_pc[pc]) sns.regplot(growth[pca_pc.index], pca_pc[pc], ax=ax, color='#4c4c4c') ax.axhline(0, ls='-', lw=0.1, c='black', alpha=.3) ax.axvline(0, ls='-', lw=0.1, c='black', alpha=.3) ax.set_title('%s - %s\n(Pearson: %.2f, p-value: %.1e)' % (dataset_type, df_type, cor, pvalue)) ax.set_xlabel('Relative growth (centered)') ax.set_ylabel( 'PC%d (%.1f%%)' % (int(pc[-1:]), pca.explained_variance_ratio_[int(pc[-1:]) - 1] * 100)) sns.despine(trim=True, ax=ax) ax = plt.subplot(gs[pos + 1]) plot_df = DataFrame(zip(['PC%d' % i for i in range(1, n_components + 1)], pca.explained_variance_ratio_), columns=['PC', 'var'])
for m in metabolomics_dyn_ng.index: for c in conditions: ys = y.ix[m, [i for i in y if not i.startswith(c)]] xs = x[ys.index].T yss = y.ix[m, [i for i in y if i.startswith(c)]] xss = x[yss.index].T lm = ElasticNet(alpha=0.01).fit(xs, ys) pred = Series(lm.predict(xss), index=xss.index) features = dict(zip(*(xs.columns, lm.coef_))) for f in features: lm_feat.append((feature_type, method_type, m, c, f, features[f])) lm_res.append((feature_type, method_type, m, c, pearson(yss, pred)[0])) lm_res = DataFrame(lm_res, columns=['feature', 'method', 'ion', 'condition', 'pearson']) lm_res['metabolite'] = [met_name[i] for i in lm_res['ion']] print lm_res.head() lm_feat = DataFrame(lm_feat, columns=['feature_type', 'method', 'ion', 'condition', 'feature', 'coefficient']) lm_feat['m_name'] = [met_name[i] for i in lm_feat['ion']] lm_feat['f_name'] = [acc_name[i] for i in lm_feat['feature']] print lm_feat.head() # -- Plot palette = {'TFs': '#34495e', 'Kinases': '#3498db'} # Correlation boxplots
index_col=0)[k_activity_dyn_comb_gsea.columns] metabolomics_dyn_comb = metabolomics_dyn_comb[ metabolomics_dyn_comb.std(1) > .4] metabolomics_dyn_comb.index = ['%.4f' % i for i in metabolomics_dyn_comb.index] kinases, ions, conditions = set(k_activity_dyn_comb_lm.index), set( metabolomics_dyn_comb.index), set(metabolomics_dyn_comb) # -- Define metabolic to analyse m = '606.0736' k_activities = [('gsea', k_activity_dyn_comb_gsea), ('lm', k_activity_dyn_comb_lm)] # -- Kinase activities correlation m_cor = [(k, pearson(k_activity_dyn_comb_lm.ix[k, conditions], metabolomics_dyn_comb.ix[m, conditions])[0], pearson(k_activity_dyn_comb_gsea.ix[k, conditions], metabolomics_dyn_comb.ix[m, conditions])[0]) for k in kinases] m_cor = DataFrame(m_cor, columns=['kinase', 'lm_cor', 'gsea_cor']).set_index('kinase') lm_top_features = list(m_cor['lm_cor'].abs().sort( inplace=False, ascending=False).head(5).index) gsea_top_features = list(m_cor['gsea_cor'].abs().sort( inplace=False, ascending=False).head(5).index) top_features = list(set(lm_top_features).union(gsea_top_features)) plot_df = [(m, k, c, method, metabolomics_dyn_comb.ix[m, c], df.ix[k, c]) for k in top_features for c in conditions for method, df in k_activities]