def test_outlier_influence_funcs(): #smoke test x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() oi.summary_table(res, alpha=0.05) res2 = OLS(y, x[:,0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table()
def test_outlier_influence_funcs(): # smoke test x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() oi.summary_table(res, alpha=0.05) res2 = OLS(y, x[:, 0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table()
def seasonal_prediction(df, df_result, y_name, time_name, new_t, period, show=False, option='cma'): """ df should be the deseasoned df if possible """ y_v = df[y_name] if len(new_t) == 0: _, data, _ = sso.summary_table(df_result, alpha=0.05) trend_proj = df_result.predict(sm.add_constant(new_t)) df_result.predict(sm.add_constant(new_t)) tdf = df.copy() tdf[f'Pre_{y_name}'] = data[:, 2] * tdf['SeaIdx'] return tdf else: new_t = np.array(new_t) SI, SIid = seasonal_index(y_v, period, show, option=option) # des_df = smoothing_cma(df, y_name, time_name, # period=period, show=show) # final df secured trend_proj = df_result.predict(sm.add_constant(new_t)) seasonal_adj = trend_proj * SI # new_t = np.arange(12, 16) _, data, _ = sso.summary_table(df_result, alpha=0.05) trend_proj = df_result.predict(sm.add_constant(new_t)) df_result.predict(sm.add_constant(new_t)) tdf = df.copy() tdf[f'Pre_{y_name}'] = data[:, 2] * tdf['SeaIdx'] # tdf[x_name] = np.append(tdf[x_name], new_t) for i, t in enumerate(new_t): tdf = tdf.append( { time_name: t, 'SID': tdf['SID'].values[-(1 + i) - (len(new_t) - (1 + i))], 'SeaIdx': tdf['SeaIdx'].values[-(1 + i) - (len(new_t) - (1 + i))], f'Pre_{y_name}': trend_proj[i] * tdf['SeaIdx'].values[-(1 + i) - (len(new_t) - (1 + i))] }, ignore_index=True) return tdf
def test_outlier_influence_funcs(reset_randomstate): x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() out_05 = oi.summary_table(res) # GH3344 : Check alpha has an effect out_01 = oi.summary_table(res, alpha=0.01) assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6])) assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7])) res2 = OLS(y, x[:,0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table()
def test_outlier_influence_funcs(reset_randomstate): x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() out_05 = oi.summary_table(res) # GH3344 : Check alpha has an effect out_01 = oi.summary_table(res, alpha=0.01) assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6])) assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7])) res2 = OLS(y, x[:, 0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table()
def OLS_fit(S1="600015", S2="600016"): import statsmodels.api as sm from statsmodels.stats.outliers_influence import summary_table db = dbloader("./dataset/training_data") ts1 = db.load_day_a(S1, "20090101", "20091130")["Closing Price"] ts2 = db.load_day_a(S2, "20090101", "20091130")["Closing Price"] ts1, ts2 = align_series(ts1, ts2) x = ts1.values Y = ts2.values X = sm.add_constant(x) res = sm.OLS(Y, X).fit() print(res.summary()) _, data, _ = summary_table(res) plt.plot(Y, label="real") plt.plot(res.fittedvalues, label="fitted") plt.legend() plt.savefig("./image/OLS_{}_{}.png".format(S1, S2)) plt.show() w1 = res.params[1] diff = ts2 - w1 * ts1 diff_mean = diff.mean() diff_std = diff.std() mean_line = pd.Series(diff_mean, index=diff.index) up_line = pd.Series(diff_mean + diff_std, index=diff.index) down_line = pd.Series(diff_mean - diff_std, index=diff.index) sets = pd.concat([diff, mean_line, up_line, down_line], axis=1) sets.columns = ["diff", "mean", "up", "down"] sets.plot(figsize=(14, 7)) plt.savefig("./image/OLS_diff_{}_{}.png".format(S1, S2), dpi=800) plt.show()
def simple_CIPIINT_regplot(df, xname, yname, alpha=0.05): print( "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n|CI PI Interval plot - simple|\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" ) print("using alpha = ", alpha) df_sorted = df.sort_values([xname]) result = smf.ols(yname + '~' + xname, data=df_sorted).fit() x = df_sorted[xname].values y = df_sorted[yname].values st, data, ss2 = sso.summary_table(result, alpha=alpha) fittedvalues = data[:, 2] predict_mean_se = data[:, 3] predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T predict_ci_low, predict_ci_upp = data[:, 6:8].T plt.plot(x, y, 'o', color='gray') plt.plot(x, fittedvalues, '-', lw=0.5) plt.plot(x, predict_mean_ci_low, 'r-', lw=0.4) plt.plot(x, predict_mean_ci_upp, 'r-', lw=0.4) plt.plot(x, predict_ci_low, 'b--', lw=0.4) plt.plot(x, predict_ci_upp, 'b--', lw=0.4) plt.title('CI PI plot') plt.xlabel(xname) plt.ylabel(yname) plt.legend([ 'data points', 'regression model', 'confidence interval', 'prediction interval' ], title='Legends', bbox_to_anchor=(1.3, 1), prop={'size': 6}) plt.show()
def multiple_durbin_watson(df, xnames, yname, alpha=0.05): print("\n\n========== Durbin-Watson ==========\n") y_data = df[yname] x_data_ar = [] for i in range(len(xnames)): x_data_ar.append(df[xnames[i]]) x_data_ar = np.asarray(x_data_ar) x_data_T = x_data_ar.T x_data = pd.DataFrame(x_data_T, columns=xnames) x_data2 = sm.add_constant(x_data) olsmod = sm.OLS(y_data, x_data2) result = olsmod.fit() st, data, ss2 = sso.summary_table(result, alpha=alpha) print("Columns in data are: %s" % ss2) # Predicted value y_pre = data[:, 2] # Studentized Residual SD = data[:, 10] x_square_sum = np.vdot(SD, SD) print("x_square_sum = ", x_square_sum) size = SD.size print("size = ", size) x_d = np.zeros((size)) print("x_d = ", x_d) l_size = size - 1 for i in range(l_size): x_d[i + 1] = SD[i + 1] - SD[i] print("x_d = ", x_d) d = np.vdot(x_d, x_d) / x_square_sum print("d = ", d)
def regress(x,y,alpha=.05,xlabel='',ylabel='',title=''): if x.name=='': x.name='x' X = sm.add_constant(x) res = sm.OLS(y, X).fit() st, data, ss2 = summary_table(res, alpha=0.05) fittedvalues = data[:,2] #predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T fig, ax = plt.subplots(figsize=(8,6)) ax.scatter(x, y, label="data",s=5) ax.plot(x, fittedvalues, 'r-', label='OLS: 95% confidence') ax.plot(x, predict_ci_low, 'b--') ax.plot(x, predict_ci_upp, 'b--') ax.plot(x, predict_mean_ci_low, 'g--') ax.plot(x, predict_mean_ci_upp, 'g--') ax.scatter(x=x.tail(1), y=y.tail(1), color='Red', s=25, label="Now") ax.legend(loc='best'); equation = ylabel+" = %.4f"%res.params[1] +" * " + xlabel +" + " + "%.4f"%res.params[0] ax.set_xlabel(xlabel + " "+ equation, color='g',fontsize = 15); ax.set_ylabel(ylabel, color='b',fontsize = 15); plt.show() fig, ax = plt.subplots(figsize=(8,6)) plt.plot(x.index,data[:,8]) plt.title("Residual plot: "+equation,fontsize=20) ax.set_xlabel("Date ", color='g',fontsize = 15); ax.set_ylabel("Residual ", color='g',fontsize = 15); plt.show() return data
def linear_regression(self, x, y): x = sm.add_constant(x) regr = sm.OLS(y, x) res = regr.fit() # Get fitted values from model to plot st, data, ss2 = summary_table(res, alpha=0.05) fitted_values = data[:, 2] return fitted_values
def figplot(x, y, sims, clrs, xlab, ylab, fig, n): fig.add_subplot(2, 2, n) y2 = list(y) x2 = list(x) clrs = list(clrs) plt.scatter(x2, y2, color=clrs, s=2, linewidths=0.0) d = pd.DataFrame({'x': list(x2)}) d['y'] = list(y2) f = smf.ols('y ~ x', d).fit() st, data, ss2 = summary_table(f, alpha=0.05) fitted = data[:, 2] m, b, r, p, std_err = stats.linregress(x2, y2) if n == 1: lab = r'$R_{models}$' + ' = ' + str(round( 10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n' lab += r'$R_{microbes}$' + ' = 2.34*' + r'$N$' + '$^{0.14}$' + '\n' lab += r'$R_{macrobes}$' + ' = 1.7*' + r'$N$' + '$^{0.11}$' plt.text(0.2, 1.4, lab, fontsize=7) elif n == 2: lab = r'$D_{models}$' + ' = ' + str(round( 10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n' lab += r'$D_{microbes}$' + ' = 0.44*' + r'$N$' + '$^{0.92}$' + '\n' lab += r'$D_{macrobes}$' + ' = 0.23*' + r'$N$' + '$^{0.99}$' plt.text(0.2, 2.5, lab, fontsize=7) elif n == 3: lab = r'$E_{models}$' + ' = ' + str(round( 10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n' lab += r'$E_{microbes}$' + ' = 0.58*' + r'$N$' + '$^{-0.23}$' + '\n' lab += r'$E_{macrobes}$' + ' = 1.15*' + r'$N$' + '$^{-0.21}$' plt.text(0.2, -3.4, lab, fontsize=7) elif n == 4: lab = r'$S_{models}$' + ' = ' + str(round( 10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n' lab += r'$S_{microbes}$' + ' = 1.77*' + r'$N$' + '$^{0.38}$' + '\n' lab += r'$S_{macrobes}$' + ' = 1.77*' + r'$N$' + '$^{0.24}$' plt.text(0.2, 2.8, lab, fontsize=7) if n == 3: plt.legend(loc='best', fontsize=6, frameon=False) plt.plot(x2, fitted, color='k', ls='--', lw=1.0, alpha=0.9) plt.xlabel(xlab, fontsize=8) plt.ylabel(ylab, fontsize=8) plt.tick_params(axis='both', labelsize=5) plt.xlim(0, 1.05 * max(x2)) if n == 1: plt.ylim(0.0, max(y2)) elif n == 2: plt.ylim(0.0, max(y2)) elif n == 3: plt.ylim(min(y2), 0) elif n == 4: plt.ylim(0.4, max(y2)) return fig
def linearR(df, column1, column2): x = sm.add_constant(df.toPandas()[column1]) y = df.toPandas()[column2] regr = sm.OLS(y, x) res = regr.fit() st, data, ss2 = summary_table(res, alpha=0.05) fitted_values = data[:, 2] return fitted_values
def figplot(clrs, x, y, xlab, ylab, fig, n): fig.add_subplot(2, 2, n) plt.xscale('log') if n == 1: plt.yscale('log', subsy=[1, 2]) plt.yscale('log') plt.minorticks_off() d = pd.DataFrame({'x': np.log10(x)}) d['y'] = np.log10(y) f = smf.ols('y ~ x', d).fit() m, b, r, p, std_err = stats.linregress(np.log10(x), np.log10(y)) st, data, ss2 = summary_table(f, alpha=0.05) fitted = data[:, 2] mean_ci_low, mean_ci_upp = data[:, 4:6].T ci_low, ci_upp = data[:, 6:8].T x, y, fitted, ci_low, ci_upp, clrs = zip( *sorted(zip(x, y, fitted, ci_low, ci_upp, clrs))) x = np.array(x) y = np.array(y) fitted = 10**np.array(fitted) ci_low = 10**np.array(ci_low) ci_upp = 10**np.array(ci_upp) if n == 1: lbl = r'$rarity$' + ' = ' + str(round( 10**b, 1)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' elif n == 2: lbl = r'$Nmax$' + ' = ' + str(round( 10**b, 1)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' elif n == 3: lbl = r'$Ev$' + ' = ' + str(round( 10**b, 1)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' elif n == 4: lbl = r'$S$' + ' = ' + str(round( 10**b, 1)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' plt.scatter(x, y, s=sz, color=clrs, linewidths=0.0, edgecolor=None) plt.fill_between(x, ci_upp, ci_low, color='0.5', lw=0.1, alpha=0.2) plt.plot(x, fitted, color='k', ls='--', lw=0.5, label=lbl) if n == 3: plt.legend(loc=3, fontsize=8, frameon=False) else: plt.legend(loc=2, fontsize=8, frameon=False) plt.xlabel(xlab, fontsize=10) plt.ylabel(ylab, fontsize=10) plt.tick_params(axis='both', labelsize=6) if n in [2, 4]: plt.ylim(min(y), max(y)) elif n == 1: plt.ylim(min(ci_low), max(ci_upp)) elif n == 3: plt.ylim(0.1, 1.1) return fig
def fixed_effect(treat, control, k, SEED): x = np.arange(T) x = np.concatenate([x for _ in range(N_tr+N_co)]).reshape(-1,1) units = np.concatenate([[i for _ in range(T)] for i in range(N_tr+N_co)]).reshape(-1,1) treated = np.logical_and((units<N_tr), (x>=T0)).astype("float") y = np.concatenate([treat.reshape(-1,1),control.reshape(-1,1)]) COLUMNS = ["time", "y", "unit", "treated"] data = pd.DataFrame(np.concatenate((x,y,units,treated),axis=1),columns=COLUMNS) data.to_csv(synthetic_path+"/data_{}.csv".format(SEED), index=False) return fit = ols('y ~ 1 + C(time) + C(unit) + treated:C(time)', data=data).fit() ypred = fit.predict(data) m_tr = ypred[:N_tr*T].to_numpy().reshape(N_tr,T) m_co = ypred[N_tr*T:].to_numpy().reshape(N_co,T) # print(fit.summary()) for t in range(T0, T, 1): m_tr[:, t] -= fit.params["treated:C(time)[{}.0]".format(t)] _, data, _ = summary_table(fit, alpha=0.05) predict_mean_ci_lower, predict_mean_ci_upper = data[:, 4:6].T lower_tr = predict_mean_ci_lower[:N_tr*T].reshape(N_tr,T) upper_tr = predict_mean_ci_upper[:N_tr*T].reshape(N_tr,T) lower_co = predict_mean_ci_lower[N_tr*T:].reshape(N_co,T) upper_co = predict_mean_ci_upper[N_tr*T:].reshape(N_co,T) for t in range(T0, T, 1): lower_tr[:, t] -= fit.conf_int().loc["treated:C(time)[{}.0]".format(t),1] upper_tr[:, t] -= fit.conf_int().loc["treated:C(time)[{}.0]".format(t),0] test_t = np.arange(T) # plt.plot(test_t, np.mean(control, axis=0), color='grey', alpha=0.8) # plt.plot(test_t, np.mean(m_co, axis=0), 'k--', linewidth=1.0, label='Estimated Y(0)') # plt.fill_between(test_t, np.mean(lower_co, axis=0), np.mean(upper_co, axis=0), alpha=0.5) # plt.show() ATT = np.stack([np.mean(treat-m_tr, axis=0), np.mean(treat-upper_tr, axis=0), np.mean(treat-lower_tr, axis=0)]) plt.rcParams["figure.figsize"] = (15,5) plt.plot(test_t, ATT[0],'k--', linewidth=1.0, label="Estimated ATT") plt.fill_between(test_t, ATT[1], ATT[2], alpha=0.5, label="ATT 95% CI") plt.legend(loc=2) plt.savefig(synthetic_path+"/fixedeffect{}_{}.png".format(k, SEED)) plt.close() np.savetxt(synthetic_path+"/fixedeffect{}_{}.csv".format(k, SEED), ATT, delimiter=",")
def figplot(x, y, xlab, ylab, fig, n, binned=1): '''main figure plotting function''' fig.add_subplot(3, 3, n) x = np.log10(x) y = np.log10(y) y2 = list(y) x2 = list(x) if binned == 1: X, Y = (np.array(t) for t in zip(*sorted(zip(x2, y2)))) Xi = xfrm(X, max(X) * 1.05) bins = np.linspace(np.min(Xi), np.max(Xi) + 1, 100) ii = np.digitize(Xi, bins) y2 = np.array([ np.mean(Y[ii == i]) for i in range(1, len(bins)) if len(Y[ii == i]) > 0 ]) x2 = np.array([ np.mean(X[ii == i]) for i in range(1, len(bins)) if len(X[ii == i]) > 0 ]) d = pd.DataFrame({'size': list(x2)}) d['rate'] = list(y2) f = smf.ols('rate ~ size', d).fit() coef = f.params[1] st, data, ss2 = summary_table(f, alpha=0.05) fitted = data[:, 2] mean_ci_low, mean_ci_upp = data[:, 4:6].T ci_low, ci_upp = data[:, 6:8].T x2, y2, fitted, ci_low, ci_upp = zip( *sorted(zip(x2, y2, fitted, ci_low, ci_upp))) plt.scatter(x2, y2, color='SkyBlue', alpha=1, s=12, linewidths=0.5, edgecolor='Steelblue') plt.fill_between(x2, ci_upp, ci_low, color='b', lw=0.1, alpha=0.15) plt.plot(x2, fitted, color='b', ls='--', lw=1.0, alpha=0.9) plt.xlabel(xlab, fontsize=10) plt.ylabel(ylab, fontsize=10) plt.tick_params(axis='both', labelsize=6) plt.xlim(0.9 * min(x2), 1.1 * max(x2)) plt.ylim(min(ci_low), max(ci_upp)) plt.title('$z$ = ' + str(round(coef, 2)), fontsize=10) return fig
def ciAnalysis(re,x,y): st, data, ss2 = summary_table(re, alpha=0.10) fittedvalues = data[:,2] predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T plt.plot(x, y, 'o') plt.plot(x, fittedvalues, '-', lw=2) plt.plot(x, predict_ci_low, 'r--', lw=2) plt.plot(x, predict_ci_upp, 'r--', lw=2) plt.plot(x, predict_mean_ci_low, 'r--', lw=2) plt.plot(x, predict_mean_ci_upp, 'r--', lw=2) plt.show()
def seasonal_index(y_v, period: int, show=False, option='cma'): """ return seasonal index and a Series with all observations linked to its seasonal index """ n = period if option == 'cma': SI_MA_a = np.zeros(len(y_v)) SI_MA_a[:] = np.nan SI_MA_a = y_v / cma(y_v, period) SI_id_s = np.arange(1, len(y_v) + 1) SI_id = SI_id_s - np.floor(SI_id_s / n) * n SI_id[np.where((SI_id[:] == 0))] = n SI_MA_df = pd.DataFrame({'SIMA': SI_MA_a, 'SIid': SI_id}) SI_MA_u = np.zeros(n) for j in range(1, n + 1): SI_MA_u[j - 1] = SI_MA_df['SIMA'][SI_MA_df['SIid'] == j].dropna().mean() SI_MA = SI_MA_u / sum(SI_MA_u) * n if show: print('Seasonal Index:', SI_MA) return SI_MA, SI_MA_df['SIid'] elif option == 'lr': y_data = y_v X_data_ar = np.arange(1, len(y_v) + 1) X_data_T = X_data_ar.T X_data = pd.DataFrame(X_data_T, columns=['Time']) X_data = sm.add_constant(X_data) olsmod = sm.OLS(y_data, X_data) result_reg = olsmod.fit() st, data, ss2 = sso.summary_table(result_reg, alpha=0.05) y_v_LR_a = data[:, 2] SI_LR_a = y_v / y_v_LR_a SI_id_s = np.arange(1, len(y_v) + 1) SI_id = SI_id_s - np.floor(SI_id_s / n) * n SI_id[np.where((SI_id[:] == 0))] = n SI_LR_a_df = pd.DataFrame({'SILR': SI_LR_a, 'SIid': SI_id}) SI_LR_u = np.zeros(n) for j in range(1, n + 1): SI_LR_u[j - 1] = SI_LR_a_df['SILR'][SI_LR_a_df['SIid'] == j].dropna().mean() SI_LR = SI_LR_u / sum(SI_LR_u) * n if show: print('Seasonal Index:', SI_LR) return SI_LR, SI_LR_a_df['SIid']
def figplot(clrs, x, y, xlab, ylab, fig, n): fig.add_subplot(2, 2, n) plt.xscale('log') if n == 1: plt.yscale('log', subsy=[1, 2]) plt.yscale('log') plt.minorticks_off() d = pd.DataFrame({'x': np.log10(x)}) d['y'] = np.log10(y) f = smf.ols('y ~ x', d).fit() m, b, r, p, std_err = stats.linregress(np.log10(x), np.log10(y)) st, data, ss2 = summary_table(f, alpha=0.05) fitted = data[:,2] mean_ci_low, mean_ci_upp = data[:,4:6].T ci_low, ci_upp = data[:,6:8].T x, y, fitted, ci_low, ci_upp, clrs = zip(*sorted(zip(x, y, fitted, ci_low, ci_upp, clrs))) x = np.array(x) y = np.array(y) fitted = 10**np.array(fitted) ci_low = 10**np.array(ci_low) ci_upp = 10**np.array(ci_upp) if n == 1: lbl = r'$rarity$'+ ' = '+str(round(10**b,1))+'*'+r'$N$'+'$^{'+str(round(m,2))+'}$' elif n == 2: lbl = r'$Nmax$'+ ' = '+str(round(10**b,1))+'*'+r'$N$'+'$^{'+str(round(m,2))+'}$' elif n == 3: lbl = r'$Ev$'+ ' = '+str(round(10**b,1))+'*'+r'$N$'+'$^{'+str(round(m,2))+'}$' elif n == 4: lbl = r'$S$'+ ' = '+str(round(10**b,1))+'*'+r'$N$'+'$^{'+str(round(m,2))+'}$' plt.scatter(x, y, s = sz, color=clrs, linewidths=0.0, edgecolor=None) plt.fill_between(x, ci_upp, ci_low, color='0.5', lw=0.1, alpha=0.2) plt.plot(x, fitted, color='k', ls='--', lw=0.5, label = lbl) if n == 3: plt.legend(loc=3, fontsize=8, frameon=False) else: plt.legend(loc=2, fontsize=8, frameon=False) plt.xlabel(xlab, fontsize=10) plt.ylabel(ylab, fontsize=10) plt.tick_params(axis='both', labelsize=6) if n in [2, 4]: plt.ylim(min(y), max(y)) elif n == 1: plt.ylim(min(ci_low), max(ci_upp)) elif n == 3: plt.ylim(0.1, 1.1) return fig
def lm(x, y, alpha=ALPHA): "fits an OLS from statsmodels. returns tuple." x, y = map(plot_friendly, [x, y]) if _isdate(x[0]): x = np.array([i.toordinal() for i in x]) X = sm.add_constant(x) fit = sm.OLS(y, X).fit() prstd, iv_l, iv_u = wls_prediction_std(fit) _, summary_values, summary_names = summary_table(fit, alpha=alpha) df = pd.DataFrame(summary_values, columns=map(snakify, summary_names)) fittedvalues = df['predicted_value'] predict_mean_se = df['std_error_mean_predict'] predict_mean_ci_low = df['mean_ci_95%_low'] predict_mean_ci_upp = df['mean_ci_95%_upp'] predict_ci_low = df['predict_ci_95%_low'] predict_ci_upp = df['predict_ci_95%_upp'] return (fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
def lm(x, y, alpha=ALPHA): "fits an OLS from statsmodels. returns tuple." x, y = map(plot_friendly, [x,y]) if _isdate(x[0]): x = np.array([i.toordinal() for i in x]) X = sm.add_constant(x) fit = sm.OLS(y, X).fit() prstd, iv_l, iv_u = wls_prediction_std(fit) _, summary_values, summary_names = summary_table(fit, alpha=alpha) df = pd.DataFrame(summary_values, columns=map(snakify, summary_names)) fittedvalues = df['predicted_value'] predict_mean_se = df['std_error_mean_predict'] predict_mean_ci_low = df['mean_ci_95%_low'] predict_mean_ci_upp = df['mean_ci_95%_upp'] predict_ci_low = df['predict_ci_95%_low'] predict_ci_upp = df['predict_ci_95%_upp'] return (fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
def _fit_reg(fit_reg, ci, ax, x, y, data, color, line_kws): if not fit_reg: return None if ci is None: ci = 0 if ci < 0 or ci >= 100: raise ValueError('ci must be between 0 and 100 or `None`') if line_kws is None: line_kws = {} if 'lw' not in line_kws: line_kws['lw'] = 3 X = data[x].values if len(X) == 1: return None idx_order = X.argsort() y = data[y].values if len(X) == 2: ax.plot(X, y, color=color, **line_kws) return None X = sm.add_constant(X) # if all x's are the same value, there can be no regression line if X.shape[1] == 1: return 1 ols = sm.OLS(y, X).fit() pred_obj = ols.get_prediction() pred = pred_obj.predicted_mean[idx_order] try: ax.plot(X[idx_order, 1], pred, color=color, **line_kws) except IndexError: print(f"col is {x}") print(X.shape) print(data[x].values) print(X) if ci != 0: st, data, ss2 = summary_table(ols, alpha=1 - ci / 100) ax.fill_between(X[idx_order, 1], data[idx_order, 4], data[idx_order, 5], alpha=.3, color=color)
def simple_outliers_DIY(df, xname, yname, alpha=0.05): # Fit regression model result = smf.ols(yname + '~' + xname, data=df).fit() # studentized residual st1, data1, ss3 = sso.summary_table(result, alpha=alpha) Residual = data1[:, 8] STD_Residual = data1[:, 10] mu = np.mean(STD_Residual) sigma = np.std(STD_Residual) print("◆ Outliers Finding\n") print("(remove by yourself!)\n") df_out = pd.DataFrame(STD_Residual, columns=['SD']) filter = (df_out['SD'] < -2) | (df_out['SD'] > 2) print("Outliers by SD = ") print(df_out['SD'].loc[filter]) print("\nActual ID: ", df_out['SD'].loc[filter].index + 1) return df_out['SD'].loc[filter].index
def lm(x, y, alpha=ALPHA): "fits an OLS from statsmodels. returns tuple." x_is_date = _isdate(x.iloc[0]) if x_is_date: x = np.array([i.toordinal() for i in x]) X = sm.add_constant(x) fit = sm.OLS(y, X).fit() prstd, iv_l, iv_u = wls_prediction_std(fit) _, summary_values, summary_names = summary_table(fit, alpha=alpha) df = pd.DataFrame(summary_values, columns=map(_snakify, summary_names)) # TODO: indexing w/ data frame is messing everything up fittedvalues = df['predicted_value'].values predict_mean_ci_low = df['mean_ci_95%_low'].values predict_mean_ci_upp = df['mean_ci_95%_upp'].values predict_ci_low = df['predict_ci_95%_low'].values predict_ci_upp = df['predict_ci_95%_upp'].values if x_is_date: x = [Timestamp.fromordinal(int(i)) for i in x] return (x, fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
def lm(x, y, alpha=ALPHA): "fits an OLS from statsmodels. returns tuple." import statsmodels.api as sm from statsmodels.sandbox.regression.predstd import wls_prediction_std from statsmodels.stats.outliers_influence import summary_table x, y = map(plot_friendly, [x,y]) if _isdate(x[0]): x = np.array([i.toordinal() for i in x]) X = sm.add_constant(x) fit = sm.OLS(y, X).fit() prstd, iv_l, iv_u = wls_prediction_std(fit) _, summary_values, summary_names = summary_table(fit, alpha=alpha) df = pd.DataFrame(summary_values, columns=map(snakify, summary_names)) fittedvalues = df['predicted_value'] predict_mean_se = df['std_error_mean_predict'] predict_mean_ci_low = df['mean_ci_95%_low'] predict_mean_ci_upp = df['mean_ci_95%_upp'] predict_ci_low = df['predict_ci_95%_low'] predict_ci_upp = df['predict_ci_95%_upp'] return (x, fittedvalues.tolist(), predict_mean_ci_low.tolist(), predict_mean_ci_upp.tolist())
def get_pred_interval_sm(y, X, dfx, pi = 0.95): import statsmodels.api as sm from statsmodels.sandbox.regression.predstd import wls_prediction_std from statsmodels.stats.outliers_influence import summary_table df = dfx.copy() Y = df[y] X = df[X] X = sm.add_constant(X) re = sm.OLS(Y, X).fit() print(re.summary()) st, data, ss2 = summary_table(re, alpha=1-pi) fittedvalues = data[:, 2] predict_mean_se = data[:, 3] predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T predict_ci_low, predict_ci_upp = data[:, 6:8].T return predict_ci_low, fittedvalues, predict_ci_upp
def compare_two_tickers(TICKER_A, TICKER_B): start = datetime.datetime(2017, 5, 1) end = datetime.datetime(2018, 5, 27) df = pd.DataFrame(columns=(TICKER_A, TICKER_B)) df[TICKER_A] = retrieve_ticker_prices(TICKER_A, start, end) df[TICKER_B] = retrieve_ticker_prices(TICKER_B, start, end) print(df) # Plot the two time series plot_price_series(df, TICKER_A, TICKER_B, start, end) # Display a scatter plot of the two time series plot_scatter_series(df, TICKER_A, TICKER_B) # Calculate optimal hedge ratio "beta" res = sm.OLS(endog=df[TICKER_B], exog=df[TICKER_A]).fit() st, data, ss2 = summary_table( res, alpha=0.05) # 置信水平alpha=5%,st数据汇总,data数据详情,ss2数据列名 # beta_hr = data[:2] # 等价于res.fittedvalues beta_hr = res.params[TICKER_A] # beta_hr = res.fittedvalues #获取拟合y值 # res.params # 拟合回归模型参数 # res.params[0] + res.params[1] * daily_data['temp'] == res.fittedvalues # 验证二维回归模型的拟合y值计算原理 # Calculate the residuals of the linear combination df["res"] = df[TICKER_A] - beta_hr * df[TICKER_B] print('===============') print(beta_hr) print('===============') print(df['res']) # Plot the residuals plot_residuals(df, start, end) # Calculate and output the CADF test on the residuals calculate_adf(df['res'])
def drawLinearRegressionByTail(x, y, alpha, ax): x = np.array(x) res = sm.OLS(y, x).fit() st, data, ss2 = summary_table(res, alpha=alpha) fittedvalues = data[:, 2] predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T predict_ci_low, predict_ci_upp = data[:, 6:8].T ax.plot(x, fittedvalues, 'r-', label='Параметрична пряма') ax.plot(x, predict_ci_low, 'black', label='Толерантні межі', linestyle='dashed') ax.plot(x, predict_ci_upp, 'black', linestyle='dashed') ax.plot(x, predict_mean_ci_low, 'gray', label='Довірчий інтервал', linestyle='dashed') ax.plot(x, predict_mean_ci_upp, 'gray', linestyle='dashed') ax.legend(loc='best', fontsize='x-small')
def get_regression_summary(result, conf_int=0.95, columns=None, as_dataframe=False): column_mapper = { 'Obs': 'obs', 'Dep Var\nPopulation': 'dep_var_population', 'Predicted\nValue': 'predicted_value', 'Std Error\nMean Predict': 'std_error_mean_predict', 'Mean ci\n95% low': 'mean_ci_lo', 'Mean ci\n95% upp': 'mean_ci_up', 'Predict ci\n95% low': 'pred_ci_lo', 'Predict ci\n95% upp': 'pred_ci_up', 'Residual': 'residual', 'Std Error\nResidual': 'std_error_residual', 'Student\nResidual': 'student_residual', "Cook's\nD": 'cooks' } # columns = columns or ['Mean ci\n95% low', 'Mean ci\n95% upp'] simple_table, data_table, table_columns = summary_table(result, alpha=conf_int) table_columns = [column_mapper.get(c) for c in table_columns] if as_dataframe: return pd.DataFrame(data_table, columns=table_columns)
def calcErrDistBug(in_filename,gold_in_filename,out_filename,title): errs = [] abs_errs = [] mses = [] # read data in_file = open(in_filename,'rb') errs = cPickle.load(in_file) abs_errs = cPickle.load(in_file) mses = cPickle.load(in_file) in_file.close() g_in_file = open(gold_in_filename,'rb') g_errs = cPickle.load(g_in_file) g_abs_errs = cPickle.load(g_in_file) g_mses = cPickle.load(g_in_file) g_in_file.close() # ensure that we don't actually take the log of 0 # FIXME: we may need to make this more dynamic so that it doesn't skew results. g_mses = np.array(g_mses) g_abs_errs = np.array(g_abs_errs) zeros = g_mses>0.0 g_mses = g_mses[zeros] g_abs_errs = g_abs_errs[zeros] #zeros = g_abs_errs>0.0 zeros = g_abs_errs>1.0E-14 g_mses = g_mses[zeros] g_abs_errs = g_abs_errs[zeros] #print g_mses #print g_abs_errs #print g_mses mses = np.array(mses) abs_errs = np.array(abs_errs) zeros = mses>0.0 mses = mses[zeros] abs_errs = abs_errs[zeros] #zeros = abs_errs>0.0 zeros = abs_errs>1.0E-14 mses = mses[zeros] abs_errs = abs_errs[zeros] skip = 5 mses = mses[::skip] abs_errs = abs_errs[::skip] #print g_mses #print np.log2(g_mses) g_dist = np.divide(g_mses,g_abs_errs) dist = np.divide(mses,abs_errs) # determine Ordinary Least Squares X = np.log2(g_abs_errs) X = sm.add_constant(X) #print X #print len(g_abs_errs) #model = sm.OLS(g_mses,X) model = sm.OLS(np.log2(g_mses),X) #model = sm.RLM(np.log2(g_mses),X) results = model.fit() #print results.params #print results.summary() #print dir(results) #print results.outlier_test() prstd, iv_l, iv_u = wls_prediction_std(results) st, data, ss2 = summary_table(results, alpha=0.05) #print oi.OLSInfluence(results).influence fittedvalues = data[:,2] predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T # check we got the right things #print np.max(np.abs(results.fittedvalues - fittedvalues)) #print np.max(np.abs(iv_l - predict_ci_low)) #print np.max(np.abs(iv_u - predict_ci_upp)) #legend = [] #legend.append('> to < Mutation') #plt.loglog(abs_errs, mses, basex=2, linestyle='', marker='+', color='b') #plt.plot(np.log2(abs_errs), np.log2(mses), linestyle='', marker='+', color='b') #legend.append('> to < Mutation -- outliers') o_x,o_y = returnOutliers(results,sm.add_constant(np.log2(abs_errs)),np.log2(mses),alpha=0.05) #plt.plot(o_x, o_y, linestyle='', marker='*', color='b') #plt.plot(abs_errs, dist, linestyle='', marker='o', color='b') #print 'Mutation outliers: ' + `len(o_x)` #legend.append('Bug-free') #plt.loglog(g_abs_errs, g_mses, basex=2, linestyle='', marker='+', color='r') #plt.plot(np.log2(g_abs_errs), np.log2(g_mses), linestyle='', marker='+', color='r') #g_o_x,g_o_y = returnOutliers(results,sm.add_constant(np.log2(g_abs_errs)),np.log2(g_mses),alpha=0.05) #plt.plot(g_o_x, g_o_y, linestyle='', marker='*', color='r') #legend.append('Bug-free -- MSE/ABS') #plt.plot(g_abs_errs, g_dist, linestyle='', marker='o', color='r') #print 'Gold outliers: ' + `len(g_o_x)` #legend.append('95% CI -') #plt.loglog(g_abs_errs, iv_l, basex=2, linestyle='-', color='c') #plt.plot(np.log2(g_abs_errs), iv_l, linestyle='-', color='c') #legend.append('95% CI - manual') #plt.plot(np.log2(g_abs_errs), predict_ci_low, linestyle='-', color='m') #plt.loglog(g_abs_errs, results.fittedvalues, basex=2, linestyle='-', color='k') #plt.plot(np.log2(g_abs_errs), results.fittedvalues, linestyle='-', color='k') #legend.append('95% CI +') #plt.loglog(g_abs_errs, iv_u, basex=2, linestyle='-', color='g') #plt.plot(np.log2(g_abs_errs), iv_u, linestyle='-', color='g') #legend.append('95% CI + manual') #plt.plot(np.log2(g_abs_errs), predict_ci_upp, linestyle='-', color='y') #leg = plt.legend(legend, 'lower right',ncol=1) # fix up plotting to look nice #plt.suptitle(title, fontsize=35) #plt.xlabel('Sum of Absolute Execution Errors', fontsize=23) #plt.ylabel('Mean Squared Error of Output', fontsize=23) #plt.show() return len(o_x),len(mses)
def Fig1(ref, Ones): datasets = [] if ref == 'ClosedRef': GoodNames = ['EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST if ref == 'OpenRef': GoodNames = ['EMPopen', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST for name in os.listdir(mydir +'data/micro'): if name in GoodNames: pass else: continue if Ones == 'N': path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' elif Ones == 'Y': path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'micro', num_lines]) print name, num_lines for name in os.listdir(mydir +'data/macro'): if name in GoodNames: pass else: continue if Ones == 'N': path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' elif Ones == 'Y': path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'macro', num_lines]) print name, num_lines metrics = ['Nmax, '+r'$log_{10}$', 'McNaughton', 'Berger-Parker', 'Simpson\'s D'] fig = plt.figure() for index, i in enumerate(metrics): metric = i fig.add_subplot(2, 2, index+1) fs = 10 # font size used across figures MicIntList, MicCoefList, MacIntList, MacCoefList, R2List, metlist = [[], [], [], [], [], []] Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [[], [], [], [], [], [], [], [], []] EvarList, EQList, OList = [[],[],[]] SimpDomList, McNList, LogSkewList, POnesList = [[],[],[],[]] its = 10000 for n in range(its): print n, metric Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [[], [], [], [], [], [], [], [], []] EvarList, EQList, OList = [[],[],[]] SimpDomList, McNList, LogSkewList, POnesList = [[],[],[],[]] numMac = 0 numMic = 0 radDATA = [] for dataset in datasets: name, kind, numlines = dataset lines = [] small = ['BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED'] big = ['HUMAN', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO'] if kind == 'macro': lines = np.random.choice(range(1, numlines+1), 100, replace=True) elif name in small: lines = np.random.choice(range(1, numlines+1), 20, replace=True) elif name in big: lines = np.random.choice(range(1, numlines+1), 50, replace=True) elif name == 'TARA': lines = np.random.choice(range(1, numlines+1), 50, replace=True) else: lines = np.random.choice(range(1, numlines+1), 50, replace=True) if Ones == 'N': path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' elif Ones == 'Y': path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) for data in radDATA: data = data.split() name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data KindList.append(kind) N = float(N) S = float(S) if S < 2 or N < 10: continue # Min species richness Nlist.append(float(np.log10(N))) Slist.append(float(np.log10(S))) # Dominance BPlist.append(float(BP)) NmaxList.append(float(np.log10(float(Nmax)))) SimpDomList.append(float(SimpDom)) McNList.append(float(McN)) if kind == 'micro': numMic += 1 klist.append('b') if kind == 'macro': klist.append('r') numMac += 1 if index == 0: metlist = list(NmaxList) elif index == 1: metlist = list(McNList) elif index == 2: metlist = list(BPlist) elif index == 3: metlist = list(SimpDomList) # Multiple regression d = pd.DataFrame({'N': list(Nlist)}) d['y'] = list(metlist) d['Kind'] = list(KindList) f = smf.ols('y ~ N * Kind', d).fit() MacIntList.append(f.params[0]) MacCoefList.append(f.params[2]) if f.pvalues[1] < 0.05: MicIntList.append(f.params[1] + f.params[0]) else: MicIntList.append(f.params[0]) if f.pvalues[3] < 0.05: MicCoefList.append(f.params[3] + f.params[2]) else: MicCoefList.append(f.params[2]) R2List.append(f.rsquared) MacListX = [] MacListY = [] MicListX = [] MicListY = [] for j, k in enumerate(KindList): if k == 'micro': MicListX.append(Nlist[j]) MicListY.append(metlist[j]) elif k == 'macro': MacListX.append(Nlist[j]) MacListY.append(metlist[j]) MacPIx, MacFitted, MicPIx, MicFitted = [[],[],[],[]] macCiH, macCiL, micCiH, micCiL = [[],[],[],[]] lm = smf.ols('y ~ N * Kind', d).fit() print metric, '\n', lm.summary() f1 = smf.ols('y ~ N', d).fit() print metric, '\n', f1.summary() st, data, ss2 = summary_table(lm, alpha=0.05) # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict, # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp, # Residual, Std Error Residual, Student Residual, Cook's D fittedvalues = data[:,2] predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T for j, kval in enumerate(KindList): if kval == 'macro': macCiH.append(predict_mean_ci_upp[j]) macCiL.append(predict_mean_ci_low[j]) MacPIx.append(Nlist[j]) MacFitted.append(f.fittedvalues[j]) elif kval == 'micro': micCiH.append(predict_mean_ci_upp[j]) micCiL.append(predict_mean_ci_low[j]) MicPIx.append(Nlist[j]) MicFitted.append(f.fittedvalues[j]) MicPIx, MicFitted, micCiH, micCiL = zip(*sorted(zip(MicPIx, MicFitted, micCiH, micCiL))) MacPIx, MacFitted, macCiH, macCiL = zip(*sorted(zip(MacPIx, MacFitted, macCiH, macCiL))) for i in range(len(MicListX)): plt.scatter(MacListX[i], MacListY[i], color = 'LightCoral', alpha= 1 , s = 4, linewidths=0.5, edgecolor='Crimson') plt.scatter(MicListX[i], MicListY[i], color = 'SkyBlue', alpha= 1 , s = 4, linewidths=0.5, edgecolor='Steelblue') plt.fill_between(MacPIx, macCiL, macCiH, color='r', lw=0.0, alpha=0.3) plt.fill_between(MicPIx, micCiL, micCiH, color='b', lw=0.0, alpha=0.3) MicInt = round(np.mean(MicIntList), 2) MicCoef = round(np.mean(MicCoefList), 2) MacInt = round(np.mean(MacIntList), 2) MacCoef = round(np.mean(MacCoefList), 2) r2 = round(np.mean(R2List), 2) if index == 0: plt.ylim(0, 6) plt.xlim(1, 8) plt.text(1.5, 5.3, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue') plt.text(1.5, 4.7, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson') plt.text(1.5, 4.0, r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k') if index == 1: plt.ylim(0, 120) plt.xlim(1, 8) #plt.text(4.0, 110, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue') #plt.text(4.0, 100, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson') plt.text(5.0, 90, r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k') if index == 2: plt.ylim(0, 1.2) plt.xlim(1, 8) #plt.text(3.8, 1.10, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue') #plt.text(3.8, 1.0, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson') plt.text(1.5, 1.0, r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k') if index == 3: plt.ylim(0, 1.3) plt.xlim(1, 8) #plt.text(1.5, 1.2, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue') #plt.text(1.5, 1.1, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson') plt.text(1.5, 1.1, r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k') plt.xlabel('Number of reads or individuals, '+ '$log$'+r'$_{10}$', fontsize=fs) plt.ylabel(metric, fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs-1) plt.subplots_adjust(wspace=0.4, hspace=0.4) if ref == 'OpenRef'and Ones =='N': plt.savefig(mydir+'/figs/appendix/Dominance/SupplementaryDominanceFig-OpenRef_NoMicrobe1s.png', dpi=600, bbox_inches = "tight") elif ref == 'OpenRef'and Ones =='Y': plt.savefig(mydir+'/figs/appendix/Dominance/SupplementaryDominanceFig-OpenRef.png', dpi=600, bbox_inches = "tight") elif ref == 'ClosedRef'and Ones =='Y': plt.savefig(mydir+'/figs/appendix/Dominance/SupplementaryDominanceFig-ClosedRef.png', dpi=600, bbox_inches = "tight") elif ref == 'ClosedRef'and Ones =='N': plt.savefig(mydir+'/figs/appendix/Dominance/SupplementaryDominanceFig-ClosedRef_NoMicrobe1s.png', dpi=600, bbox_inches = "tight") #plt.show() return
def Fig1(ref, Ones): datasets = [] if ref == 'ClosedRef': GoodNames = ['MGRAST', 'HMP', 'EMPclosed', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] if ref == 'OpenRef': GoodNames = ['MGRAST', 'HMP', 'EMPopen', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] for name in os.listdir(mydir +'data/micro'): if name in GoodNames: pass else: continue if Ones == 'N': path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' elif Ones == 'Y': path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'micro', num_lines]) print name, num_lines for name in os.listdir(mydir +'data/macro'): if name in GoodNames: pass else: continue if Ones == 'N': path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' elif Ones == 'Y': path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'macro', num_lines]) print name, num_lines metrics = ['log-modulo skewness', 'log-skew'] fig = plt.figure() for index, i in enumerate(metrics): metric = i fig.add_subplot(2, 2, index+1) fs = 10 # font size used across figures MicIntList, MicCoefList, MacIntList, MacCoefList, R2List, metlist = [[], [], [], [], [], []] Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [[], [], [], [], [], [], [], [], []] EvarList, EQList, OList = [[],[],[]] SkewList, LogSkewList = [[],[]] its = 1000 for n in range(its): #print n, metric Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [[], [], [], [], [], [], [], [], []] EvarList, EQList, OList = [[],[],[]] SkewList, LogSkewList = [[],[]] numMac = 0 numMic = 0 radDATA = [] for dataset in datasets: name, kind, numlines = dataset lines = [] if name == 'EMPclosed' or name == 'EMPopen': lines = np.random.choice(range(1, numlines+1), 100, replace=True) elif kind == 'micro': lines = np.random.choice(range(1, numlines+1), 100, replace=True) else: lines = np.random.choice(range(1, numlines+1), 60, replace=True) if Ones == 'N': path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' elif Ones == 'Y': path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) for data in radDATA: data = data.split() name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data KindList.append(kind) N = float(N) S = float(S) if S < 10 or N < 10: continue # Min species richness Nlist.append(float(np.log10(N))) Slist.append(float(np.log10(S))) # Rarity lms = np.log10(np.abs(float(skew)) + 1) if skew < 0: lms = lms * -1 SkewList.append(float(lms)) LogSkewList.append(float(logskew)) if kind == 'micro': numMic += 1 klist.append('b') if kind == 'macro': klist.append('r') numMac += 1 if index == 0: metlist = list(SkewList) elif index == 1: metlist = list(LogSkewList) # Multiple regression d = pd.DataFrame({'N': list(Nlist)}) d['y'] = list(metlist) d['Kind'] = list(KindList) f = smf.ols('y ~ N * Kind', d).fit() MacIntList.append(f.params[0]) MacCoefList.append(f.params[2]) if f.pvalues[1] < 0.05: MicIntList.append(f.params[1] + f.params[0]) else: MicIntList.append(f.params[0]) if f.pvalues[3] < 0.05: MicCoefList.append(f.params[3] + f.params[2]) else: MicCoefList.append(f.params[2]) R2List.append(f.rsquared) MacListX = [] MacListY = [] MicListX = [] MicListY = [] for j, k in enumerate(KindList): if k == 'micro': MicListX.append(Nlist[j]) MicListY.append(metlist[j]) elif k == 'macro': MacListX.append(Nlist[j]) MacListY.append(metlist[j]) MacPIx, MacFitted, MicPIx, MicFitted = [[],[],[],[]] macCiH, macCiL, micCiH, micCiL = [[],[],[],[]] lm = smf.ols('y ~ N * Kind', d).fit() #print metric, '\n', lm.summary() #f1 = smf.ols('y ~ N', d).fit() #print metric, '\n', f1.summary() st, data, ss2 = summary_table(lm, alpha=0.05) fittedvalues = data[:,2] predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T for j, kval in enumerate(KindList): if kval == 'macro': macCiH.append(predict_mean_ci_upp[j]) macCiL.append(predict_mean_ci_low[j]) MacPIx.append(Nlist[j]) MacFitted.append(f.fittedvalues[j]) elif kval == 'micro': micCiH.append(predict_mean_ci_upp[j]) micCiL.append(predict_mean_ci_low[j]) MicPIx.append(Nlist[j]) MicFitted.append(f.fittedvalues[j]) MicPIx, MicFitted, micCiH, micCiL = zip(*sorted(zip(MicPIx, MicFitted, micCiH, micCiL))) MacPIx, MacFitted, macCiH, macCiL = zip(*sorted(zip(MacPIx, MacFitted, macCiH, macCiL))) for i in range(len(MicListX)): plt.scatter(MacListX[i], MacListY[i], color = 'LightCoral', alpha= 1 , s = 4, linewidths=0.5, edgecolor='Crimson') plt.scatter(MicListX[i], MicListY[i], color = 'SkyBlue', alpha= 1 , s = 4, linewidths=0.5, edgecolor='Steelblue') plt.fill_between(MacPIx, macCiL, macCiH, color='r', lw=0.0, alpha=0.3) plt.fill_between(MicPIx, micCiL, micCiH, color='b', lw=0.0, alpha=0.3) MicInt = round(np.mean(MicIntList), 2) MicCoef = round(np.mean(MicCoefList), 2) MacInt = round(np.mean(MacIntList), 2) MacCoef = round(np.mean(MacCoefList), 2) r2 = round(np.mean(R2List), 2) if index == 0: plt.ylim(0, 2.5) plt.xlim(0, 7) plt.text(0.3, 2.2, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs-1, color='Steelblue') plt.text(0.3, 2.0, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs-1, color='Crimson') plt.text(0.3, 1.7, r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k') if index == 1: plt.ylim(-1, 4.5) plt.xlim(0, 7) plt.text(0.3, 4.0, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs-1, color='Steelblue') plt.text(0.3, 3.5, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs-1, color='Crimson') plt.text(0.3, 2.9, r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k') plt.xlabel('Number of reads or individuals, '+ '$log$'+r'$_{10}$', fontsize=fs) plt.ylabel(metric, fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs-1) plt.subplots_adjust(wspace=0.4, hspace=0.4) if ref == 'OpenRef'and Ones =='N': plt.savefig(mydir+'/figs/appendix/Rarity/SupplementaryRarityFig-OpenRef_NoMicrobe1s.png', dpi=600, bbox_inches = "tight") elif ref == 'OpenRef'and Ones =='Y': plt.savefig(mydir+'/figs/appendix/Rarity/SupplementaryRarityFig-OpenRef.png', dpi=600, bbox_inches = "tight") elif ref == 'ClosedRef'and Ones =='Y': plt.savefig(mydir+'/figs/appendix/Rarity/SupplementaryRarityFig-ClosedRef.png', dpi=600, bbox_inches = "tight") elif ref == 'ClosedRef'and Ones =='N': plt.savefig(mydir+'/figs/appendix/Rarity/SupplementaryRarityFig-ClosedRef_NoMicrobe1s.png', dpi=600, bbox_inches = "tight") #plt.show() return
def plotErrDistBug(in_filename,gold_in_filename,out_filename,title): errs = [] abs_errs = [] mses = [] # read data if in_filename == None: abs_errs = None mses = None else: in_file = open(in_filename,'rb') errs = cPickle.load(in_file) abs_errs = cPickle.load(in_file) mses = cPickle.load(in_file) in_file.close() g_in_file = open(gold_in_filename,'rb') g_errs = cPickle.load(g_in_file) g_abs_errs = cPickle.load(g_in_file) g_mses = cPickle.load(g_in_file) g_in_file.close() # ensure that we don't actually take the log of 0 g_mses = np.array(g_mses) g_abs_errs = np.array(g_abs_errs) zeros = g_mses>0.0 g_mses = g_mses[zeros] g_abs_errs = g_abs_errs[zeros] #zeros = g_abs_errs>0.0 zeros = g_abs_errs>1.0E-10 g_mses = g_mses[zeros] g_abs_errs = g_abs_errs[zeros] #print g_mses #print g_abs_errs #print g_mses if not mses == None: mses = np.array(mses) abs_errs = np.array(abs_errs) zeros = mses>0.0 mses = mses[zeros] abs_errs = abs_errs[zeros] #zeros = abs_errs>0.0 zeros = abs_errs>1.0E-10 mses = mses[zeros] abs_errs = abs_errs[zeros] print 'mses: ' + `mses` #print g_mses #print np.log2(g_mses) g_dist = np.divide(g_mses,g_abs_errs) if not mses == None: dist = np.divide(mses,abs_errs) # determine Ordinary Least Squares X = np.log2(g_abs_errs) X = sm.add_constant(X) #print X #print len(g_abs_errs) #model = sm.OLS(g_mses,X) model = sm.OLS(np.log2(g_mses),X) #model = sm.RLM(np.log2(g_mses),X) results = model.fit() #print results.params #print results.summary() #print dir(results) #print results.outlier_test() prstd, iv_l, iv_u = wls_prediction_std(results) st, data, ss2 = summary_table(results, alpha=0.05) #print oi.OLSInfluence(results).influence fittedvalues = data[:,2] predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T # check we got the right things #print np.max(np.abs(results.fittedvalues - fittedvalues)) #print np.max(np.abs(iv_l - predict_ci_low)) #print np.max(np.abs(iv_u - predict_ci_upp)) if not in_filename == None: in_files_pattern = re.compile('\D*(\d+)\D*') match = in_files_pattern.search(in_filename) mutation_num = match.group(1) module_name = re.sub('Gold','',gold_in_filename) module_name = re.sub('(.*/)*','',module_name) fig = plt.figure(figsize=(10.0, 7.0)) legend = [] if not mses == None: legend.append('Mutation ' + `mutation_num`) #plt.loglog(abs_errs, mses, basex=2, linestyle='', marker='+', color='b') plt.plot(np.log2(abs_errs), np.log2(mses), linestyle='', marker='+', color='b') legend.append('Mutation ' + `mutation_num` + ' -- outliers') o_x,o_y = returnOutliers(results,sm.add_constant(np.log2(abs_errs)),np.log2(mses),alpha=0.05) plt.plot(o_x, o_y, linestyle='', marker='*', color='b') #plt.plot(abs_errs, dist, linestyle='', marker='o', color='b') print 'Mutation outliers: ' + `len(o_x)` legend.append('Bug-free') #plt.loglog(g_abs_errs, g_mses, basex=2, linestyle='', marker='+', color='r') plt.plot(np.log2(g_abs_errs), np.log2(g_mses), linestyle='', marker='+', color='r') g_o_x,g_o_y = returnOutliers(results,sm.add_constant(np.log2(g_abs_errs)),np.log2(g_mses),alpha=0.05) legend.append('Bug-free -- outliers') plt.plot(g_o_x, g_o_y, linestyle='', marker='*', color='r') #legend.append('Bug-free -- MSE/ABS') #plt.plot(g_abs_errs, g_dist, linestyle='', marker='o', color='r') print 'Gold outliers: ' + `len(g_o_x)` legend.append('95% CI -') #plt.loglog(g_abs_errs, iv_l, basex=2, linestyle='-', color='c') plt.plot(np.log2(g_abs_errs), iv_l, linestyle='-', color='c') #legend.append('95% CI - manual') #plt.plot(np.log2(g_abs_errs), predict_ci_low, linestyle='-', color='m') #plt.loglog(g_abs_errs, results.fittedvalues, basex=2, linestyle='-', color='k') plt.plot(np.log2(g_abs_errs), results.fittedvalues, linestyle='-', color='k') legend.append('95% CI +') #plt.loglog(g_abs_errs, iv_u, basex=2, linestyle='-', color='g') plt.plot(np.log2(g_abs_errs), iv_u, linestyle='-', color='g') #legend.append('95% CI + manual') #plt.plot(np.log2(g_abs_errs), predict_ci_upp, linestyle='-', color='y') leg = plt.legend(legend, 'lower right',ncol=1) # fix up plotting to look nice plt.suptitle(module_name+' '+title, fontsize=35) plt.xlabel('Log2(Execution Error)', fontsize=23) plt.ylabel('Log2(Output Error)', fontsize=23) if not out_filename == None: plt.gcf().savefig(out_filename) plt.show()
y = np.concatenate([y[0],y[1],y[2],y[3],y[4],y[5]]) x = np.concatenate([x[0],x[1],x[2],x[3],x[4],x[5]]) xsort = np.argsort(x) x = x[xsort] y = y[xsort] xb = sm.add_constant(x) model = sm.OLS(y,xb) results = model.fit() x2 = np.linspace(np.min(x),np.max(x),np.size(x)) y2 = results.predict(sm.add_constant(x2)) st, data, ss2 = summary_table(results, alpha=0.001) fittedvalues = data[:,2] predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T plt.scatter(x,y) #plt.plot(x2,y2,'r') plt.plot(x, fittedvalues, 'k', lw=2) #plt.plot(x, predict_ci_low, 'r--', lw=2) #plt.plot(x, predict_ci_upp, 'r--', lw=2) plt.plot(x, predict_mean_ci_low, 'r--', lw=2) plt.plot(x, predict_mean_ci_upp, 'r--', lw=2) plt.xlabel('Max stream function at 26N (Sv)') plt.ylabel('AMO-box SST anomaly') plt.savefig('/home/ph290/Documents/figures/amoc_v_sst.png')
def modelcomparison(): OUT = open(mydir + 'output/model_comparison.txt','w+') datasets = [] GoodNames = ['empclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST for name in os.listdir(mydir +'data/micro'): if name in GoodNames: pass else: continue #path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'micro', num_lines]) print>>OUT, name, num_lines for name in os.listdir(mydir +'data/macro'): if name in GoodNames: pass else: continue #path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'macro', num_lines]) print>>OUT, name, num_lines rarity = [] dominance = [] evenness = [] richness = [] Nlist = [] metrics = ['Rarity', 'Dominance', 'Evenness', 'Richness'] for index, i in enumerate(metrics): print i, ': R-squared : AIC : BIC' print>>OUT, i, ': R-squared : AIC : BIC' loglogR2s, linlogR2s, linearR2s, loglinR2s = [[],[],[],[]] loglogAICs, linlogAICs, linearAICs, loglinAICs = [[],[],[],[]] loglogBICs, linlogBICs, linearBICs, loglinBICs = [[],[],[],[]] its = 10 for n in range(its): Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []] radDATA = [] for dataset in datasets: name, kind, numlines = dataset lines = [] if name == 'EMPclosed' or name == 'EMPopen': lines = np.random.choice(range(1, numlines+1), 100, replace=True) elif kind == 'micro': lines = np.random.choice(range(1, numlines+1), 100, replace=True) else: lines = np.random.choice(range(1, numlines+1), 60, replace=True) #path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) for data in radDATA: data = data.split() name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) if S < 2 or N < 10: continue Nlist.append(float(np.log10(N))) Slist.append(float(np.log10(S))) ESimplist.append(float(np.log10(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log10(float(Nmax)))) # log-modulo transformation of skewnness lms = np.log10(np.abs(float(skew)) + 1) if skew < 0: lms = lms * -1 rareSkews.append(float(lms)) if index == 0: metlist = list(rareSkews) elif index == 1: metlist = list(NmaxList) elif index == 2: metlist = list(ESimplist) elif index == 3: metlist = list(Slist) # Multiple regression d = pd.DataFrame({'N': list(Nlist)}) d['y'] = list(metlist) d['Kind'] = list(KindList) loglog = smf.ols('y ~ N * Kind', d).fit() loglogR2s.append(loglog.rsquared) loglogAICs.append(loglog.aic) loglogBICs.append(loglog.bic) # Multiple regression xlist = 10**np.array(Nlist) d = pd.DataFrame({'N': list(xlist)}) d['y'] = list(metlist) d['Kind'] = list(KindList) loglin = smf.ols('y ~ N * Kind', d).fit() loglinR2s.append(loglin.rsquared) loglinAICs.append(loglin.aic) loglinBICs.append(loglin.bic) # Multiple regression ylist = 10**np.array(metlist) d = pd.DataFrame({'N': list(Nlist)}) d['y'] = list(ylist) d['Kind'] = list(KindList) linlog = smf.ols('y ~ N * Kind', d).fit() linlogR2s.append(linlog.rsquared) linlogAICs.append(linlog.aic) linlogBICs.append(linlog.bic) # Multiple regression ylist = 10**np.array(metlist) xlist = 10**np.array(Nlist) d = pd.DataFrame({'N': list(xlist)}) d['y'] = list(ylist) d['Kind'] = list(KindList) linear = smf.ols('y ~ N * Kind', d).fit() linearR2s.append(linear.rsquared) linearAICs.append(linear.aic) linearBICs.append(linear.bic) st, data, ss2 = summary_table(linear, alpha=0.05) #fittedvalues = data[:,2] #predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T avgloglogR2 = round(np.mean(loglogR2s),3) avglinlogR2 = round(np.mean(linlogR2s),3) avglinearR2 = round(np.mean(linearR2s),3) avgloglinR2 = round(np.mean(loglinR2s),3) avgloglogAIC = round(np.mean(loglogAICs),3) avglinlogAIC = round(np.mean(linlogAICs),3) avglinearAIC = round(np.mean(linearAICs),3) avgloglinAIC = round(np.mean(loglinAICs),3) avgloglogBIC = round(np.mean(loglogBICs),3) avglinlogBIC = round(np.mean(linlogBICs),3) avglinearBIC = round(np.mean(linearBICs),3) avgloglinBIC = round(np.mean(loglinBICs),3) print 'power-law: ', avgloglogR2,' ', avgloglogAIC,' ', avgloglogBIC print>>OUT, 'averages from power-law', avgloglogR2,' ',avgloglogAIC,' ', avgloglogBIC print 'semilog: ', avglinlogR2,' ', avglinlogAIC,' ', avglinlogBIC print>>OUT,'averages from semilog', avglinlogR2,' ', avglinlogAIC,' ', avglinlogBIC print 'exponential: ', avgloglinR2,' ', avgloglinAIC,' ', avgloglinBIC print>>OUT,'averages from exponential', avgloglinR2,' ', avgloglinAIC,' ', avgloglinBIC print 'linear: ', avglinearR2,' ', avglinearAIC,' ', avglinearBIC,'\n' print>>OUT,'averages from linear', avglinearR2,' ', avglinearAIC,' ', avglinearBIC,'\n' OUT.close() return
def Fig1(condition, ones, sampling): tail = str() if ones is False: tail = '-SADMetricData_NoMicrobe1s.txt' elif ones is True: tail = '-SADMetricData.txt' datasets = [] GoodNames = [] emp = str() if condition == 'open': emp = 'EMPopen' elif condition == 'closed': emp = 'EMPclosed' #GoodNames = [emp, 'HMP', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] #GoodNames = [emp, 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is emp #GoodNames = ['HMP', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is HMP GoodNames = [emp, 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST for name in os.listdir(mydir +'data/micro'): if name in GoodNames: pass else: continue path = mydir+'data/micro/'+name+'/'+name+tail num_lines = sum(1 for line in open(path)) datasets.append([name, 'micro', num_lines]) print name, num_lines for name in os.listdir(mydir +'data/macro'): if name in GoodNames: pass else: continue path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'macro', num_lines]) print name, num_lines metrics = ['Rarity, '+r'$log_{10}$', 'Dominance, '+r'$log_{10}$', 'Evenness, ' +r'$log_{10}$', 'Richness, ' +r'$log_{10}$',] #+r'$(S)^{2}$'] fig = plt.figure() for index, i in enumerate(metrics): metric = i fig.add_subplot(2, 2, index+1) fs = 12 # font size used across figures MicIntList, MicCoefList, MacIntList, MacCoefList, R2List, metlist = [[], [], [], [], [], []] Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []] #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] its = 10 for n in range(its): #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []] numMac = 0 numMic = 0 radDATA = [] for dataset in datasets: name, kind, numlines = dataset lines = [] small = ['BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED'] big = ['HUMAN', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO'] if kind == 'macro': lines = np.random.choice(range(1, numlines+1), 100, replace=True) elif name in small: lines = np.random.choice(range(1, numlines+1), 20, replace=True) elif name in big: lines = np.random.choice(range(1, numlines+1), 50, replace=True) elif name == 'TARA': lines = np.random.choice(range(1, numlines+1), 50, replace=True) else: lines = np.random.choice(range(1, numlines+1), 50, replace=True) if kind == 'micro': path = mydir+'data/'+kind+'/'+name+'/'+name+tail else: path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) for data in radDATA: data = data.split() name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) Nlist.append(float(np.log10(N))) Slist.append(float(np.log10(S))) ESimplist.append(float(np.log10(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log10(float(Nmax)))) # log-modulo transformation of skewnness lms = np.log10(np.abs(float(skew)) + 1) if skew < 0: lms = lms * -1 rareSkews.append(float(lms)) if kind == 'micro': numMic += 1 klist.append('b') if kind == 'macro': klist.append('r') numMac += 1 if index == 0: metlist = list(rareSkews) elif index == 1: metlist = list(NmaxList) elif index == 2: metlist = list(ESimplist) elif index == 3: metlist = list(Slist) # Multiple regression d = pd.DataFrame({'N': list(Nlist)}) d['y'] = list(metlist) d['Kind'] = list(KindList) f = smf.ols('y ~ N * Kind', d).fit() #f = smf.rlm('y ~ N * Kind', d).fit() #r2 = smf.wls('y ~ N * Kind', d, weights= f.weights).fit().rsquared r2 = f.rsquared MacIntList.append(f.params[0]) MacCoefList.append(f.params[2]) if f.pvalues[1] < 0.05: MicIntList.append(f.params[1] + f.params[0]) else: MicIntList.append(f.params[0]) if f.pvalues[3] < 0.05: MicCoefList.append(f.params[3] + f.params[2]) else: MicCoefList.append(f.params[2]) R2List.append(r2) MacPIx, MacFitted, MicPIx, MicFitted = [[],[],[],[]] macCiH, macCiL, micCiH, micCiL = [[],[],[],[]] MacListX = [] MacListY = [] MicListX = [] MicListY = [] for j, k in enumerate(KindList): if k == 'micro': MicListX.append(Nlist[j]) MicListY.append(metlist[j]) elif k == 'macro': MacListX.append(Nlist[j]) MacListY.append(metlist[j]) print metric ols = smf.ols('y ~ N * Kind', d).fit() #rlm = smf.rlm('y ~ N * Kind', d).fit() #wls = smf.wls('y ~ N * Kind', d, weights= rlm.weights).fit() #r2 = wls.rsquared r2 = ols.rsquared st, data, ss2 = summary_table(ols, alpha=0.05) # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict, # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp, # Residual, Std Error Residual, Student Residual, Cook's D #fittedvalues = data[:,2] #predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T for j, kval in enumerate(KindList): if kval == 'macro': macCiH.append(predict_mean_ci_upp[j]) macCiL.append(predict_mean_ci_low[j]) MacPIx.append(Nlist[j]) MacFitted.append(ols.fittedvalues[j]) elif kval == 'micro': micCiH.append(predict_mean_ci_upp[j]) micCiL.append(predict_mean_ci_low[j]) MicPIx.append(Nlist[j]) MicFitted.append(ols.fittedvalues[j]) MicPIx, MicFitted, micCiH, micCiL = zip(*sorted(zip(MicPIx, MicFitted, micCiH, micCiL))) MacPIx, MacFitted, macCiH, macCiL = zip(*sorted(zip(MacPIx, MacFitted, macCiH, macCiL))) num = min(len(MacListX), len(MicListX)) micnums = np.random.choice(range(0, len(MicListX)), num, replace=False) macnums = np.random.choice(range(0, len(MacListX)), num, replace=False) for i, ind in enumerate(micnums): plt.scatter(MacListX[macnums[i]], MacListY[macnums[i]], color = 'LightCoral', alpha= 1 , s = 8, linewidths=0.5, edgecolor='Crimson') plt.scatter(MicListX[ind], MicListY[ind], color = 'SkyBlue', alpha= 1 , s = 8, linewidths=0.5, edgecolor='Steelblue') plt.fill_between(MacPIx, macCiL, macCiH, color='LightCoral', lw=0.0, alpha=0.9) plt.plot(MacPIx, MacFitted, color='r', ls='--', lw=0.5, alpha=0.9) plt.fill_between(MicPIx, micCiL, micCiH, color='b', lw=0.0, alpha=0.3) plt.plot(MicPIx, MicFitted, color='b', ls='--', lw=0.5, alpha=0.9) MicInt = round(np.mean(MicIntList), 2) MicCoef = round(np.mean(MicCoefList), 2) MacInt = round(np.mean(MacIntList), 2) MacCoef = round(np.mean(MacCoefList), 2) R2 = round(np.mean(R2List), 2) if index == 0: plt.ylim(-0.1, 2.5) plt.xlim(0, 8.2) plt.text(0.35, 2.1, r'$micro$'+ ' = '+str(round(10**MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue') plt.text(0.35, 1.8, r'$macro$'+ ' = '+str(round(10**MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson') plt.text(0.35, 1.4, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k') plt.scatter([0],[-1], color = 'SkyBlue', alpha = 1, s=15, linewidths=0.9, edgecolor='Steelblue', label= 'microbes (n='+str(len(MicListY))+')') plt.scatter([0],[-1], color = 'LightCoral',alpha= 1, s=15, linewidths=0.9, edgecolor='Crimson', label= 'macrobes (n='+str(len(MacListY))+')') plt.legend(bbox_to_anchor=(-0.04, 1.05, 2.48, .2), loc=10, ncol=2, mode="expand",prop={'size':fs}) elif index == 1: plt.plot([0,8.2],[0,8.2], ls = '--', lw=1, c='0.7') plt.ylim(0, 8) plt.xlim(0, 8.2) plt.text(0.35, 6.7, r'$micro$'+ ' = '+str(round(10**MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue') plt.text(0.35, 5.7, r'$macro$'+ ' = '+str(round(10**MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson') plt.text(0.35, 4.7, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k') elif index == 2: plt.ylim(-3.5, 0.0) plt.xlim(0, 8.2) plt.text(0.35, -2.9, r'$micro$'+ ' = '+str(round(10**MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue') plt.text(0.35, -3.3, r'$macro$'+ ' = '+str(round(10**MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson') plt.text(0.35, -2.5, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k') elif index == 3: plt.ylim(0.9, 5.0) plt.xlim(0, 8.2) plt.text(0.35, 4.5, r'$micro$'+ ' = '+str(round(2**MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue') plt.text(0.35, 4.0, r'$macro$'+ ' = '+str(round(2**MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson') plt.text(0.35, 3.5, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k') print condition, ones, ': S =', '%.3e' % (10**(MicInt + MicCoef*(30.0))) #print R2 plt.xlabel('$log$'+r'$_{10}$'+'($N$)', fontsize=fs) plt.ylabel(metric, fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) if ones == False: plt.savefig(mydir+'/figs/Fig1/Locey_Lennon_2015_Fig1-'+condition+'_NoSingletons_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight") if ones == True: plt.savefig(mydir+'/figs/Fig1/Locey_Lennon_2015_Fig1-'+condition+'_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight") #plt.show() plt.close() return
if index == 0: metlist = list(Rs) elif index == 1: metlist = list(Nmaxs) elif index == 2: metlist = list(Evs) elif index == 3: metlist = list(Ss) print len(Ns), len(metlist) d = pd.DataFrame({'N': list(Ns)}) d['y'] = list(metlist) f = smf.ols('y ~ N', d).fit() r2 = round(f.rsquared, 2) Int = f.params[0] Coef = f.params[1] st, data, ss2 = summary_table(f, alpha=0.05) # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict, # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp, # Residual, Std Error Residual, Student Residual, Cook's D fitted = data[:, 2] #predict_mean_se = data[:,3] mean_ci_low, mean_ci_upp = data[:, 4:6].T ci_low, ci_upp = data[:, 6:8].T ci_Ns = data[:, 0] Ns, metlist, fitted, ci_low, ci_upp = zip( *sorted(zip(Ns, metlist, fitted, ci_low, ci_upp))) plt.scatter(Ns, metlist,
#### plot figure ############################################################### xlab = r"$log_{10}$" + "(" + r"$\tau$" + ")" fs = 8 # fontsize fig = plt.figure() #### N vs. Tau ################################################################# fig.add_subplot(2, 2, 1) f2 = smf.ols("N ~ tau + I(tau ** 2.0)", d).fit() print f2.summary() a, b, c = f2.params p1, p2, p3 = f2.pvalues r2 = round(f2.rsquared, 2) st, data, ss2 = summary_table(f2, alpha=0.05) fitted = data[:, 2] pred_mean_se = data[:, 3] pred_mean_ci_low, pred_mean_ci_upp = data[:, 4:6].T pred_ci_low, pred_ci_upp = data[:, 6:8].T tau2, fitted, pred_ci_low, pred_ci_upp, pred_mean_ci_low, pred_mean_ci_upp = zip( *sorted(zip(tau, fitted, pred_ci_low, pred_ci_upp, pred_mean_ci_low, pred_mean_ci_upp)) ) plt.scatter(tau, N, color=colors, s=10, linewidths=0.1, edgecolor="k") # plt.fill_between(tau2, pred_ci_low, pred_ci_upp, color='r', lw=0.0, alpha=0.1) # plt.fill_between(tau2, pred_mean_ci_low, pred_mean_ci_upp, color='r', lw=0.0, alpha=0.3) plt.plot(tau2, fitted, color="r", ls="--", lw=1.5, alpha=0.9) plt.ylabel(r"$log_{10}$" + "(" + r"$N$" + ")", fontsize=fs + 6)
def figplot(x, y, xlab, ylab, fig, n): '''main figure plotting function''' fig.add_subplot(2, 2, n) y2 = list(y) x2 = list(x) d = pd.DataFrame({'x': list(x2)}) d['y'] = list(y2) f = smf.ols('y ~ x', d).fit() m, b, r, p, std_err = stats.linregress(x2, y2) st, data, ss2 = summary_table(f, alpha=0.05) fitted = data[:, 2] mean_ci_low, mean_ci_upp = data[:, 4:6].T ci_low, ci_upp = data[:, 6:8].T x2, y2, fitted, ci_low, ci_upp = zip( *sorted(zip(x2, y2, fitted, ci_low, ci_upp))) if n == 1: lab = r'$R_{models}$' + ' = ' + str(round( 10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n' lab += r'$R_{microbes}$' + ' = 2.34*' + r'$N$' + '$^{0.14}$' + '\n' lab += r'$R_{macrobes}$' + ' = 1.7*' + r'$N$' + '$^{0.11}$' plt.text(0.2, 0.8, lab, fontsize=7) elif n == 2: lab = r'$D_{models}$' + ' = ' + str(round( 10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n' lab += r'$D_{microbes}$' + ' = 0.44*' + r'$N$' + '$^{0.92}$' + '\n' lab += r'$D_{macrobes}$' + ' = 0.23*' + r'$N$' + '$^{0.99}$' plt.text(0.2, 3.0, lab, fontsize=7) elif n == 3: lab = r'$E_{models}$' + ' = ' + str(round( 10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n' lab += r'$E_{microbes}$' + ' = 0.58*' + r'$N$' + '$^{-0.23}$' + '\n' lab += r'$E_{macrobes}$' + ' = 1.15*' + r'$N$' + '$^{-0.21}$' plt.text(0.2, -1.7, lab, fontsize=7) elif n == 4: lab = r'$S_{models}$' + ' = ' + str(round( 10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n' lab += r'$S_{microbes}$' + ' = 1.77*' + r'$N$' + '$^{0.38}$' + '\n' lab += r'$S_{macrobes}$' + ' = 1.77*' + r'$N$' + '$^{0.24}$' plt.text(0.2, 1.9, lab, fontsize=7) #plt.hexbin(x2, y2, mincnt=1, gridsize = 40, bins='log', cmap=plt.cm.jet) plt.scatter(x2, y2, color='SkyBlue', alpha=1, s=12, linewidths=0.5, edgecolor='Steelblue') if n == 3: plt.legend(loc='best', fontsize=6, frameon=False) plt.plot(x2, fitted, color='k', ls='--', lw=1.0, alpha=0.9) plt.xlabel(xlab, fontsize=8) plt.ylabel(ylab, fontsize=8) plt.tick_params(axis='both', labelsize=5) plt.xlim(0, 1.05 * max(x2)) if n == 1: plt.ylim(0.0, 1.1) elif n == 2: plt.ylim(0.0, 4.2) elif n == 3: plt.ylim(-1.8, 0.05) elif n == 4: plt.ylim(0.4, 2.5) return fig
def Fig1(): datasets = [] GoodNames = ['MGRAST', 'HMP', 'EMPclosed', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] for name in os.listdir(mydir +'data/micro'): if name in GoodNames: pass else: continue #if name in BadNames: continue #else: pass #path = mydir2+'data/micro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' path = mydir2+'data/micro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'micro', num_lines]) print name, num_lines for name in os.listdir(mydir2 +'data/macro'): if name in GoodNames: pass else: continue #if name in BadNames: continue #else: pass #path = mydir2+'data/macro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' path = mydir2+'data/macro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'macro', num_lines]) print name, num_lines metrics = ['Rarity, '+r'$log_{10}$', 'Dominance, '+r'$log_{10}$', 'Evenness, ' +r'$log_{10}$', 'Richness, ' +r'$log_{10}$'] fig = plt.figure() for index, i in enumerate(metrics): metric = i fig.add_subplot(2, 2, index+1) fs = 10 # font size used across figures MicIntList, MicCoefList, MacIntList, MacCoefList, R2List, metlist = [[], [], [], [], [], []] Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []] #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] its = 1 for n in range(its): #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []] numMac = 0 numMic = 0 radDATA = [] for dataset in datasets: name, kind, numlines = dataset lines = [] if name == 'EMPclosed' or name == 'EMPopen': lines = np.random.choice(range(1, numlines+1), 1000, replace=True) # 166 elif kind == 'micro': lines = np.random.choice(range(1, numlines+1), 1000, replace=True) #167 else: lines = np.random.choice(range(1, numlines+1), 600, replace=True) # 100 #path = mydir2+'data/'+kind+'/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' path = mydir2+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) for data in radDATA: data = data.split() name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) if S < 10 or N < 11: continue Nlist.append(float(np.log10(N))) Slist.append(float(np.log10(S))) ESimplist.append(float(np.log10(float(ESimp)))) kind = np.random.choice(['micro', 'macro']) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log10(float(Nmax)))) # log-modulo transformation of skewnness lms = np.log10(np.abs(float(skew)) + 1) if skew < 0: lms = lms * -1 rareSkews.append(float(lms)) if kind == 'micro': numMic += 1 klist.append('b') if kind == 'macro': klist.append('r') numMac += 1 if index == 0: metlist = list(rareSkews) elif index == 1: metlist = list(NmaxList) elif index == 2: metlist = list(ESimplist) elif index == 3: metlist = list(Slist) # Multiple regression d = pd.DataFrame({'N': list(Nlist)}) d['y'] = list(metlist) d['Kind'] = list(KindList) f = smf.ols('y ~ N * Kind', d).fit() MacIntList.append(f.params[0]) MacCoefList.append(f.params[2]) if f.pvalues[1] < 0.05: MicIntList.append(f.params[1] + f.params[0]) else: MicIntList.append(f.params[0]) if f.pvalues[3] < 0.05: MicCoefList.append(f.params[3] + f.params[2]) else: MicCoefList.append(f.params[2]) R2List.append(f.rsquared) MacPIx, MacFitted, MicPIx, MicFitted = [[],[],[],[]] macCiH, macCiL, micCiH, micCiL = [[],[],[],[]] MacListX = [] MacListY = [] MicListX = [] MicListY = [] for j, k in enumerate(KindList): if k == 'micro': MicListX.append(Nlist[j]) MicListY.append(metlist[j]) elif k == 'macro': MacListX.append(Nlist[j]) MacListY.append(metlist[j]) print metric lm = smf.ols('y ~ N * Kind', d).fit() print lm.summary() print '\n\n' st, data, ss2 = summary_table(lm, alpha=0.05) # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict, # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp, # Residual, Std Error Residual, Student Residual, Cook's D fittedvalues = data[:,2] predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T for j, kval in enumerate(KindList): if kval == 'macro': macCiH.append(predict_mean_ci_upp[j]) macCiL.append(predict_mean_ci_low[j]) MacPIx.append(Nlist[j]) MacFitted.append(f.fittedvalues[j]) elif kval == 'micro': micCiH.append(predict_mean_ci_upp[j]) micCiL.append(predict_mean_ci_low[j]) MicPIx.append(Nlist[j]) MicFitted.append(f.fittedvalues[j]) MicPIx, MicFitted, micCiH, micCiL = zip(*sorted(zip(MicPIx, MicFitted, micCiH, micCiL))) MacPIx, MacFitted, macCiH, macCiL = zip(*sorted(zip(MacPIx, MacFitted, macCiH, macCiL))) num = min(len(MacListX), len(MicListX)) for i in range(num): plt.scatter(MacListX[i], MacListY[i], color = '0.4', alpha= 1 , s = 4, linewidths=0.5, edgecolor='0.3') plt.scatter(MicListX[i], MicListY[i], color = '0.4', alpha= 1 , s = 4, linewidths=0.5, edgecolor='0.3') plt.fill_between(MacPIx, macCiL, macCiH, color='lime', lw=0.0, alpha=0.3) plt.plot(MacPIx, MacFitted, color='lime', ls='--', lw=0.5, alpha=0.8) MacInt = round(np.mean(MacIntList), 2) MacCoef = round(np.mean(MacCoefList), 2) R2 = round(np.mean(R2List), 2) if index == 0: plt.ylim(-0.1, 2.0) plt.xlim(1, 7) plt.text(1.35, 1.5, r'$Rarity$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='k') plt.text(1.35, 1.2, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='0.3') plt.scatter([0],[-1], color = 'SkyBlue', alpha = 1, s=10, linewidths=0.9, edgecolor='Steelblue', label= 'microbes (n='+str(len(MicListY))+')') plt.scatter([0],[-1], color = 'LightCoral',alpha= 1, s=10, linewidths=0.9, edgecolor='Crimson', label= 'macrobes (n='+str(len(MacListY))+')') plt.legend(bbox_to_anchor=(-0.04, 1.1, 2.48, .2), loc=10, ncol=2, mode="expand",prop={'size':fs+2}) elif index == 1: plt.plot([0,7],[0,7], ls = '--', lw=1, c='0.7') #ax.text(18, 21, '1:1 line', fontsize=fs*1.0, rotation=40, color='0.7') plt.ylim(0, 6) plt.xlim(1, 7) plt.text(1.35, 4.5, r'$Dominance$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='k') plt.text(1.35, 3.75, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='0.3') elif index == 2: plt.ylim(-3.0, 0.0) plt.xlim(0, 7) plt.text(0.35, -2.5, r'$Evenness$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='k') plt.text(0.35, -2.2, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='0.3') elif index == 3: plt.ylim(0.9, 4.5) plt.xlim(1, 7) plt.text(1.35, 3.5, r'$Richness$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='k') plt.text(1.35, 3.0, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='0.3') plt.xlabel('Number of reads or individuals, '+ '$log$'+r'$_{10}$', fontsize=fs) plt.ylabel(metric, fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) #plt.savefig(mydir+'/figs/appendix/Fig1/RandomAssign/Locey_Lennon_2015_Pooled-OpenReference_NoSingletons.png', dpi=600, bbox_inches = "tight") #plt.savefig(mydir+'/figs/appendix/Fig1/RandomAssign/Locey_Lennon_2015_Pooled-ClosedReference_NoSingletons.png', dpi=600, bbox_inches = "tight") #plt.savefig(mydir+'/figs/appendix/Fig1/RandomAssign/Locey_Lennon_2015_Pooled-OpenReference.png', dpi=600, bbox_inches = "tight") plt.savefig(mydir+'/figs/appendix/Fig1/RandomAssign/Locey_Lennon_2015_Pooled-ClosedReference.png', dpi=600, bbox_inches = "tight") #plt.show() #plt.close() return
def d(re,alpha): st, data, ss2 = summary_table(re, alpha) return st,data,ss2
def Fig3(condition, ones, sampling): """ A figure demonstrating a strong richness relationship across 10 or 11 orders of magnitude in total abundance. Taxonomic richness of a sample scales in a log-log fashion with the total abundance of the sample. """ fs = 12 # font size used across figures metric = 'Richness, '+r'$log$'+r'$_{10}$' tail = str() if ones is False: tail = '-SADMetricData_NoMicrobe1s.txt' elif ones is True: tail = '-SADMetricData.txt' datasets = [] GoodNames = [] emp = str() if condition == 'open': emp = 'EMPopen' elif condition == 'closed': emp = 'EMPclosed' GoodNames = [emp, 'TARA', 'HMP', 'BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED', 'HUMAN', 'CHINA', 'CATLIN', 'FUNGI'] print '\n' its = 1 d_blist = [] d_zlist = [] s_blist = [] s_zlist = [] for name in os.listdir(mydir +'data/micro'): if name in GoodNames: pass else: continue path = mydir+'data/micro/'+name+'/'+name+tail numlines = sum(1 for line in open(path)) #print name, numlines datasets.append([name, 'micro', numlines]) if sampling <= 500: its = 100 else: its = 100 for i in range(its): Nlist, Slist, klist, NmaxList = [[],[],[],[]] for dataset in datasets: radDATA = [] name, kind, numlines = dataset lines = [] small_mgrast = ['BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED'] big_mgrast = ['HUMAN', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO'] if kind == 'micro': if name in small_mgrast: lines = np.random.choice(range(1, numlines+1), 160, replace=True) # 40 elif name in big_mgrast: lines = np.random.choice(range(1, numlines+1), 400, replace=True) # 100 else: lines = np.random.choice(range(1, numlines+1), 400, replace=True) # 100 path = mydir+'data/micro/'+name+'/'+name+tail for line in lines: data = linecache.getline(path, line) radDATA.append(data) ct = 0 for data in radDATA: data = data.split() if data == []: continue name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) Nmax = float(Nmax) ct += 1 Nlist.append(float(np.log10(N))) Slist.append(float(np.log10(S))) NmaxList.append(float(np.log10(Nmax))) klist.append('DarkCyan') #print name, ct Nlist, Slist, NmaxList = zip(*sorted(zip(Nlist, Slist, NmaxList))) Nlist = list(Nlist) Slist = list(Slist) NmaxList = list(NmaxList) # Regression for Dominance (Nmax) vs. N d = pd.DataFrame({'N': Nlist}) d['Nmax'] = NmaxList f = smf.ols('Nmax ~ N', d).fit() R2 = f.rsquared pval = f.pvalues[0] intercept = f.params[0] slope = f.params[1] d_blist.append(intercept) d_zlist.append(slope) # Regression for Richness (S) vs. N d = pd.DataFrame({'N': Nlist}) d['S'] = Slist f = smf.ols('S ~ N', d).fit() R2 = f.rsquared pval = f.pvalues[0] intercept = f.params[0] slope = f.params[1] s_blist.append(intercept) s_zlist.append(slope) sb = np.mean(s_blist) sz = np.mean(s_zlist) db = np.mean(d_blist) dz = np.mean(d_zlist) #print 'R2 for Nmax vs. N:', round(dR2, 3) print 'Nmax =', round(10**db, 2), '*', 'N^', round(dz, 2) #print 'R2 for S vs. N:', round(R2, 3) print 'S =', round(10**sb, 2), '*', 'N^', round(sz, 2),'\n' fig = plt.figure() ax = fig.add_subplot(1, 1, 1) plt.text(2, 11.0, r'$S$'+ ' = '+str(round(10**sb, 1))+'*'+r'$N$'+'$^{'+str(round(sz, 2))+'},$' + r' $r^2$' + '=' +str(round(R2,2)), fontsize=fs+5, color='Crimson', alpha=0.9) # code for prediction intervals X = np.linspace(5, 32, 100) Y = f.predict(exog=dict(N=X)) Nlist2 = Nlist + X.tolist() Slist2 = Slist + Y.tolist() d = pd.DataFrame({'N': list(Nlist2)}) d['y'] = list(Slist2) f = smf.ols('y ~ N', d).fit() st, data, ss2 = summary_table(f, alpha=0.05) #fittedvalues = data[:,2] #pred_mean_se = data[:,3] pred_mean_ci_low, pred_mean_ci_upp = data[:,4:6].T pred_ci_low, pred_ci_upp = data[:,6:8].T plt.fill_between(Nlist2, pred_ci_low, pred_ci_upp, color='r', lw=0.5, alpha=0.2) z = np.polyfit(Nlist2, Slist2, 1) p = np.poly1d(z) xp = np.linspace(0, 32, 1000) label1 = 'Richness-abundance scaling relationship, $S$ = 7.6$N^{0.35}$' label2 = 'Predicted $S$ via lognormal, published $N$ and $N_{max}$' label3 = 'Predicted $S$ via lognormal, published $N$, $N_{max}$ = 0.4$N^{0.93}$' plt.plot(xp, p(xp), '--', c='red', lw=2, alpha=0.8, color='Crimson', label=label1) plt.scatter(Nlist, Slist, color = 'LightCoral', alpha= 1 , s = 10, linewidths=0.5, edgecolor='Crimson') #plt.hexbin(Nlist, Slist, mincnt=1, gridsize = 80, bins='log', cmap=plt.cm.Reds_r, label='EMP') # Adding in derived/inferred points c = '0.3' GO = [3.6*(10**28), 10.1*(10**28)] # estimated open ocean bacteria; Whitman et al. 1998 Pm = [2.8*(10**27), 3.0*(10**27)] # estimated Prochlorococcus; Flombaum et al. 2013 Syn = [6.7*(10**26), 7.3*(10**26)] # estimated Synechococcus; Flombaum et al. 2013 Earth = [9.2*(10**29), 31.7*(10**29)] # estimated bacteria on Earth; Kallmeyer et al. 2012 SAR11 = [2.0*(10**28), 2.0*(10**28)] # estimated percent abundance of SAR11; Morris et al. (2002) HGx = [0.5*(10**14), 1.5*(10**14)] # estimated bacteria in Human gut; Berg (1996) HGy = [0.05*min(HGx), 0.15*max(HGx)] # estimated most abundant bacteria in Human gut; Turnbaugh et al. (2009), & Dethlefsen et al. (2008) COWx = [0.5*2.226*(10**15), 1.5*2.226*(10**15)] # estimated bacteria in Cow rumen; LOW: HIGH: Whitman et al. (1998) COWy = [0.09*min(COWx), 0.15*max(COWx)] # estimated dominance in Cow rumen; Stevenson and Weimer (2006) ## PREDICTIONS OF S BASED ON THE EMPIRICAL S VS. N SCALING LAW, AND BASED ## ON THE LOGNORMAL PREDICTIVE FRAMEWORK OF CURTIS AND SLOAN USING ## 1.) THE ESTIMATED NMAX AND 2.) THE PREDICTED NMAX Ns = [] Ss = [] DomSs = [] # Global Ocean estimates based on Whitman et al. (1998) and P. marinus (2012 paper) guess = 0.1019 yrange = [min(Syn), max(Pm)] Slist_ln, Slist_SvN, Dlist, Nlist = getS(GO, sb, sz, db, dz, guess, yrange, predictNmax=False) S_ln = np.mean(Slist_ln) S1 = float(S_ln) S_ln_sem = stats.sem(Slist_ln, ddof=1) S_SvN = np.mean(Slist_SvN) S_SvN_sem = stats.sem(Slist_SvN, ddof=1) Nmax = np.mean(Dlist) Nmax_sem = stats.sem(Dlist, ddof=1) avgN = np.mean(Nlist) avgN_sem = stats.sem(Nlist, ddof=1) Ss.append(S_ln) print 'scaling law prediction of S for Global Ocean:', '%.3e' % 10**(S_SvN) print 'lognormal prediction of S for Global Ocean, using estimated Nmax:', '%.3e' % 10**S_ln guess = 0.1019 Slist_ln, Slist_SvN, Dlist, Nlist = getS(GO, sb, sz, db, dz, guess, yrange, predictNmax=True) S_ln = np.mean(Slist_ln) S_ln_sem = stats.sem(Slist_ln, ddof=1) S_SvN = np.mean(Slist_SvN) S_SvN_sem = stats.sem(Slist_SvN, ddof=1) Nmax = np.mean(Dlist) Nmax_sem = stats.sem(Dlist, ddof=1) avgN = np.mean(Nlist) avgN_sem = stats.sem(Nlist, ddof=1) print 'lognormal prediction of S for Global Ocean, using predicted Nmax:', '%.3e' % 10**S_ln #print 'P.m.:', '%.2e' % float(2.9*10**27), 'Nmax:', '%.2e' % 10**Nmax,'\n' S2 = float(S_ln) N = float(avgN) S_sem = float(4*S_ln_sem) N_sem = float(4*avgN_sem) ax.text(13.5, S1*0.93, 'Global Ocean', fontsize=fs+2, color = 'k') ax.axhline(S1, 0, 0.91, ls = '--', c = '0.6') ax.text(N-1, S2*.80, 'Global ocean', fontsize=fs+2, color = 'k', rotation = 90) ax.axvline(N, 0, 0.65, ls = '--', c = '0.6') #plt.scatter([N], [S2], color = '0.2', alpha= 1 , s = 60, linewidths=1, edgecolor='k') Ns.append(N) DomSs.append(S2) #plt.errorbar([N], [S2], xerr=N_sem, yerr=S_sem, color='k', linewidth=2) # Earth, i.e., Global estimates based on Kallmeyer et al. (2012) and SAR11 (2002 paper) guess = 0.1060 yrange = [min(Pm), max(SAR11)] Slist_ln, Slist_SvN, Dlist, Nlist = getS(Earth, sb, sz, db, dz, guess, yrange, predictNmax=False) S_ln = np.mean(Slist_ln) S1 = float(S_ln) S_ln_sem = stats.sem(Slist_ln, ddof=1) S_SvN = np.mean(Slist_SvN) S_SvN_sem = stats.sem(Slist_SvN, ddof=1) Nmax = np.mean(Dlist) Nmax_sem = stats.sem(Dlist, ddof=1) avgN = np.mean(Nlist) avgN_sem = stats.sem(Nlist, ddof=1) Ss.append(S_ln) #print 'average N and sem:' '%.3e' % 10**avgN, '%.3e' % 10**avgN_sem #print 'average Nmax and sem:' '%.3e' % 10**Nmax, '%.3e' % 10**Nmax_sem print '\nscaling law prediction of S for Earth:', '%.3e' % 10**S_SvN #,'%.3e' % 10**S_SvN_sem #, '%.3e' % S_SvN_CI print 'lognormal prediction of S for Earth, using estimated Nmax:', '%.3e' % 10**S_ln #, '%.3e' % 10**S_ln_sem#, '%.3e' % S_ln_CI guess = 0.1060 Slist_ln, Slist_SvN, Dlist, Nlist = getS(Earth, sb, sz, db, dz, guess, yrange, predictNmax=True) S_ln = np.mean(Slist_ln) S_ln_sem = stats.sem(Slist_ln, ddof=1) S_SvN = np.mean(Slist_SvN) S_SvN_sem = stats.sem(Slist_SvN, ddof=1) Nmax = np.mean(Dlist) Nmax_sem = stats.sem(Dlist, ddof=1) avgN = np.mean(Nlist) avgN_sem = stats.sem(Nlist, ddof=1) print 'lognormal prediction of S for Earth, using predicted Nmax:', '%.3e' % 10**S_ln #, '%.3e' % 10**S_ln_sem#, '%.3e' % S_ln_CI #print 'SAR11:', '%.2e' % float(2.4*10**28), 'Nmax:', '%.2e' % Nmax,'\n' S2 = float(S_ln) N = float(avgN) S_sem = float(4*S_ln_sem) N_sem = float(4*avgN_sem) ax.text(25, S2*1.025, 'Earth', fontsize=fs+2, color = 'k') ax.axhline(S2, 0, 0.95, ls = '--', c = '0.6') ax.text(N-1, 8, 'Earth', fontsize=fs+2, color = 'k', rotation = 90) ax.axvline(N, 0, 0.82, ls = '--', c = '0.6') #plt.scatter([N], [S2], color = '0.2', alpha= 1 , s = 60, linewidths=1, edgecolor='k') #plt.errorbar([N], [S2], xerr=N_sem, yerr=S_sem, color='k', linewidth=2) Ns.append(N) DomSs.append(S2) # Human Gut guess = 0.1509 Slist_ln, Slist_SvN, Dlist, Nlist = getS(HGx, sb, sz, db, dz, guess, HGy, predictNmax=False) S_ln = np.mean(Slist_ln) S1 = float(S_ln) S_ln_sem = stats.sem(Slist_ln, ddof=1) S_SvN = np.mean(Slist_SvN) S_SvN_sem = stats.sem(Slist_SvN, ddof=1) Nmax = np.mean(Dlist) Nmax_sem = stats.sem(Dlist, ddof=1) avgN = np.mean(Nlist) avgN_sem = stats.sem(Nlist, ddof=1) Ss.append(S_ln) Slist_ln, Slist_SvN, Dlist, Nlist = getS(HGx, sb, sz, db, dz, guess, HGy, predictNmax=True) S_ln = np.mean(Slist_ln) S_ln_sem = stats.sem(Slist_ln, ddof=1) S_SvN = np.mean(Slist_SvN) S_SvN_sem = stats.sem(Slist_SvN, ddof=1) Nmax = np.mean(Dlist) Nmax_sem = stats.sem(Dlist, ddof=1) avgN = np.mean(Nlist) avgN_sem = stats.sem(Nlist, ddof=1) S2 = float(S_ln) N = float(avgN) S_sem = float(4*S_ln_sem) N_sem = float(4*avgN_sem) ax.text(3.5, S2*.9, 'Human Gut', fontsize=fs+2, color = 'k') ax.axhline(S2, 0, 0.41, ls = '--', c = '0.6') ax.text(N-1, 3.6, 'Human Gut', fontsize=fs+2, color = 'k', rotation = 90) ax.axvline(N, 0, 0.33, ls = '--', c = '0.6') #plt.scatter([N], [S2], color = '0.2', alpha= 1 , s = 60, linewidths=1, edgecolor='k') #plt.errorbar([N], [S2], xerr=N_sem, yerr=S_sem, color='k', linewidth=2) Ns.append(N) DomSs.append(S2) #print 'predS for Human Gut:', '%.3e' % 10**S2 # Cow Rumen guess = 0.1 Slist_ln, Slist_SvN, Dlist, Nlist = getS(COWx, sb, sz, db, dz, guess, COWy, predictNmax=False) S_ln = np.mean(Slist_ln) S1 = float(S_ln) S_ln_sem = stats.sem(Slist_ln, ddof=1) S_SvN = np.mean(Slist_SvN) S_SvN_sem = stats.sem(Slist_SvN, ddof=1) Nmax = np.mean(Dlist) Nmax_sem = stats.sem(Dlist, ddof=1) avgN = np.mean(Nlist) avgN_sem = stats.sem(Nlist, ddof=1) Ss.append(S_ln) Slist_ln, Slist_SvN, Dlist, Nlist = getS(COWx, sb, sz, db, dz, guess, COWy, predictNmax=True) S_ln = np.mean(Slist_ln) S_ln_sem = stats.sem(Slist_ln, ddof=1) S_SvN = np.mean(Slist_SvN) S_SvN_sem = stats.sem(Slist_SvN, ddof=1) Nmax = np.mean(Dlist) Nmax_sem = stats.sem(Dlist, ddof=1) avgN = np.mean(Nlist) avgN_sem = stats.sem(Nlist, ddof=1) S2 = float(S_ln) N = float(avgN) S_sem = float(4*S_ln_sem) N_sem = float(4*avgN_sem) ax.text(6, S2*1.04, 'Cow Rumen', fontsize=fs+2, color = 'k') ax.axhline(S2, 0, 0.46, ls = '--', c = '0.6') ax.text(N+0.3, 4.2, 'Cow Rumen', fontsize=fs+2, color = 'k', rotation = 90) ax.axvline(N, 0, 0.38, ls = '--', c = '0.6') Ns.append(N) DomSs.append(S2) plt.scatter(Ns, Ss, color = '0.4', alpha= 1, s = 50, linewidths=2, edgecolor='k', label=label2) plt.scatter(Ns, DomSs, color = 'SkyBlue', alpha= 1, s = 50, linewidths=2, edgecolor='Steelblue', label=label3) #plt.errorbar([N], [S2], xerr=N_sem, yerr=S_sem, color='k', linewidth=2) ax.text(5, -0.8, 'Number of reads or total abundance, '+ '$log$'+r'$_{10}$', fontsize=fs+4) ax.text(-2.1, 10, 'OTU '+ metric, fontsize=fs+4, rotation=90) plt.xlim(1, 31) plt.ylim(0.8, 14) plt.legend(bbox_to_anchor=(-0.015, 1.03, 1.03, .2), loc=10, ncol=1, mode="expand",prop={'size':fs+2.2}) if ones == False: plt.savefig(mydir+'/figs/Fig3/figure3.pdf', dpi=300, bbox_inches = "tight") #plt.savefig(mydir+'/figs/Fig3/Locey_Lennon_2015_Fig3-'+condition+'_NoSingletons_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight") if ones == True: plt.savefig(mydir+'/figs/Fig3/figure3.pdf', dpi=300, bbox_inches = "tight") #plt.savefig(mydir+'/figs/Fig3/Locey_Lennon_2015_Fig3-'+condition+'_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight") #plt.show() return
def plot_time_series(fname_template=None, region_number="all", dpi=150, ax = None, melt_index_or_extent="index", extent_melt_days_threshold=2, include_ylabel=True, gap_filled=True, include_trendline=False, include_trendline_only_if_significant=True, include_legend_if_significant=True, include_name_in_title=True, print_trendline_summary=True, offset_years_by_one=True, add_confidence_intervals=True, add_prediction_intervals=True, verbose=True): """Cretae a plot of the time series of melt. In fname_template, if you specify a {0} tag in the name, it will be filled in with the region number. This is useful if you want to use region_number="all", as that will create 8 plots for regions 0-7. Use 'ax' to provide an axis upon which to draw. This is useful for putting together a multi-part figure. Don't use this option if using "region_number="all", as that will draw multiple plots on the same axes. """ if region_number == "all": region_nums = range(8) else: region_nums = [region_number] ax_provided = ax for region_n in region_nums: years, melt = get_time_series_data(region_number=region_n, melt_index_or_extent=melt_index_or_extent, extent_melt_days_threshold = extent_melt_days_threshold, gap_filled=gap_filled, return_in_km2=True) # Since the "2019" melt season (.e.g) in Antarctica actually spans 2019-2020, # it makes more sense to center it over the Jan 1, 2020 date rather than # the start of 2019. # Make it so. if offset_years_by_one: years = years + 1 if include_ylabel: if max(melt) > 1e6: melt = melt / 1e6 figure_exp = 6 else: melt = melt / 1e3 figure_exp = 3 else: melt = melt / 1e6 figure_exp = 6 # Create a new figure if no axis is provided. if ax_provided is None: fig, ax = plt.subplots(1,1) ax.plot(years, melt, color="maroon", label = "Annual melt {0}".format("index" if melt_index_or_extent == "index" else "extent")) melt_index_or_extent_lower = melt_index_or_extent.strip().lower() if include_ylabel: if melt_index_or_extent_lower == "index": ax.set_ylabel("Melt Index (10$^{0}$ km$^2\cdot$days)".format(figure_exp)) # ax.set_ylabel("Melt Index (million km$^2$ days)") elif melt_index_or_extent_lower == "extent": ax.set_ylabel("Melt Extent (10$^{0}$ km$^2$)".format(figure_exp)) else: raise ValueError("Unknown value for parameter 'melt_index_or_extent': {0}".format(melt_index_or_extent)) ax.tick_params(direction="in", bottom=True, left=True, right=True, top=False, labeltop=False, labelright=False, which="major") ax.tick_params(direction="in", bottom=True, which="minor") ax.tick_params(axis='x', length=4, which="major") ax.tick_params(axis='x', length=2, which="minor") if include_name_in_title: region_name = antarctic_regions_dict[region_n] ax.set_title(region_name) # Limit lower-bounds to zero ylim = ax.get_ylim() ax.set_ylim(max(ylim[0], 0), ylim[1]) # Force the y-axis to only use integers (this tends to give us better scaling) if ylim[1] > 8: ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True)) # Turn on the minor ticks for the years. ax.xaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(base=1)) # ax.xaxis.grid(True, which='minor') # Run a linear-fit OLS model on the data. x = statsmodels.api.add_constant(years) model = statsmodels.api.OLS(melt, x) results = model.fit() # If go into all this if we've indicated we might want to plot a trendline. if include_trendline or include_trendline_only_if_significant: # print(results.params) # print(results.pvalues) pval_int, pval_slope = results.pvalues intercept, slope = results.params # fit_func = numpy.poly1d((slope, intercept)) if print_trendline_summary: print("\n") print("============", antarctic_regions_dict[region_n] + ",", melt_index_or_extent, "==============") print(results.summary()) if include_trendline or (pval_slope <= 0.05 and include_trendline_only_if_significant): st, data, ss2 = summary_table(results, alpha=0.05) fittedvalues = data[:, 2] # predict_mean_se = data[:, 3] predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T predict_ci_low, predict_ci_upp = data[:, 6:8].T # Put the p-value in the legend text. p_value_text = ("{0:0.3f}" if (pval_slope > 0.001) else "{0:0.1e}").format(pval_slope) # ax.plot(years, fit_func(years), color="blue", label = r"Linear Trend (\textit{p=" + p_value_text + "})") # ax.plot(years, fit_func(years), color="blue", label = r"Linear Trend ($\it{p=" + p_value_text + "}$)") ax.plot(years, fittedvalues, color="blue", label = r"Linear trend ($\it{p=" + p_value_text + "}$)") if add_confidence_intervals: # Regression errors, Y minus Y_fit # y_err = melt - fit_func(years) # Calculate confidence intervals # p_x, confs = CI.conf_calc(years, y_err, c_limit=0.975, test_n=50) # Calculate the lines for plotting: # The fit line, and lower and upper confidence bounds # p_y, lower, upper = CI.ylines_calc(p_x, confs, fit_func) # plot confidence limits # ax.plot(p_x, lower, 'c--', ax.plot(years, predict_mean_ci_low, color='blue', linestyle='--', label='95% confidence interval', # label='95\% Confidence Interval', alpha=0.5, linewidth=0.8) # ax.plot(p_x, upper, 'c--', ax.plot(years, predict_mean_ci_upp, color='blue', linestyle='--', label=None, alpha=0.5, linewidth=0.8) if add_prediction_intervals: ax.plot(years, predict_ci_low, color="red", linestyle='--', label='95% prediction interval', # label='95\% Confidence Interval', alpha=0.5, linewidth=0.5) # ax.plot(p_x, upper, 'c--', ax.plot(years, predict_ci_upp, color="red", linestyle='--', label=None, alpha=0.5, linewidth=0.5) # The prediction intervals are quite wide. Rescale the y-limits # to be no more than 10% above/below the max/min of the data, # even if it makes the prediction intervals trail off the figure # a bit. ylim = ax.get_ylim() if (ylim[0] < 0) or (ylim[0] < (min(melt) - 0.1*(max(melt) - min(melt)))): ax.set_ylim(max(0, min(melt)- 0.1*(max(melt) - min(melt))), ylim[1]) ylim = ax.get_ylim() if (ylim[1] > (max(melt) + 0.1*(max(melt) - min(melt)))): ax.set_ylim(ylim[0], (max(melt) + 0.1*(max(melt) - min(melt)))) if include_legend_if_significant: ax.legend(fontsize="small", labelspacing=0.1, framealpha=0.95) if ax_provided is None: fig.tight_layout() if fname_template is None: plt.show() else: fname = fname_template.format(region_n) fig.savefig(fname, dpi=dpi) if verbose: print(fname, "written.") plt.close(fig) return results
def Fig1(): OUT = open(mydir + 'output/PerDataset.txt','w+') """ This code generates a 4 plot figure of diversity properties (rarity, dominance, evenness, richness) versus total abundance, for each dataset. This code also generates a .txt file of results for the regression analyses. """ datasets = [] #GoodNames = ['TARA', 'HUMAN', 'BOVINE', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'HMP', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA', 'EMPclosed', 'EMPopen'] GoodNames = ['HMP'] for name in os.listdir(mydir +'data/micro'): if name in GoodNames: pass else: continue #path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'micro', num_lines]) print name, num_lines for name in os.listdir(mydir +'data/macro'): if name in GoodNames: pass else: continue #path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'macro', num_lines]) print name, num_lines metrics = ['Rarity, '+r'$log_{10}$', 'Dominance, '+r'$log_{10}$', 'Evenness, ' +r'$log_{10}$', 'Richness, ' +r'$log_{10}$'] #OUT = open(mydir + 'output/SummaryPerDataset_NoMicrobe1s.txt','w+') OUT = open(mydir + 'output/SummaryPerDataset.txt','w+') for dataset in datasets: fig = plt.figure() for index, i in enumerate(metrics): metric = i fig.add_subplot(2, 2, index+1) fs = 10 # font size used across figures IntList, CoefList, R2List, metlist = [[], [], [], []] Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []] #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] its = 1 f = list() for n in range(its): #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []] radDATA = [] name, kind, numlines = dataset lines = [] lines = np.random.choice(range(1, numlines+1), numlines, replace=False) #if numlines > 1000: # lines = np.random.choice(range(1, numlines+1), 1000, replace=True) #else: # lines = np.random.choice(range(1, numlines+1), numlines, replace=False) #path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt' path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) tN = 0 for data in radDATA: data = data.split() name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) tN += N S = float(S) #if S < 2 or N < 10: continue Nlist.append(float(np.log10(N))) Slist.append(float(np.log10(S))) ESimplist.append(float(np.log10(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log10(float(Nmax)))) # log-modulo transformation of skewnness lms = np.log10(np.abs(float(skew)) + 1) if skew < 0: lms = lms * -1 rareSkews.append(float(lms)) print 'total number of reads in', name, ':', print '%.3e' % tN sys.exit() if index == 0: metlist = list(rareSkews) elif index == 1: metlist = list(NmaxList) elif index == 2: metlist = list(ESimplist) elif index == 3: metlist = list(Slist) # Simple regression d = pd.DataFrame({'N': list(Nlist)}) d['y'] = list(metlist) f = smf.ols('y ~ N', d).fit() IntList.append(f.params[0]) CoefList.append(f.params[1]) R2List.append(f.rsquared) PIx = list(Nlist) st, data, ss2 = summary_table(f, alpha=0.05) # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict, # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp, # Residual, Std Error Residual, Student Residual, Cook's D Fitted = data[:,2] predict_mean_se = data[:,3] CiL, CiH = data[:,4:6].T PiL, PiH = data[:,6:8].T PIx, Fitted, CiH, CiL = zip(*sorted(zip(PIx, Fitted, CiH, CiL))) plt.scatter(Nlist, metlist, color = 'SkyBlue', alpha= 1 , s = 4, linewidths=0.5, edgecolor='Steelblue') plt.fill_between(PIx, CiL, CiH, color='b', lw=0.0, alpha=0.3) Int = round(np.mean(IntList), 2) Coef = round(np.mean(CoefList), 2) R2 = round(np.mean(R2List), 3) print dataset, metric, Int, Coef, R2 x = min(Nlist) y = 1.1*max(metlist) plt.scatter([0],[-1], color = 'SkyBlue', alpha = 1, s=10, linewidths=0.9, edgecolor='Steelblue', label= metric+' = '+str(round(10**Int, 2))+'*'+r'$N$'+'$^{'+str(round(Coef, 2))+'}$'+'\n'+r'$R^2$' + '=' +str(R2) +' (n='+str(len(PIx))+')') if index == 2: leg = plt.legend(loc=3,prop={'size':fs-1}) leg.draw_frame(False) else: leg = plt.legend(loc=2,prop={'size':fs-1}) leg.draw_frame(False) plt.ylim(min(metlist), max(metlist)*1.1) plt.xlim(min(Nlist), max(Nlist)) plt.xlabel('Total abundance, ' + r'$log_{10}(N)$', fontsize=fs) plt.ylabel(metric, fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs-3) metrix = ['rarity', 'dominance', 'evenness', 'richness'] print>>OUT, name, kind, metrix[index], np.mean(PIx), np.mean(Slist), Int, Coef fig.suptitle(dataset[0], fontsize=fs+2) #plt.subplots_adjust(wspace=0.4, hspace=0.4) #plt.savefig(mydir+'/figs/appendix/Fig1/PerDataset/Locey_Lennon_2015_'+name+'_NoMicrobeSingletons.png', dpi=600, bbox_inches = "tight") plt.savefig(mydir+'/figs/appendix/Fig1/PerDataset/Locey_Lennon_2015_'+name+'.png', dpi=600, bbox_inches = "tight") #plt.show() #plt.close() OUT.close() return
def Fig1(ref, Ones): datasets = [] if ref == 'ClosedRef': GoodNames = [ 'MGRAST', 'HMP', 'EMPclosed', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA' ] if ref == 'OpenRef': GoodNames = [ 'MGRAST', 'HMP', 'EMPopen', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA' ] for name in os.listdir(mydir + 'data/micro'): if name in GoodNames: pass else: continue if Ones == 'N': path = mydir + 'data/micro/' + name + '/' + name + '-SADMetricData_NoMicrobe1s.txt' elif Ones == 'Y': path = mydir + 'data/micro/' + name + '/' + name + '-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'micro', num_lines]) print name, num_lines for name in os.listdir(mydir + 'data/macro'): if name in GoodNames: pass else: continue if Ones == 'N': path = mydir + 'data/macro/' + name + '/' + name + '-SADMetricData_NoMicrobe1s.txt' elif Ones == 'Y': path = mydir + 'data/macro/' + name + '/' + name + '-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, 'macro', num_lines]) print name, num_lines metrics = ['log-modulo skewness', 'log-skew'] fig = plt.figure() for index, i in enumerate(metrics): metric = i fig.add_subplot(2, 2, index + 1) fs = 10 # font size used across figures MicIntList, MicCoefList, MacIntList, MacCoefList, R2List, metlist = [ [], [], [], [], [], [] ] Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [ [], [], [], [], [], [], [], [], [] ] EvarList, EQList, OList = [[], [], []] SkewList, LogSkewList = [[], []] its = 1000 for n in range(its): #print n, metric Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [ [], [], [], [], [], [], [], [], [] ] EvarList, EQList, OList = [[], [], []] SkewList, LogSkewList = [[], []] numMac = 0 numMic = 0 radDATA = [] for dataset in datasets: name, kind, numlines = dataset lines = [] if name == 'EMPclosed' or name == 'EMPopen': lines = np.random.choice(range(1, numlines + 1), 100, replace=True) elif kind == 'micro': lines = np.random.choice(range(1, numlines + 1), 100, replace=True) else: lines = np.random.choice(range(1, numlines + 1), 60, replace=True) if Ones == 'N': path = mydir + 'data/' + kind + '/' + name + '/' + name + '-SADMetricData_NoMicrobe1s.txt' elif Ones == 'Y': path = mydir + 'data/' + kind + '/' + name + '/' + name + '-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) for data in radDATA: data = data.split() name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data KindList.append(kind) N = float(N) S = float(S) if S < 10 or N < 10: continue # Min species richness Nlist.append(float(np.log10(N))) Slist.append(float(np.log10(S))) # Rarity lms = np.log10(np.abs(float(skew)) + 1) if skew < 0: lms = lms * -1 SkewList.append(float(lms)) LogSkewList.append(float(logskew)) if kind == 'micro': numMic += 1 klist.append('b') if kind == 'macro': klist.append('r') numMac += 1 if index == 0: metlist = list(SkewList) elif index == 1: metlist = list(LogSkewList) # Multiple regression d = pd.DataFrame({'N': list(Nlist)}) d['y'] = list(metlist) d['Kind'] = list(KindList) f = smf.ols('y ~ N * Kind', d).fit() MacIntList.append(f.params[0]) MacCoefList.append(f.params[2]) if f.pvalues[1] < 0.05: MicIntList.append(f.params[1] + f.params[0]) else: MicIntList.append(f.params[0]) if f.pvalues[3] < 0.05: MicCoefList.append(f.params[3] + f.params[2]) else: MicCoefList.append(f.params[2]) R2List.append(f.rsquared) MacListX = [] MacListY = [] MicListX = [] MicListY = [] for j, k in enumerate(KindList): if k == 'micro': MicListX.append(Nlist[j]) MicListY.append(metlist[j]) elif k == 'macro': MacListX.append(Nlist[j]) MacListY.append(metlist[j]) MacPIx, MacFitted, MicPIx, MicFitted = [[], [], [], []] macCiH, macCiL, micCiH, micCiL = [[], [], [], []] lm = smf.ols('y ~ N * Kind', d).fit() #print metric, '\n', lm.summary() #f1 = smf.ols('y ~ N', d).fit() #print metric, '\n', f1.summary() st, data, ss2 = summary_table(lm, alpha=0.05) fittedvalues = data[:, 2] predict_mean_se = data[:, 3] predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T predict_ci_low, predict_ci_upp = data[:, 6:8].T for j, kval in enumerate(KindList): if kval == 'macro': macCiH.append(predict_mean_ci_upp[j]) macCiL.append(predict_mean_ci_low[j]) MacPIx.append(Nlist[j]) MacFitted.append(f.fittedvalues[j]) elif kval == 'micro': micCiH.append(predict_mean_ci_upp[j]) micCiL.append(predict_mean_ci_low[j]) MicPIx.append(Nlist[j]) MicFitted.append(f.fittedvalues[j]) MicPIx, MicFitted, micCiH, micCiL = zip( *sorted(zip(MicPIx, MicFitted, micCiH, micCiL))) MacPIx, MacFitted, macCiH, macCiL = zip( *sorted(zip(MacPIx, MacFitted, macCiH, macCiL))) for i in range(len(MicListX)): plt.scatter(MacListX[i], MacListY[i], color='LightCoral', alpha=1, s=4, linewidths=0.5, edgecolor='Crimson') plt.scatter(MicListX[i], MicListY[i], color='SkyBlue', alpha=1, s=4, linewidths=0.5, edgecolor='Steelblue') plt.fill_between(MacPIx, macCiL, macCiH, color='r', lw=0.0, alpha=0.3) plt.fill_between(MicPIx, micCiL, micCiH, color='b', lw=0.0, alpha=0.3) MicInt = round(np.mean(MicIntList), 2) MicCoef = round(np.mean(MicCoefList), 2) MacInt = round(np.mean(MacIntList), 2) MacCoef = round(np.mean(MacCoefList), 2) r2 = round(np.mean(R2List), 2) if index == 0: plt.ylim(0, 2.5) plt.xlim(0, 7) plt.text(0.3, 2.2, r'$micro$' + ' = ' + str(round(MicInt, 2)) + '*' + r'$N$' + '$^{' + str(round(MicCoef, 2)) + '}$', fontsize=fs - 1, color='Steelblue') plt.text(0.3, 2.0, r'$macro$' + ' = ' + str(round(MacInt, 2)) + '*' + r'$N$' + '$^{' + str(round(MacCoef, 2)) + '}$', fontsize=fs - 1, color='Crimson') plt.text(0.3, 1.7, r'$R^2$' + '=' + str(round(r2, 3)), fontsize=fs - 1, color='k') if index == 1: plt.ylim(-1, 4.5) plt.xlim(0, 7) plt.text(0.3, 4.0, r'$micro$' + ' = ' + str(round(MicInt, 2)) + '*' + r'$N$' + '$^{' + str(round(MicCoef, 2)) + '}$', fontsize=fs - 1, color='Steelblue') plt.text(0.3, 3.5, r'$macro$' + ' = ' + str(round(MacInt, 2)) + '*' + r'$N$' + '$^{' + str(round(MacCoef, 2)) + '}$', fontsize=fs - 1, color='Crimson') plt.text(0.3, 2.9, r'$R^2$' + '=' + str(round(r2, 3)), fontsize=fs - 1, color='k') plt.xlabel('Number of reads or individuals, ' + '$log$' + r'$_{10}$', fontsize=fs) plt.ylabel(metric, fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs - 1) plt.subplots_adjust(wspace=0.4, hspace=0.4) if ref == 'OpenRef' and Ones == 'N': plt.savefig( mydir + '/figs/appendix/Rarity/SupplementaryRarityFig-OpenRef_NoMicrobe1s.png', dpi=600, bbox_inches="tight") elif ref == 'OpenRef' and Ones == 'Y': plt.savefig(mydir + '/figs/appendix/Rarity/SupplementaryRarityFig-OpenRef.png', dpi=600, bbox_inches="tight") elif ref == 'ClosedRef' and Ones == 'Y': plt.savefig( mydir + '/figs/appendix/Rarity/SupplementaryRarityFig-ClosedRef.png', dpi=600, bbox_inches="tight") elif ref == 'ClosedRef' and Ones == 'N': plt.savefig( mydir + '/figs/appendix/Rarity/SupplementaryRarityFig-ClosedRef_NoMicrobe1s.png', dpi=600, bbox_inches="tight") #plt.show() return
def scatter_interaction(ax, x, y, groups, colors, ms=5, labels=None, title=None, legend=False, formula='y ~ x', legend_loc='best'): # Here are the imports import numpy as np import matplotlib.pylab as plt import pandas as pd from scipy.stats import pearsonr from statsmodels.sandbox.regression.predstd import wls_prediction_std from statsmodels.formula.api import ols from statsmodels.stats.outliers_influence import summary_table # If you haven't already been given an axis on which to plot, then # create a new figure if not ax: fig = plt.figure(figsize = (5,4)) ax = fig.add_subplot(111) marker_styles = [ 'o', '^', 'D', 's', '*' ] * len(groups) line_styles = ['--', '-', '-.', ':'] * len(groups) # Loop through all the groups for i, x_i, y_i, c_i, g_i, m_i, l_i in zip(range(len(groups)), x, y, colors, groups, marker_styles, line_styles): # Scatter each with the appropriate colors ax.scatter(x_i, y_i, c=c_i, edgecolor=c_i, alpha=0.8, s=ms, marker=m_i, zorder=7*i) # Now calculate the linear correlation between x and y # for each group # Heavily stolen from: # http://www.students.ncl.ac.uk/tom.holderness/software/pythonlinearfit #z = np.polyfit(x_i,y_i,1) #p = np.poly1d(z) #fit = p(x_i) #c_x = [np.min(x_i),np.max(x_i)] #c_y = [p(np.min(x_i)), p(np.max(x_i))] df2 = pd.DataFrame({ 'x' : x_i, 'y' : y_i }) df2.sort('x', inplace=True) lm = ols(formula, df2).fit() ps = [ '{:2.4f}'.format(p) for p in lm.pvalues[1:] ] print ' {}, r2 = {}, p(s) = {}'.format(g_i, lm.rsquared, ', '.join(ps)) prstd, iv_l, iv_u = wls_prediction_std(lm) iv_l = np.array(iv_l) iv_u = np.array(iv_u) fit_y = np.array(lm.fittedvalues) st, data, ss2 = summary_table(lm, alpha=0.05) predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T #pl.plot(x, y, 'k-') #pl.plot(x, fit_y, 'r--') #pl.fill_between(x, iv_l, iv_u, alpha=0.2) # Get the r and p values #r, p = pearsonr(x_i, y_i) #label = '{} r: {: .2g} p: {: .2g}'.format(g_i, r, p) # Now plot ax.plot(df2.x.values, fit_y, c=c_i, linestyle = '-', linewidth = ms/25.0, zorder=6*i, label=g_i) ax.plot(df2.x.values, predict_mean_ci_low, c=c_i, linestyle = '-', linewidth = ms/50.0, zorder=3*i) ax.plot(df2.x.values, predict_mean_ci_upp, c=c_i, linestyle = '-', linewidth = ms/50.0, zorder=2*i) ax.fill_between(df2.x.values, predict_mean_ci_upp, predict_mean_ci_low, alpha=0.3, facecolor=c_i, interpolate=True, zorder=1*i) if legend: # Add the legend leg = ax.legend(loc=legend_loc, fancybox=True, fontsize=ms/2.) leg.get_frame().set_alpha(0) # Set the y limits # This is to deal with very small numbers (the MaxNLocator gets all turned around!) # Concatenate all the y data: y_all = y[0] if len(y) > 1: for k in range(1,len(y)): y_all = np.concatenate([y_all, y[k]]) max_y = np.max(y_all) min_y = np.min(y_all) buffer = ( max_y - min_y ) / 10 upper = max_y + buffer lower = min_y - buffer ax.set_ybound(upper, lower) # Set the axis labels ax.set_ylabel(labels[1], fontsize=ms/2.0) ax.set_xlabel(labels[0], fontsize=ms/2.0) for item in (ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(ms/2.0) # Adjust the power limits so that you use scientific notation on the y axis plt.ticklabel_format(style='sci', axis='y') ax.yaxis.major.formatter.set_powerlimits((-3,3)) plt.rc('font', **{'size':ms/2.0}) if title: # Set the overall title ax.set_title(title) plt.tight_layout() return ax
def Fig2(condition, ones, sampling): """ A figure demonstrating a strong abundance relationship across 30 orders of magnitude in total abundance. The abundance of the most abundant species scales in a log-log fashion with the total abundance of the sample or system. """ tail = str() if ones is False: tail = '-SADMetricData_NoMicrobe1s.txt' elif ones is True: tail = '-SADMetricData.txt' datasets = [] GoodNames = [] emp = str() if condition == 'open': emp = 'EMPopen' elif condition == 'closed': emp = 'EMPclosed' GoodNames = [emp, 'TARA', 'HMP', 'BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED', 'HUMAN', 'CHINA', 'CATLIN', 'FUNGI'] fs = 13 # font size used across figures Nlist, NmaxList, klist, datasets, radDATA = [[],[],[],[],[]] for name in os.listdir(mydir +'data/micro'): #if name in BadNames: continue if name in GoodNames: pass else: continue path = mydir+'data/micro/'+name+'/'+name+tail numlines = sum(1 for line in open(path)) print name, numlines datasets.append([name, 'micro', numlines]) for dataset in datasets: name, kind, numlines = dataset lines = [] small = ['BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED'] big = ['HUMAN', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO'] if name in small: lines = np.random.choice(range(1, numlines+1), 1000, replace=True) elif name in big: lines = np.random.choice(range(1, numlines+1), 2500, replace=True) elif name == 'TARA': lines = np.random.choice(range(1, numlines+1), 2500, replace=True) else: lines = np.random.choice(range(1, numlines+1), 2500, replace=True) path = mydir+'data/micro/'+name+'/'+name+tail for line in lines: data = linecache.getline(path, line) radDATA.append(data) klist.append('DarkCyan') for data in radDATA: data = data.split() name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) #if S < 10 or N < 11: continue # Min species richness Nlist.append(float(np.log10(float(N)))) NmaxList.append(float(np.log10(float(Nmax)))) klist.append('DarkCyan') metric = 'Dominance, '+'$log$'+r'$_{10}$' fig = plt.figure() ax = fig.add_subplot(1, 1, 1) Nlist, NmaxList = zip(*sorted(zip(Nlist, NmaxList))) Nlist = list(Nlist) NmaxList = list(NmaxList) # Regression d = pd.DataFrame({'N': list(Nlist)}) d['y'] = list(NmaxList) f = smf.ols('y ~ N', d).fit() R2 = f.rsquared pval = f.pvalues intercept = f.params[0] slope = f.params[1] #print f.summary() #print intercept, slope X = np.linspace(6, 40, 100) Y = f.predict(exog=dict(N=X)) Nlist2 = Nlist + X.tolist() NmaxList2 = NmaxList + Y.tolist() d = pd.DataFrame({'N': list(Nlist2)}) d['y'] = list(NmaxList2) f = smf.ols('y ~ N', d).fit() st, data, ss2 = summary_table(f, alpha=0.05) fittedvalues = data[:,2] pred_mean_se = data[:,3] pred_mean_ci_low, pred_mean_ci_upp = data[:,4:6].T pred_ci_low, pred_ci_upp = data[:,6:8].T label1 = 'Dominance scaling law for microbial data compilation' label2 = 'Ranges of published $N_{max}$ and $N$' plt.fill_between(Nlist2, pred_ci_low, pred_ci_upp, color='r', lw=0.5, alpha=0.2) plt.text(2, 22, r'$N_{max}$'+ ' = '+str(round(10**intercept,2))+'*'+r'$N$'+'$^{'+str(round(slope,2))+'}$', fontsize=fs+4, color='Crimson', alpha=0.9) plt.text(2, 19, r'$r^2$' + ' = ' +str("%.2f" % R2), fontsize=fs+4, color='0.2') plt.plot(X.tolist(), Y.tolist(), '--', c='red', lw=2, alpha=0.8, color='Crimson', label=label1) print 'r-squared and slope for RADs w/out inferred:', round(R2, 3), round(slope,3) plt.hexbin(Nlist, NmaxList, mincnt=1, gridsize = 50, bins='log', cmap=plt.cm.Reds_r) # #plt.scatter(Nlist, NmaxList, color = 'LightCoral', alpha= 0.6 , s = 10, linewidths=0.5, edgecolor='Crimson') GO = np.log10([360.0*(10**26), 1010.0*(10**26)]) # estimated open ocean bacteria; Whitman et al. 1998 Pm = np.log10([2.8*(10**27), 3.0*(10**27)]) # estimated Prochlorococcus; Flombaum et al. 2013 Syn = np.log10([6.7*(10**26), 7.3*(10**26)]) # estimated Synechococcus; Flombaum et al. 2013 Earth = np.log10([9.2*(10**29), 31.7*(10**29)]) # estimated bacteria on Earth; Kallmeyer et al. 2012 SAR11 = np.log10([2.0*(10**28), 2.0*(10**28)]) # estimated percent abundance of SAR11; Morris et al. (2002) HGx = np.log10([0.5*(10**14), 1.5*(10**14)]) # estimated bacteria in Human gut; Berg (1996) HGy = np.log10([0.05*(10**min(HGx)), 0.15*(10**max(HGx))]) # estimated most abundant bacteria in Human gut; Turnbaugh et al. (2009), & Dethlefsen et al. (2008) COWx = np.log10([0.5*2.226*(10**15), 1.5*2.226*(10**15)]) # estimated bacteria in Cow rumen; LOW: HIGH: Whitman et al. (1998) COWy = np.log10([0.09*(10**min(COWx)), .15*(10**max(COWx))]) # estimated dominance in Cow rumen; Stevenson and Weimer (2006) c = '0.2' ## EARTH x = [np.mean(Earth)] x_range = (max(Earth) - min(Earth))/2.0 y = [np.mean([min(Pm), max(SAR11)])] y_range = (max(SAR11) - min(Pm))/2.0 ax.text(8.5, max(SAR11)+0.2, r'$Prochlorococcus$ and Pelagibacterales', fontsize=fs+2, color = 'k') ax.text(max(Earth)+0.5, 26, 'Earth microbiome', fontsize=fs+2, color = 'k', rotation = 90) ax.axhline(y, 0, 0.90, ls = '--', c = '0.4') ax.axvline(x, 0, 0.85, ls = '--', c = '0.4') plt.errorbar(x, y, xerr=x_range, yerr=y_range, color='k', linewidth=1, label=label2) c = '0.4' ## GLOBAL OCEAN x = [np.mean(GO)] x_range = (max(GO) - min(GO))/2.0 y = [np.mean(Pm)] y_range = (max(SAR11) - min(Pm))/2.0 ax.text(7.5, min(Pm)-1.35, r'$Synechococcus$ and $Prochlorococcus$', fontsize=fs+2, color = 'k') ax.text(min(GO)-1, 22, 'Non-sediment ocean bacteria', fontsize=fs+2, color = 'k', rotation = 90) ax.axhline(y, 0, 0.85, ls = '--', c = '0.4') ax.axvline(x, 0, 0.83, ls = '--', c = '0.4') plt.errorbar(x, y, xerr=x_range, yerr=y_range, color='k', linewidth=1) ## HUMAN GUT x = [np.mean(HGx)] x_range = (max(HGx) - min(HGx))/2.0 y = [np.mean(HGy)] y_range = (max(HGy) - min(HGy))/2.0 ax.text(4, min(HGy)-1, 'Human gut', fontsize=fs+2, color = 'k') ax.text(min(HGx)-1, 8, 'Human gut', fontsize=fs+2, color = 'k', rotation = 90) ax.axhline(y, 0, 0.40, ls = '--', c = '0.4') ax.axvline(x, 0, 0.38, ls = '--', c = '0.4') plt.errorbar(x, y, xerr=x_range, yerr=y_range, color='k', linewidth=1) ## COW RUMEN x = [np.mean(COWx)] x_range = (max(COWx) - min(COWx))/2.0 y = [np.mean(COWy)] y_range = (max(COWy) - min(COWy))/2.0 ax.text(7, max(COWy)+0.3, '$Prevotella$', fontsize=fs+2, color = 'k') ax.text(max(COWx)+0.4, 11.2, 'Cow rumen', fontsize=fs+2, color = 'k', rotation = 90) ax.axhline(y, 0, 0.41, ls = '--', c = '0.4') ax.axvline(x, 0, 0.43, ls = '--', c = '0.4') plt.errorbar(x, y, xerr=x_range, yerr=y_range, color='k', linewidth=1) ax.text(5, -4.2, 'Number of reads or total abundance, '+ '$log$'+r'$_{10}$', fontsize=fs+4) ax.text(-2.5, 22, metric, fontsize=fs+4, rotation=90) plt.plot([0,32],[0,32], ls = '--', lw=2, c='0.7') #ax.text(18, 21, '1:1 line', fontsize=fs*1.0, rotation=40, color='0.7') plt.xlim(1, 33) plt.ylim(0, 32) plt.legend(bbox_to_anchor=(-0.015, 1, 1.025, .2), loc=10, ncol=1, mode="expand",prop={'size':fs+1}, numpoints=1) if ones == False: plt.savefig(mydir+'/figs/Fig2/Locey_Lennon_2015_Fig2-'+condition+'_NoSingletons_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight") if ones == True: plt.savefig(mydir+'/figs/Fig2/Locey_Lennon_2015_Fig2-'+condition+'_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight") #plt.savefig(mydir+'/figs/Fig2/figure2-v2.pdf', dpi=300, bbox_inches = "tight") #plt.show() return
leg.get_frame().set_alpha(0.5) #, fontsize='small') ltext = leg.get_texts() # all the text.Text instance in the legend plt.setp(ltext, fontsize='small') # the legend text fontsize print oi.reset_ramsey(res, degree=3) #note, constant in last column for i in range(1): print oi.variance_inflation_factor(res.model.exog, i) infl = oi.OLSInfluence(res_ols) print infl.resid_studentized_external print infl.resid_studentized_internal print infl.summary_table() print oi.summary_table(res, alpha=0.05)[0] ''' >>> res.resid array([ 4.28571429, 4. , 0.57142857, -3.64285714, -4.71428571, 1.92857143, 10. , -6.35714286, -11. , -1.42857143, 1.71428571, 4.64285714]) >>> infl.hat_matrix_diag array([ 0.10084034, 0.11764706, 0.28571429, 0.20168067, 0.10084034, 0.16806723, 0.11764706, 0.08403361, 0.11764706, 0.28571429, 0.33613445, 0.08403361]) >>> infl.resid_press array([ 4.76635514, 4.53333333, 0.8 , -4.56315789, -5.24299065, 2.31818182, 11.33333333, -6.94036697, -12.46666667, -2. , 2.58227848, 5.06880734]) >>> infl.ess_press
def plot_OLS(ax, target, Y, mode='unicolor'): X = target X = sm.add_constant(X) model = sm.OLS(Y, X) results = model.fit() st, data, ss2 = summary_table(results, alpha=0.05) fittedvalues = data[:, 2] predict_mean_se = data[:, 3] predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T predict_ci_low, predict_ci_upp = data[:, 6:8].T if mode == 'unicolor': ax.scatter(target, Y, c='silver', linewidths=0, s=4) else: xy = np.row_stack([target, Y]) z = gaussian_kde(xy)(xy) idx = z.argsort() x, y, z = xy[0][idx], xy[1][idx], z[idx] ax.scatter(x, y, c=z, s=4, cmap=pl.cm.inferno_r) ax.plot(target, fittedvalues, 'r-', label='Least Square Regression', lw=2) idx = np.argsort(predict_ci_low) ax.plot(target[idx], predict_ci_low[idx], 'r--', lw=2, label='95% confidence interval') idx = np.argsort(predict_ci_upp) ax.plot(target[idx], predict_ci_upp[idx], 'r--', lw=2) mx = np.ceil(max(target.max(), fittedvalues.max())) ax.plot([0, mx], [0, mx], 'k-') ax.set_xlim(0, mx) ax.set_ylim(0, mx) ax.set_aspect(1) ax.legend(loc='upper left') ax.set_xlabel('AGB from map [Mg ha$^{-1}$]') ax.set_ylabel('Reconstructed AGB [Mg ha$^{-1}$]') nse = 1 - ((Y - target)**2).sum() / ((target - target.mean())**2).sum() rmse = np.sqrt(((Y - target)**2).mean()) ax.text( 0.98, 0.02, 'y = %4.2fx + %4.2f\nR$^2$ = %4.2f; p < 0.001\nrmse = %4.1f Mg ha$^{-1}$ ; NSE = %4.2f' % (results.params[1], results.params[0], results.rsquared, rmse, nse), va='bottom', ha='right', transform=ax.transAxes) idx = np.argsort(predict_ci_upp) ax.plot(target[idx], predict_ci_upp[idx], 'r--', lw=2) mx = np.ceil(max(target.max(), fittedvalues.max())) ax.plot([0, mx], [0, mx], 'k-') ax.set_xlim(0, mx) ax.set_ylim(0, mx) ax.set_aspect(1) ax.legend(loc='upper left') ax.set_xlabel('AGB from map [Mg ha$^{-1}$]') ax.set_ylabel('Reconstructed AGB [Mg ha$^{-1}$]') nse = 1 - ((Y - target)**2).sum() / ((target - target.mean())**2).sum() rmse = np.sqrt(((Y - target)**2).mean()) ax.text( 0.98, 0.02, 'y = %4.2fx + %4.2f\nR$^2$ = %4.2f; p < 0.001\nrmse = %4.1f Mg ha$^{-1}$ ; NSE = %4.2f' % (results.params[1], results.params[0], results.rsquared, rmse, nse), va='bottom', ha='right', transform=ax.transAxes)
def Fig1(cutoffs, Ones): datasets = [] GoodNames = ['LAUB', 'CHU', 'HYDRO', 'CATLIN'] for cutoff in cutoffs: for name in GoodNames: if Ones == 'N': path = mydir+'data/micro/'+name+'/'+name+cutoff+'/'+name+cutoff+'-SADMetricData_NoMicrobe1s.txt' if Ones == 'Y': path = mydir+'data/micro/'+name+'/'+name+cutoff+'/'+name+cutoff+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, cutoff, 'micro', num_lines]) print name, num_lines metrics = ['Rarity, '+r'$log_{10}$', 'Dominance, '+r'$log_{10}$', 'Evenness, ' +r'$log_{10}$', 'Richness, ' +r'$log_{10}$'] fig = plt.figure() for index, i in enumerate(metrics): metric = i fig.add_subplot(2, 2, index+1) fs = 10 # font size used across figures c97IntList, c97CoefList, c99IntList, c99CoefList, c95CoefList, c95IntList, R2List, metlist = [[], [], [], [], [], [], [], []] Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []] #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] its = 100 for n in range(its): #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []] radDATA = [] for dataset in datasets: name, cutoff, kind, numlines = dataset lines = [] lines = np.random.choice(range(1, numlines+1), numlines, replace=False) if Ones == 'N': path = mydir+'data/'+kind+'/'+name+'/'+name+cutoff+'/'+name+cutoff+'-SADMetricData_NoMicrobe1s.txt' if Ones == 'Y': path = mydir+'data/'+kind+'/'+name+'/'+name+cutoff+'/'+name+cutoff+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) dlist = cutoff+' '+data radDATA.append(dlist) for data in radDATA: data = data.split() cutoff, name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) #if S < 10 or N < 11: continue Nlist.append(float(np.log10(N))) Slist.append(float(np.log10(S))) ESimplist.append(float(np.log10(float(ESimp)))) KindList.append(cutoff) BPlist.append(float(BP)) NmaxList.append(float(np.log10(float(Nmax)))) # log-modulo transformation of skewnness lms = np.log10(np.abs(float(skew)) + 1) if skew < 0: lms = lms * -1 rareSkews.append(float(lms)) if cutoff == '99': klist.append('Steelblue') elif cutoff == '97': klist.append('Crimson') elif cutoff == '95': klist.append('0.4') if index == 0: metlist = list(rareSkews) elif index == 1: metlist = list(NmaxList) elif index == 2: metlist = list(ESimplist) elif index == 3: metlist = list(Slist) # Multiple regression d = pd.DataFrame({'N': list(Nlist)}) d['y'] = list(metlist) d['Kind'] = list(KindList) f = smf.ols('y ~ N * Kind', d).fit() print f.summary() #print f.params c95IntList.append(f.params[0]) c95CoefList.append(f.params[3]) if f.pvalues[1] < 0.05: c97IntList.append(f.params[1] + f.params[0]) else: c97IntList.append(f.params[0]) if f.pvalues[4] < 0.05: c97CoefList.append(f.params[4] + f.params[3]) else: c97CoefList.append(f.params[3]) if f.pvalues[2] < 0.05: c99IntList.append(f.params[2] + f.params[0]) else: c99IntList.append(f.params[0]) if f.pvalues[5] < 0.05: c99CoefList.append(f.params[5] + f.params[3]) else: c99CoefList.append(f.params[3]) R2List.append(f.rsquared) c95PIx, c95Fitted = [[],[]] c95CiH, c95CiL = [[],[]] c97PIx, c97Fitted = [[],[]] c97CiH, c97CiL = [[],[]] c99PIx, c99Fitted = [[],[]] c99CiH, c99CiL = [[],[]] c95ListX = [] c95ListY = [] c97ListX = [] c97ListY = [] c99ListX = [] c99ListY = [] for j, k in enumerate(KindList): if k == '99': c99ListX.append(Nlist[j]) c99ListY.append(metlist[j]) if k == '97': c97ListX.append(Nlist[j]) c97ListY.append(metlist[j]) if k == '95': c95ListX.append(Nlist[j]) c95ListY.append(metlist[j]) print metric lm = smf.ols('y ~ N * Kind', d).fit() print lm.summary() print '\n\n' st, data, ss2 = summary_table(lm, alpha=0.05) # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict, # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp, # Residual, Std Error Residual, Student Residual, Cook's D #fittedvalues = data[:,2] #predict_mean_se = data[:,3] predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T predict_ci_low, predict_ci_upp = data[:,6:8].T for j, kval in enumerate(KindList): if kval == '99': c99CiH.append(predict_mean_ci_upp[j]) c99CiL.append(predict_mean_ci_low[j]) c99PIx.append(Nlist[j]) c99Fitted.append(f.fittedvalues[j]) if kval == '97': c97CiH.append(predict_mean_ci_upp[j]) c97CiL.append(predict_mean_ci_low[j]) c97PIx.append(Nlist[j]) c97Fitted.append(f.fittedvalues[j]) if kval == '95': c95CiH.append(predict_mean_ci_upp[j]) c95CiL.append(predict_mean_ci_low[j]) c95PIx.append(Nlist[j]) c95Fitted.append(f.fittedvalues[j]) c99PIx, c99Fitted, c99CiH, c99CiL, c97PIx, c97Fitted, c97CiH, c97CiL, c95PIx, c95Fitted, c95CiH, c95CiL = zip(*sorted(zip(c99PIx, c99Fitted, c99CiH, c99CiL, c97PIx, c97Fitted, c97CiH, c97CiL, c95PIx, c95Fitted, c95CiH, c95CiL))) plt.scatter(c99ListX, c99ListY, facecolor = 'none', alpha= 1 , s = 5, linewidths=0.5, edgecolor='Steelblue') plt.scatter(c97ListX, c97ListY, facecolor = 'none', alpha= 1 , s = 5, linewidths=0.5, edgecolor='Crimson') plt.scatter(c95ListX, c95ListY, facecolor = 'none', alpha= 1 , s = 5, linewidths=0.5, edgecolor='0.4') #plt.fill_between(c99PIx, c99CiL, c99CiH, color='b', lw=0.0, alpha=0.3) #plt.fill_between(c97PIx, c97CiL, c97CiH, color='r', lw=0.0, alpha=0.3) #plt.fill_between(c95PIx, c95CiL, c95CiH, color='k', lw=0.0, alpha=0.3) plt.plot(c99PIx, c99Fitted, color='b', ls='--', lw=1, alpha=0.8) plt.plot(c97PIx, c97Fitted, color='r', ls='--', lw=1, alpha=0.8) plt.plot(c95PIx, c95Fitted, color='0.2', ls='--', lw=1, alpha=0.8) c99Int = round(np.mean(c99IntList), 2) c99Coef = round(np.mean(c99CoefList), 2) c97Int = round(np.mean(c97IntList), 2) c97Coef = round(np.mean(c97CoefList), 2) c95Int = round(np.mean(c95IntList), 2) c95Coef = round(np.mean(c95CoefList), 2) R2 = round(np.mean(R2List), 2) if index == 0: plt.ylim(-0.1, 2.0) plt.xlim(1, 6) plt.text(1.35, 1.7, r'$99%$'+ ' = '+str(round(10**c99Int,2))+'*'+r'$N$'+'$^{'+str(round(c99Coef,2))+'}$', fontsize=fs, color='Steelblue') plt.text(1.35, 1.5, r'$97%$'+ ' = '+str(round(10**c97Int,2))+'*'+r'$N$'+'$^{'+str(round(c97Coef,2))+'}$', fontsize=fs, color='Crimson') plt.text(1.35, 1.3, r'$95%$'+ ' = '+str(round(10**c95Int,2))+'*'+r'$N$'+'$^{'+str(round(c95Coef,2))+'}$', fontsize=fs, color='0.3') plt.text(1.35, 1.1, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k') plt.scatter([0],[-1], color = 'none', alpha = 1, s=10, linewidths=0.9, edgecolor='Steelblue', label= '99% (n='+str(len(c99ListY))+')') plt.scatter([0],[-1], color = 'none', alpha = 1, s=10, linewidths=0.9, edgecolor='Crimson', label= '97% (n='+str(len(c97ListY))+')') plt.scatter([0],[-1], color = 'none', alpha = 1, s=10, linewidths=0.9, edgecolor='0.3', label= '95% (n='+str(len(c95ListY))+')') plt.legend(bbox_to_anchor=(-0.04, 1.05, 2.48, .2), loc=10, ncol=3, mode="expand",prop={'size':fs+2}) elif index == 1: plt.plot([0,7],[0,7], ls = '--', lw=1, c='0.7') #ax.text(18, 21, '1:1 line', fontsize=fs*1.0, rotation=40, color='0.7') plt.ylim(0, 6) plt.xlim(1, 6) plt.text(1.35, 5.1, r'$99%$'+ ' = '+str(round(10**c99Int,2))+'*'+r'$N$'+'$^{'+str(round(c99Coef,2))+'}$', fontsize=fs, color='Steelblue') plt.text(1.35, 4.6, r'$97%$'+ ' = '+str(round(10**c97Int,2))+'*'+r'$N$'+'$^{'+str(round(c97Coef,2))+'}$', fontsize=fs, color='Crimson') plt.text(1.35, 4.1, r'$95%$'+ ' = '+str(round(10**c95Int,2))+'*'+r'$N$'+'$^{'+str(round(c95Coef,2))+'}$', fontsize=fs, color='0.3') plt.text(1.35, 3.6, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k') elif index == 2: plt.ylim(-3.0, 0.0) plt.xlim(1, 6) plt.text(1.35, -1.8, r'$99%$'+ ' = '+str(round(10**c99Int,2))+'*'+r'$N$'+'$^{'+str(round(c99Coef,2))+'}$', fontsize=fs, color='Steelblue') plt.text(1.35, -2.1, r'$97%$'+ ' = '+str(round(10**c97Int,2))+'*'+r'$N$'+'$^{'+str(round(c97Coef,2))+'}$', fontsize=fs, color='Crimson') plt.text(1.35, -2.4, r'$95%$'+ ' = '+str(round(10**c95Int,2))+'*'+r'$N$'+'$^{'+str(round(c95Coef,2))+'}$', fontsize=fs, color='0.3') plt.text(1.35, -2.7, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k') elif index == 3: plt.ylim(0.9, 4.5) plt.xlim(1, 6) plt.text(1.35, 3.9, r'$99%$'+ ' = '+str(round(10**c99Int,2))+'*'+r'$N$'+'$^{'+str(round(c99Coef,2))+'}$', fontsize=fs, color='Steelblue') plt.text(1.35, 3.5, r'$97%$'+ ' = '+str(round(10**c97Int,2))+'*'+r'$N$'+'$^{'+str(round(c97Coef,2))+'}$', fontsize=fs, color='Crimson') plt.text(1.35, 3.1, r'$95%$'+ ' = '+str(round(10**c95Int,2))+'*'+r'$N$'+'$^{'+str(round(c95Coef,2))+'}$', fontsize=fs, color='0.3') plt.text(1.35, 2.7, r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k') plt.xlabel('Number of reads, '+ '$log$'+r'$_{10}$', fontsize=fs) plt.ylabel(metric, fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) if Ones =='N': plt.savefig(mydir+'figs/appendix/PercentCutoff/PercentCutoff_NoMicrobe1s.png', dpi=600, bbox_inches = "tight") elif Ones =='Y': plt.savefig(mydir+'figs/appendix/PercentCutoff/PercentCutoff.png', dpi=600, bbox_inches = "tight") #plt.show() plt.close() return