Example #1
0
def test_outlier_influence_funcs():
    #smoke test
    x = add_constant(np.random.randn(10, 2))
    y = x.sum(1) + np.random.randn(10)
    res = OLS(y, x).fit()
    oi.summary_table(res, alpha=0.05)

    res2 = OLS(y, x[:,0]).fit()
    oi.summary_table(res2, alpha=0.05)
    infl = res2.get_influence()
    infl.summary_table()
Example #2
0
def test_outlier_influence_funcs():
    # smoke test
    x = add_constant(np.random.randn(10, 2))
    y = x.sum(1) + np.random.randn(10)
    res = OLS(y, x).fit()
    oi.summary_table(res, alpha=0.05)

    res2 = OLS(y, x[:, 0]).fit()
    oi.summary_table(res2, alpha=0.05)
    infl = res2.get_influence()
    infl.summary_table()
Example #3
0
def seasonal_prediction(df,
                        df_result,
                        y_name,
                        time_name,
                        new_t,
                        period,
                        show=False,
                        option='cma'):
    """
    df should be the deseasoned df if possible
    """
    y_v = df[y_name]
    if len(new_t) == 0:
        _, data, _ = sso.summary_table(df_result, alpha=0.05)
        trend_proj = df_result.predict(sm.add_constant(new_t))
        df_result.predict(sm.add_constant(new_t))
        tdf = df.copy()
        tdf[f'Pre_{y_name}'] = data[:, 2] * tdf['SeaIdx']
        return tdf
    else:
        new_t = np.array(new_t)
        SI, SIid = seasonal_index(y_v, period, show, option=option)
        # des_df = smoothing_cma(df, y_name, time_name,
        #                        period=period, show=show)  # final df secured

        trend_proj = df_result.predict(sm.add_constant(new_t))
        seasonal_adj = trend_proj * SI
        # new_t = np.arange(12, 16)
        _, data, _ = sso.summary_table(df_result, alpha=0.05)
        trend_proj = df_result.predict(sm.add_constant(new_t))
        df_result.predict(sm.add_constant(new_t))
        tdf = df.copy()
        tdf[f'Pre_{y_name}'] = data[:, 2] * tdf['SeaIdx']

        # tdf[x_name] = np.append(tdf[x_name], new_t)
        for i, t in enumerate(new_t):
            tdf = tdf.append(
                {
                    time_name:
                    t,
                    'SID':
                    tdf['SID'].values[-(1 + i) - (len(new_t) - (1 + i))],
                    'SeaIdx':
                    tdf['SeaIdx'].values[-(1 + i) - (len(new_t) - (1 + i))],
                    f'Pre_{y_name}':
                    trend_proj[i] * tdf['SeaIdx'].values[-(1 + i) -
                                                         (len(new_t) -
                                                          (1 + i))]
                },
                ignore_index=True)

    return tdf
Example #4
0
def test_outlier_influence_funcs(reset_randomstate):
    x = add_constant(np.random.randn(10, 2))
    y = x.sum(1) + np.random.randn(10)
    res = OLS(y, x).fit()
    out_05 = oi.summary_table(res)
    # GH3344 : Check alpha has an effect
    out_01 = oi.summary_table(res, alpha=0.01)
    assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6]))
    assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7]))

    res2 = OLS(y, x[:,0]).fit()
    oi.summary_table(res2, alpha=0.05)
    infl = res2.get_influence()
    infl.summary_table()
def test_outlier_influence_funcs(reset_randomstate):
    x = add_constant(np.random.randn(10, 2))
    y = x.sum(1) + np.random.randn(10)
    res = OLS(y, x).fit()
    out_05 = oi.summary_table(res)
    # GH3344 : Check alpha has an effect
    out_01 = oi.summary_table(res, alpha=0.01)
    assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6]))
    assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7]))

    res2 = OLS(y, x[:, 0]).fit()
    oi.summary_table(res2, alpha=0.05)
    infl = res2.get_influence()
    infl.summary_table()
def OLS_fit(S1="600015", S2="600016"):
    import statsmodels.api as sm
    from statsmodels.stats.outliers_influence import summary_table
    db = dbloader("./dataset/training_data")
    ts1 = db.load_day_a(S1, "20090101", "20091130")["Closing Price"]
    ts2 = db.load_day_a(S2, "20090101", "20091130")["Closing Price"]
    ts1, ts2 = align_series(ts1, ts2)
    x = ts1.values
    Y = ts2.values
    X = sm.add_constant(x)
    res = sm.OLS(Y, X).fit()
    print(res.summary())
    _, data, _ = summary_table(res)
    plt.plot(Y, label="real")
    plt.plot(res.fittedvalues, label="fitted")
    plt.legend()
    plt.savefig("./image/OLS_{}_{}.png".format(S1, S2))
    plt.show()
    w1 = res.params[1]
    diff = ts2 - w1 * ts1
    diff_mean = diff.mean()
    diff_std = diff.std()
    mean_line = pd.Series(diff_mean, index=diff.index)
    up_line = pd.Series(diff_mean + diff_std, index=diff.index)
    down_line = pd.Series(diff_mean - diff_std, index=diff.index)
    sets = pd.concat([diff, mean_line, up_line, down_line], axis=1)
    sets.columns = ["diff", "mean", "up", "down"]
    sets.plot(figsize=(14, 7))
    plt.savefig("./image/OLS_diff_{}_{}.png".format(S1, S2), dpi=800)
    plt.show()
Example #7
0
def simple_CIPIINT_regplot(df, xname, yname, alpha=0.05):
    print(
        "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n|CI PI Interval plot - simple|\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
    )
    print("using alpha = ", alpha)

    df_sorted = df.sort_values([xname])
    result = smf.ols(yname + '~' + xname, data=df_sorted).fit()
    x = df_sorted[xname].values
    y = df_sorted[yname].values
    st, data, ss2 = sso.summary_table(result, alpha=alpha)
    fittedvalues = data[:, 2]
    predict_mean_se = data[:, 3]
    predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T
    predict_ci_low, predict_ci_upp = data[:, 6:8].T

    plt.plot(x, y, 'o', color='gray')
    plt.plot(x, fittedvalues, '-', lw=0.5)
    plt.plot(x, predict_mean_ci_low, 'r-', lw=0.4)
    plt.plot(x, predict_mean_ci_upp, 'r-', lw=0.4)
    plt.plot(x, predict_ci_low, 'b--', lw=0.4)
    plt.plot(x, predict_ci_upp, 'b--', lw=0.4)
    plt.title('CI PI plot')
    plt.xlabel(xname)
    plt.ylabel(yname)
    plt.legend([
        'data points', 'regression model', 'confidence interval',
        'prediction interval'
    ],
               title='Legends',
               bbox_to_anchor=(1.3, 1),
               prop={'size': 6})
    plt.show()
Example #8
0
def multiple_durbin_watson(df, xnames, yname, alpha=0.05):
    print("\n\n========== Durbin-Watson ==========\n")

    y_data = df[yname]
    x_data_ar = []
    for i in range(len(xnames)):
        x_data_ar.append(df[xnames[i]])
    x_data_ar = np.asarray(x_data_ar)

    x_data_T = x_data_ar.T
    x_data = pd.DataFrame(x_data_T, columns=xnames)
    x_data2 = sm.add_constant(x_data)
    olsmod = sm.OLS(y_data, x_data2)
    result = olsmod.fit()

    st, data, ss2 = sso.summary_table(result, alpha=alpha)
    print("Columns in data are: %s" % ss2)
    # Predicted value
    y_pre = data[:, 2]
    # Studentized Residual
    SD = data[:, 10]

    x_square_sum = np.vdot(SD, SD)
    print("x_square_sum = ", x_square_sum)
    size = SD.size
    print("size = ", size)
    x_d = np.zeros((size))
    print("x_d = ", x_d)
    l_size = size - 1
    for i in range(l_size):
        x_d[i + 1] = SD[i + 1] - SD[i]
    print("x_d = ", x_d)
    d = np.vdot(x_d, x_d) / x_square_sum
    print("d = ", d)
Example #9
0
def regress(x,y,alpha=.05,xlabel='',ylabel='',title=''):
    if x.name=='':
        x.name='x'
    X = sm.add_constant(x)
    res = sm.OLS(y, X).fit()

    st, data, ss2 = summary_table(res, alpha=0.05)
    fittedvalues = data[:,2]
    #predict_mean_se  = data[:,3]
    predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
    predict_ci_low, predict_ci_upp = data[:,6:8].T
    
    fig, ax = plt.subplots(figsize=(8,6))
    ax.scatter(x, y, label="data",s=5)
    ax.plot(x, fittedvalues, 'r-', label='OLS: 95% confidence')
    ax.plot(x, predict_ci_low, 'b--')
    ax.plot(x, predict_ci_upp, 'b--')
    ax.plot(x, predict_mean_ci_low, 'g--')
    ax.plot(x, predict_mean_ci_upp, 'g--')
    ax.scatter(x=x.tail(1), y=y.tail(1), color='Red', s=25, label="Now")
    ax.legend(loc='best');
    equation = ylabel+" = %.4f"%res.params[1] +" * " + xlabel +"  + " + "%.4f"%res.params[0]
    ax.set_xlabel(xlabel + "          "+ equation, color='g',fontsize = 15);
    ax.set_ylabel(ylabel, color='b',fontsize = 15);
    plt.show()
    
    fig, ax = plt.subplots(figsize=(8,6))
    plt.plot(x.index,data[:,8])
    plt.title("Residual plot:  "+equation,fontsize=20)
    ax.set_xlabel("Date ", color='g',fontsize = 15);
    ax.set_ylabel("Residual ", color='g',fontsize = 15);
    plt.show()
    return data
Example #10
0
 def linear_regression(self, x, y):
     x = sm.add_constant(x)
     regr = sm.OLS(y, x)
     res = regr.fit()
     # Get fitted values from model to plot
     st, data, ss2 = summary_table(res, alpha=0.05)
     fitted_values = data[:, 2]
     return fitted_values
def figplot(x, y, sims, clrs, xlab, ylab, fig, n):

    fig.add_subplot(2, 2, n)
    y2 = list(y)
    x2 = list(x)
    clrs = list(clrs)

    plt.scatter(x2, y2, color=clrs, s=2, linewidths=0.0)

    d = pd.DataFrame({'x': list(x2)})
    d['y'] = list(y2)
    f = smf.ols('y ~ x', d).fit()
    st, data, ss2 = summary_table(f, alpha=0.05)
    fitted = data[:, 2]
    m, b, r, p, std_err = stats.linregress(x2, y2)

    if n == 1:
        lab = r'$R_{models}$' + ' = ' + str(round(
            10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n'
        lab += r'$R_{microbes}$' + ' = 2.34*' + r'$N$' + '$^{0.14}$' + '\n'
        lab += r'$R_{macrobes}$' + ' = 1.7*' + r'$N$' + '$^{0.11}$'
        plt.text(0.2, 1.4, lab, fontsize=7)

    elif n == 2:
        lab = r'$D_{models}$' + ' = ' + str(round(
            10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n'
        lab += r'$D_{microbes}$' + ' = 0.44*' + r'$N$' + '$^{0.92}$' + '\n'
        lab += r'$D_{macrobes}$' + ' = 0.23*' + r'$N$' + '$^{0.99}$'
        plt.text(0.2, 2.5, lab, fontsize=7)

    elif n == 3:
        lab = r'$E_{models}$' + ' = ' + str(round(
            10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n'
        lab += r'$E_{microbes}$' + ' = 0.58*' + r'$N$' + '$^{-0.23}$' + '\n'
        lab += r'$E_{macrobes}$' + ' = 1.15*' + r'$N$' + '$^{-0.21}$'
        plt.text(0.2, -3.4, lab, fontsize=7)

    elif n == 4:
        lab = r'$S_{models}$' + ' = ' + str(round(
            10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n'
        lab += r'$S_{microbes}$' + ' = 1.77*' + r'$N$' + '$^{0.38}$' + '\n'
        lab += r'$S_{macrobes}$' + ' = 1.77*' + r'$N$' + '$^{0.24}$'
        plt.text(0.2, 2.8, lab, fontsize=7)

    if n == 3: plt.legend(loc='best', fontsize=6, frameon=False)

    plt.plot(x2, fitted, color='k', ls='--', lw=1.0, alpha=0.9)
    plt.xlabel(xlab, fontsize=8)
    plt.ylabel(ylab, fontsize=8)
    plt.tick_params(axis='both', labelsize=5)
    plt.xlim(0, 1.05 * max(x2))

    if n == 1: plt.ylim(0.0, max(y2))
    elif n == 2: plt.ylim(0.0, max(y2))
    elif n == 3: plt.ylim(min(y2), 0)
    elif n == 4: plt.ylim(0.4, max(y2))

    return fig
Example #12
0
def linearR(df, column1, column2):

    x = sm.add_constant(df.toPandas()[column1])
    y = df.toPandas()[column2]
    regr = sm.OLS(y, x)
    res = regr.fit()
    st, data, ss2 = summary_table(res, alpha=0.05)
    fitted_values = data[:, 2]
    return fitted_values
def figplot(clrs, x, y, xlab, ylab, fig, n):

    fig.add_subplot(2, 2, n)
    plt.xscale('log')
    if n == 1: plt.yscale('log', subsy=[1, 2])
    plt.yscale('log')
    plt.minorticks_off()

    d = pd.DataFrame({'x': np.log10(x)})
    d['y'] = np.log10(y)
    f = smf.ols('y ~ x', d).fit()

    m, b, r, p, std_err = stats.linregress(np.log10(x), np.log10(y))
    st, data, ss2 = summary_table(f, alpha=0.05)
    fitted = data[:, 2]
    mean_ci_low, mean_ci_upp = data[:, 4:6].T
    ci_low, ci_upp = data[:, 6:8].T

    x, y, fitted, ci_low, ci_upp, clrs = zip(
        *sorted(zip(x, y, fitted, ci_low, ci_upp, clrs)))

    x = np.array(x)
    y = np.array(y)
    fitted = 10**np.array(fitted)
    ci_low = 10**np.array(ci_low)
    ci_upp = 10**np.array(ci_upp)

    if n == 1:
        lbl = r'$rarity$' + ' = ' + str(round(
            10**b, 1)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$'
    elif n == 2:
        lbl = r'$Nmax$' + ' = ' + str(round(
            10**b, 1)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$'
    elif n == 3:
        lbl = r'$Ev$' + ' = ' + str(round(
            10**b, 1)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$'
    elif n == 4:
        lbl = r'$S$' + ' = ' + str(round(
            10**b, 1)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$'

    plt.scatter(x, y, s=sz, color=clrs, linewidths=0.0, edgecolor=None)
    plt.fill_between(x, ci_upp, ci_low, color='0.5', lw=0.1, alpha=0.2)
    plt.plot(x, fitted, color='k', ls='--', lw=0.5, label=lbl)

    if n == 3: plt.legend(loc=3, fontsize=8, frameon=False)
    else: plt.legend(loc=2, fontsize=8, frameon=False)

    plt.xlabel(xlab, fontsize=10)
    plt.ylabel(ylab, fontsize=10)
    plt.tick_params(axis='both', labelsize=6)
    if n in [2, 4]: plt.ylim(min(y), max(y))
    elif n == 1: plt.ylim(min(ci_low), max(ci_upp))
    elif n == 3: plt.ylim(0.1, 1.1)

    return fig
def fixed_effect(treat, control, k, SEED):
    x = np.arange(T)
    x = np.concatenate([x for _ in range(N_tr+N_co)]).reshape(-1,1)
    units = np.concatenate([[i for _ in range(T)] for i in range(N_tr+N_co)]).reshape(-1,1)
    treated = np.logical_and((units<N_tr), (x>=T0)).astype("float")
    y = np.concatenate([treat.reshape(-1,1),control.reshape(-1,1)])
    COLUMNS = ["time", "y", "unit", "treated"]
    data = pd.DataFrame(np.concatenate((x,y,units,treated),axis=1),columns=COLUMNS)
    data.to_csv(synthetic_path+"/data_{}.csv".format(SEED), index=False)

    return
    
    fit = ols('y ~ 1 + C(time) + C(unit) + treated:C(time)', data=data).fit()
    ypred = fit.predict(data)
    m_tr = ypred[:N_tr*T].to_numpy().reshape(N_tr,T)
    m_co = ypred[N_tr*T:].to_numpy().reshape(N_co,T)

    # print(fit.summary())

    for t in range(T0, T, 1):
        m_tr[:, t] -= fit.params["treated:C(time)[{}.0]".format(t)]

    _, data, _ = summary_table(fit, alpha=0.05)

    predict_mean_ci_lower, predict_mean_ci_upper = data[:, 4:6].T

    lower_tr = predict_mean_ci_lower[:N_tr*T].reshape(N_tr,T)
    upper_tr = predict_mean_ci_upper[:N_tr*T].reshape(N_tr,T)
    lower_co = predict_mean_ci_lower[N_tr*T:].reshape(N_co,T)
    upper_co = predict_mean_ci_upper[N_tr*T:].reshape(N_co,T)

    for t in range(T0, T, 1):
        lower_tr[:, t] -= fit.conf_int().loc["treated:C(time)[{}.0]".format(t),1]
        upper_tr[:, t] -= fit.conf_int().loc["treated:C(time)[{}.0]".format(t),0]

    test_t = np.arange(T)

    # plt.plot(test_t, np.mean(control, axis=0), color='grey', alpha=0.8)
    # plt.plot(test_t,  np.mean(m_co, axis=0), 'k--', linewidth=1.0, label='Estimated Y(0)')
    # plt.fill_between(test_t, np.mean(lower_co, axis=0), np.mean(upper_co, axis=0), alpha=0.5)
    # plt.show()

    ATT = np.stack([np.mean(treat-m_tr, axis=0),
                    np.mean(treat-upper_tr, axis=0),
                    np.mean(treat-lower_tr, axis=0)])

    plt.rcParams["figure.figsize"] = (15,5)
    plt.plot(test_t, ATT[0],'k--', linewidth=1.0, label="Estimated ATT")
    plt.fill_between(test_t, ATT[1], ATT[2], alpha=0.5, label="ATT 95% CI")
    plt.legend(loc=2)
    plt.savefig(synthetic_path+"/fixedeffect{}_{}.png".format(k, SEED))
    plt.close()
    
    np.savetxt(synthetic_path+"/fixedeffect{}_{}.csv".format(k, SEED), ATT, delimiter=",")
def figplot(x, y, xlab, ylab, fig, n, binned=1):
    '''main figure plotting function'''

    fig.add_subplot(3, 3, n)
    x = np.log10(x)
    y = np.log10(y)
    y2 = list(y)
    x2 = list(x)

    if binned == 1:
        X, Y = (np.array(t) for t in zip(*sorted(zip(x2, y2))))
        Xi = xfrm(X, max(X) * 1.05)
        bins = np.linspace(np.min(Xi), np.max(Xi) + 1, 100)
        ii = np.digitize(Xi, bins)
        y2 = np.array([
            np.mean(Y[ii == i]) for i in range(1, len(bins))
            if len(Y[ii == i]) > 0
        ])
        x2 = np.array([
            np.mean(X[ii == i]) for i in range(1, len(bins))
            if len(X[ii == i]) > 0
        ])

    d = pd.DataFrame({'size': list(x2)})
    d['rate'] = list(y2)
    f = smf.ols('rate ~ size', d).fit()

    coef = f.params[1]
    st, data, ss2 = summary_table(f, alpha=0.05)
    fitted = data[:, 2]
    mean_ci_low, mean_ci_upp = data[:, 4:6].T
    ci_low, ci_upp = data[:, 6:8].T

    x2, y2, fitted, ci_low, ci_upp = zip(
        *sorted(zip(x2, y2, fitted, ci_low, ci_upp)))

    plt.scatter(x2,
                y2,
                color='SkyBlue',
                alpha=1,
                s=12,
                linewidths=0.5,
                edgecolor='Steelblue')
    plt.fill_between(x2, ci_upp, ci_low, color='b', lw=0.1, alpha=0.15)
    plt.plot(x2, fitted, color='b', ls='--', lw=1.0, alpha=0.9)
    plt.xlabel(xlab, fontsize=10)
    plt.ylabel(ylab, fontsize=10)
    plt.tick_params(axis='both', labelsize=6)
    plt.xlim(0.9 * min(x2), 1.1 * max(x2))
    plt.ylim(min(ci_low), max(ci_upp))
    plt.title('$z$ = ' + str(round(coef, 2)), fontsize=10)
    return fig
Example #16
0
def ciAnalysis(re,x,y):
	st, data, ss2 = summary_table(re, alpha=0.10)
	fittedvalues = data[:,2]
	predict_mean_se  = data[:,3]
	predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
	predict_ci_low, predict_ci_upp = data[:,6:8].T
	plt.plot(x, y, 'o')
	plt.plot(x, fittedvalues, '-', lw=2)
	plt.plot(x, predict_ci_low, 'r--', lw=2)
	plt.plot(x, predict_ci_upp, 'r--', lw=2)
	plt.plot(x, predict_mean_ci_low, 'r--', lw=2)
	plt.plot(x, predict_mean_ci_upp, 'r--', lw=2)
	plt.show()
Example #17
0
def seasonal_index(y_v, period: int, show=False, option='cma'):
    """
    return seasonal index and a Series with all observations linked to its seasonal index
    """
    n = period
    if option == 'cma':
        SI_MA_a = np.zeros(len(y_v))
        SI_MA_a[:] = np.nan
        SI_MA_a = y_v / cma(y_v, period)
        SI_id_s = np.arange(1, len(y_v) + 1)
        SI_id = SI_id_s - np.floor(SI_id_s / n) * n
        SI_id[np.where((SI_id[:] == 0))] = n
        SI_MA_df = pd.DataFrame({'SIMA': SI_MA_a, 'SIid': SI_id})
        SI_MA_u = np.zeros(n)
        for j in range(1, n + 1):
            SI_MA_u[j - 1] = SI_MA_df['SIMA'][SI_MA_df['SIid'] ==
                                              j].dropna().mean()
        SI_MA = SI_MA_u / sum(SI_MA_u) * n
        if show:
            print('Seasonal Index:', SI_MA)
        return SI_MA, SI_MA_df['SIid']
    elif option == 'lr':
        y_data = y_v
        X_data_ar = np.arange(1, len(y_v) + 1)
        X_data_T = X_data_ar.T
        X_data = pd.DataFrame(X_data_T, columns=['Time'])
        X_data = sm.add_constant(X_data)
        olsmod = sm.OLS(y_data, X_data)
        result_reg = olsmod.fit()
        st, data, ss2 = sso.summary_table(result_reg, alpha=0.05)
        y_v_LR_a = data[:, 2]
        SI_LR_a = y_v / y_v_LR_a
        SI_id_s = np.arange(1, len(y_v) + 1)
        SI_id = SI_id_s - np.floor(SI_id_s / n) * n
        SI_id[np.where((SI_id[:] == 0))] = n
        SI_LR_a_df = pd.DataFrame({'SILR': SI_LR_a, 'SIid': SI_id})
        SI_LR_u = np.zeros(n)
        for j in range(1, n + 1):
            SI_LR_u[j - 1] = SI_LR_a_df['SILR'][SI_LR_a_df['SIid'] ==
                                                j].dropna().mean()
        SI_LR = SI_LR_u / sum(SI_LR_u) * n

        if show:
            print('Seasonal Index:', SI_LR)

        return SI_LR, SI_LR_a_df['SIid']
def figplot(clrs, x, y, xlab, ylab, fig, n):

    fig.add_subplot(2, 2, n)
    plt.xscale('log')
    if n == 1: plt.yscale('log', subsy=[1, 2])
    plt.yscale('log')
    plt.minorticks_off()

    d = pd.DataFrame({'x': np.log10(x)})
    d['y'] = np.log10(y)
    f = smf.ols('y ~ x', d).fit()

    m, b, r, p, std_err = stats.linregress(np.log10(x), np.log10(y))
    st, data, ss2 = summary_table(f, alpha=0.05)
    fitted = data[:,2]
    mean_ci_low, mean_ci_upp = data[:,4:6].T
    ci_low, ci_upp = data[:,6:8].T

    x, y, fitted, ci_low, ci_upp, clrs = zip(*sorted(zip(x, y, fitted, ci_low, ci_upp, clrs)))

    x = np.array(x)
    y = np.array(y)
    fitted = 10**np.array(fitted)
    ci_low = 10**np.array(ci_low)
    ci_upp = 10**np.array(ci_upp)

    if n == 1: lbl = r'$rarity$'+ ' = '+str(round(10**b,1))+'*'+r'$N$'+'$^{'+str(round(m,2))+'}$'
    elif n == 2: lbl = r'$Nmax$'+ ' = '+str(round(10**b,1))+'*'+r'$N$'+'$^{'+str(round(m,2))+'}$'
    elif n == 3: lbl = r'$Ev$'+ ' = '+str(round(10**b,1))+'*'+r'$N$'+'$^{'+str(round(m,2))+'}$'
    elif n == 4: lbl = r'$S$'+ ' = '+str(round(10**b,1))+'*'+r'$N$'+'$^{'+str(round(m,2))+'}$'

    plt.scatter(x, y, s = sz, color=clrs, linewidths=0.0, edgecolor=None)
    plt.fill_between(x, ci_upp, ci_low, color='0.5', lw=0.1, alpha=0.2)
    plt.plot(x, fitted,  color='k', ls='--', lw=0.5, label = lbl)

    if n == 3: plt.legend(loc=3, fontsize=8, frameon=False)
    else: plt.legend(loc=2, fontsize=8, frameon=False)

    plt.xlabel(xlab, fontsize=10)
    plt.ylabel(ylab, fontsize=10)
    plt.tick_params(axis='both', labelsize=6)
    if n in [2, 4]: plt.ylim(min(y), max(y))
    elif n == 1: plt.ylim(min(ci_low), max(ci_upp))
    elif n == 3: plt.ylim(0.1, 1.1)

    return fig
Example #19
0
def lm(x, y, alpha=ALPHA):
    "fits an OLS from statsmodels. returns tuple."
    x, y = map(plot_friendly, [x, y])
    if _isdate(x[0]):
        x = np.array([i.toordinal() for i in x])
    X = sm.add_constant(x)
    fit = sm.OLS(y, X).fit()
    prstd, iv_l, iv_u = wls_prediction_std(fit)
    _, summary_values, summary_names = summary_table(fit, alpha=alpha)
    df = pd.DataFrame(summary_values, columns=map(snakify, summary_names))
    fittedvalues = df['predicted_value']
    predict_mean_se = df['std_error_mean_predict']
    predict_mean_ci_low = df['mean_ci_95%_low']
    predict_mean_ci_upp = df['mean_ci_95%_upp']
    predict_ci_low = df['predict_ci_95%_low']
    predict_ci_upp = df['predict_ci_95%_upp']
    return (fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
Example #20
0
def lm(x, y, alpha=ALPHA):
    "fits an OLS from statsmodels. returns tuple."
    x, y = map(plot_friendly, [x,y])
    if _isdate(x[0]):
        x = np.array([i.toordinal() for i in x])
    X = sm.add_constant(x)
    fit = sm.OLS(y, X).fit()
    prstd, iv_l, iv_u = wls_prediction_std(fit)
    _, summary_values, summary_names = summary_table(fit, alpha=alpha)
    df = pd.DataFrame(summary_values, columns=map(snakify, summary_names))
    fittedvalues        = df['predicted_value']
    predict_mean_se     = df['std_error_mean_predict']
    predict_mean_ci_low = df['mean_ci_95%_low']
    predict_mean_ci_upp = df['mean_ci_95%_upp']
    predict_ci_low      = df['predict_ci_95%_low']
    predict_ci_upp      = df['predict_ci_95%_upp']
    return (fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
Example #21
0
def _fit_reg(fit_reg, ci, ax, x, y, data, color, line_kws):
    if not fit_reg:
        return None
    if ci is None:
        ci = 0
    if ci < 0 or ci >= 100:
        raise ValueError('ci must be between 0 and 100 or `None`')

    if line_kws is None:
        line_kws = {}

    if 'lw' not in line_kws:
        line_kws['lw'] = 3

    X = data[x].values
    if len(X) == 1:
        return None
    idx_order = X.argsort()
    y = data[y].values
    if len(X) == 2:
        ax.plot(X, y, color=color, **line_kws)
        return None
    X = sm.add_constant(X)

    # if all x's are the same value, there can be no regression line
    if X.shape[1] == 1:
        return 1
    ols = sm.OLS(y, X).fit()
    pred_obj = ols.get_prediction()
    pred = pred_obj.predicted_mean[idx_order]
    try:
        ax.plot(X[idx_order, 1], pred, color=color, **line_kws)
    except IndexError:
        print(f"col is {x}")
        print(X.shape)
        print(data[x].values)
        print(X)

    if ci != 0:
        st, data, ss2 = summary_table(ols, alpha=1 - ci / 100)
        ax.fill_between(X[idx_order, 1],
                        data[idx_order, 4],
                        data[idx_order, 5],
                        alpha=.3,
                        color=color)
Example #22
0
def simple_outliers_DIY(df, xname, yname, alpha=0.05):
    # Fit regression model
    result = smf.ols(yname + '~' + xname, data=df).fit()

    # studentized residual
    st1, data1, ss3 = sso.summary_table(result, alpha=alpha)
    Residual = data1[:, 8]
    STD_Residual = data1[:, 10]
    mu = np.mean(STD_Residual)
    sigma = np.std(STD_Residual)

    print("◆ Outliers Finding\n")
    print("(remove by yourself!)\n")
    df_out = pd.DataFrame(STD_Residual, columns=['SD'])
    filter = (df_out['SD'] < -2) | (df_out['SD'] > 2)
    print("Outliers by SD = ")
    print(df_out['SD'].loc[filter])
    print("\nActual ID: ", df_out['SD'].loc[filter].index + 1)
    return df_out['SD'].loc[filter].index
Example #23
0
def lm(x, y, alpha=ALPHA):
    "fits an OLS from statsmodels. returns tuple."
    x_is_date = _isdate(x.iloc[0])
    if x_is_date:
        x = np.array([i.toordinal() for i in x])
    X = sm.add_constant(x)
    fit = sm.OLS(y, X).fit()
    prstd, iv_l, iv_u = wls_prediction_std(fit)
    _, summary_values, summary_names = summary_table(fit, alpha=alpha)
    df = pd.DataFrame(summary_values, columns=map(_snakify, summary_names))
    # TODO: indexing w/ data frame is messing everything up
    fittedvalues = df['predicted_value'].values
    predict_mean_ci_low = df['mean_ci_95%_low'].values
    predict_mean_ci_upp = df['mean_ci_95%_upp'].values
    predict_ci_low = df['predict_ci_95%_low'].values
    predict_ci_upp = df['predict_ci_95%_upp'].values

    if x_is_date:
        x = [Timestamp.fromordinal(int(i)) for i in x]
    return (x, fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
Example #24
0
def lm(x, y, alpha=ALPHA):
    "fits an OLS from statsmodels. returns tuple."
    x_is_date = _isdate(x.iloc[0])
    if x_is_date:
        x = np.array([i.toordinal() for i in x])
    X = sm.add_constant(x)
    fit = sm.OLS(y, X).fit()
    prstd, iv_l, iv_u = wls_prediction_std(fit)
    _, summary_values, summary_names = summary_table(fit, alpha=alpha)
    df = pd.DataFrame(summary_values, columns=map(_snakify, summary_names))
    # TODO: indexing w/ data frame is messing everything up
    fittedvalues        = df['predicted_value'].values
    predict_mean_ci_low = df['mean_ci_95%_low'].values
    predict_mean_ci_upp = df['mean_ci_95%_upp'].values
    predict_ci_low      = df['predict_ci_95%_low'].values
    predict_ci_upp      = df['predict_ci_95%_upp'].values

    if x_is_date:
        x = [Timestamp.fromordinal(int(i)) for i in x]
    return (x, fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
Example #25
0
def lm(x, y, alpha=ALPHA):
    "fits an OLS from statsmodels. returns tuple."
    import statsmodels.api as sm
    from statsmodels.sandbox.regression.predstd import wls_prediction_std
    from statsmodels.stats.outliers_influence import summary_table
    x, y = map(plot_friendly, [x,y])
    if _isdate(x[0]):
        x = np.array([i.toordinal() for i in x])
    X = sm.add_constant(x)
    fit = sm.OLS(y, X).fit()
    prstd, iv_l, iv_u = wls_prediction_std(fit)
    _, summary_values, summary_names = summary_table(fit, alpha=alpha)
    df = pd.DataFrame(summary_values, columns=map(snakify, summary_names))
    fittedvalues        = df['predicted_value']
    predict_mean_se     = df['std_error_mean_predict']
    predict_mean_ci_low = df['mean_ci_95%_low']
    predict_mean_ci_upp = df['mean_ci_95%_upp']
    predict_ci_low      = df['predict_ci_95%_low']
    predict_ci_upp      = df['predict_ci_95%_upp']
    return (x, fittedvalues.tolist(), predict_mean_ci_low.tolist(),
            predict_mean_ci_upp.tolist())
Example #26
0
def get_pred_interval_sm(y, X, dfx, pi = 0.95):
    
    import statsmodels.api as sm
    from statsmodels.sandbox.regression.predstd import wls_prediction_std
    from statsmodels.stats.outliers_influence import summary_table

    df = dfx.copy()
    Y = df[y]
    X = df[X]
    X = sm.add_constant(X)
    
    re = sm.OLS(Y, X).fit()
    
    print(re.summary())
    
    st, data, ss2 = summary_table(re, alpha=1-pi)
    fittedvalues = data[:, 2]
    predict_mean_se  = data[:, 3]
    predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T
    predict_ci_low, predict_ci_upp = data[:, 6:8].T
    
    return predict_ci_low, fittedvalues, predict_ci_upp
Example #27
0
def compare_two_tickers(TICKER_A, TICKER_B):
    start = datetime.datetime(2017, 5, 1)
    end = datetime.datetime(2018, 5, 27)

    df = pd.DataFrame(columns=(TICKER_A, TICKER_B))
    df[TICKER_A] = retrieve_ticker_prices(TICKER_A, start, end)
    df[TICKER_B] = retrieve_ticker_prices(TICKER_B, start, end)
    print(df)

    # Plot the two time series
    plot_price_series(df, TICKER_A, TICKER_B, start, end)

    # Display a scatter plot of the two time series
    plot_scatter_series(df, TICKER_A, TICKER_B)

    # Calculate optimal hedge ratio "beta"
    res = sm.OLS(endog=df[TICKER_B], exog=df[TICKER_A]).fit()
    st, data, ss2 = summary_table(
        res, alpha=0.05)  # 置信水平alpha=5%,st数据汇总,data数据详情,ss2数据列名

    # beta_hr = data[:2]  # 等价于res.fittedvalues
    beta_hr = res.params[TICKER_A]
    # beta_hr = res.fittedvalues #获取拟合y值
    # res.params  # 拟合回归模型参数
    # res.params[0] + res.params[1] * daily_data['temp'] == res.fittedvalues  # 验证二维回归模型的拟合y值计算原理

    # Calculate the residuals of the linear combination
    df["res"] = df[TICKER_A] - beta_hr * df[TICKER_B]
    print('===============')
    print(beta_hr)
    print('===============')
    print(df['res'])

    # Plot the residuals
    plot_residuals(df, start, end)

    # Calculate and output the CADF test on the residuals
    calculate_adf(df['res'])
Example #28
0
def drawLinearRegressionByTail(x, y, alpha, ax):
    x = np.array(x)
    res = sm.OLS(y, x).fit()

    st, data, ss2 = summary_table(res, alpha=alpha)
    fittedvalues = data[:, 2]
    predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T
    predict_ci_low, predict_ci_upp = data[:, 6:8].T

    ax.plot(x, fittedvalues, 'r-', label='Параметрична пряма')
    ax.plot(x,
            predict_ci_low,
            'black',
            label='Толерантні межі',
            linestyle='dashed')
    ax.plot(x, predict_ci_upp, 'black', linestyle='dashed')
    ax.plot(x,
            predict_mean_ci_low,
            'gray',
            label='Довірчий інтервал',
            linestyle='dashed')
    ax.plot(x, predict_mean_ci_upp, 'gray', linestyle='dashed')
    ax.legend(loc='best', fontsize='x-small')
Example #29
0
 def get_regression_summary(result,
                            conf_int=0.95,
                            columns=None,
                            as_dataframe=False):
     column_mapper = {
         'Obs': 'obs',
         'Dep Var\nPopulation': 'dep_var_population',
         'Predicted\nValue': 'predicted_value',
         'Std Error\nMean Predict': 'std_error_mean_predict',
         'Mean ci\n95% low': 'mean_ci_lo',
         'Mean ci\n95% upp': 'mean_ci_up',
         'Predict ci\n95% low': 'pred_ci_lo',
         'Predict ci\n95% upp': 'pred_ci_up',
         'Residual': 'residual',
         'Std Error\nResidual': 'std_error_residual',
         'Student\nResidual': 'student_residual',
         "Cook's\nD": 'cooks'
     }
     # columns = columns or ['Mean ci\n95% low', 'Mean ci\n95% upp']
     simple_table, data_table, table_columns = summary_table(result,
                                                             alpha=conf_int)
     table_columns = [column_mapper.get(c) for c in table_columns]
     if as_dataframe:
         return pd.DataFrame(data_table, columns=table_columns)
def calcErrDistBug(in_filename,gold_in_filename,out_filename,title):
    errs = []
    abs_errs = []
    mses = []

    # read data
    in_file = open(in_filename,'rb')
    errs = cPickle.load(in_file)
    abs_errs = cPickle.load(in_file)
    mses = cPickle.load(in_file)
    in_file.close()
    
    g_in_file = open(gold_in_filename,'rb')
    g_errs = cPickle.load(g_in_file)
    g_abs_errs = cPickle.load(g_in_file)
    g_mses = cPickle.load(g_in_file)
    g_in_file.close()

    # ensure that we don't actually take the log of 0
    # FIXME: we may need to make this more dynamic so that it doesn't skew results.
    g_mses = np.array(g_mses)
    g_abs_errs = np.array(g_abs_errs)
    zeros = g_mses>0.0
    g_mses = g_mses[zeros]
    g_abs_errs = g_abs_errs[zeros]
    #zeros = g_abs_errs>0.0
    zeros = g_abs_errs>1.0E-14
    g_mses = g_mses[zeros]
    g_abs_errs = g_abs_errs[zeros]
    #print g_mses
    #print g_abs_errs

    #print g_mses 
    
    mses = np.array(mses)
    abs_errs = np.array(abs_errs)
    zeros = mses>0.0
    mses = mses[zeros]
    abs_errs = abs_errs[zeros]
    #zeros = abs_errs>0.0
    zeros = abs_errs>1.0E-14
    mses = mses[zeros]
    abs_errs = abs_errs[zeros]
    skip = 5
    mses = mses[::skip]
    abs_errs = abs_errs[::skip]

    #print g_mses
    #print np.log2(g_mses)

    g_dist = np.divide(g_mses,g_abs_errs)
    dist = np.divide(mses,abs_errs)

    # determine Ordinary Least Squares
    X = np.log2(g_abs_errs)
    X = sm.add_constant(X)
    #print X
    #print len(g_abs_errs)
    #model = sm.OLS(g_mses,X)
    model = sm.OLS(np.log2(g_mses),X)
    #model = sm.RLM(np.log2(g_mses),X)
    results = model.fit()
    #print results.params
    #print results.summary()
    #print dir(results)
    #print results.outlier_test()
    prstd, iv_l, iv_u = wls_prediction_std(results)

    st, data, ss2 = summary_table(results, alpha=0.05)
    
    #print oi.OLSInfluence(results).influence

    fittedvalues = data[:,2]
    predict_mean_se = data[:,3]
    predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
    predict_ci_low, predict_ci_upp = data[:,6:8].T

    # check we got the right things
    #print np.max(np.abs(results.fittedvalues - fittedvalues))
    #print np.max(np.abs(iv_l - predict_ci_low))
    #print np.max(np.abs(iv_u - predict_ci_upp))

    #legend = []
    #legend.append('> to < Mutation')
    #plt.loglog(abs_errs, mses, basex=2, linestyle='', marker='+', color='b')
    #plt.plot(np.log2(abs_errs), np.log2(mses), linestyle='', marker='+', color='b')
    #legend.append('> to < Mutation -- outliers')
    o_x,o_y = returnOutliers(results,sm.add_constant(np.log2(abs_errs)),np.log2(mses),alpha=0.05)
    #plt.plot(o_x, o_y, linestyle='', marker='*', color='b')
    #plt.plot(abs_errs, dist, linestyle='', marker='o', color='b')
    #print 'Mutation outliers: ' + `len(o_x)`
    
    #legend.append('Bug-free')
    #plt.loglog(g_abs_errs, g_mses, basex=2, linestyle='', marker='+', color='r')
    #plt.plot(np.log2(g_abs_errs), np.log2(g_mses), linestyle='', marker='+', color='r')
    #g_o_x,g_o_y = returnOutliers(results,sm.add_constant(np.log2(g_abs_errs)),np.log2(g_mses),alpha=0.05)
    #plt.plot(g_o_x, g_o_y, linestyle='', marker='*', color='r')
    #legend.append('Bug-free -- MSE/ABS')
    #plt.plot(g_abs_errs, g_dist, linestyle='', marker='o', color='r')
    #print 'Gold outliers: ' + `len(g_o_x)`

    #legend.append('95% CI -')
    #plt.loglog(g_abs_errs, iv_l, basex=2, linestyle='-', color='c')
    #plt.plot(np.log2(g_abs_errs), iv_l, linestyle='-', color='c')
    #legend.append('95% CI - manual')
    #plt.plot(np.log2(g_abs_errs), predict_ci_low, linestyle='-', color='m')
    #plt.loglog(g_abs_errs, results.fittedvalues, basex=2, linestyle='-', color='k')
    #plt.plot(np.log2(g_abs_errs), results.fittedvalues, linestyle='-', color='k')
    #legend.append('95% CI +')
    #plt.loglog(g_abs_errs, iv_u, basex=2, linestyle='-', color='g')
    #plt.plot(np.log2(g_abs_errs), iv_u, linestyle='-', color='g')
    #legend.append('95% CI + manual')
    #plt.plot(np.log2(g_abs_errs), predict_ci_upp, linestyle='-', color='y')
  
    #leg = plt.legend(legend, 'lower right',ncol=1)
   
    # fix up plotting to look nice
    #plt.suptitle(title, fontsize=35)
    #plt.xlabel('Sum of Absolute Execution Errors', fontsize=23)
    #plt.ylabel('Mean Squared Error of Output', fontsize=23)

    #plt.show()
    return len(o_x),len(mses)
def Fig1(ref, Ones):

    datasets = []
    if ref == 'ClosedRef': GoodNames = ['EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST

    if ref == 'OpenRef': GoodNames = ['EMPopen', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST


    for name in os.listdir(mydir +'data/micro'):
        if name in GoodNames: pass
        else: continue

        if Ones == 'N': path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
        elif Ones == 'Y': path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'micro', num_lines])
        print name, num_lines

    for name in os.listdir(mydir +'data/macro'):
        if name in GoodNames: pass
        else: continue

        if Ones == 'N': path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
        elif Ones == 'Y': path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'macro', num_lines])
        print name, num_lines


    metrics = ['Nmax, '+r'$log_{10}$', 'McNaughton', 'Berger-Parker', 'Simpson\'s D']

    fig = plt.figure()
    for index, i in enumerate(metrics):

        metric = i
        fig.add_subplot(2, 2, index+1)
        fs = 10 # font size used across figures

        MicIntList, MicCoefList, MacIntList, MacCoefList, R2List, metlist = [[], [], [], [], [], []]
        Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [[], [], [], [], [], [], [], [], []]
        EvarList, EQList, OList = [[],[],[]]
        SimpDomList, McNList, LogSkewList, POnesList = [[],[],[],[]]

        its = 10000
        for n in range(its):
            print n, metric

            Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [[], [], [], [], [], [], [], [], []]
            EvarList, EQList, OList = [[],[],[]]
            SimpDomList, McNList, LogSkewList, POnesList = [[],[],[],[]]

            numMac = 0
            numMic = 0
            radDATA = []

            for dataset in datasets:

                name, kind, numlines = dataset
                lines = []
                small = ['BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED']
                big = ['HUMAN', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO']

                if kind == 'macro':
                    lines = np.random.choice(range(1, numlines+1), 100, replace=True)

                elif name in small:
                    lines = np.random.choice(range(1, numlines+1), 20, replace=True)

                elif name in big:
                    lines = np.random.choice(range(1, numlines+1), 50, replace=True)

                elif name == 'TARA':
                    lines = np.random.choice(range(1, numlines+1), 50, replace=True)
                else:
                    lines = np.random.choice(range(1, numlines+1), 50, replace=True)

                if Ones == 'N':   path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
                elif Ones == 'Y': path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

            for data in radDATA:

                data = data.split()
                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data

                KindList.append(kind)
                N = float(N)
                S = float(S)

                if S < 2 or N < 10: continue # Min species richness

                Nlist.append(float(np.log10(N)))
                Slist.append(float(np.log10(S)))

                # Dominance
                BPlist.append(float(BP))
                NmaxList.append(float(np.log10(float(Nmax))))
                SimpDomList.append(float(SimpDom))
                McNList.append(float(McN))

                if kind == 'micro':
                    numMic += 1
                    klist.append('b')
                if kind == 'macro':
                    klist.append('r')
                    numMac += 1


            if index == 0: metlist = list(NmaxList)
            elif index == 1: metlist = list(McNList)
            elif index == 2: metlist = list(BPlist)
            elif index == 3: metlist = list(SimpDomList)

            # Multiple regression
            d = pd.DataFrame({'N': list(Nlist)})
            d['y'] = list(metlist)
            d['Kind'] = list(KindList)
            f = smf.ols('y ~ N * Kind', d).fit()

            MacIntList.append(f.params[0])
            MacCoefList.append(f.params[2])

            if f.pvalues[1] < 0.05:
                MicIntList.append(f.params[1] + f.params[0])
            else:
                MicIntList.append(f.params[0])

            if f.pvalues[3] < 0.05:
                MicCoefList.append(f.params[3] + f.params[2])
            else:
                MicCoefList.append(f.params[2])

            R2List.append(f.rsquared)

        MacListX = []
        MacListY = []
        MicListX = []
        MicListY = []

        for j, k in enumerate(KindList):
            if k == 'micro':
                MicListX.append(Nlist[j])
                MicListY.append(metlist[j])

            elif k == 'macro':
                MacListX.append(Nlist[j])
                MacListY.append(metlist[j])


        MacPIx, MacFitted, MicPIx, MicFitted = [[],[],[],[]]
        macCiH, macCiL, micCiH, micCiL = [[],[],[],[]]

        lm = smf.ols('y ~ N * Kind', d).fit()
        print metric, '\n', lm.summary()
        f1 = smf.ols('y ~ N', d).fit()
        print metric, '\n', f1.summary()

        st, data, ss2 = summary_table(lm, alpha=0.05)
        # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict,
        # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp,
        # Residual, Std Error Residual, Student Residual, Cook's D

        fittedvalues = data[:,2]
        predict_mean_se = data[:,3]
        predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
        predict_ci_low, predict_ci_upp = data[:,6:8].T


        for j, kval in enumerate(KindList):
            if kval == 'macro':

                macCiH.append(predict_mean_ci_upp[j])
                macCiL.append(predict_mean_ci_low[j])
                MacPIx.append(Nlist[j])
                MacFitted.append(f.fittedvalues[j])

            elif kval == 'micro':

                micCiH.append(predict_mean_ci_upp[j])
                micCiL.append(predict_mean_ci_low[j])
                MicPIx.append(Nlist[j])
                MicFitted.append(f.fittedvalues[j])

        MicPIx, MicFitted, micCiH, micCiL = zip(*sorted(zip(MicPIx, MicFitted, micCiH, micCiL)))
        MacPIx, MacFitted, macCiH, macCiL = zip(*sorted(zip(MacPIx, MacFitted, macCiH, macCiL)))

        for i in range(len(MicListX)):
            plt.scatter(MacListX[i], MacListY[i], color = 'LightCoral', alpha= 1 , s = 4, linewidths=0.5, edgecolor='Crimson')
            plt.scatter(MicListX[i], MicListY[i], color = 'SkyBlue', alpha= 1 , s = 4, linewidths=0.5, edgecolor='Steelblue')

        plt.fill_between(MacPIx, macCiL, macCiH, color='r', lw=0.0, alpha=0.3)
        plt.fill_between(MicPIx, micCiL, micCiH, color='b', lw=0.0, alpha=0.3)

        MicInt = round(np.mean(MicIntList), 2)
        MicCoef = round(np.mean(MicCoefList), 2)
        MacInt = round(np.mean(MacIntList), 2)
        MacCoef = round(np.mean(MacCoefList), 2)
        r2 = round(np.mean(R2List), 2)

        if index == 0:
            plt.ylim(0, 6)
            plt.xlim(1, 8)
            plt.text(1.5, 5.3, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue')
            plt.text(1.5, 4.7, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(1.5, 4.0,  r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k')


        if index == 1:
            plt.ylim(0, 120)
            plt.xlim(1, 8)
            #plt.text(4.0, 110, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue')
            #plt.text(4.0, 100, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(5.0, 90,  r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k')

        if index == 2:
            plt.ylim(0, 1.2)
            plt.xlim(1, 8)
            #plt.text(3.8, 1.10, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue')
            #plt.text(3.8, 1.0, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(1.5, 1.0,  r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k')

        if index == 3:
            plt.ylim(0, 1.3)
            plt.xlim(1, 8)
            #plt.text(1.5, 1.2, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue')
            #plt.text(1.5, 1.1, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(1.5, 1.1,  r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k')

        plt.xlabel('Number of reads or individuals, '+ '$log$'+r'$_{10}$', fontsize=fs)
        plt.ylabel(metric, fontsize=fs)
        plt.tick_params(axis='both', which='major', labelsize=fs-1)

    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    if ref == 'OpenRef'and Ones =='N': plt.savefig(mydir+'/figs/appendix/Dominance/SupplementaryDominanceFig-OpenRef_NoMicrobe1s.png', dpi=600, bbox_inches = "tight")
    elif ref == 'OpenRef'and Ones =='Y': plt.savefig(mydir+'/figs/appendix/Dominance/SupplementaryDominanceFig-OpenRef.png', dpi=600, bbox_inches = "tight")
    elif ref == 'ClosedRef'and Ones =='Y': plt.savefig(mydir+'/figs/appendix/Dominance/SupplementaryDominanceFig-ClosedRef.png', dpi=600, bbox_inches = "tight")
    elif ref == 'ClosedRef'and Ones =='N': plt.savefig(mydir+'/figs/appendix/Dominance/SupplementaryDominanceFig-ClosedRef_NoMicrobe1s.png', dpi=600, bbox_inches = "tight")

    #plt.show()

    return
def Fig1(ref, Ones):

    datasets = []
    if ref == 'ClosedRef': GoodNames = ['MGRAST', 'HMP', 'EMPclosed', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA']
    if ref == 'OpenRef': GoodNames = ['MGRAST', 'HMP', 'EMPopen', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA']

    for name in os.listdir(mydir +'data/micro'):
        if name in GoodNames: pass
        else: continue

        if Ones == 'N': path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
        elif Ones == 'Y': path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'micro', num_lines])
        print name, num_lines

    for name in os.listdir(mydir +'data/macro'):
        if name in GoodNames: pass
        else: continue

        if Ones == 'N': path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
        elif Ones == 'Y': path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'macro', num_lines])
        print name, num_lines


    metrics = ['log-modulo skewness', 'log-skew']

    fig = plt.figure()
    for index, i in enumerate(metrics):

        metric = i
        fig.add_subplot(2, 2, index+1)
        fs = 10 # font size used across figures

        MicIntList, MicCoefList, MacIntList, MacCoefList, R2List, metlist = [[], [], [], [], [], []]
        Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [[], [], [], [], [], [], [], [], []]
        EvarList, EQList, OList = [[],[],[]]
        SkewList, LogSkewList = [[],[]]

        its = 1000
        for n in range(its):
            #print n, metric

            Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [[], [], [], [], [], [], [], [], []]
            EvarList, EQList, OList = [[],[],[]]
            SkewList, LogSkewList = [[],[]]

            numMac = 0
            numMic = 0
            radDATA = []

            for dataset in datasets:

                name, kind, numlines = dataset
                lines = []
                if name == 'EMPclosed' or name == 'EMPopen':
                    lines = np.random.choice(range(1, numlines+1), 100, replace=True)
                elif kind == 'micro': lines = np.random.choice(range(1, numlines+1), 100, replace=True)
                else: lines = np.random.choice(range(1, numlines+1), 60, replace=True)

                if Ones == 'N':   path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
                elif Ones == 'Y': path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

            for data in radDATA:

                data = data.split()
                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data


                KindList.append(kind)
                N = float(N)
                S = float(S)

                if S < 10 or N < 10: continue # Min species richness

                Nlist.append(float(np.log10(N)))
                Slist.append(float(np.log10(S)))

                # Rarity
                lms = np.log10(np.abs(float(skew)) + 1)
                if skew < 0: lms = lms * -1
                SkewList.append(float(lms))

                LogSkewList.append(float(logskew))

                if kind == 'micro':
                    numMic += 1
                    klist.append('b')
                if kind == 'macro':
                    klist.append('r')
                    numMac += 1


            if index == 0: metlist = list(SkewList)
            elif index == 1: metlist = list(LogSkewList)

            # Multiple regression
            d = pd.DataFrame({'N': list(Nlist)})
            d['y'] = list(metlist)
            d['Kind'] = list(KindList)
            f = smf.ols('y ~ N * Kind', d).fit()

            MacIntList.append(f.params[0])
            MacCoefList.append(f.params[2])

            if f.pvalues[1] < 0.05:
                MicIntList.append(f.params[1] + f.params[0])
            else:
                MicIntList.append(f.params[0])

            if f.pvalues[3] < 0.05:
                MicCoefList.append(f.params[3] + f.params[2])
            else:
                MicCoefList.append(f.params[2])

            R2List.append(f.rsquared)

        MacListX = []
        MacListY = []
        MicListX = []
        MicListY = []

        for j, k in enumerate(KindList):
            if k == 'micro':
                MicListX.append(Nlist[j])
                MicListY.append(metlist[j])

            elif k == 'macro':
                MacListX.append(Nlist[j])
                MacListY.append(metlist[j])


        MacPIx, MacFitted, MicPIx, MicFitted = [[],[],[],[]]
        macCiH, macCiL, micCiH, micCiL = [[],[],[],[]]

        lm = smf.ols('y ~ N * Kind', d).fit()
        #print metric, '\n', lm.summary()
        #f1 = smf.ols('y ~ N', d).fit()
        #print metric, '\n', f1.summary()

        st, data, ss2 = summary_table(lm, alpha=0.05)

        fittedvalues = data[:,2]
        predict_mean_se = data[:,3]
        predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
        predict_ci_low, predict_ci_upp = data[:,6:8].T


        for j, kval in enumerate(KindList):
            if kval == 'macro':

                macCiH.append(predict_mean_ci_upp[j])
                macCiL.append(predict_mean_ci_low[j])
                MacPIx.append(Nlist[j])
                MacFitted.append(f.fittedvalues[j])

            elif kval == 'micro':

                micCiH.append(predict_mean_ci_upp[j])
                micCiL.append(predict_mean_ci_low[j])
                MicPIx.append(Nlist[j])
                MicFitted.append(f.fittedvalues[j])

        MicPIx, MicFitted, micCiH, micCiL = zip(*sorted(zip(MicPIx, MicFitted, micCiH, micCiL)))
        MacPIx, MacFitted, macCiH, macCiL = zip(*sorted(zip(MacPIx, MacFitted, macCiH, macCiL)))

        for i in range(len(MicListX)):
            plt.scatter(MacListX[i], MacListY[i], color = 'LightCoral', alpha= 1 , s = 4, linewidths=0.5, edgecolor='Crimson')
            plt.scatter(MicListX[i], MicListY[i], color = 'SkyBlue', alpha= 1 , s = 4, linewidths=0.5, edgecolor='Steelblue')

        plt.fill_between(MacPIx, macCiL, macCiH, color='r', lw=0.0, alpha=0.3)
        plt.fill_between(MicPIx, micCiL, micCiH, color='b', lw=0.0, alpha=0.3)

        MicInt = round(np.mean(MicIntList), 2)
        MicCoef = round(np.mean(MicCoefList), 2)
        MacInt = round(np.mean(MacIntList), 2)
        MacCoef = round(np.mean(MacCoefList), 2)
        r2 = round(np.mean(R2List), 2)

        if index == 0:
            plt.ylim(0, 2.5)
            plt.xlim(0, 7)
            plt.text(0.3, 2.2, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs-1, color='Steelblue')
            plt.text(0.3, 2.0, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs-1, color='Crimson')
            plt.text(0.3, 1.7,  r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k')


        if index == 1:
            plt.ylim(-1, 4.5)
            plt.xlim(0, 7)
            plt.text(0.3, 4.0, r'$micro$'+ ' = '+str(round(MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs-1, color='Steelblue')
            plt.text(0.3, 3.5, r'$macro$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs-1, color='Crimson')
            plt.text(0.3, 2.9,  r'$R^2$' + '=' +str(round(r2,3)), fontsize=fs-1, color='k')


        plt.xlabel('Number of reads or individuals, '+ '$log$'+r'$_{10}$', fontsize=fs)
        plt.ylabel(metric, fontsize=fs)
        plt.tick_params(axis='both', which='major', labelsize=fs-1)

    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    if ref == 'OpenRef'and Ones =='N': plt.savefig(mydir+'/figs/appendix/Rarity/SupplementaryRarityFig-OpenRef_NoMicrobe1s.png', dpi=600, bbox_inches = "tight")
    elif ref == 'OpenRef'and Ones =='Y': plt.savefig(mydir+'/figs/appendix/Rarity/SupplementaryRarityFig-OpenRef.png', dpi=600, bbox_inches = "tight")
    elif ref == 'ClosedRef'and Ones =='Y': plt.savefig(mydir+'/figs/appendix/Rarity/SupplementaryRarityFig-ClosedRef.png', dpi=600, bbox_inches = "tight")
    elif ref == 'ClosedRef'and Ones =='N': plt.savefig(mydir+'/figs/appendix/Rarity/SupplementaryRarityFig-ClosedRef_NoMicrobe1s.png', dpi=600, bbox_inches = "tight")

    #plt.show()

    return
def plotErrDistBug(in_filename,gold_in_filename,out_filename,title):
    errs = []
    abs_errs = []
    mses = []

    # read data
    if in_filename == None:
        abs_errs = None
        mses = None
    else:
        in_file = open(in_filename,'rb')
        errs = cPickle.load(in_file)
        abs_errs = cPickle.load(in_file)
        mses = cPickle.load(in_file)
        in_file.close()
    
    g_in_file = open(gold_in_filename,'rb')
    g_errs = cPickle.load(g_in_file)
    g_abs_errs = cPickle.load(g_in_file)
    g_mses = cPickle.load(g_in_file)
    g_in_file.close()

    # ensure that we don't actually take the log of 0
    g_mses = np.array(g_mses)
    g_abs_errs = np.array(g_abs_errs)
    zeros = g_mses>0.0
    g_mses = g_mses[zeros]
    g_abs_errs = g_abs_errs[zeros]
    #zeros = g_abs_errs>0.0
    zeros = g_abs_errs>1.0E-10
    g_mses = g_mses[zeros]
    g_abs_errs = g_abs_errs[zeros]
    #print g_mses
    #print g_abs_errs

    #print g_mses 
    
    if not mses == None:
        mses = np.array(mses)
        abs_errs = np.array(abs_errs)
        zeros = mses>0.0
        mses = mses[zeros]
        abs_errs = abs_errs[zeros]
        #zeros = abs_errs>0.0
        zeros = abs_errs>1.0E-10
        mses = mses[zeros]
        abs_errs = abs_errs[zeros]
        print 'mses: ' + `mses`

    #print g_mses
    #print np.log2(g_mses)

    g_dist = np.divide(g_mses,g_abs_errs)
    if not mses == None:
        dist = np.divide(mses,abs_errs)

    # determine Ordinary Least Squares
    X = np.log2(g_abs_errs)
    X = sm.add_constant(X)
    #print X
    #print len(g_abs_errs)
    #model = sm.OLS(g_mses,X)
    model = sm.OLS(np.log2(g_mses),X)
    #model = sm.RLM(np.log2(g_mses),X)
    results = model.fit()
    #print results.params
    #print results.summary()
    #print dir(results)
    #print results.outlier_test()
    prstd, iv_l, iv_u = wls_prediction_std(results)

    st, data, ss2 = summary_table(results, alpha=0.05)
    
    #print oi.OLSInfluence(results).influence

    fittedvalues = data[:,2]
    predict_mean_se = data[:,3]
    predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
    predict_ci_low, predict_ci_upp = data[:,6:8].T

    # check we got the right things
    #print np.max(np.abs(results.fittedvalues - fittedvalues))
    #print np.max(np.abs(iv_l - predict_ci_low))
    #print np.max(np.abs(iv_u - predict_ci_upp))

    if not in_filename == None:
        in_files_pattern = re.compile('\D*(\d+)\D*')
        match = in_files_pattern.search(in_filename)
        mutation_num = match.group(1)

    module_name =  re.sub('Gold','',gold_in_filename)
    module_name =  re.sub('(.*/)*','',module_name)

    
    fig = plt.figure(figsize=(10.0, 7.0))
    legend = []
    if not mses == None:
        legend.append('Mutation ' + `mutation_num`)
        #plt.loglog(abs_errs, mses, basex=2, linestyle='', marker='+', color='b')
        plt.plot(np.log2(abs_errs), np.log2(mses), linestyle='', marker='+', color='b')
        legend.append('Mutation ' + `mutation_num` + ' -- outliers')
        o_x,o_y = returnOutliers(results,sm.add_constant(np.log2(abs_errs)),np.log2(mses),alpha=0.05)
        plt.plot(o_x, o_y, linestyle='', marker='*', color='b')
        #plt.plot(abs_errs, dist, linestyle='', marker='o', color='b')
        print 'Mutation outliers: ' + `len(o_x)`
    
    legend.append('Bug-free')
    #plt.loglog(g_abs_errs, g_mses, basex=2, linestyle='', marker='+', color='r')
    plt.plot(np.log2(g_abs_errs), np.log2(g_mses), linestyle='', marker='+', color='r')
    g_o_x,g_o_y = returnOutliers(results,sm.add_constant(np.log2(g_abs_errs)),np.log2(g_mses),alpha=0.05)
    legend.append('Bug-free -- outliers')
    plt.plot(g_o_x, g_o_y, linestyle='', marker='*', color='r')
    #legend.append('Bug-free -- MSE/ABS')
    #plt.plot(g_abs_errs, g_dist, linestyle='', marker='o', color='r')
    print 'Gold outliers: ' + `len(g_o_x)`

    legend.append('95% CI -')
    #plt.loglog(g_abs_errs, iv_l, basex=2, linestyle='-', color='c')
    plt.plot(np.log2(g_abs_errs), iv_l, linestyle='-', color='c')
    #legend.append('95% CI - manual')
    #plt.plot(np.log2(g_abs_errs), predict_ci_low, linestyle='-', color='m')
    #plt.loglog(g_abs_errs, results.fittedvalues, basex=2, linestyle='-', color='k')
    plt.plot(np.log2(g_abs_errs), results.fittedvalues, linestyle='-', color='k')
    legend.append('95% CI +')
    #plt.loglog(g_abs_errs, iv_u, basex=2, linestyle='-', color='g')
    plt.plot(np.log2(g_abs_errs), iv_u, linestyle='-', color='g')
    #legend.append('95% CI + manual')
    #plt.plot(np.log2(g_abs_errs), predict_ci_upp, linestyle='-', color='y')
  
    leg = plt.legend(legend, 'lower right',ncol=1)
   
    # fix up plotting to look nice
    plt.suptitle(module_name+' '+title, fontsize=35)
    plt.xlabel('Log2(Execution Error)', fontsize=23)
    plt.ylabel('Log2(Output Error)', fontsize=23)

    if not out_filename == None:
        plt.gcf().savefig(out_filename)

    plt.show()
y = np.concatenate([y[0],y[1],y[2],y[3],y[4],y[5]])
x = np.concatenate([x[0],x[1],x[2],x[3],x[4],x[5]])

xsort = np.argsort(x)
x = x[xsort]
y = y[xsort]

xb = sm.add_constant(x)
model = sm.OLS(y,xb)
results = model.fit()

x2 = np.linspace(np.min(x),np.max(x),np.size(x))
y2 = results.predict(sm.add_constant(x2))

st, data, ss2 = summary_table(results, alpha=0.001)
fittedvalues = data[:,2]
predict_mean_se  = data[:,3]
predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
predict_ci_low, predict_ci_upp = data[:,6:8].T

plt.scatter(x,y)
#plt.plot(x2,y2,'r')
plt.plot(x, fittedvalues, 'k', lw=2)
#plt.plot(x, predict_ci_low, 'r--', lw=2)
#plt.plot(x, predict_ci_upp, 'r--', lw=2)
plt.plot(x, predict_mean_ci_low, 'r--', lw=2)
plt.plot(x, predict_mean_ci_upp, 'r--', lw=2)
plt.xlabel('Max stream function at 26N (Sv)')
plt.ylabel('AMO-box SST anomaly')
plt.savefig('/home/ph290/Documents/figures/amoc_v_sst.png')
def modelcomparison():

    OUT = open(mydir + 'output/model_comparison.txt','w+')

    datasets = []
    GoodNames = ['empclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST


    for name in os.listdir(mydir +'data/micro'):
        if name in GoodNames: pass
        else: continue

        #path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
        path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'micro', num_lines])
        print>>OUT, name, num_lines

    for name in os.listdir(mydir +'data/macro'):
        if name in GoodNames: pass
        else: continue

        #path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
        path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'macro', num_lines])
        print>>OUT, name, num_lines

    rarity = []
    dominance = []
    evenness = []
    richness = []
    Nlist = []

    metrics = ['Rarity', 'Dominance', 'Evenness', 'Richness']
    for index, i in enumerate(metrics):

        print i, ':   R-squared   :   AIC  :   BIC'
        print>>OUT, i, ':   R-squared   :   AIC  :   BIC'

        loglogR2s, linlogR2s, linearR2s, loglinR2s = [[],[],[],[]]
        loglogAICs, linlogAICs, linearAICs, loglinAICs = [[],[],[],[]]
        loglogBICs, linlogBICs, linearBICs, loglinBICs = [[],[],[],[]]

        its = 10
        for n in range(its):

            Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []]

            radDATA = []
            for dataset in datasets:

                name, kind, numlines = dataset
                lines = []
                if name == 'EMPclosed' or name == 'EMPopen':
                    lines = np.random.choice(range(1, numlines+1), 100, replace=True)
                elif kind == 'micro': lines = np.random.choice(range(1, numlines+1), 100, replace=True)
                else: lines = np.random.choice(range(1, numlines+1), 60, replace=True)

                #path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
                path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

            for data in radDATA:

                data = data.split()
                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data

                N = float(N)
                S = float(S)

                if S < 2 or N < 10: continue

                Nlist.append(float(np.log10(N)))
                Slist.append(float(np.log10(S)))

                ESimplist.append(float(np.log10(float(ESimp))))
                KindList.append(kind)

                BPlist.append(float(BP))
                NmaxList.append(float(np.log10(float(Nmax))))

                # log-modulo transformation of skewnness
                lms = np.log10(np.abs(float(skew)) + 1)
                if skew < 0: lms = lms * -1
                rareSkews.append(float(lms))


            if index == 0: metlist = list(rareSkews)
            elif index == 1: metlist = list(NmaxList)
            elif index == 2: metlist = list(ESimplist)
            elif index == 3: metlist = list(Slist)

            # Multiple regression
            d = pd.DataFrame({'N': list(Nlist)})
            d['y'] = list(metlist)
            d['Kind'] = list(KindList)
            loglog = smf.ols('y ~ N * Kind', d).fit()

            loglogR2s.append(loglog.rsquared)
            loglogAICs.append(loglog.aic)
            loglogBICs.append(loglog.bic)

            # Multiple regression
            xlist = 10**np.array(Nlist)
            d = pd.DataFrame({'N': list(xlist)})
            d['y'] = list(metlist)
            d['Kind'] = list(KindList)
            loglin = smf.ols('y ~ N * Kind', d).fit()

            loglinR2s.append(loglin.rsquared)
            loglinAICs.append(loglin.aic)
            loglinBICs.append(loglin.bic)

            # Multiple regression
            ylist = 10**np.array(metlist)
            d = pd.DataFrame({'N': list(Nlist)})
            d['y'] = list(ylist)
            d['Kind'] = list(KindList)
            linlog = smf.ols('y ~ N * Kind', d).fit()

            linlogR2s.append(linlog.rsquared)
            linlogAICs.append(linlog.aic)
            linlogBICs.append(linlog.bic)

            # Multiple regression
            ylist = 10**np.array(metlist)
            xlist = 10**np.array(Nlist)
            d = pd.DataFrame({'N': list(xlist)})
            d['y'] = list(ylist)
            d['Kind'] = list(KindList)
            linear = smf.ols('y ~ N * Kind', d).fit()

            linearR2s.append(linear.rsquared)
            linearAICs.append(linear.aic)
            linearBICs.append(linear.bic)


        st, data, ss2 = summary_table(linear, alpha=0.05)

        #fittedvalues = data[:,2]
        #predict_mean_se = data[:,3]
        predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
        predict_ci_low, predict_ci_upp = data[:,6:8].T


        avgloglogR2 = round(np.mean(loglogR2s),3)
        avglinlogR2 = round(np.mean(linlogR2s),3)
        avglinearR2 = round(np.mean(linearR2s),3)
        avgloglinR2 = round(np.mean(loglinR2s),3)

        avgloglogAIC = round(np.mean(loglogAICs),3)
        avglinlogAIC = round(np.mean(linlogAICs),3)
        avglinearAIC = round(np.mean(linearAICs),3)
        avgloglinAIC = round(np.mean(loglinAICs),3)

        avgloglogBIC = round(np.mean(loglogBICs),3)
        avglinlogBIC = round(np.mean(linlogBICs),3)
        avglinearBIC = round(np.mean(linearBICs),3)
        avgloglinBIC = round(np.mean(loglinBICs),3)


        print 'power-law:   ', avgloglogR2,'      ', avgloglogAIC,'      ', avgloglogBIC
        print>>OUT, 'averages from power-law', avgloglogR2,'      ',avgloglogAIC,'      ', avgloglogBIC

        print 'semilog:     ', avglinlogR2,'      ', avglinlogAIC,'      ', avglinlogBIC
        print>>OUT,'averages from semilog', avglinlogR2,'      ', avglinlogAIC,'      ', avglinlogBIC

        print 'exponential: ', avgloglinR2,'      ', avgloglinAIC,'      ', avgloglinBIC
        print>>OUT,'averages from exponential', avgloglinR2,'      ', avgloglinAIC,'      ', avgloglinBIC

        print 'linear:      ',  avglinearR2,'      ', avglinearAIC,'      ', avglinearBIC,'\n'
        print>>OUT,'averages from linear',  avglinearR2,'      ', avglinearAIC,'      ', avglinearBIC,'\n'

    OUT.close()
    return
def Fig1(condition, ones, sampling):

    tail = str()
    if ones is False:
        tail = '-SADMetricData_NoMicrobe1s.txt'
    elif ones is True:
        tail = '-SADMetricData.txt'

    datasets = []
    GoodNames = []
    emp = str()

    if condition == 'open': emp = 'EMPopen'
    elif condition == 'closed': emp = 'EMPclosed'

    #GoodNames = [emp, 'HMP', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA']
    #GoodNames = [emp, 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is emp
    #GoodNames = ['HMP', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is HMP
    GoodNames = [emp, 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST

    for name in os.listdir(mydir +'data/micro'):
        if name in GoodNames: pass
        else: continue

        path = mydir+'data/micro/'+name+'/'+name+tail
        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'micro', num_lines])
        print name, num_lines


    for name in os.listdir(mydir +'data/macro'):
        if name in GoodNames: pass
        else: continue

        path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt'
        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'macro', num_lines])
        print name, num_lines


    metrics = ['Rarity, '+r'$log_{10}$',
            'Dominance, '+r'$log_{10}$',
            'Evenness, ' +r'$log_{10}$',
            'Richness, ' +r'$log_{10}$',] #+r'$(S)^{2}$']

    fig = plt.figure()
    for index, i in enumerate(metrics):

        metric = i
        fig.add_subplot(2, 2, index+1)
        fs = 12 # font size used across figures

        MicIntList, MicCoefList, MacIntList, MacCoefList, R2List, metlist = [[], [], [], [], [], []]
        Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []]
        #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

        its = 10

        for n in range(its):

            #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
            Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []]

            numMac = 0
            numMic = 0
            radDATA = []

            for dataset in datasets:

                name, kind, numlines = dataset
                lines = []
                small = ['BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED']
                big = ['HUMAN', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO']

                if kind == 'macro':
                    lines = np.random.choice(range(1, numlines+1), 100, replace=True)

                elif name in small:
                    lines = np.random.choice(range(1, numlines+1), 20, replace=True)

                elif name in big:
                    lines = np.random.choice(range(1, numlines+1), 50, replace=True)

                elif name == 'TARA':
                    lines = np.random.choice(range(1, numlines+1), 50, replace=True)
                else:
                    lines = np.random.choice(range(1, numlines+1), 50, replace=True)

                if kind == 'micro':
                    path = mydir+'data/'+kind+'/'+name+'/'+name+tail

                else:
                    path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

            for data in radDATA:

                data = data.split()
                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data

                N = float(N)
                S = float(S)

                Nlist.append(float(np.log10(N)))
                Slist.append(float(np.log10(S)))

                ESimplist.append(float(np.log10(float(ESimp))))
                KindList.append(kind)

                BPlist.append(float(BP))
                NmaxList.append(float(np.log10(float(Nmax))))

                # log-modulo transformation of skewnness
                lms = np.log10(np.abs(float(skew)) + 1)
                if skew < 0: lms = lms * -1
                rareSkews.append(float(lms))

                if kind == 'micro':
                    numMic += 1
                    klist.append('b')
                if kind == 'macro':
                    klist.append('r')
                    numMac += 1

            if index == 0: metlist = list(rareSkews)
            elif index == 1: metlist = list(NmaxList)
            elif index == 2: metlist = list(ESimplist)
            elif index == 3: metlist = list(Slist)

            # Multiple regression
            d = pd.DataFrame({'N': list(Nlist)})
            d['y'] = list(metlist)
            d['Kind'] = list(KindList)

            f = smf.ols('y ~ N * Kind', d).fit()
            #f = smf.rlm('y ~ N * Kind', d).fit()
            #r2 = smf.wls('y ~ N * Kind', d, weights= f.weights).fit().rsquared
            r2 = f.rsquared

            MacIntList.append(f.params[0])
            MacCoefList.append(f.params[2])

            if f.pvalues[1] < 0.05:
                MicIntList.append(f.params[1] + f.params[0])
            else:
                MicIntList.append(f.params[0])

            if f.pvalues[3] < 0.05:
                MicCoefList.append(f.params[3] + f.params[2])
            else:
                MicCoefList.append(f.params[2])

            R2List.append(r2)


        MacPIx, MacFitted, MicPIx, MicFitted = [[],[],[],[]]
        macCiH, macCiL, micCiH, micCiL = [[],[],[],[]]

        MacListX = []
        MacListY = []
        MicListX = []
        MicListY = []

        for j, k in enumerate(KindList):
            if k == 'micro':
                MicListX.append(Nlist[j])
                MicListY.append(metlist[j])

            elif k == 'macro':
                MacListX.append(Nlist[j])
                MacListY.append(metlist[j])

        print metric

        ols = smf.ols('y ~ N * Kind', d).fit()

        #rlm = smf.rlm('y ~ N * Kind', d).fit()
        #wls = smf.wls('y ~ N * Kind', d, weights= rlm.weights).fit()
        #r2 = wls.rsquared
        r2 = ols.rsquared

        st, data, ss2 = summary_table(ols, alpha=0.05)
        # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict,
        # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp,
        # Residual, Std Error Residual, Student Residual, Cook's D

        #fittedvalues = data[:,2]
        #predict_mean_se = data[:,3]
        predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
        predict_ci_low, predict_ci_upp = data[:,6:8].T

        for j, kval in enumerate(KindList):
            if kval == 'macro':

                macCiH.append(predict_mean_ci_upp[j])
                macCiL.append(predict_mean_ci_low[j])
                MacPIx.append(Nlist[j])
                MacFitted.append(ols.fittedvalues[j])

            elif kval == 'micro':

                micCiH.append(predict_mean_ci_upp[j])
                micCiL.append(predict_mean_ci_low[j])
                MicPIx.append(Nlist[j])
                MicFitted.append(ols.fittedvalues[j])

        MicPIx, MicFitted, micCiH, micCiL = zip(*sorted(zip(MicPIx, MicFitted, micCiH, micCiL)))
        MacPIx, MacFitted, macCiH, macCiL = zip(*sorted(zip(MacPIx, MacFitted, macCiH, macCiL)))

        num = min(len(MacListX), len(MicListX))
        micnums = np.random.choice(range(0, len(MicListX)), num, replace=False)
        macnums = np.random.choice(range(0, len(MacListX)), num, replace=False)


        for i, ind in enumerate(micnums):
            plt.scatter(MacListX[macnums[i]], MacListY[macnums[i]], color = 'LightCoral', alpha= 1 , s = 8, linewidths=0.5, edgecolor='Crimson')
            plt.scatter(MicListX[ind], MicListY[ind], color = 'SkyBlue', alpha= 1 , s = 8, linewidths=0.5, edgecolor='Steelblue')

        plt.fill_between(MacPIx, macCiL, macCiH, color='LightCoral', lw=0.0, alpha=0.9)
        plt.plot(MacPIx, MacFitted,  color='r', ls='--', lw=0.5, alpha=0.9)
        plt.fill_between(MicPIx, micCiL, micCiH, color='b', lw=0.0, alpha=0.3)
        plt.plot(MicPIx, MicFitted,  color='b', ls='--', lw=0.5, alpha=0.9)

        MicInt = round(np.mean(MicIntList), 2)
        MicCoef = round(np.mean(MicCoefList), 2)
        MacInt = round(np.mean(MacIntList), 2)
        MacCoef = round(np.mean(MacCoefList), 2)
        R2 = round(np.mean(R2List), 2)

        if index == 0:
            plt.ylim(-0.1, 2.5)
            plt.xlim(0, 8.2)

            plt.text(0.35, 2.1, r'$micro$'+ ' = '+str(round(10**MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue')
            plt.text(0.35, 1.8, r'$macro$'+ ' = '+str(round(10**MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(0.35, 1.4,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k')

            plt.scatter([0],[-1], color = 'SkyBlue', alpha = 1, s=15, linewidths=0.9, edgecolor='Steelblue', label= 'microbes (n='+str(len(MicListY))+')')
            plt.scatter([0],[-1], color = 'LightCoral',alpha= 1, s=15, linewidths=0.9, edgecolor='Crimson', label= 'macrobes (n='+str(len(MacListY))+')')
            plt.legend(bbox_to_anchor=(-0.04, 1.05, 2.48, .2), loc=10, ncol=2, mode="expand",prop={'size':fs})

        elif index == 1:

            plt.plot([0,8.2],[0,8.2], ls = '--', lw=1, c='0.7')
            plt.ylim(0, 8)
            plt.xlim(0, 8.2)

            plt.text(0.35, 6.7, r'$micro$'+ ' = '+str(round(10**MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue')
            plt.text(0.35, 5.7, r'$macro$'+ ' = '+str(round(10**MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(0.35, 4.7,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k')

        elif index == 2:
            plt.ylim(-3.5, 0.0)
            plt.xlim(0, 8.2)

            plt.text(0.35, -2.9, r'$micro$'+ ' = '+str(round(10**MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue')
            plt.text(0.35, -3.3, r'$macro$'+ ' = '+str(round(10**MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(0.35, -2.5,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k')

        elif index == 3:
            plt.ylim(0.9, 5.0)
            plt.xlim(0, 8.2)

            plt.text(0.35, 4.5, r'$micro$'+ ' = '+str(round(2**MicInt,2))+'*'+r'$N$'+'$^{'+str(round(MicCoef,2))+'}$', fontsize=fs, color='Steelblue')
            plt.text(0.35, 4.0, r'$macro$'+ ' = '+str(round(2**MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(0.35, 3.5,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k')
            print condition, ones, ': S =', '%.3e' % (10**(MicInt + MicCoef*(30.0)))
            #print R2


        plt.xlabel('$log$'+r'$_{10}$'+'($N$)', fontsize=fs)
        plt.ylabel(metric, fontsize=fs)
        plt.tick_params(axis='both', which='major', labelsize=fs-3)

    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    if ones == False:
        plt.savefig(mydir+'/figs/Fig1/Locey_Lennon_2015_Fig1-'+condition+'_NoSingletons_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight")
    if ones == True:
        plt.savefig(mydir+'/figs/Fig1/Locey_Lennon_2015_Fig1-'+condition+'_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight")

    #plt.show()
    plt.close()

    return
Example #37
0
    if index == 0: metlist = list(Rs)
    elif index == 1: metlist = list(Nmaxs)
    elif index == 2: metlist = list(Evs)
    elif index == 3: metlist = list(Ss)

    print len(Ns), len(metlist)

    d = pd.DataFrame({'N': list(Ns)})
    d['y'] = list(metlist)
    f = smf.ols('y ~ N', d).fit()

    r2 = round(f.rsquared, 2)
    Int = f.params[0]
    Coef = f.params[1]

    st, data, ss2 = summary_table(f, alpha=0.05)
    # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict,
    # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp,
    # Residual, Std Error Residual, Student Residual, Cook's D

    fitted = data[:, 2]
    #predict_mean_se = data[:,3]
    mean_ci_low, mean_ci_upp = data[:, 4:6].T
    ci_low, ci_upp = data[:, 6:8].T
    ci_Ns = data[:, 0]

    Ns, metlist, fitted, ci_low, ci_upp = zip(
        *sorted(zip(Ns, metlist, fitted, ci_low, ci_upp)))

    plt.scatter(Ns,
                metlist,
Example #38
0
#### plot figure ###############################################################
xlab = r"$log_{10}$" + "(" + r"$\tau$" + ")"
fs = 8  # fontsize
fig = plt.figure()

#### N vs. Tau #################################################################
fig.add_subplot(2, 2, 1)

f2 = smf.ols("N ~ tau + I(tau ** 2.0)", d).fit()
print f2.summary()

a, b, c = f2.params
p1, p2, p3 = f2.pvalues
r2 = round(f2.rsquared, 2)

st, data, ss2 = summary_table(f2, alpha=0.05)
fitted = data[:, 2]
pred_mean_se = data[:, 3]
pred_mean_ci_low, pred_mean_ci_upp = data[:, 4:6].T
pred_ci_low, pred_ci_upp = data[:, 6:8].T

tau2, fitted, pred_ci_low, pred_ci_upp, pred_mean_ci_low, pred_mean_ci_upp = zip(
    *sorted(zip(tau, fitted, pred_ci_low, pred_ci_upp, pred_mean_ci_low, pred_mean_ci_upp))
)

plt.scatter(tau, N, color=colors, s=10, linewidths=0.1, edgecolor="k")
# plt.fill_between(tau2, pred_ci_low, pred_ci_upp, color='r', lw=0.0, alpha=0.1)
# plt.fill_between(tau2, pred_mean_ci_low, pred_mean_ci_upp, color='r', lw=0.0, alpha=0.3)
plt.plot(tau2, fitted, color="r", ls="--", lw=1.5, alpha=0.9)

plt.ylabel(r"$log_{10}$" + "(" + r"$N$" + ")", fontsize=fs + 6)
def figplot(x, y, xlab, ylab, fig, n):
    '''main figure plotting function'''

    fig.add_subplot(2, 2, n)
    y2 = list(y)
    x2 = list(x)

    d = pd.DataFrame({'x': list(x2)})
    d['y'] = list(y2)
    f = smf.ols('y ~ x', d).fit()

    m, b, r, p, std_err = stats.linregress(x2, y2)

    st, data, ss2 = summary_table(f, alpha=0.05)
    fitted = data[:, 2]
    mean_ci_low, mean_ci_upp = data[:, 4:6].T
    ci_low, ci_upp = data[:, 6:8].T

    x2, y2, fitted, ci_low, ci_upp = zip(
        *sorted(zip(x2, y2, fitted, ci_low, ci_upp)))

    if n == 1:
        lab = r'$R_{models}$' + ' = ' + str(round(
            10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n'
        lab += r'$R_{microbes}$' + ' = 2.34*' + r'$N$' + '$^{0.14}$' + '\n'
        lab += r'$R_{macrobes}$' + ' = 1.7*' + r'$N$' + '$^{0.11}$'
        plt.text(0.2, 0.8, lab, fontsize=7)

    elif n == 2:
        lab = r'$D_{models}$' + ' = ' + str(round(
            10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n'
        lab += r'$D_{microbes}$' + ' = 0.44*' + r'$N$' + '$^{0.92}$' + '\n'
        lab += r'$D_{macrobes}$' + ' = 0.23*' + r'$N$' + '$^{0.99}$'
        plt.text(0.2, 3.0, lab, fontsize=7)

    elif n == 3:
        lab = r'$E_{models}$' + ' = ' + str(round(
            10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n'
        lab += r'$E_{microbes}$' + ' = 0.58*' + r'$N$' + '$^{-0.23}$' + '\n'
        lab += r'$E_{macrobes}$' + ' = 1.15*' + r'$N$' + '$^{-0.21}$'
        plt.text(0.2, -1.7, lab, fontsize=7)

    elif n == 4:
        lab = r'$S_{models}$' + ' = ' + str(round(
            10**b, 2)) + '*' + r'$N$' + '$^{' + str(round(m, 2)) + '}$' + '\n'
        lab += r'$S_{microbes}$' + ' = 1.77*' + r'$N$' + '$^{0.38}$' + '\n'
        lab += r'$S_{macrobes}$' + ' = 1.77*' + r'$N$' + '$^{0.24}$'
        plt.text(0.2, 1.9, lab, fontsize=7)

    #plt.hexbin(x2, y2, mincnt=1, gridsize = 40, bins='log', cmap=plt.cm.jet)
    plt.scatter(x2,
                y2,
                color='SkyBlue',
                alpha=1,
                s=12,
                linewidths=0.5,
                edgecolor='Steelblue')

    if n == 3: plt.legend(loc='best', fontsize=6, frameon=False)

    plt.plot(x2, fitted, color='k', ls='--', lw=1.0, alpha=0.9)
    plt.xlabel(xlab, fontsize=8)
    plt.ylabel(ylab, fontsize=8)
    plt.tick_params(axis='both', labelsize=5)
    plt.xlim(0, 1.05 * max(x2))

    if n == 1: plt.ylim(0.0, 1.1)
    elif n == 2: plt.ylim(0.0, 4.2)
    elif n == 3: plt.ylim(-1.8, 0.05)
    elif n == 4: plt.ylim(0.4, 2.5)

    return fig
def Fig1():

    datasets = []
    GoodNames = ['MGRAST', 'HMP', 'EMPclosed', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA']

    for name in os.listdir(mydir +'data/micro'):
        if name in GoodNames: pass
        else: continue

        #if name in BadNames: continue
        #else: pass

        #path = mydir2+'data/micro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
        path = mydir2+'data/micro/'+name+'/'+name+'-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'micro', num_lines])
        print name, num_lines

    for name in os.listdir(mydir2 +'data/macro'):
        if name in GoodNames: pass
        else: continue

        #if name in BadNames: continue
        #else: pass

        #path = mydir2+'data/macro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
        path = mydir2+'data/macro/'+name+'/'+name+'-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'macro', num_lines])
        print name, num_lines


    metrics = ['Rarity, '+r'$log_{10}$',
            'Dominance, '+r'$log_{10}$',
            'Evenness, ' +r'$log_{10}$',
            'Richness, ' +r'$log_{10}$']

    fig = plt.figure()
    for index, i in enumerate(metrics):

        metric = i
        fig.add_subplot(2, 2, index+1)
        fs = 10 # font size used across figures

        MicIntList, MicCoefList, MacIntList, MacCoefList, R2List, metlist = [[], [], [], [], [], []]
        Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []]
        #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

        its = 1
        for n in range(its):

            #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
            Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []]

            numMac = 0
            numMic = 0
            radDATA = []

            for dataset in datasets:

                name, kind, numlines = dataset
                lines = []
                if name == 'EMPclosed' or name == 'EMPopen':
                    lines = np.random.choice(range(1, numlines+1), 1000, replace=True) # 166
                elif kind == 'micro': lines = np.random.choice(range(1, numlines+1), 1000, replace=True) #167
                else: lines = np.random.choice(range(1, numlines+1), 600, replace=True) # 100

                #path = mydir2+'data/'+kind+'/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
                path = mydir2+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

            for data in radDATA:

                data = data.split()
                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data

                N = float(N)
                S = float(S)

                if S < 10 or N < 11: continue

                Nlist.append(float(np.log10(N)))
                Slist.append(float(np.log10(S)))

                ESimplist.append(float(np.log10(float(ESimp))))

                kind = np.random.choice(['micro', 'macro'])
                KindList.append(kind)

                BPlist.append(float(BP))
                NmaxList.append(float(np.log10(float(Nmax))))

                # log-modulo transformation of skewnness
                lms = np.log10(np.abs(float(skew)) + 1)
                if skew < 0: lms = lms * -1
                rareSkews.append(float(lms))



                if kind == 'micro':
                    numMic += 1
                    klist.append('b')
                if kind == 'macro':
                    klist.append('r')
                    numMac += 1

            if index == 0: metlist = list(rareSkews)
            elif index == 1: metlist = list(NmaxList)
            elif index == 2: metlist = list(ESimplist)
            elif index == 3: metlist = list(Slist)

            # Multiple regression
            d = pd.DataFrame({'N': list(Nlist)})
            d['y'] = list(metlist)
            d['Kind'] = list(KindList)
            f = smf.ols('y ~ N * Kind', d).fit()


            MacIntList.append(f.params[0])
            MacCoefList.append(f.params[2])

            if f.pvalues[1] < 0.05:
                MicIntList.append(f.params[1] + f.params[0])
            else:
                MicIntList.append(f.params[0])

            if f.pvalues[3] < 0.05:
                MicCoefList.append(f.params[3] + f.params[2])
            else:
                MicCoefList.append(f.params[2])

            R2List.append(f.rsquared)


        MacPIx, MacFitted, MicPIx, MicFitted = [[],[],[],[]]
        macCiH, macCiL, micCiH, micCiL = [[],[],[],[]]

        MacListX = []
        MacListY = []
        MicListX = []
        MicListY = []

        for j, k in enumerate(KindList):
            if k == 'micro':
                MicListX.append(Nlist[j])
                MicListY.append(metlist[j])

            elif k == 'macro':
                MacListX.append(Nlist[j])
                MacListY.append(metlist[j])

        print metric
        lm = smf.ols('y ~ N * Kind', d).fit()
        print lm.summary()
        print '\n\n'

        st, data, ss2 = summary_table(lm, alpha=0.05)
        # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict,
        # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp,
        # Residual, Std Error Residual, Student Residual, Cook's D

        fittedvalues = data[:,2]
        predict_mean_se = data[:,3]
        predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
        predict_ci_low, predict_ci_upp = data[:,6:8].T


        for j, kval in enumerate(KindList):
            if kval == 'macro':

                macCiH.append(predict_mean_ci_upp[j])
                macCiL.append(predict_mean_ci_low[j])
                MacPIx.append(Nlist[j])
                MacFitted.append(f.fittedvalues[j])

            elif kval == 'micro':

                micCiH.append(predict_mean_ci_upp[j])
                micCiL.append(predict_mean_ci_low[j])
                MicPIx.append(Nlist[j])
                MicFitted.append(f.fittedvalues[j])

        MicPIx, MicFitted, micCiH, micCiL = zip(*sorted(zip(MicPIx, MicFitted, micCiH, micCiL)))
        MacPIx, MacFitted, macCiH, macCiL = zip(*sorted(zip(MacPIx, MacFitted, macCiH, macCiL)))


        num = min(len(MacListX), len(MicListX))
        for i in range(num):
            plt.scatter(MacListX[i], MacListY[i], color = '0.4', alpha= 1 , s = 4, linewidths=0.5, edgecolor='0.3')
            plt.scatter(MicListX[i], MicListY[i], color = '0.4', alpha= 1 , s = 4, linewidths=0.5, edgecolor='0.3')

        plt.fill_between(MacPIx, macCiL, macCiH, color='lime', lw=0.0, alpha=0.3)
        plt.plot(MacPIx, MacFitted,  color='lime', ls='--', lw=0.5, alpha=0.8)

        MacInt = round(np.mean(MacIntList), 2)
        MacCoef = round(np.mean(MacCoefList), 2)
        R2 = round(np.mean(R2List), 2)

        if index == 0:
            plt.ylim(-0.1, 2.0)
            plt.xlim(1, 7)
            plt.text(1.35, 1.5, r'$Rarity$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='k')
            plt.text(1.35, 1.2,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='0.3')

            plt.scatter([0],[-1], color = 'SkyBlue', alpha = 1, s=10, linewidths=0.9, edgecolor='Steelblue', label= 'microbes (n='+str(len(MicListY))+')')
            plt.scatter([0],[-1], color = 'LightCoral',alpha= 1, s=10, linewidths=0.9, edgecolor='Crimson', label= 'macrobes (n='+str(len(MacListY))+')')
            plt.legend(bbox_to_anchor=(-0.04, 1.1, 2.48, .2), loc=10, ncol=2, mode="expand",prop={'size':fs+2})

        elif index == 1:

            plt.plot([0,7],[0,7], ls = '--', lw=1, c='0.7')
            #ax.text(18, 21, '1:1 line', fontsize=fs*1.0, rotation=40, color='0.7')
            plt.ylim(0, 6)
            plt.xlim(1, 7)

            plt.text(1.35, 4.5, r'$Dominance$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='k')
            plt.text(1.35, 3.75,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='0.3')

        elif index == 2:
            plt.ylim(-3.0, 0.0)
            plt.xlim(0, 7)

            plt.text(0.35, -2.5, r'$Evenness$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='k')
            plt.text(0.35, -2.2,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='0.3')

        elif index == 3:
            plt.ylim(0.9, 4.5)
            plt.xlim(1, 7)

            plt.text(1.35, 3.5, r'$Richness$'+ ' = '+str(round(MacInt,2))+'*'+r'$N$'+'$^{'+str(round(MacCoef,2))+'}$', fontsize=fs, color='k')
            plt.text(1.35, 3.0,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='0.3')


        plt.xlabel('Number of reads or individuals, '+ '$log$'+r'$_{10}$', fontsize=fs)
        plt.ylabel(metric, fontsize=fs)
        plt.tick_params(axis='both', which='major', labelsize=fs-3)

    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    #plt.savefig(mydir+'/figs/appendix/Fig1/RandomAssign/Locey_Lennon_2015_Pooled-OpenReference_NoSingletons.png', dpi=600, bbox_inches = "tight")
    #plt.savefig(mydir+'/figs/appendix/Fig1/RandomAssign/Locey_Lennon_2015_Pooled-ClosedReference_NoSingletons.png', dpi=600, bbox_inches = "tight")
    #plt.savefig(mydir+'/figs/appendix/Fig1/RandomAssign/Locey_Lennon_2015_Pooled-OpenReference.png', dpi=600, bbox_inches = "tight")
    plt.savefig(mydir+'/figs/appendix/Fig1/RandomAssign/Locey_Lennon_2015_Pooled-ClosedReference.png', dpi=600, bbox_inches = "tight")

    #plt.show()
    #plt.close()

    return
Example #41
0
def d(re,alpha):
	st, data, ss2 = summary_table(re, alpha)
	return st,data,ss2
def Fig3(condition, ones, sampling):

    """ A figure demonstrating a strong richness relationship across 10 or 11
    orders of magnitude in total abundance. Taxonomic richness of a sample
    scales in a log-log fashion with the total abundance of the sample.
    """

    fs = 12 # font size used across figures
    metric = 'Richness, '+r'$log$'+r'$_{10}$'

    tail = str()
    if ones is False:
        tail = '-SADMetricData_NoMicrobe1s.txt'
    elif ones is True:
        tail = '-SADMetricData.txt'

    datasets = []
    GoodNames = []
    emp = str()

    if condition == 'open': emp = 'EMPopen'
    elif condition == 'closed': emp = 'EMPclosed'

    GoodNames = [emp, 'TARA', 'HMP', 'BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED', 'HUMAN', 'CHINA', 'CATLIN', 'FUNGI']

    print '\n'

    its = 1
    d_blist = []
    d_zlist = []
    s_blist = []
    s_zlist = []

    for name in os.listdir(mydir +'data/micro'):
        if name in GoodNames: pass
        else: continue

        path = mydir+'data/micro/'+name+'/'+name+tail
        numlines = sum(1 for line in open(path))
        #print name, numlines
        datasets.append([name, 'micro', numlines])

    if sampling <= 500: its = 100
    else: its = 100

    for i in range(its):

        Nlist, Slist, klist, NmaxList = [[],[],[],[]]

        for dataset in datasets:
            radDATA = []
            name, kind, numlines = dataset
            lines = []

            small_mgrast = ['BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED']
            big_mgrast = ['HUMAN', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO']

            if kind == 'micro':
                if name in small_mgrast:
                    lines = np.random.choice(range(1, numlines+1), 160, replace=True) # 40

                elif name in big_mgrast:
                    lines = np.random.choice(range(1, numlines+1), 400, replace=True) # 100

                else:
                    lines = np.random.choice(range(1, numlines+1), 400, replace=True) # 100

            path = mydir+'data/micro/'+name+'/'+name+tail

            for line in lines:
                data = linecache.getline(path, line)
                radDATA.append(data)

            ct = 0
            for data in radDATA:
                data = data.split()
                if data == []: continue
                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data

                N = float(N)
                S = float(S)
                Nmax = float(Nmax)

                ct += 1
                Nlist.append(float(np.log10(N)))
                Slist.append(float(np.log10(S)))

                NmaxList.append(float(np.log10(Nmax)))
                klist.append('DarkCyan')

            #print name, ct


        Nlist, Slist, NmaxList = zip(*sorted(zip(Nlist, Slist, NmaxList)))
        Nlist = list(Nlist)
        Slist = list(Slist)
        NmaxList = list(NmaxList)

        # Regression for Dominance (Nmax) vs. N
        d = pd.DataFrame({'N': Nlist})
        d['Nmax'] = NmaxList
        f = smf.ols('Nmax ~ N', d).fit()

        R2 = f.rsquared
        pval = f.pvalues[0]
        intercept = f.params[0]
        slope = f.params[1]

        d_blist.append(intercept)
        d_zlist.append(slope)

        # Regression for Richness (S) vs. N
        d = pd.DataFrame({'N': Nlist})
        d['S'] = Slist
        f = smf.ols('S ~ N', d).fit()

        R2 = f.rsquared
        pval = f.pvalues[0]
        intercept = f.params[0]
        slope = f.params[1]

        s_blist.append(intercept)
        s_zlist.append(slope)

    sb = np.mean(s_blist)
    sz = np.mean(s_zlist)

    db = np.mean(d_blist)
    dz = np.mean(d_zlist)

    #print 'R2 for Nmax vs. N:', round(dR2, 3)
    print 'Nmax =', round(10**db, 2), '*', 'N^', round(dz, 2)
    #print 'R2 for S vs. N:', round(R2, 3)
    print 'S =', round(10**sb, 2), '*', 'N^', round(sz, 2),'\n'


    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    plt.text(2, 11.0, r'$S$'+ ' = '+str(round(10**sb, 1))+'*'+r'$N$'+'$^{'+str(round(sz, 2))+'},$' + r' $r^2$' + '=' +str(round(R2,2)), fontsize=fs+5, color='Crimson', alpha=0.9)

    # code for prediction intervals
    X = np.linspace(5, 32, 100)
    Y = f.predict(exog=dict(N=X))
    Nlist2 = Nlist + X.tolist()
    Slist2 = Slist + Y.tolist()

    d = pd.DataFrame({'N': list(Nlist2)})
    d['y'] = list(Slist2)
    f = smf.ols('y ~ N', d).fit()

    st, data, ss2 = summary_table(f, alpha=0.05)
    #fittedvalues = data[:,2]
    #pred_mean_se = data[:,3]
    pred_mean_ci_low, pred_mean_ci_upp = data[:,4:6].T
    pred_ci_low, pred_ci_upp = data[:,6:8].T

    plt.fill_between(Nlist2, pred_ci_low, pred_ci_upp, color='r', lw=0.5, alpha=0.2)
    z = np.polyfit(Nlist2, Slist2, 1)
    p = np.poly1d(z)
    xp = np.linspace(0, 32, 1000)

    label1 = 'Richness-abundance scaling relationship, $S$ = 7.6$N^{0.35}$'
    label2 = 'Predicted $S$ via lognormal, published $N$ and $N_{max}$'
    label3 = 'Predicted $S$ via lognormal, published $N$, $N_{max}$ = 0.4$N^{0.93}$'

    plt.plot(xp, p(xp), '--', c='red', lw=2, alpha=0.8, color='Crimson', label=label1)
    plt.scatter(Nlist, Slist, color = 'LightCoral', alpha= 1 , s = 10, linewidths=0.5, edgecolor='Crimson')
    #plt.hexbin(Nlist, Slist, mincnt=1, gridsize = 80, bins='log', cmap=plt.cm.Reds_r, label='EMP')

    # Adding in derived/inferred points
    c = '0.3'

    GO = [3.6*(10**28), 10.1*(10**28)] # estimated open ocean bacteria; Whitman et al. 1998
    Pm = [2.8*(10**27), 3.0*(10**27)] # estimated Prochlorococcus; Flombaum et al. 2013
    Syn = [6.7*(10**26), 7.3*(10**26)] # estimated Synechococcus; Flombaum et al. 2013

    Earth = [9.2*(10**29), 31.7*(10**29)] # estimated bacteria on Earth; Kallmeyer et al. 2012
    SAR11 = [2.0*(10**28), 2.0*(10**28)] # estimated percent abundance of SAR11; Morris et al. (2002)

    HGx = [0.5*(10**14), 1.5*(10**14)] # estimated bacteria in Human gut; Berg (1996)
    HGy = [0.05*min(HGx), 0.15*max(HGx)] # estimated most abundant bacteria in Human gut; Turnbaugh et al. (2009), & Dethlefsen et al. (2008)

    COWx = [0.5*2.226*(10**15), 1.5*2.226*(10**15)] # estimated bacteria in Cow rumen; LOW:   HIGH: Whitman et al. (1998)
    COWy = [0.09*min(COWx), 0.15*max(COWx)] # estimated dominance in Cow rumen; Stevenson and Weimer (2006)


    ## PREDICTIONS OF S BASED ON THE EMPIRICAL S VS. N SCALING LAW, AND BASED
    ## ON THE LOGNORMAL PREDICTIVE FRAMEWORK OF CURTIS AND SLOAN USING
    ## 1.) THE ESTIMATED NMAX AND 2.) THE PREDICTED NMAX

    Ns = []
    Ss = []
    DomSs = []

    # Global Ocean estimates based on Whitman et al. (1998) and P. marinus (2012 paper)
    guess = 0.1019
    yrange = [min(Syn), max(Pm)]
    Slist_ln, Slist_SvN, Dlist, Nlist = getS(GO, sb, sz, db, dz, guess, yrange, predictNmax=False)

    S_ln = np.mean(Slist_ln)
    S1 = float(S_ln)
    S_ln_sem = stats.sem(Slist_ln, ddof=1)
    S_SvN = np.mean(Slist_SvN)
    S_SvN_sem = stats.sem(Slist_SvN, ddof=1)
    Nmax = np.mean(Dlist)
    Nmax_sem = stats.sem(Dlist, ddof=1)
    avgN = np.mean(Nlist)
    avgN_sem = stats.sem(Nlist, ddof=1)
    Ss.append(S_ln)

    print 'scaling law prediction of S for Global Ocean:', '%.3e' % 10**(S_SvN)
    print 'lognormal prediction of S for Global Ocean, using estimated Nmax:', '%.3e' % 10**S_ln

    guess = 0.1019
    Slist_ln, Slist_SvN, Dlist, Nlist = getS(GO, sb, sz, db, dz, guess, yrange, predictNmax=True)

    S_ln = np.mean(Slist_ln)
    S_ln_sem = stats.sem(Slist_ln, ddof=1)
    S_SvN = np.mean(Slist_SvN)
    S_SvN_sem = stats.sem(Slist_SvN, ddof=1)
    Nmax = np.mean(Dlist)
    Nmax_sem = stats.sem(Dlist, ddof=1)
    avgN = np.mean(Nlist)
    avgN_sem = stats.sem(Nlist, ddof=1)

    print 'lognormal prediction of S for Global Ocean, using predicted Nmax:', '%.3e' % 10**S_ln
    #print 'P.m.:', '%.2e' % float(2.9*10**27), 'Nmax:', '%.2e' % 10**Nmax,'\n'

    S2 = float(S_ln)
    N = float(avgN)
    S_sem = float(4*S_ln_sem)
    N_sem = float(4*avgN_sem)

    ax.text(13.5, S1*0.93, 'Global Ocean', fontsize=fs+2, color = 'k')
    ax.axhline(S1, 0, 0.91, ls = '--', c = '0.6')
    ax.text(N-1, S2*.80, 'Global ocean', fontsize=fs+2, color = 'k', rotation = 90)
    ax.axvline(N, 0, 0.65, ls = '--', c = '0.6')
    #plt.scatter([N], [S2], color = '0.2', alpha= 1 , s = 60, linewidths=1, edgecolor='k')
    Ns.append(N)
    DomSs.append(S2)
    #plt.errorbar([N], [S2], xerr=N_sem, yerr=S_sem, color='k', linewidth=2)


    # Earth, i.e., Global estimates based on Kallmeyer et al. (2012) and SAR11 (2002 paper)
    guess = 0.1060
    yrange = [min(Pm), max(SAR11)]
    Slist_ln, Slist_SvN, Dlist, Nlist = getS(Earth, sb, sz, db, dz, guess, yrange, predictNmax=False)

    S_ln = np.mean(Slist_ln)
    S1 = float(S_ln)
    S_ln_sem = stats.sem(Slist_ln, ddof=1)
    S_SvN = np.mean(Slist_SvN)
    S_SvN_sem = stats.sem(Slist_SvN, ddof=1)
    Nmax = np.mean(Dlist)
    Nmax_sem = stats.sem(Dlist, ddof=1)
    avgN = np.mean(Nlist)
    avgN_sem = stats.sem(Nlist, ddof=1)
    Ss.append(S_ln)

    #print 'average N and sem:' '%.3e' % 10**avgN, '%.3e' % 10**avgN_sem
    #print 'average Nmax and sem:' '%.3e' % 10**Nmax, '%.3e' % 10**Nmax_sem

    print '\nscaling law prediction of S for Earth:', '%.3e' % 10**S_SvN #,'%.3e' % 10**S_SvN_sem #, '%.3e' % S_SvN_CI
    print 'lognormal prediction of S for Earth, using estimated Nmax:', '%.3e' % 10**S_ln #, '%.3e' % 10**S_ln_sem#, '%.3e' % S_ln_CI

    guess = 0.1060
    Slist_ln, Slist_SvN, Dlist, Nlist = getS(Earth, sb, sz, db, dz, guess, yrange, predictNmax=True)

    S_ln = np.mean(Slist_ln)
    S_ln_sem = stats.sem(Slist_ln, ddof=1)
    S_SvN = np.mean(Slist_SvN)
    S_SvN_sem = stats.sem(Slist_SvN, ddof=1)
    Nmax = np.mean(Dlist)
    Nmax_sem = stats.sem(Dlist, ddof=1)
    avgN = np.mean(Nlist)
    avgN_sem = stats.sem(Nlist, ddof=1)

    print 'lognormal prediction of S for Earth, using predicted Nmax:', '%.3e' % 10**S_ln #, '%.3e' % 10**S_ln_sem#, '%.3e' % S_ln_CI
    #print 'SAR11:', '%.2e' % float(2.4*10**28), 'Nmax:', '%.2e' % Nmax,'\n'

    S2 = float(S_ln)
    N = float(avgN)
    S_sem = float(4*S_ln_sem)
    N_sem = float(4*avgN_sem)

    ax.text(25, S2*1.025, 'Earth', fontsize=fs+2, color = 'k')
    ax.axhline(S2, 0, 0.95, ls = '--', c = '0.6')
    ax.text(N-1, 8, 'Earth', fontsize=fs+2, color = 'k', rotation = 90)
    ax.axvline(N, 0, 0.82, ls = '--', c = '0.6')
    #plt.scatter([N], [S2], color = '0.2', alpha= 1 , s = 60, linewidths=1, edgecolor='k')
    #plt.errorbar([N], [S2], xerr=N_sem, yerr=S_sem, color='k', linewidth=2)
    Ns.append(N)
    DomSs.append(S2)


    # Human Gut
    guess = 0.1509
    Slist_ln, Slist_SvN, Dlist, Nlist = getS(HGx, sb, sz, db, dz, guess, HGy, predictNmax=False)

    S_ln = np.mean(Slist_ln)
    S1 = float(S_ln)
    S_ln_sem = stats.sem(Slist_ln, ddof=1)
    S_SvN = np.mean(Slist_SvN)
    S_SvN_sem = stats.sem(Slist_SvN, ddof=1)
    Nmax = np.mean(Dlist)
    Nmax_sem = stats.sem(Dlist, ddof=1)
    avgN = np.mean(Nlist)
    avgN_sem = stats.sem(Nlist, ddof=1)
    Ss.append(S_ln)


    Slist_ln, Slist_SvN, Dlist, Nlist = getS(HGx, sb, sz, db, dz, guess, HGy, predictNmax=True)

    S_ln = np.mean(Slist_ln)
    S_ln_sem = stats.sem(Slist_ln, ddof=1)
    S_SvN = np.mean(Slist_SvN)
    S_SvN_sem = stats.sem(Slist_SvN, ddof=1)
    Nmax = np.mean(Dlist)
    Nmax_sem = stats.sem(Dlist, ddof=1)
    avgN = np.mean(Nlist)
    avgN_sem = stats.sem(Nlist, ddof=1)

    S2 = float(S_ln)
    N = float(avgN)
    S_sem = float(4*S_ln_sem)
    N_sem = float(4*avgN_sem)

    ax.text(3.5, S2*.9, 'Human Gut', fontsize=fs+2, color = 'k')
    ax.axhline(S2, 0, 0.41, ls = '--', c = '0.6')
    ax.text(N-1, 3.6, 'Human Gut', fontsize=fs+2, color = 'k', rotation = 90)
    ax.axvline(N, 0, 0.33, ls = '--', c = '0.6')
    #plt.scatter([N], [S2], color = '0.2', alpha= 1 , s = 60, linewidths=1, edgecolor='k')
    #plt.errorbar([N], [S2], xerr=N_sem, yerr=S_sem, color='k', linewidth=2)
    Ns.append(N)
    DomSs.append(S2)
    #print 'predS for Human Gut:', '%.3e' % 10**S2


    # Cow Rumen
    guess = 0.1
    Slist_ln, Slist_SvN, Dlist, Nlist = getS(COWx, sb, sz, db, dz, guess, COWy, predictNmax=False)

    S_ln = np.mean(Slist_ln)
    S1 = float(S_ln)
    S_ln_sem = stats.sem(Slist_ln, ddof=1)
    S_SvN = np.mean(Slist_SvN)
    S_SvN_sem = stats.sem(Slist_SvN, ddof=1)
    Nmax = np.mean(Dlist)
    Nmax_sem = stats.sem(Dlist, ddof=1)
    avgN = np.mean(Nlist)
    avgN_sem = stats.sem(Nlist, ddof=1)
    Ss.append(S_ln)

    Slist_ln, Slist_SvN, Dlist, Nlist = getS(COWx, sb, sz, db, dz, guess, COWy, predictNmax=True)

    S_ln = np.mean(Slist_ln)
    S_ln_sem = stats.sem(Slist_ln, ddof=1)
    S_SvN = np.mean(Slist_SvN)
    S_SvN_sem = stats.sem(Slist_SvN, ddof=1)
    Nmax = np.mean(Dlist)
    Nmax_sem = stats.sem(Dlist, ddof=1)
    avgN = np.mean(Nlist)
    avgN_sem = stats.sem(Nlist, ddof=1)

    S2 = float(S_ln)
    N = float(avgN)
    S_sem = float(4*S_ln_sem)
    N_sem = float(4*avgN_sem)

    ax.text(6, S2*1.04, 'Cow Rumen', fontsize=fs+2, color = 'k')
    ax.axhline(S2, 0, 0.46, ls = '--', c = '0.6')
    ax.text(N+0.3, 4.2, 'Cow Rumen', fontsize=fs+2, color = 'k', rotation = 90)
    ax.axvline(N, 0, 0.38, ls = '--', c = '0.6')
    Ns.append(N)
    DomSs.append(S2)

    plt.scatter(Ns, Ss, color = '0.4', alpha= 1, s = 50, linewidths=2, edgecolor='k', label=label2)
    plt.scatter(Ns, DomSs, color = 'SkyBlue', alpha= 1, s = 50, linewidths=2, edgecolor='Steelblue', label=label3)
    #plt.errorbar([N], [S2], xerr=N_sem, yerr=S_sem, color='k', linewidth=2)

    ax.text(5, -0.8, 'Number of reads or total abundance, '+ '$log$'+r'$_{10}$', fontsize=fs+4)
    ax.text(-2.1, 10, 'OTU '+ metric, fontsize=fs+4, rotation=90)
    plt.xlim(1, 31)
    plt.ylim(0.8, 14)

    plt.legend(bbox_to_anchor=(-0.015, 1.03, 1.03, .2), loc=10, ncol=1,
                                mode="expand",prop={'size':fs+2.2})

    if ones == False:
        plt.savefig(mydir+'/figs/Fig3/figure3.pdf', dpi=300, bbox_inches = "tight")
        #plt.savefig(mydir+'/figs/Fig3/Locey_Lennon_2015_Fig3-'+condition+'_NoSingletons_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight")
    if ones == True:
        plt.savefig(mydir+'/figs/Fig3/figure3.pdf', dpi=300, bbox_inches = "tight")
        #plt.savefig(mydir+'/figs/Fig3/Locey_Lennon_2015_Fig3-'+condition+'_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight")

    #plt.show()

    return
def plot_time_series(fname_template=None,
                     region_number="all",
                     dpi=150,
                     ax = None,
                     melt_index_or_extent="index",
                     extent_melt_days_threshold=2,
                     include_ylabel=True,
                     gap_filled=True,
                     include_trendline=False,
                     include_trendline_only_if_significant=True,
                     include_legend_if_significant=True,
                     include_name_in_title=True,
                     print_trendline_summary=True,
                     offset_years_by_one=True,
                     add_confidence_intervals=True,
                     add_prediction_intervals=True,
                     verbose=True):
    """Cretae a plot of the time series of melt.

    In fname_template, if you specify a {0} tag in the name, it will be filled
    in with the region number. This is useful if you want to use region_number="all",
    as that will create 8 plots for regions 0-7.

    Use 'ax' to provide an axis upon which to draw. This is useful for putting
    together a multi-part figure. Don't use this option if using "region_number="all",
    as that will draw multiple plots on the same axes.
    """
    if region_number == "all":
        region_nums = range(8)
    else:
        region_nums = [region_number]

    ax_provided = ax

    for region_n in region_nums:
        years, melt = get_time_series_data(region_number=region_n,
                                           melt_index_or_extent=melt_index_or_extent,
                                           extent_melt_days_threshold = extent_melt_days_threshold,
                                           gap_filled=gap_filled,
                                           return_in_km2=True)

        # Since the "2019" melt season (.e.g) in Antarctica actually spans 2019-2020,
        # it makes more sense to center it over the Jan 1, 2020 date rather than
        # the start of 2019.
        # Make it so.
        if offset_years_by_one:
            years = years + 1

        if include_ylabel:
            if max(melt) > 1e6:
                melt = melt / 1e6
                figure_exp = 6
            else:
                melt = melt / 1e3
                figure_exp = 3
        else:
            melt = melt / 1e6
            figure_exp = 6

        # Create a new figure if no axis is provided.
        if ax_provided is None:
            fig, ax = plt.subplots(1,1)

        ax.plot(years, melt, color="maroon", label = "Annual melt {0}".format("index" if melt_index_or_extent == "index" else "extent"))

        melt_index_or_extent_lower = melt_index_or_extent.strip().lower()

        if include_ylabel:
            if melt_index_or_extent_lower == "index":
                ax.set_ylabel("Melt Index (10$^{0}$ km$^2\cdot$days)".format(figure_exp))
                # ax.set_ylabel("Melt Index (million km$^2$ days)")
            elif melt_index_or_extent_lower == "extent":
                ax.set_ylabel("Melt Extent (10$^{0}$ km$^2$)".format(figure_exp))
            else:
                raise ValueError("Unknown value for parameter 'melt_index_or_extent': {0}".format(melt_index_or_extent))

        ax.tick_params(direction="in", bottom=True, left=True, right=True, top=False, labeltop=False, labelright=False, which="major")
        ax.tick_params(direction="in", bottom=True, which="minor")
        ax.tick_params(axis='x', length=4, which="major")
        ax.tick_params(axis='x', length=2, which="minor")

        if include_name_in_title:
            region_name = antarctic_regions_dict[region_n]

            ax.set_title(region_name)


        # Limit lower-bounds to zero
        ylim = ax.get_ylim()
        ax.set_ylim(max(ylim[0], 0), ylim[1])

        # Force the y-axis to only use integers (this tends to give us better scaling)
        if ylim[1] > 8:
            ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))

        # Turn on the minor ticks for the years.
        ax.xaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(base=1))
        # ax.xaxis.grid(True, which='minor')

        # Run a linear-fit OLS model on the data.
        x = statsmodels.api.add_constant(years)
        model = statsmodels.api.OLS(melt, x)
        results = model.fit()

        # If go into all this if we've indicated we might want to plot a trendline.
        if include_trendline or include_trendline_only_if_significant:

            # print(results.params)
            # print(results.pvalues)
            pval_int, pval_slope = results.pvalues
            intercept, slope = results.params
            # fit_func = numpy.poly1d((slope, intercept))

            if print_trendline_summary:
               print("\n")
               print("============", antarctic_regions_dict[region_n] + ",", melt_index_or_extent, "==============")
               print(results.summary())

            if include_trendline or (pval_slope <= 0.05 and include_trendline_only_if_significant):

                st, data, ss2 = summary_table(results, alpha=0.05)
                fittedvalues = data[:, 2]
                # predict_mean_se  = data[:, 3]
                predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T
                predict_ci_low, predict_ci_upp = data[:, 6:8].T

                # Put the p-value in the legend text.
                p_value_text = ("{0:0.3f}" if (pval_slope > 0.001) else "{0:0.1e}").format(pval_slope)
                # ax.plot(years, fit_func(years), color="blue", label = r"Linear Trend (\textit{p=" + p_value_text + "})")
                # ax.plot(years, fit_func(years), color="blue", label = r"Linear Trend ($\it{p=" + p_value_text + "}$)")
                ax.plot(years, fittedvalues, color="blue", label = r"Linear trend ($\it{p=" + p_value_text + "}$)")

                if add_confidence_intervals:
                    # Regression errors, Y minus Y_fit
                    # y_err = melt - fit_func(years)

                                    # Calculate confidence intervals
                    # p_x, confs = CI.conf_calc(years, y_err, c_limit=0.975, test_n=50)

                    # Calculate the lines for plotting:
                    # The fit line, and lower and upper confidence bounds
                    # p_y, lower, upper = CI.ylines_calc(p_x, confs, fit_func)

                    # plot confidence limits
                    # ax.plot(p_x, lower, 'c--',
                    ax.plot(years, predict_mean_ci_low, color='blue', linestyle='--',
                            label='95% confidence interval',
                            # label='95\% Confidence Interval',
                            alpha=0.5,
                            linewidth=0.8)
                    # ax.plot(p_x, upper, 'c--',
                    ax.plot(years, predict_mean_ci_upp, color='blue', linestyle='--',
                            label=None,
                            alpha=0.5,
                            linewidth=0.8)

                if add_prediction_intervals:
                    ax.plot(years, predict_ci_low, color="red", linestyle='--',
                            label='95% prediction interval',
                            # label='95\% Confidence Interval',
                            alpha=0.5,
                            linewidth=0.5)
                    # ax.plot(p_x, upper, 'c--',
                    ax.plot(years, predict_ci_upp, color="red", linestyle='--',
                            label=None,
                            alpha=0.5,
                            linewidth=0.5)

                    # The prediction intervals are quite wide. Rescale the y-limits
                    # to be no more than 10% above/below the max/min of the data,
                    # even if it makes the prediction intervals trail off the figure
                    # a bit.
                    ylim = ax.get_ylim()
                    if (ylim[0] < 0) or (ylim[0] < (min(melt) - 0.1*(max(melt) - min(melt)))):
                        ax.set_ylim(max(0, min(melt)- 0.1*(max(melt) - min(melt))), ylim[1])

                    ylim = ax.get_ylim()
                    if (ylim[1] > (max(melt) + 0.1*(max(melt) - min(melt)))):
                        ax.set_ylim(ylim[0], (max(melt) + 0.1*(max(melt) - min(melt))))

                if include_legend_if_significant:
                    ax.legend(fontsize="small", labelspacing=0.1, framealpha=0.95)


        if ax_provided is None:
            fig.tight_layout()

            if fname_template is None:
                plt.show()
            else:
                fname = fname_template.format(region_n)
                fig.savefig(fname, dpi=dpi)
                if verbose:
                    print(fname, "written.")

            plt.close(fig)

    return results
def Fig1():

    OUT = open(mydir + 'output/PerDataset.txt','w+')

    """ This code generates a 4 plot figure of diversity properties (rarity,
        dominance, evenness, richness) versus total abundance, for each dataset.
        This code also generates a .txt file of results for the regression
        analyses. """

    datasets = []
    #GoodNames = ['TARA', 'HUMAN', 'BOVINE', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'HMP', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA', 'EMPclosed', 'EMPopen']

    GoodNames = ['HMP']

    for name in os.listdir(mydir +'data/micro'):
        if name in GoodNames: pass
        else: continue

        #path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
        path = mydir+'data/micro/'+name+'/'+name+'-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'micro', num_lines])
        print name, num_lines

    for name in os.listdir(mydir +'data/macro'):
        if name in GoodNames: pass
        else: continue

        #path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
        path = mydir+'data/macro/'+name+'/'+name+'-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'macro', num_lines])
        print name, num_lines


    metrics = ['Rarity, '+r'$log_{10}$',
            'Dominance, '+r'$log_{10}$',
            'Evenness, ' +r'$log_{10}$',
            'Richness, ' +r'$log_{10}$']

    #OUT = open(mydir + 'output/SummaryPerDataset_NoMicrobe1s.txt','w+')
    OUT = open(mydir + 'output/SummaryPerDataset.txt','w+')

    for dataset in datasets:

        fig = plt.figure()
        for index, i in enumerate(metrics):

            metric = i
            fig.add_subplot(2, 2, index+1)
            fs = 10 # font size used across figures

            IntList, CoefList, R2List, metlist = [[], [], [], []]
            Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []]
            #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

            its = 1
            f = list()
            for n in range(its):

                #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
                Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []]

                radDATA = []
                name, kind, numlines = dataset
                lines = []

                lines = np.random.choice(range(1, numlines+1), numlines, replace=False)

                #if numlines > 1000:
                #    lines = np.random.choice(range(1, numlines+1), 1000, replace=True)
                #else:
                #    lines = np.random.choice(range(1, numlines+1), numlines, replace=False)

                #path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData_NoMicrobe1s.txt'
                path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

                tN = 0
                for data in radDATA:

                    data = data.split()
                    name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data


                    N = float(N)
                    tN += N
                    S = float(S)

                    #if S < 2 or N < 10: continue

                    Nlist.append(float(np.log10(N)))
                    Slist.append(float(np.log10(S)))

                    ESimplist.append(float(np.log10(float(ESimp))))
                    KindList.append(kind)

                    BPlist.append(float(BP))
                    NmaxList.append(float(np.log10(float(Nmax))))

                    # log-modulo transformation of skewnness
                    lms = np.log10(np.abs(float(skew)) + 1)
                    if skew < 0: lms = lms * -1
                    rareSkews.append(float(lms))


                print 'total number of reads in', name, ':',
                print '%.3e' % tN
                sys.exit()

                if index == 0: metlist = list(rareSkews)
                elif index == 1: metlist = list(NmaxList)
                elif index == 2: metlist = list(ESimplist)
                elif index == 3: metlist = list(Slist)

                # Simple regression
                d = pd.DataFrame({'N': list(Nlist)})
                d['y'] = list(metlist)
                f = smf.ols('y ~ N', d).fit()

                IntList.append(f.params[0])
                CoefList.append(f.params[1])
                R2List.append(f.rsquared)


            PIx = list(Nlist)
            st, data, ss2 = summary_table(f, alpha=0.05)
            # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict,
            # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp,
            # Residual, Std Error Residual, Student Residual, Cook's D

            Fitted = data[:,2]
            predict_mean_se = data[:,3]
            CiL, CiH = data[:,4:6].T
            PiL, PiH = data[:,6:8].T

            PIx, Fitted, CiH, CiL = zip(*sorted(zip(PIx, Fitted, CiH, CiL)))

            plt.scatter(Nlist, metlist, color = 'SkyBlue', alpha= 1 , s = 4, linewidths=0.5, edgecolor='Steelblue')
            plt.fill_between(PIx, CiL, CiH, color='b', lw=0.0, alpha=0.3)

            Int = round(np.mean(IntList), 2)
            Coef = round(np.mean(CoefList), 2)
            R2 = round(np.mean(R2List), 3)

            print dataset, metric, Int, Coef, R2

            x = min(Nlist)
            y = 1.1*max(metlist)

            plt.scatter([0],[-1], color = 'SkyBlue', alpha = 1, s=10, linewidths=0.9, edgecolor='Steelblue', label= metric+' = '+str(round(10**Int, 2))+'*'+r'$N$'+'$^{'+str(round(Coef, 2))+'}$'+'\n'+r'$R^2$' + '=' +str(R2) +' (n='+str(len(PIx))+')')

            if index == 2:
                leg = plt.legend(loc=3,prop={'size':fs-1})
                leg.draw_frame(False)

            else:
                leg = plt.legend(loc=2,prop={'size':fs-1})
                leg.draw_frame(False)

            plt.ylim(min(metlist), max(metlist)*1.1)
            plt.xlim(min(Nlist), max(Nlist))

            plt.xlabel('Total abundance, ' + r'$log_{10}(N)$', fontsize=fs)
            plt.ylabel(metric, fontsize=fs)
            plt.tick_params(axis='both', which='major', labelsize=fs-3)


            metrix = ['rarity', 'dominance', 'evenness', 'richness']
            print>>OUT, name, kind, metrix[index], np.mean(PIx), np.mean(Slist), Int, Coef

        fig.suptitle(dataset[0], fontsize=fs+2)
        #plt.subplots_adjust(wspace=0.4, hspace=0.4)

        #plt.savefig(mydir+'/figs/appendix/Fig1/PerDataset/Locey_Lennon_2015_'+name+'_NoMicrobeSingletons.png', dpi=600, bbox_inches = "tight")
        plt.savefig(mydir+'/figs/appendix/Fig1/PerDataset/Locey_Lennon_2015_'+name+'.png', dpi=600, bbox_inches = "tight")

        #plt.show()
        #plt.close()

    OUT.close()
    return
def Fig1(ref, Ones):

    datasets = []
    if ref == 'ClosedRef':
        GoodNames = [
            'MGRAST', 'HMP', 'EMPclosed', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'
        ]
    if ref == 'OpenRef':
        GoodNames = [
            'MGRAST', 'HMP', 'EMPopen', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'
        ]

    for name in os.listdir(mydir + 'data/micro'):
        if name in GoodNames: pass
        else: continue

        if Ones == 'N':
            path = mydir + 'data/micro/' + name + '/' + name + '-SADMetricData_NoMicrobe1s.txt'
        elif Ones == 'Y':
            path = mydir + 'data/micro/' + name + '/' + name + '-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'micro', num_lines])
        print name, num_lines

    for name in os.listdir(mydir + 'data/macro'):
        if name in GoodNames: pass
        else: continue

        if Ones == 'N':
            path = mydir + 'data/macro/' + name + '/' + name + '-SADMetricData_NoMicrobe1s.txt'
        elif Ones == 'Y':
            path = mydir + 'data/macro/' + name + '/' + name + '-SADMetricData.txt'

        num_lines = sum(1 for line in open(path))
        datasets.append([name, 'macro', num_lines])
        print name, num_lines

    metrics = ['log-modulo skewness', 'log-skew']

    fig = plt.figure()
    for index, i in enumerate(metrics):

        metric = i
        fig.add_subplot(2, 2, index + 1)
        fs = 10  # font size used across figures

        MicIntList, MicCoefList, MacIntList, MacCoefList, R2List, metlist = [
            [], [], [], [], [], []
        ]
        Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [
            [], [], [], [], [], [], [], [], []
        ]
        EvarList, EQList, OList = [[], [], []]
        SkewList, LogSkewList = [[], []]

        its = 1000
        for n in range(its):
            #print n, metric

            Nlist, Slist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList = [
                [], [], [], [], [], [], [], [], []
            ]
            EvarList, EQList, OList = [[], [], []]
            SkewList, LogSkewList = [[], []]

            numMac = 0
            numMic = 0
            radDATA = []

            for dataset in datasets:

                name, kind, numlines = dataset
                lines = []
                if name == 'EMPclosed' or name == 'EMPopen':
                    lines = np.random.choice(range(1, numlines + 1),
                                             100,
                                             replace=True)
                elif kind == 'micro':
                    lines = np.random.choice(range(1, numlines + 1),
                                             100,
                                             replace=True)
                else:
                    lines = np.random.choice(range(1, numlines + 1),
                                             60,
                                             replace=True)

                if Ones == 'N':
                    path = mydir + 'data/' + kind + '/' + name + '/' + name + '-SADMetricData_NoMicrobe1s.txt'
                elif Ones == 'Y':
                    path = mydir + 'data/' + kind + '/' + name + '/' + name + '-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

            for data in radDATA:

                data = data.split()
                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data

                KindList.append(kind)
                N = float(N)
                S = float(S)

                if S < 10 or N < 10: continue  # Min species richness

                Nlist.append(float(np.log10(N)))
                Slist.append(float(np.log10(S)))

                # Rarity
                lms = np.log10(np.abs(float(skew)) + 1)
                if skew < 0: lms = lms * -1
                SkewList.append(float(lms))

                LogSkewList.append(float(logskew))

                if kind == 'micro':
                    numMic += 1
                    klist.append('b')
                if kind == 'macro':
                    klist.append('r')
                    numMac += 1

            if index == 0: metlist = list(SkewList)
            elif index == 1: metlist = list(LogSkewList)

            # Multiple regression
            d = pd.DataFrame({'N': list(Nlist)})
            d['y'] = list(metlist)
            d['Kind'] = list(KindList)
            f = smf.ols('y ~ N * Kind', d).fit()

            MacIntList.append(f.params[0])
            MacCoefList.append(f.params[2])

            if f.pvalues[1] < 0.05:
                MicIntList.append(f.params[1] + f.params[0])
            else:
                MicIntList.append(f.params[0])

            if f.pvalues[3] < 0.05:
                MicCoefList.append(f.params[3] + f.params[2])
            else:
                MicCoefList.append(f.params[2])

            R2List.append(f.rsquared)

        MacListX = []
        MacListY = []
        MicListX = []
        MicListY = []

        for j, k in enumerate(KindList):
            if k == 'micro':
                MicListX.append(Nlist[j])
                MicListY.append(metlist[j])

            elif k == 'macro':
                MacListX.append(Nlist[j])
                MacListY.append(metlist[j])

        MacPIx, MacFitted, MicPIx, MicFitted = [[], [], [], []]
        macCiH, macCiL, micCiH, micCiL = [[], [], [], []]

        lm = smf.ols('y ~ N * Kind', d).fit()
        #print metric, '\n', lm.summary()
        #f1 = smf.ols('y ~ N', d).fit()
        #print metric, '\n', f1.summary()

        st, data, ss2 = summary_table(lm, alpha=0.05)

        fittedvalues = data[:, 2]
        predict_mean_se = data[:, 3]
        predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T
        predict_ci_low, predict_ci_upp = data[:, 6:8].T

        for j, kval in enumerate(KindList):
            if kval == 'macro':

                macCiH.append(predict_mean_ci_upp[j])
                macCiL.append(predict_mean_ci_low[j])
                MacPIx.append(Nlist[j])
                MacFitted.append(f.fittedvalues[j])

            elif kval == 'micro':

                micCiH.append(predict_mean_ci_upp[j])
                micCiL.append(predict_mean_ci_low[j])
                MicPIx.append(Nlist[j])
                MicFitted.append(f.fittedvalues[j])

        MicPIx, MicFitted, micCiH, micCiL = zip(
            *sorted(zip(MicPIx, MicFitted, micCiH, micCiL)))
        MacPIx, MacFitted, macCiH, macCiL = zip(
            *sorted(zip(MacPIx, MacFitted, macCiH, macCiL)))

        for i in range(len(MicListX)):
            plt.scatter(MacListX[i],
                        MacListY[i],
                        color='LightCoral',
                        alpha=1,
                        s=4,
                        linewidths=0.5,
                        edgecolor='Crimson')
            plt.scatter(MicListX[i],
                        MicListY[i],
                        color='SkyBlue',
                        alpha=1,
                        s=4,
                        linewidths=0.5,
                        edgecolor='Steelblue')

        plt.fill_between(MacPIx, macCiL, macCiH, color='r', lw=0.0, alpha=0.3)
        plt.fill_between(MicPIx, micCiL, micCiH, color='b', lw=0.0, alpha=0.3)

        MicInt = round(np.mean(MicIntList), 2)
        MicCoef = round(np.mean(MicCoefList), 2)
        MacInt = round(np.mean(MacIntList), 2)
        MacCoef = round(np.mean(MacCoefList), 2)
        r2 = round(np.mean(R2List), 2)

        if index == 0:
            plt.ylim(0, 2.5)
            plt.xlim(0, 7)
            plt.text(0.3,
                     2.2,
                     r'$micro$' + ' = ' + str(round(MicInt, 2)) + '*' +
                     r'$N$' + '$^{' + str(round(MicCoef, 2)) + '}$',
                     fontsize=fs - 1,
                     color='Steelblue')
            plt.text(0.3,
                     2.0,
                     r'$macro$' + ' = ' + str(round(MacInt, 2)) + '*' +
                     r'$N$' + '$^{' + str(round(MacCoef, 2)) + '}$',
                     fontsize=fs - 1,
                     color='Crimson')
            plt.text(0.3,
                     1.7,
                     r'$R^2$' + '=' + str(round(r2, 3)),
                     fontsize=fs - 1,
                     color='k')

        if index == 1:
            plt.ylim(-1, 4.5)
            plt.xlim(0, 7)
            plt.text(0.3,
                     4.0,
                     r'$micro$' + ' = ' + str(round(MicInt, 2)) + '*' +
                     r'$N$' + '$^{' + str(round(MicCoef, 2)) + '}$',
                     fontsize=fs - 1,
                     color='Steelblue')
            plt.text(0.3,
                     3.5,
                     r'$macro$' + ' = ' + str(round(MacInt, 2)) + '*' +
                     r'$N$' + '$^{' + str(round(MacCoef, 2)) + '}$',
                     fontsize=fs - 1,
                     color='Crimson')
            plt.text(0.3,
                     2.9,
                     r'$R^2$' + '=' + str(round(r2, 3)),
                     fontsize=fs - 1,
                     color='k')

        plt.xlabel('Number of reads or individuals, ' + '$log$' + r'$_{10}$',
                   fontsize=fs)
        plt.ylabel(metric, fontsize=fs)
        plt.tick_params(axis='both', which='major', labelsize=fs - 1)

    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    if ref == 'OpenRef' and Ones == 'N':
        plt.savefig(
            mydir +
            '/figs/appendix/Rarity/SupplementaryRarityFig-OpenRef_NoMicrobe1s.png',
            dpi=600,
            bbox_inches="tight")
    elif ref == 'OpenRef' and Ones == 'Y':
        plt.savefig(mydir +
                    '/figs/appendix/Rarity/SupplementaryRarityFig-OpenRef.png',
                    dpi=600,
                    bbox_inches="tight")
    elif ref == 'ClosedRef' and Ones == 'Y':
        plt.savefig(
            mydir +
            '/figs/appendix/Rarity/SupplementaryRarityFig-ClosedRef.png',
            dpi=600,
            bbox_inches="tight")
    elif ref == 'ClosedRef' and Ones == 'N':
        plt.savefig(
            mydir +
            '/figs/appendix/Rarity/SupplementaryRarityFig-ClosedRef_NoMicrobe1s.png',
            dpi=600,
            bbox_inches="tight")

    #plt.show()

    return
def scatter_interaction(ax, x, y, groups, colors, ms=5, labels=None, title=None, legend=False, formula='y ~ x', legend_loc='best'):
    
    # Here are the imports
    import numpy as np
    import matplotlib.pylab as plt
    import pandas as pd
    from scipy.stats import pearsonr
    from statsmodels.sandbox.regression.predstd import wls_prediction_std
    from statsmodels.formula.api import ols
    from statsmodels.stats.outliers_influence import summary_table

    # If you haven't already been given an axis on which to plot, then
    # create a new figure
        
    if not ax:
        fig = plt.figure(figsize = (5,4))
        
        ax = fig.add_subplot(111)
    
    marker_styles = [ 'o', '^', 'D', 's', '*' ] * len(groups)
    line_styles = ['--', '-', '-.', ':'] * len(groups)
    
    # Loop through all the groups
    for i, x_i, y_i, c_i, g_i, m_i, l_i in zip(range(len(groups)), x, y, colors, groups, marker_styles, line_styles):

        # Scatter each with the appropriate colors
        ax.scatter(x_i, y_i, c=c_i, edgecolor=c_i, alpha=0.8, s=ms, marker=m_i, zorder=7*i)
        
        # Now calculate the linear correlation between x and y
        # for each group
        # Heavily stolen from:
        # http://www.students.ncl.ac.uk/tom.holderness/software/pythonlinearfit
        #z = np.polyfit(x_i,y_i,1)
        #p = np.poly1d(z)
        #fit = p(x_i)
        #c_x = [np.min(x_i),np.max(x_i)]
        #c_y = [p(np.min(x_i)), p(np.max(x_i))]

        df2 = pd.DataFrame({ 'x' : x_i, 'y' : y_i })
        df2.sort('x', inplace=True)
        lm = ols(formula, df2).fit()
        ps = [ '{:2.4f}'.format(p) for p in lm.pvalues[1:] ]
        print '    {}, r2 = {}, p(s) = {}'.format(g_i, lm.rsquared, ', '.join(ps))
        prstd, iv_l, iv_u = wls_prediction_std(lm)
        iv_l = np.array(iv_l)
        iv_u = np.array(iv_u)
        fit_y = np.array(lm.fittedvalues)
        st, data, ss2 = summary_table(lm, alpha=0.05)

        predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T

        #pl.plot(x, y, 'k-')
        #pl.plot(x, fit_y, 'r--')
        #pl.fill_between(x, iv_l, iv_u, alpha=0.2)
        
        # Get the r and p values
        #r, p = pearsonr(x_i, y_i)
        #label = '{} r: {: .2g} p: {: .2g}'.format(g_i, r, p)
        # Now plot
        ax.plot(df2.x.values, fit_y, c=c_i, linestyle = '-', linewidth = ms/25.0, zorder=6*i, label=g_i)
        ax.plot(df2.x.values, predict_mean_ci_low, c=c_i, linestyle = '-', linewidth = ms/50.0, zorder=3*i)
        ax.plot(df2.x.values, predict_mean_ci_upp, c=c_i, linestyle = '-', linewidth = ms/50.0, zorder=2*i)
        ax.fill_between(df2.x.values, predict_mean_ci_upp, predict_mean_ci_low, alpha=0.3, facecolor=c_i, interpolate=True, zorder=1*i)
        
    if legend:
        # Add the legend
        leg = ax.legend(loc=legend_loc, fancybox=True, fontsize=ms/2.)
        leg.get_frame().set_alpha(0)

    # Set the y limits
    # This is to deal with very small numbers (the MaxNLocator gets all turned around!)
    # Concatenate all the y data:
    y_all = y[0]
    if len(y) > 1:
        for k in range(1,len(y)):
            y_all = np.concatenate([y_all, y[k]])
    max_y = np.max(y_all)
    min_y = np.min(y_all)
    buffer = ( max_y - min_y ) / 10
    upper = max_y + buffer
    lower = min_y - buffer
    ax.set_ybound(upper, lower)
    
    # Set the axis labels    
    ax.set_ylabel(labels[1], fontsize=ms/2.0)
    ax.set_xlabel(labels[0], fontsize=ms/2.0)
    
    for item in (ax.get_xticklabels() + ax.get_yticklabels()):
        item.set_fontsize(ms/2.0)
    # Adjust the power limits so that you use scientific notation on the y axis
    plt.ticklabel_format(style='sci', axis='y')
    ax.yaxis.major.formatter.set_powerlimits((-3,3))
    
    plt.rc('font', **{'size':ms/2.0})
    
    if title:
        # Set the overall title
        ax.set_title(title)

    plt.tight_layout()
    
    return ax
def Fig2(condition, ones, sampling):


    """ A figure demonstrating a strong abundance relationship across 30
    orders of magnitude in total abundance. The abundance of the most abundant
    species scales in a log-log fashion with the total abundance of the sample
    or system. """

    tail = str()
    if ones is False:
        tail = '-SADMetricData_NoMicrobe1s.txt'
    elif ones is True:
        tail = '-SADMetricData.txt'

    datasets = []
    GoodNames = []
    emp = str()

    if condition == 'open': emp = 'EMPopen'
    elif condition == 'closed': emp = 'EMPclosed'

    GoodNames = [emp, 'TARA', 'HMP', 'BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED', 'HUMAN', 'CHINA', 'CATLIN', 'FUNGI']

    fs = 13 # font size used across figures
    Nlist, NmaxList, klist, datasets, radDATA = [[],[],[],[],[]]


    for name in os.listdir(mydir +'data/micro'):
        #if name in BadNames: continue
        if name in GoodNames: pass
        else: continue

        path = mydir+'data/micro/'+name+'/'+name+tail
        numlines = sum(1 for line in open(path))
        print name, numlines
        datasets.append([name, 'micro', numlines])

    for dataset in datasets:
        name, kind, numlines = dataset
        lines = []

        small = ['BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED']
        big = ['HUMAN', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO']

        if name in small:
            lines = np.random.choice(range(1, numlines+1), 1000, replace=True)

        elif name in big:
            lines = np.random.choice(range(1, numlines+1), 2500, replace=True)

        elif name == 'TARA':
            lines = np.random.choice(range(1, numlines+1), 2500, replace=True)
        else:
            lines = np.random.choice(range(1, numlines+1), 2500, replace=True)


        path = mydir+'data/micro/'+name+'/'+name+tail

        for line in lines:
            data = linecache.getline(path, line)
            radDATA.append(data)

        klist.append('DarkCyan')

    for data in radDATA:

        data = data.split()
        name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data

        N = float(N)
        S = float(S)

        #if S < 10 or N < 11: continue # Min species richness

        Nlist.append(float(np.log10(float(N))))
        NmaxList.append(float(np.log10(float(Nmax))))
        klist.append('DarkCyan')

    metric = 'Dominance, '+'$log$'+r'$_{10}$'

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    Nlist, NmaxList = zip(*sorted(zip(Nlist, NmaxList)))
    Nlist = list(Nlist)
    NmaxList = list(NmaxList)

    # Regression
    d = pd.DataFrame({'N': list(Nlist)})
    d['y'] = list(NmaxList)
    f = smf.ols('y ~ N', d).fit()

    R2 = f.rsquared
    pval = f.pvalues
    intercept = f.params[0]
    slope = f.params[1]

    #print f.summary()
    #print intercept, slope

    X = np.linspace(6, 40, 100)
    Y = f.predict(exog=dict(N=X))
    Nlist2 = Nlist + X.tolist()
    NmaxList2 = NmaxList + Y.tolist()

    d = pd.DataFrame({'N': list(Nlist2)})
    d['y'] = list(NmaxList2)
    f = smf.ols('y ~ N', d).fit()

    st, data, ss2 = summary_table(f, alpha=0.05)
    fittedvalues = data[:,2]
    pred_mean_se = data[:,3]
    pred_mean_ci_low, pred_mean_ci_upp = data[:,4:6].T
    pred_ci_low, pred_ci_upp = data[:,6:8].T

    label1 = 'Dominance scaling law for microbial data compilation'
    label2 = 'Ranges of published $N_{max}$ and $N$'

    plt.fill_between(Nlist2, pred_ci_low, pred_ci_upp, color='r', lw=0.5, alpha=0.2)
    plt.text(2, 22, r'$N_{max}$'+ ' = '+str(round(10**intercept,2))+'*'+r'$N$'+'$^{'+str(round(slope,2))+'}$', fontsize=fs+4, color='Crimson', alpha=0.9)
    plt.text(2, 19,  r'$r^2$' + ' = ' +str("%.2f" % R2), fontsize=fs+4, color='0.2')
    plt.plot(X.tolist(), Y.tolist(), '--', c='red', lw=2, alpha=0.8, color='Crimson', label=label1)

    print 'r-squared and slope for RADs w/out inferred:', round(R2, 3), round(slope,3)


    plt.hexbin(Nlist, NmaxList, mincnt=1, gridsize = 50, bins='log', cmap=plt.cm.Reds_r)  #
    #plt.scatter(Nlist, NmaxList, color = 'LightCoral', alpha= 0.6 , s = 10, linewidths=0.5, edgecolor='Crimson')


    GO = np.log10([360.0*(10**26), 1010.0*(10**26)]) # estimated open ocean bacteria; Whitman et al. 1998
    Pm = np.log10([2.8*(10**27), 3.0*(10**27)]) # estimated Prochlorococcus; Flombaum et al. 2013
    Syn = np.log10([6.7*(10**26), 7.3*(10**26)]) # estimated Synechococcus; Flombaum et al. 2013

    Earth = np.log10([9.2*(10**29), 31.7*(10**29)]) # estimated bacteria on Earth; Kallmeyer et al. 2012
    SAR11 = np.log10([2.0*(10**28), 2.0*(10**28)]) # estimated percent abundance of SAR11; Morris et al. (2002)

    HGx = np.log10([0.5*(10**14), 1.5*(10**14)]) # estimated bacteria in Human gut; Berg (1996)
    HGy = np.log10([0.05*(10**min(HGx)), 0.15*(10**max(HGx))]) # estimated most abundant bacteria in Human gut; Turnbaugh et al. (2009), & Dethlefsen et al. (2008)

    COWx = np.log10([0.5*2.226*(10**15), 1.5*2.226*(10**15)]) # estimated bacteria in Cow rumen; LOW:   HIGH: Whitman et al. (1998)
    COWy = np.log10([0.09*(10**min(COWx)), .15*(10**max(COWx))]) # estimated dominance in Cow rumen; Stevenson and Weimer (2006)

    c = '0.2'
    ## EARTH
    x = [np.mean(Earth)]
    x_range = (max(Earth) - min(Earth))/2.0
    y = [np.mean([min(Pm), max(SAR11)])]
    y_range = (max(SAR11) - min(Pm))/2.0

    ax.text(8.5, max(SAR11)+0.2, r'$Prochlorococcus$ and Pelagibacterales', fontsize=fs+2, color = 'k')
    ax.text(max(Earth)+0.5, 26, 'Earth microbiome', fontsize=fs+2, color = 'k', rotation = 90)
    ax.axhline(y, 0, 0.90, ls = '--', c = '0.4')
    ax.axvline(x, 0, 0.85, ls = '--', c = '0.4')
    plt.errorbar(x, y, xerr=x_range, yerr=y_range, color='k', linewidth=1, label=label2)

    c = '0.4'
    ## GLOBAL OCEAN
    x = [np.mean(GO)]
    x_range = (max(GO) - min(GO))/2.0
    y = [np.mean(Pm)]
    y_range = (max(SAR11) - min(Pm))/2.0

    ax.text(7.5, min(Pm)-1.35, r'$Synechococcus$ and $Prochlorococcus$', fontsize=fs+2, color = 'k')
    ax.text(min(GO)-1, 22, 'Non-sediment ocean bacteria', fontsize=fs+2, color = 'k', rotation = 90)
    ax.axhline(y, 0, 0.85, ls = '--', c = '0.4')
    ax.axvline(x, 0, 0.83, ls = '--', c = '0.4')
    plt.errorbar(x, y, xerr=x_range, yerr=y_range, color='k', linewidth=1)

    ## HUMAN GUT
    x = [np.mean(HGx)]
    x_range = (max(HGx) - min(HGx))/2.0
    y = [np.mean(HGy)]
    y_range = (max(HGy) - min(HGy))/2.0

    ax.text(4, min(HGy)-1, 'Human gut', fontsize=fs+2, color = 'k')
    ax.text(min(HGx)-1, 8, 'Human gut', fontsize=fs+2, color = 'k', rotation = 90)
    ax.axhline(y, 0, 0.40, ls = '--', c = '0.4')
    ax.axvline(x, 0, 0.38, ls = '--', c = '0.4')
    plt.errorbar(x, y, xerr=x_range, yerr=y_range, color='k', linewidth=1)

    ## COW RUMEN
    x = [np.mean(COWx)]
    x_range = (max(COWx) - min(COWx))/2.0
    y = [np.mean(COWy)]
    y_range = (max(COWy) - min(COWy))/2.0

    ax.text(7, max(COWy)+0.3, '$Prevotella$', fontsize=fs+2, color = 'k')
    ax.text(max(COWx)+0.4, 11.2, 'Cow rumen', fontsize=fs+2, color = 'k', rotation = 90)
    ax.axhline(y, 0, 0.41, ls = '--', c = '0.4')
    ax.axvline(x, 0, 0.43, ls = '--', c = '0.4')
    plt.errorbar(x, y, xerr=x_range, yerr=y_range, color='k', linewidth=1)

    ax.text(5, -4.2, 'Number of reads or total abundance, '+ '$log$'+r'$_{10}$', fontsize=fs+4)
    ax.text(-2.5, 22, metric, fontsize=fs+4, rotation=90)

    plt.plot([0,32],[0,32], ls = '--', lw=2, c='0.7')
    #ax.text(18, 21, '1:1 line', fontsize=fs*1.0, rotation=40, color='0.7')

    plt.xlim(1, 33)
    plt.ylim(0, 32)

    plt.legend(bbox_to_anchor=(-0.015, 1, 1.025, .2), loc=10, ncol=1,
                                mode="expand",prop={'size':fs+1}, numpoints=1)

    if ones == False:
        plt.savefig(mydir+'/figs/Fig2/Locey_Lennon_2015_Fig2-'+condition+'_NoSingletons_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight")
    if ones == True:
        plt.savefig(mydir+'/figs/Fig2/Locey_Lennon_2015_Fig2-'+condition+'_'+str(sampling)+'.pdf', dpi=300, bbox_inches = "tight")
        #plt.savefig(mydir+'/figs/Fig2/figure2-v2.pdf', dpi=300, bbox_inches = "tight")

    #plt.show()
    return
        leg.get_frame().set_alpha(0.5) #, fontsize='small')
        ltext = leg.get_texts() # all the text.Text instance in the legend
        plt.setp(ltext, fontsize='small') # the legend text fontsize


    print oi.reset_ramsey(res, degree=3)

    #note, constant in last column
    for i in range(1):
        print oi.variance_inflation_factor(res.model.exog, i)

    infl = oi.OLSInfluence(res_ols)
    print infl.resid_studentized_external
    print infl.resid_studentized_internal
    print infl.summary_table()
    print oi.summary_table(res, alpha=0.05)[0]

'''
>>> res.resid
array([  4.28571429,   4.        ,   0.57142857,  -3.64285714,
        -4.71428571,   1.92857143,  10.        ,  -6.35714286,
       -11.        ,  -1.42857143,   1.71428571,   4.64285714])
>>> infl.hat_matrix_diag
array([ 0.10084034,  0.11764706,  0.28571429,  0.20168067,  0.10084034,
        0.16806723,  0.11764706,  0.08403361,  0.11764706,  0.28571429,
        0.33613445,  0.08403361])
>>> infl.resid_press
array([  4.76635514,   4.53333333,   0.8       ,  -4.56315789,
        -5.24299065,   2.31818182,  11.33333333,  -6.94036697,
       -12.46666667,  -2.        ,   2.58227848,   5.06880734])
>>> infl.ess_press
Example #49
0
def plot_OLS(ax, target, Y, mode='unicolor'):

    X = target
    X = sm.add_constant(X)

    model = sm.OLS(Y, X)

    results = model.fit()

    st, data, ss2 = summary_table(results, alpha=0.05)

    fittedvalues = data[:, 2]
    predict_mean_se = data[:, 3]
    predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T
    predict_ci_low, predict_ci_upp = data[:, 6:8].T

    if mode == 'unicolor':
        ax.scatter(target, Y, c='silver', linewidths=0, s=4)
    else:
        xy = np.row_stack([target, Y])
        z = gaussian_kde(xy)(xy)
        idx = z.argsort()
        x, y, z = xy[0][idx], xy[1][idx], z[idx]
        ax.scatter(x, y, c=z, s=4, cmap=pl.cm.inferno_r)

    ax.plot(target, fittedvalues, 'r-', label='Least Square Regression', lw=2)

    idx = np.argsort(predict_ci_low)
    ax.plot(target[idx],
            predict_ci_low[idx],
            'r--',
            lw=2,
            label='95% confidence interval')

    idx = np.argsort(predict_ci_upp)
    ax.plot(target[idx], predict_ci_upp[idx], 'r--', lw=2)

    mx = np.ceil(max(target.max(), fittedvalues.max()))
    ax.plot([0, mx], [0, mx], 'k-')

    ax.set_xlim(0, mx)
    ax.set_ylim(0, mx)

    ax.set_aspect(1)

    ax.legend(loc='upper left')
    ax.set_xlabel('AGB from map [Mg ha$^{-1}$]')

    ax.set_ylabel('Reconstructed AGB [Mg ha$^{-1}$]')

    nse = 1 - ((Y - target)**2).sum() / ((target - target.mean())**2).sum()
    rmse = np.sqrt(((Y - target)**2).mean())

    ax.text(
        0.98,
        0.02,
        'y = %4.2fx + %4.2f\nR$^2$ = %4.2f; p < 0.001\nrmse = %4.1f Mg ha$^{-1}$ ; NSE = %4.2f'
        % (results.params[1], results.params[0], results.rsquared, rmse, nse),
        va='bottom',
        ha='right',
        transform=ax.transAxes)

    idx = np.argsort(predict_ci_upp)
    ax.plot(target[idx], predict_ci_upp[idx], 'r--', lw=2)

    mx = np.ceil(max(target.max(), fittedvalues.max()))
    ax.plot([0, mx], [0, mx], 'k-')

    ax.set_xlim(0, mx)
    ax.set_ylim(0, mx)

    ax.set_aspect(1)

    ax.legend(loc='upper left')
    ax.set_xlabel('AGB from map [Mg ha$^{-1}$]')

    ax.set_ylabel('Reconstructed AGB [Mg ha$^{-1}$]')

    nse = 1 - ((Y - target)**2).sum() / ((target - target.mean())**2).sum()
    rmse = np.sqrt(((Y - target)**2).mean())

    ax.text(
        0.98,
        0.02,
        'y = %4.2fx + %4.2f\nR$^2$ = %4.2f; p < 0.001\nrmse = %4.1f Mg ha$^{-1}$ ; NSE = %4.2f'
        % (results.params[1], results.params[0], results.rsquared, rmse, nse),
        va='bottom',
        ha='right',
        transform=ax.transAxes)
def Fig1(cutoffs, Ones):

    datasets = []
    GoodNames = ['LAUB', 'CHU', 'HYDRO', 'CATLIN']

    for cutoff in cutoffs:
        for name in GoodNames:
            if Ones == 'N':
                path = mydir+'data/micro/'+name+'/'+name+cutoff+'/'+name+cutoff+'-SADMetricData_NoMicrobe1s.txt'
            if Ones == 'Y':
	            path = mydir+'data/micro/'+name+'/'+name+cutoff+'/'+name+cutoff+'-SADMetricData.txt'

            num_lines = sum(1 for line in open(path))
            datasets.append([name, cutoff, 'micro', num_lines])
            print name, num_lines

    metrics = ['Rarity, '+r'$log_{10}$',
            'Dominance, '+r'$log_{10}$',
            'Evenness, ' +r'$log_{10}$',
            'Richness, ' +r'$log_{10}$']

    fig = plt.figure()
    for index, i in enumerate(metrics):

        metric = i
        fig.add_subplot(2, 2, index+1)
        fs = 10 # font size used across figures

        c97IntList, c97CoefList, c99IntList, c99CoefList, c95CoefList, c95IntList, R2List, metlist = [[], [], [], [], [], [], [], []]
        Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []]
        #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

        its = 100
        for n in range(its):

            #name, kind, N, S, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
            Nlist, Slist, Evarlist, ESimplist, klist, radDATA, BPlist, NmaxList, rareSkews, KindList, StdList = [[], [], [], [], [], [], [], [], [], [], []]

            radDATA = []

            for dataset in datasets:

                name, cutoff, kind, numlines = dataset
                lines = []

                lines = np.random.choice(range(1, numlines+1), numlines, replace=False)

                if Ones == 'N':
                    path = mydir+'data/'+kind+'/'+name+'/'+name+cutoff+'/'+name+cutoff+'-SADMetricData_NoMicrobe1s.txt'
                if Ones == 'Y':
                    path = mydir+'data/'+kind+'/'+name+'/'+name+cutoff+'/'+name+cutoff+'-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    dlist = cutoff+' '+data
                    radDATA.append(dlist)

            for data in radDATA:

                data = data.split()
                cutoff, name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data

                N = float(N)
                S = float(S)

                #if S < 10 or N < 11: continue

                Nlist.append(float(np.log10(N)))
                Slist.append(float(np.log10(S)))

                ESimplist.append(float(np.log10(float(ESimp))))
                KindList.append(cutoff)

                BPlist.append(float(BP))
                NmaxList.append(float(np.log10(float(Nmax))))

                # log-modulo transformation of skewnness
                lms = np.log10(np.abs(float(skew)) + 1)
                if skew < 0: lms = lms * -1
                rareSkews.append(float(lms))

                if cutoff == '99':
                    klist.append('Steelblue')

                elif cutoff == '97':
                    klist.append('Crimson')

                elif cutoff == '95':
                    klist.append('0.4')

            if index == 0: metlist = list(rareSkews)
            elif index == 1: metlist = list(NmaxList)
            elif index == 2: metlist = list(ESimplist)
            elif index == 3: metlist = list(Slist)

            # Multiple regression
            d = pd.DataFrame({'N': list(Nlist)})
            d['y'] = list(metlist)
            d['Kind'] = list(KindList)
            f = smf.ols('y ~ N * Kind', d).fit()

            print f.summary()
            #print f.params

            c95IntList.append(f.params[0])
            c95CoefList.append(f.params[3])

            if f.pvalues[1] < 0.05:  c97IntList.append(f.params[1] + f.params[0])
            else: c97IntList.append(f.params[0])

            if f.pvalues[4] < 0.05: c97CoefList.append(f.params[4] + f.params[3])
            else: c97CoefList.append(f.params[3])


            if f.pvalues[2] < 0.05:  c99IntList.append(f.params[2] + f.params[0])
            else: c99IntList.append(f.params[0])

            if f.pvalues[5] < 0.05: c99CoefList.append(f.params[5] + f.params[3])
            else: c99CoefList.append(f.params[3])

            R2List.append(f.rsquared)


        c95PIx, c95Fitted = [[],[]]
        c95CiH, c95CiL = [[],[]]

        c97PIx, c97Fitted = [[],[]]
        c97CiH, c97CiL = [[],[]]

        c99PIx, c99Fitted = [[],[]]
        c99CiH, c99CiL = [[],[]]

        c95ListX = []
        c95ListY = []
        c97ListX = []
        c97ListY = []
        c99ListX = []
        c99ListY = []

        for j, k in enumerate(KindList):
            if k == '99':
                c99ListX.append(Nlist[j])
                c99ListY.append(metlist[j])
            if k == '97':
                c97ListX.append(Nlist[j])
                c97ListY.append(metlist[j])
            if k == '95':
                c95ListX.append(Nlist[j])
                c95ListY.append(metlist[j])

        print metric
        lm = smf.ols('y ~ N * Kind', d).fit()
        print lm.summary()
        print '\n\n'

        st, data, ss2 = summary_table(lm, alpha=0.05)
        # ss2: Obs, Dep Var Population, Predicted Value, Std Error Mean Predict,
        # Mean ci 95% low, Mean ci 95% upp, Predict ci 95% low, Predict ci 95% upp,
        # Residual, Std Error Residual, Student Residual, Cook's D

        #fittedvalues = data[:,2]
        #predict_mean_se = data[:,3]
        predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
        predict_ci_low, predict_ci_upp = data[:,6:8].T


        for j, kval in enumerate(KindList):
            if kval == '99':
                c99CiH.append(predict_mean_ci_upp[j])
                c99CiL.append(predict_mean_ci_low[j])
                c99PIx.append(Nlist[j])
                c99Fitted.append(f.fittedvalues[j])
            if kval == '97':
                c97CiH.append(predict_mean_ci_upp[j])
                c97CiL.append(predict_mean_ci_low[j])
                c97PIx.append(Nlist[j])
                c97Fitted.append(f.fittedvalues[j])
            if kval == '95':
                c95CiH.append(predict_mean_ci_upp[j])
                c95CiL.append(predict_mean_ci_low[j])
                c95PIx.append(Nlist[j])
                c95Fitted.append(f.fittedvalues[j])


        c99PIx, c99Fitted, c99CiH, c99CiL, c97PIx, c97Fitted, c97CiH, c97CiL, c95PIx, c95Fitted, c95CiH, c95CiL = zip(*sorted(zip(c99PIx, c99Fitted, c99CiH, c99CiL, c97PIx, c97Fitted, c97CiH, c97CiL, c95PIx, c95Fitted, c95CiH, c95CiL)))

        plt.scatter(c99ListX, c99ListY, facecolor = 'none', alpha= 1 , s = 5, linewidths=0.5, edgecolor='Steelblue')
        plt.scatter(c97ListX, c97ListY, facecolor = 'none', alpha= 1 , s = 5, linewidths=0.5, edgecolor='Crimson')
        plt.scatter(c95ListX, c95ListY, facecolor = 'none', alpha= 1 , s = 5, linewidths=0.5, edgecolor='0.4')

        #plt.fill_between(c99PIx, c99CiL, c99CiH, color='b', lw=0.0, alpha=0.3)
        #plt.fill_between(c97PIx, c97CiL, c97CiH, color='r', lw=0.0, alpha=0.3)
        #plt.fill_between(c95PIx, c95CiL, c95CiH, color='k', lw=0.0, alpha=0.3)

        plt.plot(c99PIx, c99Fitted,  color='b', ls='--', lw=1, alpha=0.8)
        plt.plot(c97PIx, c97Fitted,  color='r', ls='--', lw=1, alpha=0.8)
        plt.plot(c95PIx, c95Fitted,  color='0.2', ls='--', lw=1, alpha=0.8)

        c99Int = round(np.mean(c99IntList), 2)
        c99Coef = round(np.mean(c99CoefList), 2)
        c97Int = round(np.mean(c97IntList), 2)
        c97Coef = round(np.mean(c97CoefList), 2)
        c95Int = round(np.mean(c95IntList), 2)
        c95Coef = round(np.mean(c95CoefList), 2)

        R2 = round(np.mean(R2List), 2)

        if index == 0:
            plt.ylim(-0.1, 2.0)
            plt.xlim(1, 6)
            plt.text(1.35, 1.7, r'$99%$'+ ' = '+str(round(10**c99Int,2))+'*'+r'$N$'+'$^{'+str(round(c99Coef,2))+'}$', fontsize=fs, color='Steelblue')
            plt.text(1.35, 1.5, r'$97%$'+ ' = '+str(round(10**c97Int,2))+'*'+r'$N$'+'$^{'+str(round(c97Coef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(1.35, 1.3, r'$95%$'+ ' = '+str(round(10**c95Int,2))+'*'+r'$N$'+'$^{'+str(round(c95Coef,2))+'}$', fontsize=fs, color='0.3')
            plt.text(1.35, 1.1,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k')

            plt.scatter([0],[-1], color = 'none', alpha = 1, s=10, linewidths=0.9, edgecolor='Steelblue', label= '99% (n='+str(len(c99ListY))+')')
            plt.scatter([0],[-1], color = 'none', alpha = 1, s=10, linewidths=0.9, edgecolor='Crimson', label= '97% (n='+str(len(c97ListY))+')')
            plt.scatter([0],[-1], color = 'none', alpha = 1, s=10, linewidths=0.9, edgecolor='0.3', label= '95% (n='+str(len(c95ListY))+')')
            plt.legend(bbox_to_anchor=(-0.04, 1.05, 2.48, .2), loc=10, ncol=3, mode="expand",prop={'size':fs+2})

        elif index == 1:

            plt.plot([0,7],[0,7], ls = '--', lw=1, c='0.7')
            #ax.text(18, 21, '1:1 line', fontsize=fs*1.0, rotation=40, color='0.7')
            plt.ylim(0, 6)
            plt.xlim(1, 6)

            plt.text(1.35, 5.1, r'$99%$'+ ' = '+str(round(10**c99Int,2))+'*'+r'$N$'+'$^{'+str(round(c99Coef,2))+'}$', fontsize=fs, color='Steelblue')
            plt.text(1.35, 4.6, r'$97%$'+ ' = '+str(round(10**c97Int,2))+'*'+r'$N$'+'$^{'+str(round(c97Coef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(1.35, 4.1, r'$95%$'+ ' = '+str(round(10**c95Int,2))+'*'+r'$N$'+'$^{'+str(round(c95Coef,2))+'}$', fontsize=fs, color='0.3')
            plt.text(1.35, 3.6,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k')

        elif index == 2:
            plt.ylim(-3.0, 0.0)
            plt.xlim(1, 6)

            plt.text(1.35, -1.8, r'$99%$'+ ' = '+str(round(10**c99Int,2))+'*'+r'$N$'+'$^{'+str(round(c99Coef,2))+'}$', fontsize=fs, color='Steelblue')
            plt.text(1.35, -2.1, r'$97%$'+ ' = '+str(round(10**c97Int,2))+'*'+r'$N$'+'$^{'+str(round(c97Coef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(1.35, -2.4, r'$95%$'+ ' = '+str(round(10**c95Int,2))+'*'+r'$N$'+'$^{'+str(round(c95Coef,2))+'}$', fontsize=fs, color='0.3')
            plt.text(1.35, -2.7,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k')


        elif index == 3:
            plt.ylim(0.9, 4.5)
            plt.xlim(1, 6)

            plt.text(1.35, 3.9, r'$99%$'+ ' = '+str(round(10**c99Int,2))+'*'+r'$N$'+'$^{'+str(round(c99Coef,2))+'}$', fontsize=fs, color='Steelblue')
            plt.text(1.35, 3.5, r'$97%$'+ ' = '+str(round(10**c97Int,2))+'*'+r'$N$'+'$^{'+str(round(c97Coef,2))+'}$', fontsize=fs, color='Crimson')
            plt.text(1.35, 3.1, r'$95%$'+ ' = '+str(round(10**c95Int,2))+'*'+r'$N$'+'$^{'+str(round(c95Coef,2))+'}$', fontsize=fs, color='0.3')
            plt.text(1.35, 2.7,  r'$R^2$' + '=' +str(R2), fontsize=fs-1, color='k')


        plt.xlabel('Number of reads, '+ '$log$'+r'$_{10}$', fontsize=fs)
        plt.ylabel(metric, fontsize=fs)
        plt.tick_params(axis='both', which='major', labelsize=fs-3)

    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    if Ones =='N':
        plt.savefig(mydir+'figs/appendix/PercentCutoff/PercentCutoff_NoMicrobe1s.png', dpi=600, bbox_inches = "tight")
    elif Ones =='Y':
        plt.savefig(mydir+'figs/appendix/PercentCutoff/PercentCutoff.png', dpi=600, bbox_inches = "tight")

    #plt.show()
    plt.close()

    return