Exemple #1
0
def cronbach_alpha(data=None, items=None, scores=None, subject=None, nan_policy='pairwise', ci=.95):
    #safety check
    assert isinstance(data, pd.DataFrame), 'data must be a dataframe.'
    assert nan_policy in ['pairwise', 'listwise']

    if all([v is not None for v in [items, scores, subject]]):
        # Data in long-format: we first convert to a wide format
        data = data.pivot(index=subject, values=scores, columns=items)

    # From now we assume that data is in wide format
    n, k = data.shape
    assert k >= 2, 'At least two items are required.'
    assert n >= 2, 'At east two raters/subjects are required.'
    err = 'All columns must be numeric.'

    assert all([data[c].dtype.kind in 'bfi' for c in data.columns]), err
    if data.isna().any().any() and nan_policy == 'listwise':
        # In R = psych:alpha(data, use="complete.obs")
        data = data.dropna(axis=0, how='any')

    # Compute covariance matrix and Cronbach's alpha
    C = data.cov()
    cronbach = (k / (k - 1)) * (1-np.trace(C) / C.sum().sum())
    # which is equivalent to
    # v = np.diag(C).mean()
    # c = C.values[np.tril_indices_from(C, k=-1)].mean()
    # cronbach = (k * c) / (v + (k - 1) * c)

    # Confidence intervals
    alpha = 1 - ci
    df1 = n - 1
    df2 = df1 * (k - 1)
    lower = 1 - (1 - cronbach) * f.isf(alpha / 2, df1, df2)
    upper = 1 - (1 - cronbach) * f.isf(1 - alpha / 2, df1, df2)
    return round(cronbach, 6), np.round([lower, upper], 3)
Exemple #2
0
def ROC_CI(N, Vec_theta, alpha=0.05):
    """
    One-Dimensional Confidence-Interval Calculations
    Parameters
    ----------
    N
    Vec_theta
    alpha

    Returns
    -------
    theta_L
    theta_U
    """
    theta_L = np.zeros(Vec_theta.size)
    theta_U = np.zeros(Vec_theta.size)
    for i, theta in enumerate(Vec_theta):
        if theta != 0:
            alpha_2 = alpha / 2
        else:
            alpha_2 = alpha

        if N > 100 and theta > 0.1:
            d = N - 1
            sigma = sqrt(theta * (1 - theta))
            if theta == 0:
                theta_L[i] = 0
            else:
                theta_L[i] = theta - t.isf(alpha_2, df=d) * sigma / sqrt(N)
            theta_U[i] = theta + t.isf(alpha_2, df=d) * sigma / sqrt(N)
        elif N > 100 and theta < 0.1:
            if theta == 0:
                theta_L[i] = 0
            else:
                d_L = 2 * N * theta
                theta_L[i] = chi2.isf(1 - alpha_2, df=d_L) / (2 * N)
            d_U = 2 * (N * theta + 1)
            theta_U[i] = chi2.isf(alpha_2, df=d_U) / (2 * N)
        else:
            d1L = N - N * theta + 1
            d2L = N * theta
            if theta == 0:
                theta_L[i] = 0
            else:
                theta_L[i] = d2L / (d2L +
                                    d1L * f.isf(alpha_2, 2 * d1L, 2 * d2L))
            d1U = N * theta + 1
            d2U = N - N * theta
            theta_U[i] = d1U * f.isf(alpha_2, 2 * d1U, 2 * d2U) / (
                d2U + d1U * f.isf(alpha_2, 2 * d1U, 2 * d2U))

    # ensure increase
    for i in range(Vec_theta.size - 1):
        if theta_L[i + 1] < theta_L[i]:
            theta_L[i + 1] = theta_L[i]
        if theta_U[i + 1] < theta_U[i]:
            theta_U[i + 1] = theta_U[i]

    return theta_L, theta_U
Exemple #3
0
def ftest(fvalue, df1, df2, p=0.05):
    '''双侧F检验'''
    fl = f.isf(1 - p / 2, dfn=df1, dfd=df2)
    fr = f.isf(p / 2, dfn=df1, dfd=df2)
    if fvalue < fl or fvalue > fr:
        return 0
    else:
        return 1
Exemple #4
0
def Cochran():
    d1 = disp(1, ylist[0])
    d2 = disp(2, ylist[1])
    d3 = disp(3, ylist[2])
    d4 = disp(4, ylist[3])
    groz = round(max(ydisp) / sum(ydisp), 2)
    partresult = q / (f2 - 1)
    params = [partresult, f1, (f2 - 2) * f1]
    fisher = f.isf(*params)
    result = fisher / (fisher + (f2 - 2))
    gkr = round(Decimal(result).quantize(Decimal('.0001')).__float__(), 2)

    print("\n2)Критерій Кохрана:\n\n  Знайдемо дисперсії по рядках:")
    print("  ", d1, "\n  ", d2, "\n  ", d3, "\n  ", d4, "\n")
    print("   Dmax{{yi}} = {0}\n   Gp = {0}/({1}+{2}+{3}+{4}) = {5}".format(
        max(ydisp), *ydisp, groz))
    print("   f1 = {0} - 1 = {1}, f2 = 4, q = {3}\n   За таблицею Gкр = {2}".
          format(m, f1, gkr, q))
    if groz < gkr:
        print(
            "   Gp < Gкр => За критерієм Кохрана дисперсія однорідна з ймовірністю",
            p)
    else:
        print(
            "   Gp > Gкр => За критерієм Кохрана дисперсія неоднорідна з ймовірністю",
            p)
Exemple #5
0
def Fisher():

    sad = round(
        m * ((y1 - ymed[0])**2 + (y2 - ymed[1])**2 + (y3 - ymed[2])**2 +
             (y4 - ymed[3])**2) / (4 - d), 2)
    froz = round(sad / disp, 2)

    f4 = N - d

    fkr = Decimal(abs(f.isf(q, f4, f3))).quantize(Decimal('.0001')).__float__()

    print("\n3)Критерій Фішера:\n")
    print("   f4 = {2} - {0} = {1}".format(d, f4, N))
    print(
        "   {0}*(({5} - {1})**2 + ({6} - {2})**2 + ({7} - {2})**2 + ({8} - {2})**2)/(4-{10}) = {9}"
        .format(m, *ymed, y1, y2, y3, y4, sad, d))
    print("   Fр = {0}/{1} = {2}".format(sad, disp, froz))
    print("   За таблицею Fкр =", fkr)
    if fkr > froz:
        print(
            "   За критерієм Фішера рівняння регресії адекватне оригіналу з ймовірністю",
            p)
    else:
        print(
            "   За критерієм Фішера рівняння регресії неадекватне оригіналу з ймовірністю",
            p)
 def Ftest(self, alpha=0.05):
     """
     f统计量计算公式:f = [ESS/(k-1)]/[RSS/(N-k)]
     ESS(exlplained sum of suqres),可解释平方和,回归平方和
     RSS(residual sum of squares),残差平方和
     k, 回归系数的个数(实际上就是自变量的个数+1);N,训练样本的个数
     """
     self.y_hat = np.mean(self.y)
     print("输入是", self.x)
     self.y_bar = self.predict(self.x)
     ESS = np.sum([(self.y_bar[i] - self.y_hat)**2 for i in range(self.N)])
     RSS = np.sum([(self.y_bar[i] - self.y[i])**2 for i in range(self.N)])
     print("f分布的自由度是", self.k - 1, self.N - self.k)
     fValue = (ESS /
               (self.k - 1)) / (RSS /
                                (self.N - self.k))  #要求是多元线性回归模型,样本数量大于参数个数
     f_alpha = ff.isf(alpha, self.k - 1, self.N - self.k)
     print(fValue, f_alpha)
     if fValue > f_alpha:
         print("全部参数全为0的情况下,出现f统计量取值为", fValue, '的概率小于等于', alpha,
               "说明全部参数不为0")
         print("模型通过了f检验")
         return True
     else:
         return False
Exemple #7
0
def cochrane(selectionSize, qty_of_selections, significance):
    selectionSize += 1
    partResult1 = significance / (selectionSize - 1)
    params = [partResult1, qty_of_selections, (selectionSize - 1 - 1) * qty_of_selections]
    fischer = f.isf(*params)
    result = fischer / (fischer + (selectionSize - 1 - 1))
    return Decimal(result).quantize(Decimal('.0001')).__float__()
Exemple #8
0
 def cv_mse_F(self, Y_predict_all, y, alpha, num_tr):
     press = np.square(np.subtract(Y_predict_all, y))
     PRESS_all = np.sum(press, axis=0)
     RMSECV_array = np.sqrt(PRESS_all / self.n)
     #        print RMSECV_array
     min_RMSECV = min(RMSECV_array)
     comp_array = RMSECV_array.argsort()
     #        print comp_array
     comp_best = comp_array[0] + 1
     #        print comp_best
     k_press = PRESS_all[:comp_best]
     #        print k_press
     min_press = PRESS_all[comp_best - 1]
     #        print min_press
     F_h = k_press / min_press
     #        print F_h
     F_value = f.isf(alpha, num_tr, num_tr)
     F_bias = np.subtract(F_h, F_value)
     #        print F_bias
     min_comp = [k for k in range(len(F_bias)) if F_bias[k] < 0]
     #         if (min_comp==0):
     #             min_comp=1
     min_comp_best = min_comp[0]
     if (min_comp_best == 0):
         min_comp_best = 1
     min_RMSECV = RMSECV_array[min_comp_best - 1]
     return RMSECV_array, min_RMSECV, min_comp_best
Exemple #9
0
 def homoskedacity_test(self):
     result = True
     self.ltw('\\subsubsection{Homoskedastyczność}\n')
     n1 = self.n // 2
     n2 = self.n - n1
     r1 = n1 - (self.k + 1)
     r2 = n2 - (self.k + 1)
     f_alpha_r1r2 = f.isf(q=self.alpha, dfn=r1, dfd=r2)
     
     resid1 = self.residuals[:n1]
     resid2 = self.residuals[n1:]
     self.log(self.residuals, resid1, resid2)
     s12 = np.dot(resid1, resid1) / r1
     s22 = np.dot(resid2, resid2) / r2
     f_stat = s12 / s22
     self.log(f_stat, f_alpha_r1r2)
     self.ltw(f'\\[F = {f_stat}\\]\n')
     self.ltw(f'\\[F_{{{self.alpha}, {r1}, {r2}}} = {f_alpha_r1r2}\\]\n')
     if f_alpha_r1r2 < abs(f_stat):
         result = False
         self.log("Wystepuje heteroskedastycznosc")
         self.ltw('W modelu występuje heteroskedastyczność.\n')
     else:
         self.ltw('Model jest homoskedastyczny.\n')
     return result
Exemple #10
0
    def get_cochran_value(f1, f2, q):

        partResult1 = q / f2
        params = [partResult1, f1, (f2 - 1) * f1]
        fisher = f.isf(*params)
        result = fisher / (fisher + (f2 - 1))
        return Decimal(result).quantize(Decimal('.0001')).__float__()
 def get_fisher_value(f3, f4, q):
 return Decimal(abs(f.isf(q, f4,
f3))).quantize(Decimal('.0001')).__float__()
 f3 = (m - 1) * N
 f4 = N - d
 q = 0.05
 theoretical_y = numpy.array([regression_equation(row[0], row[1], row[2],
b_coefficients) for row in x_table])
 average_y = numpy.array(list(map(lambda el: numpy.average(el), y_table)))
 s_ad = m / (N - d) * sum((theoretical_y - average_y) ** 2)
 y_variations = numpy.array(list(map(numpy.var, y_table)))
 s_v = numpy.average(y_variations)
 f_p = float(s_ad / s_v)
 f_t = get_fisher_value(f3, f4, q)
 theoretical_values_to_print = list(
 zip(map(lambda x: "x1 = {0[1]:<10} x2 = {0[2]:<10} x3 =
{0[3]:<10}".format(x), x_table), theoretical_y))
 print("\nПеревірка адекватності моделі за критерієм Фішера: m = {}, N =
{} для таблиці y_table".format(m, N))
 print("Теоретичні значення y для різних комбінацій факторів:")
 print("\n".join(["{arr[0]}: y = {arr[1]}".format(arr=el) for el in
theoretical_values_to_print]))
 print("Fp = {}, Ft = {}".format(f_p, f_t))
 print("Fp < Ft => модель адекватна" if f_p < f_t else "Fp > Ft => модель
неадекватна")
 return True if f_p < f_t else False
Exemple #12
0
    def cochran(self, N, m):
        for i in range(N):
            ydisp = 0
            for k in range(m):
                ydisp += (self.ylist[i][k] - self.ymed[i])**2
            ydisp /= m
            self.ydisplist.append(round(ydisp, self.round))

        self.groz = round(
            max(self.ydisplist) / sum(self.ydisplist), self.round)
        f1 = m - 1
        f2 = N
        partresult = self.q / f2
        params = [partresult, f1, (f2 - 1) * f1]
        fisher = f.isf(*params)
        result = fisher / (fisher + (f2 - 1))
        self.gkr = round(
            Decimal(result).quantize(Decimal('.0001')).__float__(), self.round)
        self.f1 = f1
        self.f2 = f2
        Task.printcoch(N)
        if self.groz < self.gkr:
            print(
                "   Gp < Gкр => За критерієм Кохрана дисперсія однорідна з ймовірністю",
                p)
        else:
            print(
                "   Gp > Gкр => За критерієм Кохрана дисперсія неоднорідна з ймовірністю",
                p)
            print("   Збільшуємо m на 1: m = {1}+1 = {0}".format(m + 1, m))
            m += 1
            Task.equation(N, m)
            Task.cochran(N, m)
Exemple #13
0
def cohren_value(size_of_selections, qty_of_selections, significance):
    size_of_selections += 1
    partResult1 = significance / (size_of_selections - 1)
    params = [partResult1, qty_of_selections, (size_of_selections - 1 - 1) * qty_of_selections]
    fisher = f.isf(*params)
    result = fisher / (fisher + (size_of_selections - 1 - 1))
    return Decimal(result).quantize(Decimal('.0001')).__float__()
Exemple #14
0
 def FTest(self,alpha): # F-检验
     yHat=self.predict(self.X)
     Qe=((self.Y-yHat)**2).sum(axis=0)
     yAver=np.mean(self.Y,axis=0)
     U=((yHat-yAver)**2).sum(axis=0)
     Fvalue=(U/self.k_x)/(Qe/(self.n_x-self.k_x-1))
     Falpha=f.isf(alpha,1,self.n_x-self.k_x-1)
     return Fvalue,Falpha,Fvalue>Falpha
Exemple #15
0
def cohren(disper):
    global Gp, Gt, f1, f2
    Gp = max(disper) / sum(disper)
    f1 = m - 1
    f2 = N
    fisher = f.isf(*[q / f2, f1, (f2 - 1) * f1])
    Gt = round(fisher / (fisher + (f2 - 1)), 4)
    return Gp < Gt
 def on_pushButton_f_quantile_clicked(self):
     """
     Slot documentation goes here.
     """
     f_n = self.doubleSpinBox_f_n.value()
     f_m = self.doubleSpinBox_f_m.value()
     arfa = self.doubleSpinBox_f_arfa.value()
     quantile = f.isf(arfa, f_m, f_n)
     self.lineEdit_f_quantile.setText('%.3f' % quantile)
Exemple #17
0
def region_of_rejection(SA, SE, s, n, alpha):
    test_value = (SA / (s - 1)) / (SE / (n - s))
    F_value = f.isf(alpha, s - 1, n - s)

    if test_value >= F_value:
        reject_ind = True
    else:
        reject_ind = False
    return test_value, F_value, reject_ind
Exemple #18
0
 def Ftest(self, alpha):
     y_hat = self.predict(self.X)
     Qe = np.sum((self.Y - y_hat)**2)
     ymean = self.Y.mean()
     U = np.sum((y_hat - ymean)**2)
     n = len(self.X)
     F = U / (Qe / (n - 2))
     F_alpha = f.isf(alpha, 1, n - 2)
     return [F, F_alpha, F > F_alpha]
Exemple #19
0
def kohren_check(disper):
    global Gp, Gt, f1, f2
    print("Критерій Кохрена")
    Gp = max(disper) / sum(disper)
    f1 = m - 1
    f2 = N
    fisher = f.isf(*[q / f2, f1, (f2 - 1) * f1])
    Gt = round(fisher / (fisher + (f2 - 1)), 4)
    print("Gp = " + str(Gp) + ", Gt = " + str(Gt))
Exemple #20
0
 def cohrenValue(selectionSize, selectionQty, significance):
     selectionSize += 1
     partResult1 = significance / (selectionSize - 1)
     params = [
         partResult1, selectionQty, (selectionSize - 1 - 1) * selectionQty
     ]
     fisher = f.isf(*params)
     result = fisher / (fisher + (selectionSize - 1 - 1))
     return Decimal(result).quantize(Decimal('.0001')).__float__()
Exemple #21
0
 def get_cohren_value(self):
     size_of_selections = self.N + 1
     qty_of_selections = self.m - 1
     significance = self.q
     partResult1 = significance / (size_of_selections - 1)
     params = [partResult1, qty_of_selections, (size_of_selections - 1 - 1) * qty_of_selections]
     fisher = f.isf(*params)
     result = fisher / (fisher + (size_of_selections - 1 - 1))
     return Decimal(result).quantize(Decimal('.0001')).__float__()
 def get_cohren_value(size_of_selections, qty_of_selections, significance):
     from _pydecimal import Decimal
     from scipy.stats import f
     size_of_selections += 1
     partResult1 = significance / (size_of_selections - 1)
     params = [partResult1, qty_of_selections, (size_of_selections - 1 - 1) * qty_of_selections]
     fisher = f.isf(*params)
     result = fisher / (fisher + (size_of_selections - 1 - 1))
     return Decimal(result).quantize(Decimal('.0001')).__float__()
Exemple #23
0
 def dModXPlot(self):
     dmx = self.dModX()
     nr, nc = self.x_mat.shape
     ncomp = self.scores.shape[1]
     A0 = 0 if type(self.x_mean) == int else 1
     fc = np.sqrt(f.isf(0.05, nr - ncomp - A0, nc - ncomp))
     ax = pd.Series(dmx, index=self.x_df.index.get_level_values(0)).plot(kind='bar', color='green')
     ax.hlines(fc, -1, 20)
     return ax.figure
Exemple #24
0
 def get_cohren_critical(self) -> float:
     '''
     Get table value of Cohren criterion
     
     Returns:
         float -- [criterion value]
     '''
     f_crit = f.isf((1 - self.p) / self.f2, self.f1,
                    (self.f2 - 1) * self.f1)
     return f_crit / (f_crit + self.f2 - 1)
Exemple #25
0
def fisher_check():
    global Fp, Ft
    f4 = N - d
    sad = 0
    for i in range(N):
        sad += (f_x[i] - y_mean[i])**2
    sad *= (m / (N - d))
    Fp = sad / sb
    print(f"\n\nFp = {Fp:}", end="\t\t\t")
    Ft = round(abs(f.isf(q, f4, f3)), 4)
    print(f"Ft = {Ft:}")
 def on_pushButton_f_quantile_plot_clicked(self):
     """
     Slot documentation goes here.
     """
     f_n = self.doubleSpinBox_f_n.value()
     f_m = self.doubleSpinBox_f_m.value()
     arfa = self.doubleSpinBox_f_arfa.value()
     quantile = f.isf(arfa, f_m, f_n)
     if self.radioButton_one.isChecked():
         self.widget.mpl.axes.cla()
         self.widget.mpl.start_f_plot(f_m, f_n)
     self.widget.mpl.fill_f_plot(f_m, f_n, quantile, self.a, self.b, arfa)
Exemple #27
0
def fisher_check():
    global Fp, Ft
    print("\nКритерій Фішера")
    f4 = N - d
    sad = 0
    for i in range(N):
        sad += (f_x[i] - y_mean[i])**2
    sad *= (m / (N - d))
    Fp = sad / sb
    print(f"Fp = {Fp:}")
    print(f"Кількість степенів свободи: F4 = N - d = {f4:}")
    Ft = round(abs(f.isf(q, f4, f3)), 4)
    print(f"Табличне значення коефіцієнту Фішера: Ft = {Ft:}")
Exemple #28
0
    def get_cohren_value(size_of_selections, qty_of_selections, significance):
        # qty_of_selections = 4
        # size_of_selections = 4
        size_of_selections += 1

        #  significance = 0.05

        partResult1 = significance / (size_of_selections - 1)
        params = [partResult1, qty_of_selections, (size_of_selections - 1 - 1) * qty_of_selections]
        fisher = f.isf(*params)
        # print(fisher)
        # fisher = 0
        result = fisher / (fisher + (size_of_selections - 1 - 1))
        return Decimal(result).quantize(Decimal('.0001')).__float__()
Exemple #29
0
def kohren_check(disper):
    global Gp, Gt, f1, f2
    print("Критерій Кохрена")
    Gp = max(disper) / sum(disper)
    f1 = m - 1
    f2 = N
    fisher = f.isf(*[q / f2, f1, (f2 - 1) * f1])
    Gt = round(fisher / (fisher + (f2 - 1)), 4)
    print(
        f"Gp = {Gp:}\nКількість степенів свободи: F1 = m - 1 = {f1:}; F2 = N = {f2:}"
    )
    print(
        f"Рівень значимості: q = 1 - p = {q:}\nТабличне значення коефіцієнту Кохрена: Gt = {Gt:}"
    )
Exemple #30
0
def region_of_rejection(SA, SB, SAXB, SE, r, s, t, alpha):

    if t > 1:
        #
        # repeated test
        #
        test_FA = (SA / (r - 1)) / (SE / (r * s * (t - 1)))
        F_A = f.isf(alpha, r - 1, r * s * (t - 1))

        test_FB = (SB / (s - 1)) / (SE / (r * s * (t - 1)))
        F_B = f.isf(alpha, r - 1, r * s * (t - 1))

        test_FAXB = (SAXB / ((r - 1) * (s - 1))) / (SE / (r * s * (t - 1)))
        F_AXB = f.isf(alpha, (r - 1) * (s - 1), r * s * (t - 1))
    elif t == 1:
        #
        # single test
        #
        test_FA = (SA / (r - 1)) / ((SE / ((r - 1) * (s - 1))))
        F_A = f.isf(alpha, r - 1, (r - 1) * (s - 1))

        test_FB = (SB / (s - 1)) / ((SE / ((r - 1) * (s - 1))))
        F_B = f.isf(alpha, s - 1, (r - 1) * (s - 1))

        test_FAXB = None
        F_AXB = None
    else:
        test_FA = None
        F_A = None
        test_FB = None
        F_B = None
        test_FAXB = None
        F_AXB = None

    return test_FA, F_A, determine_rejection(test_FA, F_A), \
           test_FB, F_B, determine_rejection(test_FB, F_B), \
           test_FAXB, F_AXB, determine_rejection(test_FAXB, F_AXB)
#Decision process component
def z_cmp(calculated_z_score_proportion, criticalz_percentage_proportion):
    """
    calculated_z_score_proportion: the z score calculated from mu and xbar in proportion
    criticalz_percentage_proportion: Given Critical Value proportion for acceptance criteria
    if calculated_z_score_proportion > criticalz_percentage_proportion
    then the chance of that happening is p < criticalz_percentage_proportion
    """
    return abs(calculated_z_score_proportion) > scipy.stats.norm.cdf(scipy.stats.norm.ppf((100-criticalz_percentage_proportion)/100.))


######## ANALYSIS OF VARIANCE - ANOVA ###########
"""
F = Between Group Variability/Within Group Variability
"""
f_critical = lambda alpha, dfb, dfw: round(f.isf(alpha, dfb, dfw), 4)
#between group var
#xbar_groups is an array of x means
df_ssb = lambda xbar_groups: float(len(xbar_groups) - 1)

ssb = lambda xg, xbar_groups, each_grp_sample_size: \
sum([ni * (xbar - xg)**2 for ni, xbar in izip(each_grp_sample_size, xbar_groups)])/df_ssb(xbar_groups)

#within group var
#x_samples is an array of lists - this is like df for pooled variance
df_ssw = lambda x_samples: float(reduce(lambda x,y: x+y, imap(len, x_samples)) - len(x_samples))

#this is like pooled variance for more than 2 samples - generic
ssw = lambda x_samples: sum([var(samples) * len(samples) for samples in x_samples])/df_ssw(x_samples)

#F ratio stats
    def dinucleotide_usage(self, seqs, genome_id):
        """Calculate dinucleotide (n3:n1) usage statistics for genome.

        Parameters
        ----------
        seqs : dict[seq_id] -> seq
            Sequences indexed by sequence id.
        genome_id : str
            Unique id of genome used to create output file.
        """

        # calculate dinucleotide usage for each gene and the genome as a while
        gene_di_usage = defaultdict(lambda: defaultdict(int))
        gene_n1 = defaultdict(lambda: defaultdict(int))
        gene_n3 = defaultdict(lambda: defaultdict(int))

        genome_di_usage = defaultdict(int)
        genome_n1 = defaultdict(int)
        genome_n3 = defaultdict(int)
        gc = {}
        for gene_id, seq in seqs.items():
            gc[gene_id] = seq_tk.gc(seq)

            for i in xrange(2, len(seq) - 2, 3):
                dinucleotide = seq[i:i + 2].upper()
                if 'N' not in dinucleotide:
                    gene_di_usage[gene_id][dinucleotide] += 1
                    gene_n3[gene_id][dinucleotide[0]] += 1
                    gene_n1[gene_id][dinucleotide[1]] += 1

                    genome_di_usage[dinucleotide] += 1
                    genome_n3[dinucleotide[0]] += 1
                    genome_n1[dinucleotide[1]] += 1

        # calculate Manhattan distance for each gene
        manhattan_dist = self._manhattan(gene_di_usage, genome_di_usage)

        # identify deviant genes under Hotelling T-squared statistic
        t2_stats = self._hotelling_statistic(gene_di_usage, gene_n3, gene_n1,
                                                genome_di_usage, genome_n3, genome_n1)

        # correction from T-squared distribution to F-distribution approximation
        # http://en.wikipedia.org/wiki/Hotelling%27s_T-squared_distribution
        num_genes = len(gene_di_usage)
        f_dist_correction = float(num_genes - 15 + 1) / (15 * num_genes)
        deviant_threshold = f.isf(self.critical_value, 15, num_genes - 15 + 1) / f_dist_correction

        # report dinucleotide usage of each gene
        di_set_sorted = sorted(genome_di_usage.keys())
        gene_ids_sorted = sorted(t2_stats.items(), key=operator.itemgetter(1), reverse=True)

        output_file = os.path.join(self.output_dir, genome_id + '.di_usage.tsv')
        fout = open(output_file, 'w')

        fout.write('Gene Id\tGC\tLength (bp)\t# dinucleotides\tHotelling T-squared statistic\tDeviant\tManhattan distance\tDeviations from mean')
        for di in di_set_sorted:
            fout.write('\t' + di)
        fout.write('\n')

        genome_gc = seq_tk.gc(''.join(seqs.values()))
        genome_sum_di = sum(genome_di_usage.values())
        fout.write('%s\t%.2f\t%d\t%d' % ('<complete genome>', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_di))
        fout.write('\t%s\t%s\t%.1f\t%.1f' % ('na', 'na', 0, 0))
        for di in di_set_sorted:
            fout.write('\t%.2f' % (genome_di_usage.get(di, 0) * 100.0 / genome_sum_di))
        fout.write('\n')

        for gene_id, t2_stat in gene_ids_sorted:
            dinucleotides = gene_di_usage[gene_id]
            sum_di = sum(dinucleotides.values())
            fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_di))
            fout.write('\t%.2f\t%s' % (t2_stat, 'yes' if t2_stat > deviant_threshold else 'no'))
            fout.write('\t%.2f\t%.2f' % (manhattan_dist[gene_id][0], manhattan_dist[gene_id][1]))

            for di in di_set_sorted:
                fout.write('\t%.2f' % (dinucleotides.get(di, 0) * 100.0 / sum_di))
            fout.write('\n')
        fout.close()
#

from scipy.stats import chi2, t, f
import numpy as np

# Q1
q1_1 = chi2.isf(q=0.95, df=4)
assert np.allclose(q1_1, 0.710723)
q1_2 = chi2.isf(q=0.05, df=4)
assert np.allclose(q1_2, 9.48773)
q1_3 = chi2.isf(q=0.95, df=9)
assert np.allclose(q1_3, 3.32511)
q1_4 = chi2.isf(q=0.05, df=9)
assert np.allclose(q1_4, 16.9190)

# Q2
q2_1 = t.isf(q=0.05, df=7)
assert np.allclose(q2_1, 1.895, rtol=1.e-3)
q2_2 = t.isf(q=0.025, df=7)
assert np.allclose(q2_2, 2.365, rtol=1.e-3)
q2_3 = t.isf(q=0.05, df=12)
assert np.allclose(q2_3, 1.782, rtol=1.e-3)
q2_4 = t.isf(q=0.025, df=12)
assert np.allclose(q2_4, 2.179, rtol=1.e-3)

# Q3
q3_1 = f.isf(q=0.05, dfn=5, dfd=7)
assert np.allclose(q3_1, 3.9715)
q3_2 = f.isf(q=0.95, dfn=5, dfd=7)
assert np.allclose(q3_2, 0.2050903422957813)  # inverse of F(7,5; 0.05)