Example #1
0
def chi2_dir(cause, effect, unknown, n, p_cause, p_effect_given_cause):
    cnt = count(zip(effect, unknown))
    #print cnt
    chi_indep = chi2_contingency(cnt)[1]
    p_unknown_given_effect = [ float(cnt[0][1]) / sum(cnt[0]),
                               float(cnt[1][1]) / sum(cnt[1]) ]
    #print 'p(bact|cd)=%s' % p_unknown_given_effect
    exp=[[0,0],[0,0]]
    for c in range(2):
        for e in range(2):
            for u in range(2):
                exp[c][u] += (n * 
                              p_of_val(p_cause, c) *
                              p_of_val(p_effect_given_cause[c], e) *
                              p_of_val(p_unknown_given_effect[e], u))
    cnt = count(zip(cause, unknown))
    #print "obs=%s" % cnt
    #print 'cnt=%s' % cnt
    #print 'expected if cd->bact=%s' % exp
    chi_rev = chisquare(cnt, exp, axis=None, ddof=2)
    chi_fwd = chi2_contingency(cnt)
    #print 'expected if bact->cd=%s' % chi_fwd[3]
    bayes_factor = chi2.pdf(chi_fwd[0],1) / chi2.pdf(chi_rev.statistic,1)
    return struct(reject_indep=chi_indep,
                  bayes_fwd_rev=bayes_factor,
                  reject_fwd=chi_fwd[1],
                  reject_rev=chi_rev.pvalue)
Example #2
0
def getChis(crosstab, variable):
    chi2, p, dof, ex = sps.chi2_contingency(crosstab)
    x = sps.chi2_contingency(crosstab)

    crit = sps.chi2.ppf(q=0.95, df=dof)
    if (crit < chi2):
        evaluation = True
    else:
        evaluation = False

    obs = crosstab.as_matrix()
    obs_list = obs.tolist()
    ex_list = ex.tolist()
    z_scores = sps.zmap(obs_list, ex_list)
    z_list = z_scores.tolist()
    z_indicators = []
    for z in z_list:
        z_sig = ["+" if i > 1.96 else "-" if i < -1.96 else " " for i in z]
        z_indicators.append(z_sig)

    results = {'chi-sq': chi2,
               'p-val': p,
               'eval': evaluation,
               'dof': dof,
               'explanans': variable,
               'expected': ex_list,
               'observed': obs_list,
               'z_scores': z_indicators,
               'row_lab': crosstab.index.tolist(),
               'col_lab': crosstab.columns.tolist()
               }
    print results
    return results
Example #3
0
def chiSquare():
    ''' Application of a chi square test to a 2x2 table.
    The calculations are done with and without Yate's continuity
    correction.
    Data are taken from Altman, Table 10.10:
    Comparison of number of hours' swimming by swimmers with or without erosion of dental enamel.
    >= 6h: 32 yes, 118 no
    <  6h: 17 yes, 127 no'''

    # Enter the data
    obs = np.array([[32, 118], [17, 127]])

    # --- >>> START stats <<< ---
    # Calculate the chi-square test
    chi2_corrected = stats.chi2_contingency(obs, correction=True)
    chi2_uncorrected = stats.chi2_contingency(obs, correction=False)
    # --- >>> STOP stats <<< ---

    # Print the result
    print('\nCHI SQUARE --------------------------------------------------')
    print(('The corrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(
        chi2_corrected[0], chi2_corrected[1])))
    print(('The uncorrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(
        chi2_uncorrected[0], chi2_uncorrected[1])))
    
    return chi2_corrected
Example #4
0
    def test_basic(self):
        # median_test calls chi2_contingency to compute the test statistic
        # and p-value.  Make sure it hasn't screwed up the call...

        x = [1, 2, 3, 4, 5]
        y = [2, 4, 6, 8]

        stat, p, m, tbl = stats.median_test(x, y)
        assert_equal(m, 4)
        assert_equal(tbl, [[1, 2], [4, 2]])

        exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl)
        assert_allclose(stat, exp_stat)
        assert_allclose(p, exp_p)

        stat, p, m, tbl = stats.median_test(x, y, lambda_=0)
        assert_equal(m, 4)
        assert_equal(tbl, [[1, 2], [4, 2]])

        exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, lambda_=0)
        assert_allclose(stat, exp_stat)
        assert_allclose(p, exp_p)

        stat, p, m, tbl = stats.median_test(x, y, correction=False)
        assert_equal(m, 4)
        assert_equal(tbl, [[1, 2], [4, 2]])

        exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, correction=False)
        assert_allclose(stat, exp_stat)
        assert_allclose(p, exp_p)
def main (contingency_table):
	""" Calcula estadisticas de una tabal de contingencia 2x2 """
	
	SRS_types = set([])
	tables  = {}
	
	for row in csv.reader(open(contingency_table), delimiter = '\t'):
		
		ID, non_can, can = row
		SRS, tag = ID.split("_")
		
		SRS_types.add(SRS)
		tables[ID] = [int(non_can), int(can)]
	
	for srs in SRS_types:
		table = []
		table.append(tables[srs + "_YES"])
		table.append(tables[srs + "_NO"])
		
		
		
		obs = np.array(table)
		chi2, chi2_pvalue, chi2_dof, chi2_ex = chi2_contingency(obs, correction=False)
		chi2_yates, chi2_yates_pvalue, chi2_yates_dof, chi2_yates_ex = chi2_contingency(obs, correction=True)
		fisher_oddsratio, fisher_pvalue = stats.fisher_exact(table)
		
#		print srs, table, fisher_oddsratio, fisher_pvalue, chi2, chi2_pvalue, chi2_dof, chi2_ex
		
		print srs, fisher_oddsratio, log(fisher_oddsratio, 2), fisher_pvalue, chi2, chi2_pvalue, chi2_yates, chi2_yates_pvalue
Example #6
0
def position_wise_scores2(seq5_list, seq3_list, organism, title='Intron position strength'):
    '''Uses chi-contingency test to score base proportions at each position in sample against population'''
    
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    all_5p, all_3p = generate_all_ss_seqs(gff3, fa_dict, organism)
    
    pop_5p = seq_list_to_totals(all_5p)
    pop_3p = seq_list_to_totals(all_3p)
    samp_5p = seq_list_to_totals(seq5_list)
    samp_3p = seq_list_to_totals(seq3_list)
    print samp_5p.shape

    p5 = []
    for n in range(samp_5p.shape[1]):
        if n == 2 or n == 3:
            p5.append(1)
        else:
            conting = np.array([samp_5p[:,n],pop_5p[:,n]])
            chi2, p, dof, expected = stats.chi2_contingency(conting)
            p5.append(np.log10(p)*-1)
        
    p3 = []
    for n in range(samp_3p.shape[1]):
        if n == 4 or n == 5:
            p3.append(1)
        else:
            conting = np.array([samp_3p[:,n],pop_3p[:,n]])
            chi2, p, dof, expected = stats.chi2_contingency(conting)
            p3.append(np.log10(p)*-1)
    
    fig, ax = plt.subplots(2, 1, figsize=(4,4))
    width = 0.7
    
    max_y = max(p5+p3) + 0.1*max(p5+p3)
    
    ind5 = np.arange(len(p5))
    ax[0].bar(ind5, p5, color='k')
    ax[0].plot([0,8], [2,2], '--', color='0.7')
    ax[0].set_xlim([0,len(p5)])
    ax[0].set_ylabel("5' splice site\n-log10(p-value)")
    ax[0].set_title(title)
    ax[0].set_ylim([0,max_y])

    ind3 = np.arange(len(p3))
    ax[1].bar(ind3, p3, color='k')
    ax[1].plot([0,8], [2,2], '--', color='0.7')
    ax[1].set_xlim([0,len(p3)])
    ax[1].set_ylabel("3' splice site\n-log10(p-value)")
    ax[1].set_ylim([0,max_y])

    ax[0].set_xticks(ind3 + width / 2)
    ax[1].set_xticks(ind3 + width / 2)
    ax[0].set_xticklabels(np.arange(-2,6))
    ax[1].set_xticklabels(np.arange(-5,3))

    fig.tight_layout()
    plt.show()
    return fig
def statistic_analysis(np_snp_info,np_feature_snp,np_label_classifyProgress,np_label_classifyPhenotype):
    ### proportion
    np_proportion = np.empty([np_snp_info.shape[0],np_snp_info.shape[1]],dtype='float')
    np_proportion = np.average(np_feature_snp, axis=0).reshape(np_snp_info.shape[0],np_snp_info.shape[1])

    ### get 2X2 matrix
    np_2_2_matrix_classifyProgress = np.empty([np_snp_info.shape[0],4],dtype='float')
    np_2_2_matrix_classifyPhenotype = np.empty([np_snp_info.shape[0],4],dtype='float')
    for idxSNP in range(0,np_snp_info.shape[0]):
        for idxSample in range(0,np_feature_snp.shape[0]):
            if np_label_classifyProgress[idxSample] == 0:
                np_2_2_matrix_classifyProgress[idxSNP,0] = np_2_2_matrix_classifyProgress[idxSNP,0] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1]
                np_2_2_matrix_classifyProgress[idxSNP,2] = np_2_2_matrix_classifyProgress[idxSNP,2] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2
            else:
                np_2_2_matrix_classifyProgress[idxSNP,1] = np_2_2_matrix_classifyProgress[idxSNP,1] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1]
                np_2_2_matrix_classifyProgress[idxSNP,3] = np_2_2_matrix_classifyProgress[idxSNP,3] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2
            if np_label_classifyPhenotype[idxSample] == 0:
                np_2_2_matrix_classifyPhenotype[idxSNP,0] = np_2_2_matrix_classifyPhenotype[idxSNP,0] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1]
                np_2_2_matrix_classifyPhenotype[idxSNP,2] = np_2_2_matrix_classifyPhenotype[idxSNP,2] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2
            else:
                np_2_2_matrix_classifyPhenotype[idxSNP,1] = np_2_2_matrix_classifyPhenotype[idxSNP,1] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1]
                np_2_2_matrix_classifyPhenotype[idxSNP,3] = np_2_2_matrix_classifyPhenotype[idxSNP,3] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2
    
    ### chi-square; fisher; oddsratio
    np_chi2 = np.empty([np_snp_info.shape[0],2],dtype='float')
    np_fisher = np.empty([np_snp_info.shape[0],2],dtype='float')
    np_oddsratio = np.empty([np_snp_info.shape[0],2],dtype='float')
    for idxSNP in range(0,np_snp_info.shape[0]):
        np_this_2_2_matrix = np_2_2_matrix_classifyProgress[idxSNP,:].reshape(2,2)
        print np_this_2_2_matrix
        chi2, p, dof, ex = st.chi2_contingency(np_this_2_2_matrix, correction=False)
        np_chi2[idxSNP,0] = p
        oddsratio, pvalue = st.fisher_exact(np_this_2_2_matrix)
        np_fisher[idxSNP,0] = pvalue
        np_oddsratio[idxSNP,0] = (np_this_2_2_matrix[0,0]*np_this_2_2_matrix[1,1])/(np_this_2_2_matrix[1,0]*np_this_2_2_matrix[0,1])
        np_this_2_2_matrix = np_2_2_matrix_classifyPhenotype[idxSNP,:].reshape(2,2)
        chi2, p, dof, ex = st.chi2_contingency(np_this_2_2_matrix, correction=False)
        np_chi2[idxSNP,1] = p
        oddsratio, pvalue = st.fisher_exact(np_this_2_2_matrix)
        np_fisher[idxSNP,1] = pvalue
        np_oddsratio[idxSNP,1] = (np_this_2_2_matrix[0,0]*np_this_2_2_matrix[1,1])/(np_this_2_2_matrix[1,0]*np_this_2_2_matrix[0,1])
    
    #proportion(AA:AB:BB); ClassifyProgress(Chi2,Fisher,OddsRatio); ; ClassifyPhenotype(Chi2,Fisher,OddsRatio)    
    np_statistic_result = np.empty([np_snp_info.shape[0],9],dtype='float')
    np_statistic_result[:,:3] = np_proportion
    np_statistic_result[:,3] = np_chi2[:,0]
    np_statistic_result[:,4] = np_fisher[:,0]
    np_statistic_result[:,5] = np_oddsratio[:,0]
    np_statistic_result[:,6] = np_chi2[:,1]
    np_statistic_result[:,7] = np_fisher[:,1]
    np_statistic_result[:,8] = np_oddsratio[:,1]
    
    return np_statistic_result
Example #8
0
def contigencyAnalysis(dataTable,
                       rowNames = None, columnNames = None,
                       display = True, outputFile = None):
    
    testStatistic, pValue, degreesOfFreedom, expectedTable = chi2_contingency(dataTable)

    
    outputDataTable = ezTable(dataTable, columnNames, rowNames, summarizeColumns ="SUM", summarizeRows = "SUM", title = "Observed Counts", display = display, returnFormat = "MATRIX")

    outputExpectedTable = ezTable(expectedTable, columnNames, rowNames, summarizeColumns ="SUM", summarizeRows = "SUM", title = "Expected Counts", display = display, returnFormat = "MATRIX")

    if 0.0001 > pValue:
        pValueString = "< 0.0001"
    else:
        pValueString = str(round(pValue, 4))
    
    resultsTable = [["Test Statistic", testStatistic],
                    ["Degrees of Freedom", degreesOfFreedom],
                    ["p-value", pValueString]]

    outputResultsTable = ezTable(resultsTable, title="Results", display = display, returnFormat = "MATRIX")

    if outputFile <> None:
        outputMatrix = outputDataTable + [['']] + outputExpectedTable + [['']] + outputResultsTable
        matrixToCSV(outputMatrix, outputFile)

    return testStatistic, pValue, degreesOfFreedom, expectedTable
Example #9
0
def severs(a,b,cut,verbose=False):
    cntall = count(zip(a,b))
    cntcut = count(zip(cut,a,b))
    p_b_given_a = [float(x[1])/sum(x) for x in cntall]
    p_a_given_b = [float(x[1])/sum(x) for x in zip(*cntall)]
    if verbose:
        print 'orig=%s' % cntall
        print 'split=%s' % cntcut
        print 'p_a_given_b = %s' % p_a_given_b
        print 'p_b_given_a = %s' % p_b_given_a
    pvar = count(zip(cut))
    mularr(pvar, 1.0/sum(pvar))
    expnsev = [deepcopy(cntall), deepcopy(cntall)]
    for i in [0,1]:
        mularr(expnsev[i], pvar[i])
    exptoucha = deepcopy(expnsev) # We'll overwrite everything
    for i in [0,1]:
        for aval in [0,1]:
            n = cntcut[i][aval][0] + cntcut[i][aval][1]
            for bval in [0,1]:
                p = p_of_val(p_b_given_a[aval], bval)
                exptoucha[i][aval][bval] = n * p
                ##print 'for cut=%d a=%d b=%d, n=%d p=%.1f val=%.1f' % (i, aval, bval, n, p, exptoucha[i][aval][bval])
    exptouchb = deepcopy(expnsev) # We'll overwrite everything
    for i in [0,1]:
        for bval in [0,1]:
            n = cntcut[i][0][bval] + cntcut[i][1][bval]
            for aval in [0,1]:
                exptouchb[i][aval][bval] = n * p_of_val(p_a_given_b[bval], aval)
    if verbose:
        print 'exp|touch a = %s' % exptoucha
        print 'exp|touch b = %s' % exptouchb
    exps = [expnsev, exptoucha, exptouchb]
    bayes_factor = [1, 1, 1]
    for model in [0,1,2]:
        if verbose:
            print 'Model Touches %s' % (['neither', 'a', 'b'])[model]
        for i in [0,1]:
            try:
                chi_sev = chi2_contingency(cntcut[i])
            except (ValueError, ZeroDivisionError) as e:
                continue
            peg_sev = blurred_chi2_pdf(chi_sev[0], sumall(cntcut[i]))
            if verbose:
                print ' chi_sev=%s' % str(chi_sev)
                print ' p(e|sev)=%f' % peg_sev
                print ' Cut=%d' % i
                print ' Actual: %s' % cntcut[i]
                print ' Expected: %s' % exps[model][i]
            try:
                chi_nsev = chisquare(cntcut[i], exps[model][i], axis=None, ddof=2)
            except (ValueError, ZeroDivisionError) as e:
                print 'Failure for model %d cut %d act=%s exp=%s' % (model, i, cntcut, exps[model])
                raise e
            peg_nsev = blurred_chi2_pdf(chi_nsev[0], sumall(cntcut[i]))
            if verbose:
                print ' Chi=%s' % str(chi_nsev)
                print ' p=%s' % peg_nsev
            bayes_factor[model] *= peg_sev/peg_nsev 
    return min(bayes_factor)
Example #10
0
def binary_chi2(data, alternative):
    """
    """

    for group in data:
        assert issubclass(group.dtype.type, np.integer)
        assert set(np.unique(group)) == set((0, 1))

    n_groups = len(data)
    n_outcomes = 2
    ct = np.zeros((n_groups, n_outcomes))

    for i, group in enumerate(data):
        for el in group:
            ct[i, el] += 1

    support_outcome = np.sum(ct, axis=0)
    support_group = np.sum(ct, axis=1)
    n_samples = np.sum(ct, axis=None)


    chi2 = 0.0
    for i in range(n_groups):
        for j in range(n_outcomes):
            observed = ct[i, j]
            expected = support_group[i] * support_outcome[j] / n_samples
            chi2 += ((observed-expected) ** 2) / expected

    _, p, _, _ = stats.chi2_contingency(ct, correction=False)
    return p
def chi_mode(data,depth,low=lcut,alpha=chi,f=freq):
    result=dict()
    plus=data['A'][0]+data['T'][0]+data['G'][0]+data['C'][0]
    minus=data['A'][1]+data['T'][1]+data['G'][1]+data['C'][1]
    for key in ['A','T','G','C']:
        if data[key][0] >= low[0]*depth and data[key][1] >=low[1]*depth:
            ndep=data[key][2]
            frequency=ndep/float(data['cover'])
            if frequency >= f:
#add chi square test:
                if frequency > 0.5:
                    result[key]=frequency
                else:
                    a=data[key][0]
                    b=data[key][1]
                    c=plus-data[key][0]
                    d=minus-data[key][1]

                    least=sorted([a,b,c,d])[0]
                    table=[[a,b],[c,d]]

                    if least < 5:
                        pvalue=stats.fisher_exact(table)[1]
                    else:
                        pvalue=stats.chi2_contingency(table)[1]

                    if pvalue > alpha:
                        result[key]=frequency
    return result
Example #12
0
def RuleGeneration (D, globalL, minconf):
	Rules = []
	for key, value in globalL.items()[1:]:
		for item in value:
			#_subsets = map(frozenset, [x for x in subsets(item)])
			for consequence in item:
				consequence = frozenset([consequence])
				antecedent = item.difference(consequence)
				if len(consequence) > 0:
					if (chisquaremode):
						#calculate chi square value
						#A->B
						A = getSupp(antecedent, allFreq, D)*len(D)
						B = getSupp(consequence, allFreq, D)*len(D)
						AB = getSupp(item, allFreq, D)*len(D)
						# print "AB: " + str(AB)
						A_B = A-AB
						# print "A_B: " + str(A_B)
						_AB = B-AB
						# print "_AB: " + str(_AB)
						_A_B = len(D) - AB - A_B - _AB
						# print "_A_B: " + str(_A_B)
						chistatistics = chi2_contingency(np.array([[AB,A_B],[_AB,_A_B]]))
						if chistatistics[1] <= p_value:
							Rules.append(((tuple(antecedent), tuple(consequence)), chistatistics[0], getSupp(item, allFreq, D), getSupp(antecedent, allFreq, D), getSupp(consequence, allFreq, D), chistatistics[1]))
					else:
						confidence =  getSupp(item, allFreq, D) / getSupp(antecedent, allFreq, D)
						if confidence >= minconf:
							Rules.append(((tuple(antecedent), tuple(consequence)), confidence, getSupp(item, allFreq, D), getSupp(antecedent, allFreq, D), getSupp(consequence, allFreq, D)))
	return Rules
Example #13
0
    def consistent_acceptance_rate(self, window_size=None, critical_pval=0.05):
        """
        A convenience funcion for `burnin`.  Returns `True` if the acceptances of the two halves
        of the window are consistent with having the same acceptance rates.  This is done using
        a chi-squared contingency test.
        """
        if window_size is None:
            if len(self.updates) == 0:
                return False
            else:
                window_start = self.updates[-1]
        else:
            window_start = self.iterations - window_size

        window_length = self.iterations - window_start

        # If window is really small, return `consistent` to avoid gratuitous updating
        consistent = True
        if window_length > 2:
            windowed_acceptances = self.acceptance[window_start:self.iterations].flatten()
            X1, X2 = np.array_split(windowed_acceptances, 2)

            n1, n2 = len(X1), len(X2)
            k1, k2 = np.sum(X1), np.sum(X2)

            # Use chi^2 contingency test to test whether the halves have consistent acceptances
            table = [[k1, k2], [n1 - k1, n2 - k2]]
            p_val = chi2_contingency(table)[1]

            if p_val < critical_pval:
                consistent = False

        return consistent
Example #14
0
def independence(table, test): #conducts test for independence and prints result depending on mode
	chi2, p, df, f_exp = stats.chi2_contingency(table)

	if test==True:
		print "chi-square statistic: %s \np-value: %s \ndegrees of freedom: %d \nexpected values: %s" %(chi2, p, df, f_exp)
	else: 
		print "chi-square statistic: %s \np-value: %s" %(chi2, p)
 def test_random_circuits(self):
     qk_simulator = get_backend('local_qasm_simulator')
     for circuit in self.rqg.get_circuits(format_='QuantumCircuit'):
         self.log.info(circuit.qasm())
         compiled_circuit = compile_circuit(circuit)
         shots = 100
         job_pq = QuantumJob(compiled_circuit,
                             backend=pq_simulator,
                             seed=1, shots=shots)
         job_qk = QuantumJob(compiled_circuit,
                             backend=qk_simulator,
                             seed=1, shots=shots)
         result_pq = pq_simulator.run(job_pq).result()
         result_qk = qk_simulator.run(job_qk).result()
         counts_pq = result_pq.get_counts(result_pq.get_names()[0])
         counts_qk = result_qk.get_counts(result_qk.get_names()[0])
         self.log.info('local_qasm_simulator_projectq: %s', str(counts_pq))
         self.log.info('local_qasm_simulator: %s', str(counts_qk))
         states = counts_qk.keys() | counts_pq.keys()
         # contingency table
         ctable = numpy.array([[counts_pq.get(key, 0) for key in states],
                               [counts_qk.get(key, 0) for key in states]])
         result = chi2_contingency(ctable)
         self.log.info('chi2_contingency: %s', str(result))
         with self.subTest(circuit=circuit):
             self.assertGreater(result[1], 0.01)
Example #16
0
 def rank_features(self, metric):
     self.metrics_ranked.add(metric)
     for feat in self.feature_set:
         feat_func = {}
         for id in self.train_set:
             if feat in self.features[id]:
                 feat_func[id] = 1
             else:
                 feat_func[id] = 0
         if metric == "info":
             feat_yes = set([id for id in self.train_set if feat_func[id] == 1])
             feat_no = set([id for id in self.train_set if feat_func[id] == 0])
             label_yes = set([id for id in self.train_set if self.label_func[id] == 1])
             label_no = set([id for id in self.train_set if self.label_func[id] == 0])
             x = [len(feat_yes & label_yes), len(feat_yes & label_no)]
             y = [len(feat_no & label_yes), len(feat_no & label_no)]
             a, b, c, d = x[0], x[1], y[0], y[1]
             obs = numpy.array([x, y])
             self.feature_rank["info"][feat] = info_gain(obs)
         elif metric == "spearman":
             u = [self.label_func[id] for id in self.train_set]
             v = [feat_func[id] for id in self.train_set]
             rho, pval = stats.spearmanr(u, v)
             self.feature_rank["spearman"][feat] = abs(rho)
         else:
             feat_yes = set([id for id in self.train_set if feat_func[id] == 1])
             feat_no = set([id for id in self.train_set if feat_func[id] == 0])
             label_yes = set([id for id in self.train_set if self.label_func[id] == 1])
             label_no = set([id for id in self.train_set if self.label_func[id] == 0])
             x = [len(feat_yes & label_yes), len(feat_yes & label_no)]
             y = [len(feat_no & label_yes), len(feat_no & label_no)]
             a, b, c, d = x[0], x[1], y[0], y[1]
             obs = numpy.array([x, y])
             chi2, pval, dof, ex = stats.chi2_contingency(obs, correction=False)
             self.feature_rank[metric][feat] = 1-pval
Example #17
0
def calculateP(variables, k, data, WINDOW_LEN):
    
    freq_old = np.zeros(len(variables))
    freq = np.zeros(len(variables))

    for i in range(len(variables)):
        sample = data[k:k+WINDOW_LEN]
        freq_old[i] = sample.count(variables[i])

        sample = data[k+WINDOW_LEN : k+2*WINDOW_LEN]
        freq[i] = sample.count(variables[i])

    if (len(variables)==2):
        chi = chisquare(freq, freq_old)
        p = chi[1]
        # Tried the exact binomial goodness of fit method:
        # p = binom_test(freq, n=None, p=freq_old[0]/sum(freq_old))
        # The results were the same as Chi-square
    else:    
        if (sum(freq==0)>0 or sum(freq_old==0)>0):
            chi = chisquare(freq, freq_old)
        else:
            chi = chi2_contingency([freq,freq_old], correction=True)
        p = chi[1]
        
    return p
 def test_run_device(self):
     backends = self._provider.available_backends({'simulator': False})
     self.log.info('devices: %s', [b.name for b in backends])
     backend = lowest_pending_jobs(backends)
     self.log.info('using backend: %s', backend.name)
     qobj = qiskit._compiler.compile(self._qc, backend)
     shots = qobj['config']['shots']
     quantum_job = QuantumJob(qobj, backend, preformatted=True)
     job = backend.run(quantum_job)
     while not (job.done or job.exception):
         self.log.info(job.status)
         time.sleep(4)
     if job.exception:
         raise job.exception
     self.log.info(job.status)
     result = job.result()
     counts_qx = result.get_counts(result.get_names()[0])
     counts_ex = {'00': shots/2, '11': shots/2}
     states = counts_qx.keys() | counts_ex.keys()
     # contingency table
     ctable = numpy.array([[counts_qx.get(key, 0) for key in states],
                           [counts_ex.get(key, 0) for key in states]])
     self.log.info('states: %s', str(states))
     self.log.info('ctable: %s', str(ctable))
     contingency = chi2_contingency(ctable)
     self.log.info('chi2_contingency: %s', str(contingency))
     self.assertDictAlmostEqual(counts_qx, counts_ex, shots*0.1)
def choose_vocabulary(data):
	for assignIndex in range(len(data)):
		for innerIndex in range(len(data[assignIndex])):
			if data[assignIndex][innerIndex]==0:
				data[assignIndex][innerIndex]+=1
	chi2, p, dof, ex =stats.chi2_contingency(data)
	return p
Example #20
0
def Dep_GTest(C,X,S,M,alpha=0.05):
    C=np.array(C)
    X=np.array(X)
    g,p,dof,expected = stats.chi2_contingency(np.array([X,C]))#,lambda_='log-likelihood')
    if (p<=alpha):
        return true
    return false
Example #21
0
def xtab(formula, covariate_df):
    y, X = patsy.dmatrices(str(formula), covariate_df)
    X = patsy.dmatrix('genotype', covariate_df)
    ix = get_genotype_ix(X)

    tbl = pd.crosstab(X[:, ix], y.ravel())
    try:
        tbl.columns = ['%s_%i' % (y.design_info.column_names[-1], j) for j in range(2)]
    except:
        return None # too few samples
    tbl.index = ['%i_alts' % i for i in tbl.index]
    alts = set(tbl.index)
    if len(alts) < 2 or not '0_alts' in alts:
        tbl_dom = None
    else:
        tbl_dom = pd.DataFrame({'0_alts': tbl.ix['0_alts', :], 'n_alts': tbl.ix[list(alts - set(['0_alts'])), :].sum()}).T

    # can't test recessive without any homoz alts.
    if not '2_alts' in alts or len(alts) < 2:
        tbl_rec = None
    else:
        tbl_rec = pd.DataFrame({'lt2_alts': tbl.ix[['0_alts', '1_alts'], :].sum(), '2_alts': tbl.ix['2_alts', :]})

    d = {}
    for name, xtbl in (('additive', tbl), ('dominant', tbl_dom), ('recessive', tbl_rec)):
        if xtbl is None:

            d['p.chi.%s' % name] =  'nan'
            continue

        chi, p, ddof, e = chi2_contingency(xtbl)
        if name == 'additive':
            d = xtbl.to_dict()
        d['p.chi.%s' % name] = "%.3g" % p
    return d
Example #22
0
    def calculate_associations(self, covariate='passage', lookup=None):
        '''
        calculate the association of amino acid state and
        sequence properties such as passage
        '''
        if not hasattr(self, 'mutation_count'):
            self.count_mutations_per_site()

        # calculate associations
        from scipy.stats import chi2_contingency
        self.associations = {}
        if lookup is None:
            lookup=lambda x:x

        # loop over all positions (currently rather clumsy)
        for prot, pos in mutation_dict:
            assoc = defaultdict(int)
            for node in selt.tree.get_terminals(): # extract info from each node
                if hasattr(node, covariate):
                    assoc[(node.translations[prot][pos-1], lookup(node.passage))]+=1

            # make contingency matrix
            aa_states = sorted(set([x[0] for x in assoc]))
            cov_states = sorted(set([x[1] for x in assoc]))
            contingeny_matrix = np.zeros((aa_states, cov_states))
            for a, c in assoc:
                contingeny_matrix[aa_states.index(a), cov_states.index(c)] = assoc[(a,c)]
            g, p, dof, expctd = chi2_contingency(contingeny_matrix, lambda_="log-likelihood")
            assoc['contingency matrix'] = contingeny_matrix
            assoc['aa']=aa_states
            assoc['covariates']=cov_states
            assoc['g_test'] = (g,p)

            self.associations[(prot, pos)] = assoc
Example #23
0
def MK_test(SNPs, test_mode):
    '''
    (dict, str) -> dict
    Take a dict of gene : [PN, PS, DN, DS] pairs and a string fisher or G_test
    and a return  a new dict with gene : [PN, PS, DN, DS, p-val] pairs 
    with PN and DN being respectively replacement polymorphisms and divergence
    and PS and DS being respectively synonymous polymorphisms and divergence 
    and p-val being the p-value of the contingency test using either Fisher's
    two-sided exact test or the G-test with Yate's correction
    '''
    
    # create new dict
    MK = {}    
    
    # loop over genes in dict
    for gene in SNPs:
        # initialize list with PN, PS
        polym = [SNPs[gene][0], SNPs[gene][1]]
        # initialize list with DN, DS
        diverg = [SNPs[gene][2], SNPs[gene][3]]
        # perform the MK test according to fisher 2-tailed or G-test
        if test_mode == 'fisher':
            # get the p-value
            P = stats.fisher_exact([polym, diverg])[1]
        elif test_mode == 'G_test':
            P = stats.chi2_contingency([polym, diverg], lambda_ = 'log-likelihood')[1]
        # add p-val to list
        MK[gene] = list(SNPs[gene])
        MK[gene].append(P)
        
    return MK
 def contingency_table(self, dead_strains, live_strains, output_file):
     elem_intervals = self.make_elementary_intervals(
         [self.sample_dict[sn][0] for sn in dead_strains + live_strains]
     )
     num_dead = len(dead_strains)
     num_live = len(live_strains)
     dead_observed = self.build_pairwise_matrix(dead_strains, elem_intervals)
     live_observed = self.build_pairwise_matrix(live_strains, elem_intervals)
     with open(output_file, 'w+') as fp:
         writer = csv.writer(fp)
         writer.writerow(['Proximal chromosome', 'Proximal start', 'Proximal end',
                          'Distal chromosome', 'Distal start', 'Distal end',
                          'Proximal origin', 'Distal origin', 'chi squared', 'p-value'])
         elem_intervals.insert(0, 0)
         for combo in xrange(subspecies.NUM_SUBSPECIES**2):
             for i in xrange(len(elem_intervals)-1):
                 for j in xrange(i+1, len(elem_intervals)-1):
                     if dead_observed[combo, i, j] and live_observed[combo, i, j]:
                         contingency = np.array([[dead_observed[combo, i, j], live_observed[combo, i, j]],
                                                 [num_dead-dead_observed[combo, i, j],
                                                  num_live-live_observed[combo, i, j]]])
                         chi_squared, p, _, _ = stats.chi2_contingency(contingency)
                         proximal_pos = self.chrom_and_pos(elem_intervals[i], elem_intervals[i+1])
                         distal_pos = self.chrom_and_pos(elem_intervals[j], elem_intervals[j+1])
                         writer.writerow(proximal_pos + distal_pos +
                                         (subspecies.proximal(combo), subspecies.distal(combo), chi_squared, p))
Example #25
0
	def myChisquare(self, values):
		# Uses chisquare
		values = [pair for pair in values if not np.all(np.array(pair) == 0)]
		chi2, p, dof, ex = chi2_contingency(values)
		if (ex < 5).sum() > 0:
			return 0.0, 1.0
		# print chi2, p, dof
		return chi2, p
Example #26
0
File: Stats.py Project: paweus/pite
    def chi(data1,data2):
         obs = np.array([data1,data2])
         try:
             chi2, p, dof, expected = stats.chi2_contingency(obs)
         except:
            print 'Chi2 error'

         return chi2
 def chi_square_of_df_cols(self, df, col1, col2):
     df_col1, df_col2 = df[col1], df[col2]
 
     result = [[sum((df_col1 == cat1) & (df_col2 == cat2))
                for cat2 in self.categories(df_col2)]
               for cat1 in self.categories(df_col1)]
 
     return stats.chi2_contingency(result)
def chiSqQuant(x, y, num_states_x, num_states_y):
    if num_states_x == 1 or num_states_y == 1:
        return (1, 0)    
    x = x - min(x)
    y = y - min(y)
    n_mat = hist3(x, y, range(num_states_x), range(num_states_y))
    T, result, _, _ = chi2_contingency(n_mat)
    return (result, T)
Example #29
0
def doHitProcess(inp):
	idx, hits, n_f1_hits, n_f2_hits = inp
	if hits[0] == 0 and hits[1] == 0: return
	if hits[0] == 0: return idx, 999.0, 0, 0, hits[1], float(hits[1])/float(n_f2_hits), 'NA', 'NA'
	if hits[1] == 0: return idx, 0.0, hits[0], float(hits[0])/float(n_f1_hits), 0, 0, 'NA', 'NA'
	h1_p = float(hits[0])/float(n_f1_hits)
	h2_p = float(hits[1])/float(n_f2_hits)
	chi, pvalue, _, _ = stats.chi2_contingency([[hits[1],n_f2_hits-hits[1]],[hits[0],n_f1_hits-hits[0]]])
	return idx, round(h2_p/h1_p,3), hits[0], h1_p, hits[1], h2_p, chi, pvalue
Example #30
0
def first_sec(stats):
    obs = [[0, 0, 0], [0, 0, 0]]
    for l in stats:
        if l[5] == 1:
            add_data(l[2], obs[0])
        else:
            add_data(l[2], obs[1])

    return chi2_contingency(obs)[0:2]
Example #31
0
 def chi2_homogeneity(c_tbl):
     return chi2_contingency(c_tbl)
Example #32
0
searchdata_file = '../data/searches.json'

searches = pd.read_json(searchdata_file, lines=True)
odd_id = searches[(searches['uid'] % 2 != 0)]
even_id = searches[(searches['uid'] % 2 == 0)]
odds_searched = odd_id[(odd_id['search_count'] > 0)]
odd_unsearched = odd_id[(odd_id['search_count'] == 0)]

evens_searched = even_id[(even_id['search_count'] > 0)]
evens_unsearched = even_id[(even_id['search_count'] == 0)]

"ANALYSIS"

obs1 = np.array([[odds_searched.shape[0], odd_unsearched.shape[0]],
                 [evens_searched.shape[0], evens_unsearched.shape[0]]])
chi = (chi2_contingency(obs1))
mannwhitneyu = stats.mannwhitneyu(odd_id['search_count'],
                                  even_id['search_count'])
"""
# INFUSER DOES NOT ACCEPT THE FOLLOWING LINES: UNABLE TO JUDGE TYPE FOR EXPRESSION
odds_searched = odds_searched[(odds_searched['is_instructor'] == True)]
odd_unsearched = odd_unsearched[(odd_unsearched['is_instructor'] == True)]



evens_searched = evens_searched[(evens_searched['is_instructor'] == True)]
evens_unsearched = evens_unsearched[(evens_unsearched['is_instructor'] == True)]

odd_id = odd_id[(odd_id['is_instructor'] == True)]
even_id = even_id[(even_id['is_instructor'] == True)]
Example #33
0
expected.columns = ["democrat", "independent", "republican"]
expected.index = ["asian", "black", "hispanic", "other", "white"]

print(expected)

chi_squared_stat = (((observed - expected)**2) / expected).sum().sum()

print(chi_squared_stat)

crit = stats.chi2.ppf(
    q=0.95,  # Find the critical value for 95% confidence*
    df=8)  # *

print("Critical value")
print(crit)

p_value = 1 - stats.chi2.cdf(
    x=chi_squared_stat,  # Find the p-value
    df=8)
print("P value")
print(p_value)

print(stats.chi2_contingency(observed=observed))

print(
    "If the p-value is less than 0.05, we reject the null hypothesis that there's no difference between the means and conclude that a significant difference does exist"
)
print(
    "As expected, given the high p-value, the test result does not detect a significant relationship between the variables."
)
Example #34
0
app_pivot['Percent with Application'] = app_pivot.Application / app_pivot.Total
app_pivot

# It looks like more people from Group B turned in an application.  Why might that be?
#
# We need to know if this difference is statistically significant.
#
# Choose a hypothesis tests, import it from `scipy` and perform it.  Be sure to note the p-value.
# Is this result significant?

# In[36]:

from scipy.stats import chi2_contingency

contingency = [[250, 2254], [325, 2175]]
chi2_contingency(contingency)

# ## Step 4: Who purchases a membership?

# Of those who picked up an application, how many purchased a membership?
#
# Let's begin by adding a column to `df` called `is_member` which is `Member` if `purchase_date` is not `None`, and `Not Member` otherwise.

# In[35]:

df['is_member'] = df.purchase_date.apply(lambda x: 'Member'
                                         if pd.notnull(x) else 'Not Member')

# Now, let's create a DataFrame called `just_apps` the contains only people who picked up an application.

# In[38]:
Example #35
0
def q_counts():
    # Statistics : Q-Feature (mean, std, number_samples)
    f = {
        'q_ends': (0.00885, 0.09388, 1238),
        'q_contains': (0.022617, 0.14874, 1238)
    }
    o = {
        'q_ends': (0.090437, 0.286955, 962),
        'q_contains': (0.133056, 0.339812, 962)
    }
    a = {
        'q_ends': (0.025316, 0.157284, 395),
        'q_contains': (0.075949, 0.265253, 395)
    }

    d = read_clean_dataset()
    q = read_pickle_file(_feature_file_map['Q'])
    q['Stance'] = d.articleHeadlineStance

    # Run the t-test!
    for feature in ['q_ends', 'q_contains']:
        mean_f, std_f, n_f = f[feature]
        mean_a, std_a, n_a = a[feature]
        mean_o, std_o, n_o = o[feature]
        # Run the actual test
        _, p_fo = ttest_ind_from_stats(mean1=mean_f,
                                       std1=std_f,
                                       nobs1=n_f,
                                       mean2=mean_o,
                                       std2=std_o,
                                       nobs2=n_o)
        _, p_fa = ttest_ind_from_stats(mean1=mean_f,
                                       std1=std_f,
                                       nobs1=n_f,
                                       mean2=mean_a,
                                       std2=std_a,
                                       nobs2=n_a)
        _, p_ao = ttest_ind_from_stats(mean1=mean_a,
                                       std1=std_a,
                                       nobs1=n_a,
                                       mean2=mean_o,
                                       std2=std_o,
                                       nobs2=n_o)

        print(f"""P-values ({feature})
                    1) For - Against: {p_fa}
                    2) Observing - Against: {p_ao}
                    3) For - Observing: {p_fo}""")

        # Chi-square test for dependency between feature and stance
        contingency_table = pd.crosstab(q['Stance'], q[feature], margins=False)

        chi2_stat, p_val, dof, ex = stats.chi2_contingency(contingency_table)

        print("\n")
        print(f"""=== Chi2 Stat ({feature}) ===""")
        print(chi2_stat)
        print("\n")
        print("===Degrees of Freedom===")
        print(dof)
        print("\n")
        print("===P-Value===")
        print(p_val)
        print("\n")
        print("===Contingency Table===")
        print(ex)
Example #36
0
def get_bias_chi2_pvals(clf,
                        df,
                        feature_names,
                        categories,
                        low=None,
                        high=None,
                        num=100):
    """
    Get p-values across a range of decision thresholds

    Parameters
    ------------
    clf : sklearn clf object
        model classifier, must have a `decision_function` or
        `predict_proba` method
    df : pandas DataFrame
        contains untransformed data
    feature_names : list of strings
        features included in the classifier
    categories : list of strings
        names of demographic columns to check, e.g. ['gender', 'ethnicity']
    low : float
        lower threshold value
    high : float
        upper threshold value
    num : int
        number of thresholds to consider

    Returns
    ---------
    thresholds_to_check : range of floats
        decision thresholds obtained by np.linspace(low, high,num)
    post_chi2stat_pvals : defaultdict(list)
        containing categories' chi2 statistics and p_vals at a range
        of thresholds

    """

    # get decision score for each user and sort by the score
    # this sort makes finding who matches at a threshold easy
    X = df[feature_names].values

    # subsequent modifications on copy of the input dataframe
    df = df.copy()
    clf = ClassifierWrapper(clf)
    df['decision'] = clf.decision_function(X)
    # allow for older and newer pandas sorting schemes
    if hasattr(df, 'sort_values'):
        sorted_df = df.reindex(
            df.sort_values('decision', ascending=False).index)
    else:
        sorted_df = df.reindex(df.sort('decision', ascending=False).index)

    matched_col = get_unique_name('matched', df.columns)

    # define range of values to test over if not inputted
    if low is None:
        low = df.decision.min()
    if high is None:
        high = df.decision.max()

    n_samples = sorted_df.shape[0]
    thresholds_to_check = np.linspace(low, high, num)
    post_chi2stat_pvals = defaultdict(list)

    for threshold in thresholds_to_check:
        # set the top 1-threshold proportion of sample to 1 (match) and the
        # rest to 0 (not match)
        num_matches = int(n_samples * (1 - threshold))
        num_not_matches = (n_samples - int(n_samples * (1 - threshold)))
        sorted_df[matched_col] = ([1] * num_matches) + ([0] * num_not_matches)

        for category in categories:
            # get p-values for non-nan values
            category_vals = set(sorted_df[category].dropna())
            cat_df = sorted_df[sorted_df[category].isin(category_vals)]
            cat_ctabs = pd.crosstab(cat_df[matched_col], cat_df[category])
            chi2_stat, chi2_pval = chi2_contingency(cat_ctabs)[:2]
            post_chi2stat_pvals[category].append((chi2_stat, chi2_pval))

    return thresholds_to_check, post_chi2stat_pvals
Example #37
0
thalach_typical = heart.thalach[heart.cp == 'typical angina']
thalach_asymptom = heart.thalach[heart.cp == 'asymptomatic']
thalach_nonangin = heart.thalach[heart.cp == 'non-anginal pain']
thalach_atypical = heart.thalach[heart.cp == 'atypical angina']

# run ANOVA
from scipy.stats import f_oneway

Fstat, pval = f_oneway(thalach_typical, thalach_asymptom, thalach_nonangin,
                       thalach_atypical)
print('p-value for ANOVA: ', pval)
#there is at least one pair of chest pain types (cp) for which people with those pain types have significantly different average max heart rates during exercise (thalach)

# run Tukey's range test
from statsmodels.stats.multicomp import pairwise_tukeyhsd

output = pairwise_tukeyhsd(heart.thalach, heart.cp)
print(output)
#For any pair where “Reject” is “True”, we conclude that people with those chest pain types have significantly different maximum heart rates during exercise

# contingency table of heart disease vs cp
Xtab = pd.crosstab(heart.cp, heart.heart_disease)
print(Xtab)

# run chi-square test
from scipy.stats import chi2_contingency

chi2, pval, dof, exp = chi2_contingency(Xtab)
print('p-value for chi-square test: ', pval)
#This is less than 0.05, so we can conclude that there is a significant association between these variables.
    def calculate(self):
        try:
            if self.df.shape[1] != 2 or len(self.batchsize) != 2:
                raise ValueError(
                    'Lengths of survival_rate and batchsize must be =2')
        except ValueError as ve:
            print(ve)

        try:
            if min(list(self.df.nunique())) == 0:
                raise ValueError('One or more columns in dataframe is empty')
        except ValueError as ve:
            print(ve)

        [a_key, b_key] = list(self.batchsize.keys())
        a = self.df.loc[:, a_key]
        b = self.df.loc[:, b_key]

        nRuns = math.floor(
            min(a.shape[0] / self.batchsize[a_key],
                b.shape[0] / self.batchsize[b_key]))
        a_end = -1
        b_end = -1
        Cumm_P_val = np.zeros(nRuns)  # This variable shows cummulative P Value

        # Loop to find cummulative P value, by increasing sample size in each run
        for i in range(nRuns):
            a_end = a_end + self.batchsize[a_key]
            b_end = b_end + self.batchsize[b_key]
            a_pass = a[0:a_end].sum()
            a_fail = self.batchsize[a_key] * (i + 1) - a[0:a_end].sum()
            b_pass = b[0:b_end].sum()
            b_fail = self.batchsize[b_key] * (i + 1) - b[0:b_end].sum()
            ContingencyTable = np.array([[a_pass, a_fail], [b_pass, b_fail]])
            if np.min(ContingencyTable) == 0:
                # P value cannot be determined if one or more values in the ContingencyTable is zero
                Cumm_P_val[i] = np.nan
            else:
                (chi1, Cumm_P_val[i], DOF,
                 expected) = stats.chi2_contingency(ContingencyTable,
                                                    correction=False)

        #Plot cummulative p values for all runs
        x = (np.arange(1, nRuns + 1))
        Cumm_P_val = pd.DataFrame(list(zip(x, Cumm_P_val)),
                                  columns=['N_runs', 'Cummulative_P_Value'])
        Cumm_P_val = Cumm_P_val.dropna()

        ax = Cumm_P_val.plot(x='N_runs',
                             y='Cummulative_P_Value',
                             grid=True,
                             label='p value')
        plt.plot(np.ones(np.max(x)) * 0.05,
                 color='red',
                 ls="--",
                 label='alpha=0.05')
        ax.set_title("Chi2 Results")
        ax.set_xlabel('N Runs')
        ax.set_ylabel('P Value')
        ax.set_xticks(np.arange(0, nRuns + 1, 10))
        ax.legend()

        def run2samples(x):
            return x * (self.batchsize[a_key] + self.batchsize[b_key])

        def samples2run(x):
            return x / (self.batchsize[a_key] + self.batchsize[b_key])

        secax = ax.secondary_xaxis('top', functions=(run2samples, samples2run))
        secax.set_xlabel('Total Samples tested')
        #secax.set_xticks(np.arange(0, (nRuns+1)*(self.batchsize[a_key]+self.batchsize[b_key]), 10))
        plt.show()
        return None
# Also not very fruitful

# What is the overall conversion rate for each group?
conversions = tests.groupby('price').aggregate(
    conversion_rate=('converted', lambda x: sum(x) / len(x)),
    conversion_count=('converted', 'sum'),
    nonconversion_count=('converted', lambda x: len(x) - sum(x)),
    visitor_count=('user_id', 'count'))
conversions = conversions.reset_index()
conversions['revenue_per_visitor'] = conversions[
    'conversion_count'] * conversions['price'] / conversions['visitor_count']
print(conversions)
# Even with the decrease in conversion rate, the revenue earned per visitor is up by $0.14

# Is the difference in conversion rate significant?
chi2, pvalue, dof, ex = chi2_contingency(
    conversions[['conversion_count', 'nonconversion_count']].transpose())
print(
    'The decreased conversion rate of {:.3f} is statististically significant with p={:.3f}'
    .format(conversions['conversion_rate'].diff().max(), pvalue))

# Is the difference in revenue significant?
# this doesn't seem like a valid question to ask here because it's just another version of "are these two numbers different?"

# Plot all of the data!

# Boxplots of each variable by conversion
tests_melted = tests.melt(
    id_vars=['user_id', 'timestamp', 'converted', 'test', 'price'])
tests_melted_conversion_rate = tests_melted.groupby(
    ['variable', 'value',
     'test']).agg(conversion_rate=('converted', lambda x: sum(x) / len(x)),