Beispiel #1
0
    def test_random_choice(self):
        random_state = random.getstate()
        random.seed(42)
        random_choice_transform = transforms.RandomChoice(
            [
                transforms.Resize(15),
                transforms.Resize(20),
                transforms.CenterCrop(10)
            ]
        )
        img = transforms.ToPILImage()(torch.rand(3, 25, 25))
        num_samples = 250
        num_resize_15 = 0
        num_resize_20 = 0
        num_crop_10 = 0
        for _ in range(num_samples):
            out = random_choice_transform(img)
            if out.size == (15, 15):
                num_resize_15 += 1
            elif out.size == (20, 20):
                num_resize_20 += 1
            elif out.size == (10, 10):
                num_crop_10 += 1

        p_value = stats.binom_test(num_resize_15, num_samples, p=0.33333)
        assert p_value > 0.0001
        p_value = stats.binom_test(num_resize_20, num_samples, p=0.33333)
        assert p_value > 0.0001
        p_value = stats.binom_test(num_crop_10, num_samples, p=0.33333)
        assert p_value > 0.0001

        random.setstate(random_state)
        # Checking if RandomChoice can be printed as string
        random_choice_transform.__repr__()
Beispiel #2
0
 def test_data(self):
     pval = stats.binom_test(100,250)
     assert_almost_equal(pval,0.0018833009350757682,11)
     pval = stats.binom_test(201,405)
     assert_almost_equal(pval,0.92085205962670713,11)
     pval = stats.binom_test([682,243],p=3.0/4)
     assert_almost_equal(pval,0.38249155957481695,11)
Beispiel #3
0
    def test_random_horizontal_flip(self):
        random_state = random.getstate()
        random.seed(42)
        img = transforms.ToPILImage()(torch.rand(3, 10, 10))
        himg = img.transpose(Image.FLIP_LEFT_RIGHT)

        num_samples = 250
        num_horizontal = 0
        for _ in range(num_samples):
            out = transforms.RandomHorizontalFlip()(img)
            if out == himg:
                num_horizontal += 1

        p_value = stats.binom_test(num_horizontal, num_samples, p=0.5)
        random.setstate(random_state)
        assert p_value > 0.0001

        num_samples = 250
        num_horizontal = 0
        for _ in range(num_samples):
            out = transforms.RandomHorizontalFlip(p=0.7)(img)
            if out == himg:
                num_horizontal += 1

        p_value = stats.binom_test(num_horizontal, num_samples, p=0.7)
        random.setstate(random_state)
        assert p_value > 0.0001

        # Checking if RandomHorizontalFlip can be printed as string
        transforms.RandomHorizontalFlip().__repr__()
Beispiel #4
0
    def test_alternatives(self):
        res = stats.binom_test(51, 235, p=1./6, alternative='less')
        assert_almost_equal(res, 0.982022657605858)

        res = stats.binom_test(51, 235, p=1./6, alternative='greater')
        assert_almost_equal(res, 0.02654424571169085)

        res = stats.binom_test(51, 235, p=1./6, alternative='two-sided')
        assert_almost_equal(res, 0.0437479701823997)
def contam_contig(nmapped_sink, nmapped_source, contam_libs, KS_THRESHOLD=0.001, P_RATIO=0.0001) :
    """
    determine if a contig is contaminated
    """
    # count # of sources
    nsource = 0
    prop=0.
    N=0
    mssg=''
    for key in contam_libs:
        mssg+=key+','
    logging.info(mssg)
    max=0 # record the library from that contributed most weight to prop
    for lib_scr in nmapped_source:
        #print lib_scr
        if lib_scr in contam_libs:
            # found a source lib
            nsource += 1
            f=float(contam_libs[lib_scr][0])/float(contam_libs[lib_scr][1])*nmapped_source[lib_scr].n_mapped_reads
            prop+=f      #proportion
            N+=nmapped_source[lib_scr].n_mapped_reads
            mssg = '%s %d' % (lib_scr, N)
            logging.info(mssg)
            if f > max:
                max=f
                lib=lib_scr
    if N!=0:
        prop /= N
    else:
        return False
    mssg='max lib %s, # of sources %d, P is %f, nmap_sink is %d, nmap_src is %d' % (lib, nsource, prop, nmapped_sink[1], N)
    logging.info(mssg)
    mssg='bin.test %g' % stats.binom_test(nmapped_sink[1], nmapped_sink[1]+N, prop/(1.+prop))
    logging.info(mssg)
    mssg='%g ~ %g, p-value %g' % (prop, nmapped_sink[1]/float(N), stats.binom_test(contam_libs[lib][0], contam_libs[lib][0]+contam_libs[lib][1], prop/(1.+prop)))
    logging.info(mssg)
    if nsource == 1:
            mssg='fisher exact %g' % stats.fisher_exact([[contam_libs[lib][0], contam_libs[lib][1]], [nmapped_sink[1], nmapped_source[lib].n_mapped_reads]])[1]
    else:
            mssg='bin.test %g' % stats.binom_test(nmapped_sink[1], nmapped_sink[1]+N, prop/(1.+prop))
    logging.info(mssg)

    if nsource > 1 and stats.binom_test(contam_libs[lib][0], contam_libs[lib][0]+contam_libs[lib][1], prop/(1.+prop)) > 0.05:
        nsource=1
    if  nsource==0 or (nsource==1 and nmapped_source[lib].similar < KS_THRESHOLD) \
    or (nsource==1 and stats.fisher_exact([[contam_libs[lib][0], contam_libs[lib][1]], [nmapped_sink[1], nmapped_source[lib].n_mapped_reads]])[1] < P_RATIO)\
    or (stats.binom_test(nmapped_sink[1], nmapped_sink[1]+N, prop/(1.+prop)) < P_RATIO):
        # not a contam
        return False

    slist=[]
    for lib_scr in nmapped_source:
        if lib_scr in contam_libs:
            slist.append(lib_scr)

    logging.info(','.join(slist))
    return True
Beispiel #6
0
def binomial_up_down_test(up, down, crit):
    if up >= down:
        direction = 'Up'
        pvalue = stats.binom_test(up, up + down, 0.5)
    else:
        direction = 'Down'
        pvalue = stats.binom_test(down, up + down, 0.5)
    if pvalue <= crit:
        return {'direction':direction, 'pvalue':pvalue}
    return {}
Beispiel #7
0
    def _assess_performance(self, pfx, result, container=None):
        """ Computes performance measures for the model predictions

        Parameters
        ----------
        pfx : str
            specifies the result type ('' for a case with only one channel or for the
            channel-averaged results, '%g_' for channel-specific results)
        result : List<dict>
            List of result dictionaries for each seed
        container : dict, None
            Optionally provide a dictionary in which the performance assessement should be stored.

        Returns
        -------
        dict
            dictionary filled with performance measures

        """
        n_seeds = len(result)
        if container is None:
            container = dict()
        container[pfx + 'predictions'] = \
            np.mean([result[s]['y_pred'] for s in range(n_seeds)], axis=0).tolist()

        container[pfx + 'scoring_seed'] = \
            [self.scoring(self.y_true, result[s]['y_pred'] if self.is_regression
             else np.round(result[s]['y_pred'])) for s in range(n_seeds)]
        container[pfx + 'scoring_ste'] = \
            np.std(container[pfx + 'scoring_seed'] / np.sqrt(n_seeds)).tolist()
        container[pfx + 'scoring'] = \
            self.scoring(self.y_true, np.round(container[pfx + 'predictions'])).tolist()

        if self.is_regression:
            container[pfx + 'correct'] = None
            container[pfx + 'explained_variance'] = \
                explained_variance_score(self.y_true, container[pfx + 'predictions']).tolist()
        else:
            container[pfx + 'correct'] = (container[pfx + 'predictions'] == self.y_true).tolist()
            p_binom = max([np.mean(self.scheme.data[0].labels == l)
                     for l in np.unique(self.scheme.data[0].labels)])
            container[pfx + 'binom_statistic'] = \
                [binom_test(sum(result[s]['y_pred'] == self.y_true), self.n_samples)
                 for s in range(n_seeds)]
            container[pfx + 'binom_statistic_corrected'] = \
                [binom_test(sum(result[s]['y_pred'] == self.y_true), self.n_samples, p=p_binom)
                 for s in range(n_seeds)]
            container[pfx + 'confusion_matrix'] = \
                confusion_matrix(self.y_true, np.round(container[pfx + 'predictions'])).tolist()
            container[pfx + 'classification_report'] = \
                classification_report(self.y_true, np.round(container[pfx + 'predictions']),
                                      target_names=self.label_names,
                                      labels=np.unique(self.scheme.data[0].labels))

        return container
Beispiel #8
0
def contam_ratio(n50, nmapped_source, nmapped_sink, KS_THRESHOLD=0.001, KS_FLAT_THRESHOLD=0.01, P_RARIO=0.05, niter=3) :
    # n_contig_weighted_by_read: 90% of the reads are mapped to the first n50 contigs of the source
    ns=n50
    for i in range(n50):
        if nmapped_source[i][0] not in nmapped_sink:    # this means n_mapped_reads == 0
            ns=i    
            break
    
    # compute intial ratio
    nsource=0
    nsink=0        
    for i in range (ns):
        contig=nmapped_source[i][0]
        if nmapped_sink[contig].similar > KS_THRESHOLD and nmapped_sink[contig].flatness < KS_FLAT_THRESHOLD:
            nsource+=nmapped_source[i][1]
            nsink+=nmapped_sink[contig].n_mapped_reads
    print n50, ns, nsink, nsource
    if nsource + nsink == 0:  # now relax flatness 
        for i in range (ns):
            contig=nmapped_source[i][0]
            if nmapped_sink[contig].similar > KS_THRESHOLD:
                nsource+=nmapped_source[i][1]
                nsink+=nmapped_sink[contig].n_mapped_reads
    if nsource + nsink == 0:
        return (tot_mapped, n50, nsink, nsource, ns, ns)
    else:
        p=float(nsink)/(nsink+nsource)  # ratio for binomial test, differs from contam ratio
    for i in range (n50):
        contig=nmapped_source[i][0]
        pbi=stats.binom_test(nmapped_sink[contig].n_mapped_reads, nmapped_sink[contig].n_mapped_reads+nmapped_source[i][1], p)
        pchi2=stats.chisquare([nmapped_sink[contig].n_mapped_reads,], [p*nmapped_source[i][1],])[1]
        if nmapped_source[i][0] in nmapped_sink:
            mssg='contig: %s,\tratio is %d / %d = %g,\tks_profile=%g,\tks_flat=%g,\tp-value=%g, ch2=%g' % (contig, nmapped_sink[contig].n_mapped_reads, nmapped_source[i][1], nmapped_sink[contig].n_mapped_reads/float(nmapped_source[i][1]),\
            nmapped_sink[contig].similar, nmapped_sink[contig].flatness, \
            pbi, pchi2)
            logging.info(mssg)

    # binomial test
    for iter in range (niter):
        nsource=0
        nsink=0
        ns_removed=0
        for i in range (ns):
            contig=nmapped_source[i][0]
            if contig in nmapped_sink and stats.binom_test(nmapped_sink[contig].n_mapped_reads, nmapped_sink[contig].n_mapped_reads+nmapped_source[i][1], p) > P_RARIO:
                nsource+=nmapped_source[i][1]
                nsink+=nmapped_sink[contig].n_mapped_reads
            else:
                ns_removed+=1
        if nsource + nsink == 0:
            return (tot_mapped, n50, nsink, nsource, ns, ns_removed)
        p=float(nsink)/(nsink+nsource)
    return (tot_mapped, n50, nsink, nsource, ns, ns_removed)
def output_results(snps, out_file, groups, alpha):
    """Write results to file
    """
    with open(out_file, "w") as f:
        group_headers = []
        for i in range(len(groups)):
            group_headers += ["Freq_1", "Freq_2", "Fold_change",
                             "P_value", "Summary"]
        group_headers = "\t".join(group_headers)
        f.write("\t".join(["Contig_pos", "Allel_1", "Allel_2", 
                "Freq_1", "Freq_2", "Fold_change", "P_value",
                group_headers,
                "Global_summary"]) + "\n")
        for loc in sorted(snps):
            allels = snps[loc][0][2:4]
            loc_results = [loc, allels[0], allels[1]]
            glob_freqs = count_variants(snps[loc])
            loc_results += glob_freqs
            glob_fold_change = float(glob_freqs[0]) / glob_freqs[1]
            loc_results.append(glob_fold_change)
            glob_p_value = binom_test([glob_freqs[0], glob_freqs[1]])
            loc_results.append(glob_p_value)
            glob_synthesis = categorize_fold_change(glob_fold_change, 
                                                    glob_p_value, alpha)
            for g in groups:
                pass
                individuals = [x for x in snps[loc] if x[0] == g]
                ind_freqs = count_variants(individuals)
                loc_results += ind_freqs
                try:
                    fold_change = float(ind_freqs[0]) / ind_freqs[1]
                except:
                    fold_change = -99
                loc_results.append(fold_change)
                p_value = binom_test([ind_freqs[0], ind_freqs[1]])
                loc_results.append(p_value)
                synthesis = categorize_fold_change(fold_change, p_value, alpha)
                glob_synthesis += synthesis
                stat = [0, 0, len(individuals)]
                for i in individuals:
                    a = genotype(i)
                    if categorize_fold_change(a.fold_change, a.p_value, 
                        alpha) == synthesis:
                        stat[1] += 1
                    elif categorize_fold_change(a.fold_change, a.p_value,
                        alpha) == categorize_fold_change(fold_change, 1, alpha):
                        stat[0] += 1
                synthesis += str("%i/%i/%i" % (stat[0], stat[1], stat[2]))
                loc_results.append(synthesis)
            loc_results.append(glob_synthesis)
            tabulated_results = "\t".join([str(x) for x in loc_results])
            f.write(tabulated_results + "\n")
Beispiel #10
0
 def logp(model_distrib=sample_eyetrace, value=observations):
     fix_logp = compare_fixlens(model_distrib[1], value[1])
     tr = model_distrib[0]*samp_pres
     tr_true = value[0]
     zt_0 = [sts.binom_test(x, samp_pres, tr_true[i*test_interval, 0]) 
             for i, x in enumerate(tr[::test_interval, 0])]
     zt_1 = [sts.binom_test(x, samp_pres, tr_true[i*test_interval, 1]) 
             for i, x in enumerate(tr[::test_interval, 1])]
     traj_p = np.product(zt_0)*np.product(zt_1)
     print 'traj_p', traj_p
     full_logp = np.log(traj_p) + fix_logp
     print 'compfixlen', full_logp, model_distrib
     return full_logp
Beispiel #11
0
 def getSignificance(self):
     '''
     @return: The statistical significance of the comparison. 
              Higher value means the difference is more significant.
              Identical agents should return zero.
     '''
     if self.matrix[False][True] < self.matrix[True][False]:
         p = binom_test([self.matrix[True][False], self.matrix[False][True]]) / 2
         return (1.0-p)
     elif self.matrix[False][True] > self.matrix[True][False]:
         p = binom_test([self.matrix[False][True], self.matrix[True][False]]) / 2
         return (1.0-p)
     else:
         return 0.0
Beispiel #12
0
def StoreTraitResult(Trait, Traitname, max_hits, p_cutoff, correctionmethod, upgmatree, GTC):
    """
    The method that actually stores the results. Only accepts results from a single trait at a time
    """
    with open(Traitname + time.strftime("_%d_%m_%Y_%H%M") + ".csv", "w") as outfile:
        # Sort genes by p-value.
        sort_instructions = SortResultsAndSetKey(Trait)

        num_results = max_hits if max_hits is not None else len(Trait)

        cut_possibilities = {"Individual": "p_v", "Bonferroni": "B_p", "Benjamini-Hochberg": "BH_p"}

        outfile.write("Gene;Non-unique gene name;Annotation;Number_pos_present_in;Number_neg_present_in;Number_pos_not_present_in;"
        "Number_neg_not_present_in;Sensitivity;Specificity;Odds_ratio;Naive_p;Bonferroni_p;Benjamini_H_p;Max_Pairwise_comparisons;"
        "Max_supporting_pairs;Max_opposing_pairs;Best_pairwise_comp_p;Worst_pairwise_comp_p\n")

        print("Calculating max number of contrasting pairs for each significant gene")

        for x in xrange(num_results):
            sys.stdout.write("\r{:.2%}".format(float(x)/num_results))
            sys.stdout.flush()

            # Start with lowest p-value, the one which has key 0 in sort_instructions
            currentgene = sort_instructions[x]
            if (Trait[currentgene][cut_possibilities[correctionmethod]] > p_cutoff):
                sys.stdout.write("\r100.00%")
                sys.stdout.flush()
                break

            Max_pairwise_comparisons = ConvertUPGMAtoPhyloTree(upgmatree,
                                                               GTC[Traitname][currentgene])
            max_total_pairs = Max_pairwise_comparisons["Total"]
            max_propairs = Max_pairwise_comparisons["Pro"]
            max_antipairs = Max_pairwise_comparisons["Anti"]
            try:
                best_pairwise_comparison_p = ss.binom_test(max_propairs,
                                                           max_total_pairs,
                                                           0.5) / 2
                worst_pairwise_comparison_p = ss.binom_test(max_total_pairs-max_antipairs,
                                                            max_total_pairs,
                                                            0.5) / 2
            except TypeError:
                sys.exit("There was a problem using scipy.stats.binom_test. Ensure you have a recent distribution of SciPy installed.")

            outfile.write('"' + currentgene + '";"' + str(Trait[currentgene]["NUGN"]) + '";"' + str(Trait[currentgene]["Annotation"]) +
            '";"' + str(Trait[currentgene]["tpgp"]) + '";"' + str(Trait[currentgene]["tngp"]) + '";"' + str(Trait[currentgene]["tpgn"]) +
            '";"' + str(Trait[currentgene]["tngn"]) + '";"' + str(Trait[currentgene]["sens"]) + '";"' + str(Trait[currentgene]["spes"]) +
            '";"' + str(Trait[currentgene]["OR"]) + '";"' + str(Trait[currentgene]["p_v"]) + '";"' + str(Trait[currentgene]["B_p"]) +
            '";"' + str(Trait[currentgene]["BH_p"]) + '";"' + str(max_total_pairs) + '";"' + str(max_propairs) + '";"' + str(max_antipairs) +
            '";"' + str(best_pairwise_comparison_p) + '";"' + str(worst_pairwise_comparison_p) + '"\n')
    def binom_significant_celltypes(self):
        '''
        Binomial test for significance of celltype enrichment.
        '''
        print('Testing celltype enrichment....')
        sigcelltype = self.sigCelltypedf
        cellgroup = self.cellgenedf.groupby(self.cellgenedf['celltype'])

        binom_prob_occu = self.binom_prob_occu

        sigcelltype.loc[:, 'binom_pval'] = 1
        col = sigcelltype.columns.get_loc('binom_pval')
        for index, row in sigcelltype.iterrows():
            #print(row['celltype'])
            #print(row['genecluster'], totalgenes, len(cellgroup.get_group(row['celltype'])), allsiggenes)

            bprob_ind = binom_prob_occu[binom_prob_occu['celltype'] == row['celltype']].index[0]
            #print(bprob_ind)
            background_prob = binom_prob_occu.loc[bprob_ind, 'background_prob']
            #print(background_prob)
            binom_pval = stats.binom_test(row['genecluster']-1, len(cellgroup.get_group(row['celltype'])), background_prob, alternative='two-sided')
            sigcelltype.iloc[index, col] = binom_pval

        sigcelltype.loc[:, 'binom_FDR'] = 1
        sigcelltype = sigcelltype.sort_values('binom_pval', ascending=True)
        sigcelltype.index = range(len(sigcelltype))

        pvals = sigcelltype['binom_pval'].values
        corr_pvals = statsmodels.multipletests(pvals=pvals, alpha=0.05, method='fdr_bh')
        #print(pvals)
        #print(corr_pvals)
        sigcelltype['binom_FDR'] = corr_pvals[1]
        self.sigCelltypedf = sigcelltype
Beispiel #14
0
 def recall_test(recall, n_trials, apriori_p):
     n_success = recall * n_trials
     pval = binom_test(n_success, n=n_trials, p=apriori_p)
     if recall > apriori_p:
         return (pval / 2)
     else:
         return 1 - (pval / 2)
Beispiel #15
0
def retention_simulation(retention_data1, retention_data2, n, sample_size, d):

    p_values = []

    #install_dates = retention_data['install_date']

    retention_data1 = retention_data1[d].values #extract the Numpy array to speed up the calculation
    retention_data2 = retention_data2[d].values
    d1_retention_test = []
    d1_retention_control = []

    for i in range(n):
        if i % 100 == 0:        
            print 'Running test simulation %s...' % i

        samp1 = random.sample(retention_data1,sample_size) #need install and app_family
        samp2 = random.sample(retention_data2,sample_size) #need install and app_family

        p1 = sum(samp1)/float(sample_size)
        p2 = sum(samp2)/float(sample_size)
        #p1 = len(samp1[samp1['d1'] == 1])/float(len(samp1))
        #p2 = len(samp2[samp2['d1'] == 1])/float(len(samp2))
        #time2 = (time.time() - startTime) + time2
        #startTime = time.time()
        p_values.append(stats.binom_test(sum(samp2), sample_size, p1))
        #startTime = time.time()
        #time3 = (time.time() - startTime) + time3
        #startTime = time.time()
        d1_retention_control.append(p1)        
        d1_retention_test.append(p2)
        #time4 = (time.time() - startTime) + time3
        
    return p_values, d1_retention_control, d1_retention_test   
def getQueryResults(queryTupleSet, vcf_repos, gatk_repos):
    results_repos = {}
    #queryTupleSet contains tuples that are keys compatible with both vcf and GATK dictionaries
    # iterate over query Tuple Set - for each element do the following
    for posit in queryTupleSet:
        if (posit in vcf_repos.keys()) and (posit in gatk_repos.keys()):
            
            currParAlleles = vcf_repos[posit]
            currAltFreq = gatk_repos[posit][0]
            currAltCt = gatk_repos[posit][1]
            currRawDepth = gatk_repos[posit][2] 

            #TODO: Test the scipy package with binom_test()
            # do binomial test per snp site
            if currRawDepth == float(0):
                currPval = float(1)
            else:
                currPval =  stats.binom_test(currAltCt, currRawDepth, p = 0.5)

            # only report who is the parent with alternate allele
            if vcf_repos[posit][0] == "alt":
                currAltParent = "1"
            else:
                currAltParent = "2"
            results_repos[posit] = (currAltFreq, currPval, currAltCt, currRawDepth, currAltParent)

        else:
            print "This SNP does not exist in either vcf or gatk files:  " + posit[0] + ": " + posit[1]

    print "DONE!"
    return results_repos
def sign_test(samp, mu0=0):
    samp = np.asarray(samp)
    pos = np.sum(samp > mu0)
    neg = np.sum(samp < mu0)
    M = (pos-neg)/2.
    p = stats.binom_test(min(pos,neg), pos+neg, .5)
    return M, p
Beispiel #18
0
def retention_simulation1(group1, group2, n, sample_size, d):

    p_values = []
    
    retention_data1 = retention_data[(retention_data['experiment_group'] == group1) & (retention_data['install_date'] <= (datetime.now() - timedelta(days = -d)))]
    retention_data2 = retention_data[(retention_data['experiment_group'] == group2) & (retention_data['install_date'] <= (datetime.now() - timedelta(days = -d)))]
    
    day = 'd'+str(d)
    retention_data1 = retention_data1[day].values #extract the Numpy array to speed up the calculation
    retention_data2 = retention_data2[day].values
    d1_retention_test = []
    d1_retention_control = []

    for i in range(n):
        if i % 100 == 0:        
            print 'Running test simulation %s...' % i

        samp1 = random.sample(retention_data1,sample_size) #need install and app_family
        samp2 = random.sample(retention_data2,sample_size) #need install and app_family

        p1 = sum(samp1)/float(sample_size)
        p2 = sum(samp2)/float(sample_size)
        #p1 = len(samp1[samp1['d1'] == 1])/float(len(samp1))
        #p2 = len(samp2[samp2['d1'] == 1])/float(len(samp2))
        #time2 = (time.time() - startTime) + time2
        #startTime = time.time()
        p_values.append(stats.binom_test(sum(samp2), sample_size, p1))
        #startTime = time.time()
        #time3 = (time.time() - startTime) + time3
        #startTime = time.time()
        d1_retention_control.append(p1)        
        d1_retention_test.append(p2)
        #time4 = (time.time() - startTime) + time3
        
    return p_values, d1_retention_control, d1_retention_test       
Beispiel #19
0
def get_stats(summaries, use_binom=False, binom_threshold=0.05):
    ainfo = [ parse_summary(s) for s in summaries.split(' ') ]
    # Sort by highest number of reads
    sorted_ainfo = sorted(ainfo, key=lambda x: x[1], reverse=True)

    likely_stutter = 0
    if use_binom:
        # Use a simple but arbitrary binomial model to distinguish between
        # heterozygous loci and polymerase stutter
        if len(sorted_ainfo) > 1:
            # Get the "best" two allele lens
            obs1 = sorted_ainfo[0][1]
            obs2 = sorted_ainfo[1][1]
            if binom_test(obs1, n=obs1 + obs2, p=0.5) <= binom_threshold:
                # Reject hypothesis that each allele is equally likely: stutter
                likely_stutter = obs2    # obs1 is never considered stutter
                #print("stutter: " + summaries)
    
        # reads supporting any alleles other than the top 2 are automatically
        # considered stutter
        likely_stutter += sum([ x[1] for x in sorted_ainfo[2:] ])
    else:
        # Assume EVERYTHING but the primary allele is due to polymerase
        # stutter.  NOTE: even on hemizygous sex chromosomes, this assumption
        # can be invalid for, e.g., bulk PCR samples that profile several
        # individual cells, each of which may have its own allele.
        likely_stutter = sum([ x[1] for x in sorted_ainfo[1:] ])

    tot_reads = sum([ x[1] for x in ainfo ])
    return array([ tot_reads, likely_stutter, 1, 1 if likely_stutter else 0 ])
def sign_test(samp,mu0=0):
        '''
        Signs test with mu0=0 by default (though
        the median is often used in practice)

        Parameters
        ----------
        samp

        mu0

        Returns
        ---------
        M, p-value

        where

        M=(N(+) - N(-))/2, N(+) is the number of values above Mu0,
        N(-) is the number of values below.  Values equal to Mu0
        are discarded.

        The p-value for M is calculated using the binomial distrubution
        and can be intrepreted the same as for a t-test.

        See Also
        ---------
        scipy.stats.wilcoxon
        '''
        pos=np.sum(samp>mu0)
        neg=np.sum(samp<mu0)
        M=(pos-neg)/2.
        p=stats.binom_test(min(pos,neg),pos+neg,.5)
        return M, p
Beispiel #21
0
    def reduce(self, result):
        if self.select_regexp:
            inputs = [key3 for key3 in result
                      if re.search(self.select_regexp, str(key3))]
        else:
            inputs = result.keys()
        if len(inputs) != 2:
            raise KeyError("Need to find exactly two results to compute a "
                           "score. Found %i: %s" % (len(inputs), inputs))
        key_true = [k for k in inputs if k.find(conf.TRUE) != -1][0]
        key_pred = [k for k in inputs if k.find(conf.PREDICTION) != -1][0]
        y_true = result[key_true]
        y_pred = result[key_pred]
        try:  # If list of arrays (CV, LOO, etc.) concatenate them
            y_true = np.concatenate(y_true)
            y_pred = np.concatenate(y_pred)
        except ValueError:
            pass
        out = Result(key=result["key"])
        p, r, f1, s = precision_recall_fscore_support(y_true,
                                                      y_pred,
                                                      average=None)

        # Compute p-value of recall for each class
        def recall_test(recall, n_trials, apriori_p):
            n_success = recall * n_trials
            pval = binom_test(n_success, n=n_trials, p=apriori_p)
            if recall > apriori_p:
                return (pval / 2)
            else:
                return 1 - (pval / 2)

        n_classes = len(s)  # Number of classes
        n_obs = len(y_true)
        prior_p = s.astype(np.float)/s.sum()  # A priori probability of each class
        r_pvalues = np.zeros_like(r)
        for class_index in range(n_classes):
            n_trials = s[class_index]
            #print "Class {class_index}: {n_success} success on {n_trials} trials".format(n_success=n_success, n_trials=n_trials, class_index=class_index)
            r_pvalues[class_index] = recall_test(r[class_index],
                                                 n_trials,
                                                 prior_p[class_index])

        # Compute p-value of mean recall
        mean_r = r.mean()
        mean_r_pvalue = binom_test(int(mean_r * n_obs), n=n_obs, p=.5)

        key, _ = key_pop(key_pred, -1)
        out[key_push(key, conf.SCORE_PRECISION)] = p
        out[key_push(key, conf.SCORE_RECALL)] = r
        out[key_push(key, conf.SCORE_RECALL_PVALUES)] = r_pvalues
        out[key_push(key, conf.SCORE_RECALL_MEAN)] = mean_r
        out[key_push(key, conf.SCORE_RECALL_MEAN_PVALUE)] = mean_r_pvalue
        out[key_push(key, conf.SCORE_F1)] = f1
        out[key_push(key, conf.SCORE_ACCURACY)] = accuracy_score(y_true,
                                                                 y_pred)
        if self.keep:
            out.update(result)
        return out
Beispiel #22
0
def up_threshold(x, s, p):
    """function to decide if similarity is
    below cutoff"""
    if 1.0 * x/s >= p:
        return True
    elif stat.binom_test(x, s, p) > 0.01:
        return True
    return False
Beispiel #23
0
def upton(base, baserate):
    obs = base[3] / base[4]
    exp = baserate
    if base[3]/base[4] < exp:
        upton = (base[0], base[1], base[2], 1)
    else:
        upton = (base[0], base[1], base[2], ss.binom_test(base[3], base[4], exp))
    return upton
Beispiel #24
0
 def calc_win_perc(self):
     profit = self.trade_log['profit']
     num_trades = self.get_num_trades()
     total_wins = np.sum(profit > 0.0)
     if num_trades == 0:
         return 0, 0.0
     else:
         return total_wins/float(num_trades), stats.binom_test(total_wins, n=num_trades, p=0.5)
def binomial(n, p):
    #calculate the expected maximum number of replicated reads at a single position
    x = 1
    pvalue = 0
    while (binom_test(x,n,p) > 0.00001):
        x = x + 1
    if x >1:
        x= x - 1
    return x 
def dictionary_steg_detect(image):
    my_image = Image.open(image)
    pixels = my_image.load()
    probable_encodings = {'forward':[],'backward':[]}

    for red_bits in xrange(1,5):
        for green_bits in xrange(1,5):
            for blue_bits in xrange(1,5):
                bit_list = [red_bits, green_bits, blue_bits]
                back_char_data = []
                forw_char_data = []
                back_char_byte = []
                forw_char_byte = []
                char_count = 0
                for x in xrange(int(math.ceil((240.0/sum(bit_list))))):
                    pixel = pixels[x,0]
                    for i in xrange(3):
                        binnum = Steg.dec_2_bin(pixel[i])
                        data = binnum[-bit_list[i]:]
                        forw_char_byte.extend(data)
                        data.reverse()
                        back_char_byte.extend(data)
                        if len(back_char_byte)>=8:
                            back_char_data.append(chr(Steg.bin_2_dec(''.join(back_char_byte[:8]))))
                            back_char_byte = back_char_byte[8:]
                        if len(forw_char_byte)>=8:
                            forw_char_data.append(chr(Steg.bin_2_dec(''.join(forw_char_byte[:8]))))
                            forw_char_byte = forw_char_byte[8:]
                for char in back_char_data:
                    if re.match('[a-zA-Z0-9 \n\r=+-]',char):
                        char_count += 1
                if stats.binom_test(char_count, len(back_char_data),67.0/256) < .00001:
                    word_list = ''.join(back_char_data).upper().split()
                    if any(x in TWL_words for x in word_list):
                        probable_encodings['backward'].append(tuple(bit_list))
                char_count = 0
                for char in forw_char_data:
                    if re.match('[a-zA-Z0-9 \n\r=+-]',char):
                        char_count += 1
                if stats.binom_test(char_count, len(forw_char_data),67.0/256) < .00001:
                    word_list = ''.join(forw_char_data).upper().split()
                    if any(x in TWL_words for x in word_list):
                        probable_encodings['forward'].append(tuple(bit_list))
    return probable_encodings
def binominal_hotspot(rec, length, sample=118, recombinant=3684, gsize=373245519):
    P = float(recombinant)/(float(sample) * float(gsize))
    L = length
    p = P * L
    k = rec
    V = sample
    #print k, V, P, L, p
    mean = int (p * sample)
    pvalue = binom_test(k, V, p) 
    return mean, pvalue
Beispiel #28
0
 def testRNGIntegerFollowsDistribution(self):
     ALPHA = 0.01
     SAMPLES = 100
     REPEATS = 1000
     lMsg =[]
     createPValuesInteger = ('Integer p values',
                             lambda nSamples: np.random.randint(0, n, nSamples))
     createPValuesFloat = ('Float p values', 
                           lambda  nSamples: np.random.rand(nSamples))
     createPValiuesLargeDifferences = ('Large differences in p values',
                                       lambda nSamples: np.exp(np.random.randn(nSamples)))
     pValueGenerators = (createPValuesInteger, createPValuesFloat,
                         createPValiuesLargeDifferences)  
     
     for name, pGenerator in pValueGenerators:
         falsePositives = 0
         for i in range(REPEATS): #@UnusedVariable
             n = np.random.randint(3, 20)
             x = range(n)
             theSum = 0
             while theSum <= 0:
                 pValuesRequested = pGenerator(n)
                 theSum = sum(pValuesRequested)
             rng = randomArbitrary.RandomArbitraryInteger(x=x, p=pValuesRequested)
             theSample = rng.random(SAMPLES)
             pValuesNormalized = np.divide(pValuesRequested, 
                                           float(sum(pValuesRequested)))
             p = self._chi2testSampleAgainsProbability(theSample, 
                                                       pValuesNormalized)[1]
             if p < ALPHA:
                 falsePositives += 1
             
     
         #Binomial test.
         #At this point we expect that falsePositives will not be significantly
         # higher than ALPHA * REPEATS 
         #failureFractionValues are above ALPHA. Let's use binomial test to 
         #test it  
         nExpectedFalsePositives = int(REPEATS*ALPHA)
         if falsePositives > nExpectedFalsePositives:
             pBinomialTest = stats.binom_test(falsePositives, 
                                              REPEATS, 
                                              ALPHA) * 2 #one-sided test, thus *2
             if pBinomialTest < ALPHA:
                 #shit, there might be a problem
                 msg = '(%s) Failed sampling distribution test '\
                     'Expected %d failures or less, observed %d ones (p=%.3f). '\
                     "Don't panic. "\
                     'This might happen once in a while even if everything is OK. '\
                     'Repeat the test and hope for the best'%(name, nExpectedFalsePositives,
                                                              falsePositives, 
                                                              pBinomialTest)
                 lMsg.append(msg)
     if lMsg:
         self.fail('. '.join(lMsg))
Beispiel #29
0
 def test_sampling_distribution(self):
     """
     test if the sampling distribution function is working as expected.
     """
     trials = 1000
     distribution = [('H',0.5),('T',0.5)]
     num_heads = ml.sample_distribution(distribution,trials).count('H')
     result = [binom_test(num_heads,trials,1.0/2) for x in range(trials)]
     mean = sum(result) / len(result)
     # NOTE: Maybe this number should be higher. What is a good number?
     self.assertTrue(mean > 0.05)
def binomial_scale(member_punishments: int,
                   all_punishments: int,
                   member_pop: int,
                   all_pop: int) -> int:

    if impossible(member_punishments,
                      all_punishments,
                      member_pop,
                      all_pop):
        return -1

    """
    Finds out how many standard deviations a group's punishment
    count is from the mean of a random distribution. A result that
    seems to be the result of impossible/erroneous data gets a 1.
    A result within one standard deviation of the mean gets a 5. Five
    standard deviations below the mean would return the minimum, 0.
    Five standard deviations above the mean would return the max, 10.
    See https://en.wikipedia.org/wiki/Binomial_test"""

    score = 5
    if member_pop == member_punishments == 0:
        return score
    # max() is to avoid divide by zero errors
    p = member_pop / max(all_pop, 1)
    group_p = member_punishments / max(all_punishments, 1)
    if member_punishments / member_pop > all_punishments / all_pop:
        tail = 'greater'
    elif member_punishments / member_pop < all_punishments / all_pop:
        tail = 'less'
    else:
        return score
    pvalue = stats.binom_test(member_punishments,
                               max(all_punishments, member_punishments),
                               p,
                               alternative=tail)

    # relying on https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule
    # to find one-sided odds of being 1-5 standard deviations away from mean

    std_intervals = (.5/3, .5/22, .5/370, .5/15787, .5/1744278)

    # also requiring a minimum percentage difference to get a brighter color

    p_intervals = (1.1, 1.3, 1.6, 2, 2.5)

    # again, max() is to avoid divide by zero errors
    if tail == 'greater':
        score += min(sum(pvalue < t for t in std_intervals),
                     sum(group_p/(max(p, .00001)) > t for t in p_intervals))
    else:
        score -= min(sum(pvalue < t for t in std_intervals),
                     sum(p/(max(group_p, .00001)) > t for t in p_intervals))
    return int(score)
Beispiel #31
0
def scores(key, paths, config):
    import mapreduce
    print(key)
    values = [mapreduce.OutputCollector(p) for p in paths]
    values = [item.load() for item in values]
    y_true = [item["y_true"].ravel() for item in values]
    y_pred = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    prob_pred = [item["proba_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred)
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)  #area under curve score.
    #betas = np.hstack([item["beta"] for item in values]).T
    # threshold betas to compute fleiss_kappa and DICE
    #betas_t = np.vstack([array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0])])
    #Compute pvalue
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_recall_mean = binom_test(success[0] + success[1],
                                    s[0] + s[1],
                                    p=0.5,
                                    alternative='greater')
    scores = OrderedDict()
    try:
        a, l1, l2, tv = [float(par) for par in key.split("_")]
        scores['a'] = a
        scores['l1'] = l1
        scores['l2'] = l2
        if left == 0: left = 1.
        scores['l1_ratio'] = float(l1) / left
    except:
        pass
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['recall_mean'] = r.mean()
    scores["auc"] = auc
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_recall_mean'] = pvalue_recall_mean
    #scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
    #                               float(np.prod(betas.shape))
    scores['param_key'] = key
    return scores
Beispiel #32
0
def test_phony_hypothesis(cluster_MSM_num, cluster_GEM_num, cell_num_ary,
                          capture_rate):
    MSM_rate = phony_cluster_MSM_rate(cell_num_ary)
    return binom_test(cluster_MSM_num / capture_rate,
                      cluster_GEM_num / capture_rate, MSM_rate, "less")
Beispiel #33
0
 def _calc_enrichment_results_stats(self, background_prob, result):
     events_count = len(result.events)
     result.expected = background_prob * events_count
     result.pvalue = stats.binom_test(
         len(result.overlapped), events_count, p=background_prob
     )
Beispiel #34
0
        outfile_two_by_two_recomb.write(outline)

    #output stats for all recombs (01 and 10)
    for key in sorted(recomb_one_by):
        pos = key.split(":")
        if len(recomb_quant_bkgnd_one_by_one[key][0]) >= 300:
            prob01 = probs_one_by_dict[int(pos[0])][2]
            prob10 = probs_one_by_dict[int(pos[0])][3]
            outline = key + "\t" + str(float(sum(
                recomb_one_by[key][1]))) + "\t" + str(
                    float(sum(recomb_one_by[key][0]))) + "\t" + str(
                        prob01) + "\t" + str(prob10) + "\t" + str(
                            len(recomb_quant_bkgnd_one_by_one[key][0])) + "\n"
            outfile_one_by_recomb.write(outline)
            print(key + "\t" + str(
                binom_test(((sum(recomb_one_by[key][0]))), (
                    len(recomb_quant_bkgnd_one_by_one[key][0])), (prob10))))

    #output absolute number of recombinations (background hard to calc)
    print("whole read test")
    for key in sorted(whole_read_one_recomb_dict):
        pos = key.split(":")
        prob0011 = probs_two_by_dict[int(pos[0])][2]
        prob1100 = probs_two_by_dict[int(pos[0])][3]
        outline = key + "\t" + str(sum(
            whole_read_one_recomb_dict[key][0])) + "\t" + str(
                sum(whole_read_one_recomb_dict[key][1])) + "\t" + str(
                    len(recomb_quant_bkgnd_one_by_one[key][0])) + "\t" + str(
                        prob0011) + "\t" + str(prob1100) + "\n"
        outfile_whole_read.write(outline)
        bintest = binom_test(((sum(whole_read_one_recomb_dict[key][0]))),
                             (len(recomb_quant_bkgnd_one_by_one[key][0])),
Beispiel #35
0
import numpy as np
import fetchmaker

rottweiler_tl = fetchmaker.get_tail_length('rottweiler')
print(np.mean(rottweiler_tl), np.std(rottweiler_tl))

whippet_rescue = fetchmaker.get_is_rescue('whippet')
num_whippet_rescue = np.count_nonzero(whippet_rescue)
print(num_whippet_rescue)
num_whippets = np.size(whippet_rescue)
print(num_whippets)

from scipy.stats import binom_test

whippet_binom_test = binom_test(num_whippet_rescue, num_whippets, 0.08)
if whippet_binom_test < 0.05:
    print('Significant Difference!')
else:
    print('No Difference.')

from scipy.stats import f_oneway

whippet_avg_weight = fetchmaker.get_weight('whippet')
terrier_avg_weight = fetchmaker.get_weight('terrier')
pitbull_avg_weight = fetchmaker.get_weight('pitbull')
weight_t, weight_pval = f_oneway(whippet_avg_weight, terrier_avg_weight,
                                 pitbull_avg_weight)

if weight_pval < 0.05:
    print('There is a difference between these breeds!')
else:
Beispiel #36
0
 def func(qi):
     #return stats.binom_test(qi * nobs, nobs, p=q_) - alpha #/ 2.
     return stats.binom_test(q_ * nobs, nobs, p=qi) - alpha
#Overall
sns.countplot(sl_df['pre_post_rec'], ax=ax[1], palette="winter")

ax[1].set_title("Overall Accidents - Slavija")

# In[215]:

post_c = sl_df['pre_post_rec'].value_counts()['post_rec']
pre_c = sl_df['pre_post_rec'].value_counts()['pre_rec']
x = pre_c
n = post_c + pre_c

post_d = pp_df['pre_post_rec'].value_counts()['post_rec']
pre_d = pp_df['pre_post_rec'].value_counts()['pre_rec']
p = pre_d / (post_d + pre_d)
res = binom_test(x, n, p, alternative='less')
print('C-Test for increase in number of accidents {:.2f} p-val'.format(res))

# **p-val above** general acceptance threshold.

# **Notes**
# - We can observe increase in the number of accidents on Slavija after reconstruction (right hand side)
# - General rise in the number of accident in Belgrade contributes only partially (left hand side)

# In[216]:

sns.catplot(
    x="pre_post_rec",
    y='count',
    col="acc_outcome",
    data=sl_df,
    def test_num_appearance(self, event_index, log_atom):
        """This function makes a one step prediction and raises an alert if the count do not match the expected appearance"""
        # Return, if not TSA should be calculated for this ET
        if all(period is None for period in self.period_length_list[event_index]):
            return

        # Append the lists for the arima models if it is to short
        if len(self.arima_models) <= event_index:
            self.arima_models += [None for _ in range(event_index + 1 - len(self.arima_models))]
            self.result_list += [None for _ in range(event_index + 1 - len(self.result_list))]
        if len(self.prediction_history) <= event_index:
            self.prediction_history += [None for _ in range(event_index + 1 - len(self.prediction_history))]

        # Initialize the lists for the arima models for this ET
        if self.arima_models[event_index] is None:
            self.arima_models[event_index] = [None for _ in range(len(self.target_path_index_list[event_index]))]
            self.result_list[event_index] = [[] for _ in range(len(self.target_path_index_list[event_index]))]
        if self.prediction_history[event_index] is None:
            self.prediction_history[event_index] = [[[], [], []] for _ in range(len(self.target_path_index_list[event_index]))]

        # Check if the new values are floats
        if any(not self.event_type_detector.check_variables[event_index][var_index] or
                not isinstance(self.event_type_detector.values[event_index][var_index][-1], float) for var_index in
                self.target_path_index_list[event_index]):
            delete_indices = [count_index for count_index, var_index in enumerate(self.target_path_index_list[event_index])
                              if not self.event_type_detector.check_variables[event_index][var_index] or
                              not isinstance(self.event_type_detector.values[event_index][var_index][-1], float)]
            delete_indices.sort(reverse=True)

            for count_index in delete_indices:
                # Remove the entries of the lists
                if len(self.target_path_index_list) > event_index and len(self.target_path_index_list[event_index]) > count_index:
                    self.target_path_index_list[event_index] = self.target_path_index_list[event_index][:count_index] +\
                            self.target_path_index_list[event_index][count_index + 1:]
                if len(self.period_length_list) > event_index and len(self.period_length_list[event_index]) > count_index:
                    self.period_length_list[event_index] = self.period_length_list[event_index][:count_index] +\
                            self.period_length_list[event_index][count_index + 1:]
                if len(self.arima_models) > event_index and len(self.arima_models[event_index]) > count_index:
                    self.arima_models[event_index] = self.arima_models[event_index][:count_index] +\
                            self.arima_models[event_index][count_index + 1:]
                if len(self.prediction_history) > event_index and len(self.prediction_history[event_index]) > count_index:
                    self.prediction_history[event_index] = self.prediction_history[event_index][:count_index] +\
                            self.prediction_history[event_index][count_index + 1:]
                if len(self.result_list) > event_index and len(self.result_list[event_index]) > count_index:
                    self.result_list[event_index] = self.result_list[event_index][:count_index] +\
                            self.result_list[event_index][count_index + 1:]

            message = 'Disabled the TSA for the targetpaths %s of event %s' % (
                    [self.event_type_detector.variable_key_list[event_index][count_index] for count_index in delete_indices],
                    self.event_type_detector.get_event_type(event_index))
            affected_path = [self.event_type_detector.variable_key_list[event_index][count_index] for count_index in delete_indices]
            self.print(message, log_atom, affected_path)

        # Initialize and update the arima_model if possible
        for count_index, var_index in enumerate(self.target_path_index_list[event_index]):
            # Initialize the arima_model if possible if possible
            if self.auto_include_flag and self.arima_models[event_index][count_index] is None:
                if self.period_length_list[event_index][count_index] is not None:

                    # Add the current value to the lists
                    self.prediction_history[event_index][count_index][0].append(0)
                    self.prediction_history[event_index][count_index][1].append(self.event_type_detector.values[event_index][var_index][-1])
                    self.prediction_history[event_index][count_index][2].append(0)

                    # Check if enough values have been stored to initialize the arima_model
                    if len(self.event_type_detector.values[event_index][var_index]) >= self.num_periods_tsa_ini *\
                            self.period_length_list[event_index][count_index]:
                        message = 'Initializing the TSA for the event %s and targetpath %s' % (
                                self.event_type_detector.get_event_type(event_index),
                                self.event_type_detector.variable_key_list[event_index][count_index])
                        affected_path = self.event_type_detector.variable_key_list[event_index][count_index]
                        self.print(message, log_atom, affected_path)

                        # Add the arima_model to the list
                        try:
                            model = statsmodels.tsa.arima.model.ARIMA(
                                    self.event_type_detector.values[event_index][var_index][
                                        -self.num_periods_tsa_ini * self.period_length_list[event_index][count_index]:],
                                    order=(self.period_length_list[event_index][count_index], 0, 0),
                                    seasonal_order=(0, 0, 0, self.period_length_list[event_index][count_index]))
                            self.arima_models[event_index][count_index] = model.fit()
                        except:  # skipcq FLK-E722
                            self.arima_models[event_index][count_index] = None

            # Make a one step prediction with the new values
            elif self.arima_models[event_index][count_index] is not None:
                count = self.event_type_detector.values[event_index][var_index][-1]

                # Add the predction to the lists
                lower_limit, upper_limit = self.one_step_prediction(event_index, count_index)
                self.prediction_history[event_index][count_index][0].append(lower_limit)
                self.prediction_history[event_index][count_index][1].append(count)
                self.prediction_history[event_index][count_index][2].append(upper_limit)

                # Shorten the lists if neccessary
                if len(self.prediction_history[event_index][count_index][0]) > self.num_max_time_history:
                    self.prediction_history[event_index][count_index][0] = self.prediction_history[event_index][count_index][0][
                        -self.num_min_time_history:]
                    self.prediction_history[event_index][count_index][1] = self.prediction_history[event_index][count_index][1][
                        -self.num_min_time_history:]
                    self.prediction_history[event_index][count_index][2] = self.prediction_history[event_index][count_index][2][
                        -self.num_min_time_history:]

                else:
                    # Test if count is in boundaries
                    if count < lower_limit or count > upper_limit:
                        message = 'Event: %s, Path: %s, Lower: %s, Count: %s, Upper: %s' % (
                                self.event_type_detector.get_event_type(event_index),
                                self.event_type_detector.variable_key_list[event_index][var_index], lower_limit, count, upper_limit)
                        affected_path = self.event_type_detector.variable_key_list[event_index][var_index]
                        if count < lower_limit:
                            confidence = (lower_limit - count) / (upper_limit - count)
                        else:
                            confidence = (count - upper_limit) / (count - lower_limit)
                        self.print(message, log_atom, affected_path, confidence=confidence)
                        self.result_list[event_index][count_index].append(0)
                    else:
                        self.result_list[event_index][count_index].append(1)

                    # Reduce the number of entries in the time history if it gets too large
                    if len(self.result_list[event_index][count_index]) >= 2 * max(
                            self.num_results_bt, self.num_periods_tsa_ini * self.period_length_list[event_index][count_index]):
                        self.result_list[event_index][count_index] = self.result_list[event_index][count_index][-max(
                            self.num_results_bt, self.num_periods_tsa_ini * self.period_length_list[event_index][count_index]):]

                # Check if the too few or many successes are in the last section of the test history and discard the model
                # Else update the model for the next step
                if self.auto_include_flag and (
                        sum(self.result_list[event_index][count_index][-self.num_results_bt:]) +
                        max(0, self.num_results_bt - len(self.result_list[event_index][count_index])) < self.bt_min_suc or
                        binom_test(x=sum(self.result_list[event_index][count_index][
                        -self.num_periods_tsa_ini * self.period_length_list[event_index][count_index]:]),
                        n=self.num_periods_tsa_ini * self.period_length_list[event_index][count_index],
                        p=(1-self.alpha), alternative='greater') < self.alpha_bt):

                    message = 'Discard the TSA model for the event %s and path %s' % (
                            self.event_type_detector.get_event_type(event_index),
                            self.event_type_detector.variable_key_list[event_index][var_index])
                    affected_path = self.event_type_detector.variable_key_list[event_index][var_index]
                    self.print(message, log_atom, affected_path)

                    # Discard the trained model and reset the result_list
                    self.arima_models[event_index][count_index] = None
                    self.result_list[event_index][count_index] = []
                else:
                    # Update the model
                    self.arima_models[event_index][count_index] = self.arima_models[event_index][count_index].append([count])
Beispiel #39
0
        correct = int(row[0])
        result_array.append(correct)

result_array = np.array(
    result_array
)  # create array of number of times it guessed correctly out of a million guesses

average = int(round(np.mean(result_array)))
std_dev = np.std(result_array)

print(f'Mean = {average}')
print(f'Standard deviation = {std_dev}')

# statistical significance test
p = 0.05  # set significance level
prob = binom_test(average, 1000000, 0.5,
                  'greater')  # calculate prob given null hypothesis
if p < prob:
    print(
        f'{prob * 100}% is the probability that this would occur with the null hypothesis '
        + f'so we accept the null hypothesis')
else:
    print(
        f'{prob * 100}% is the probability that this would occur with the null hypothesis '
        + f'so we reject the null hypothesis')

# calculate bins
min_result = np.amin(
    result_array)  # min/max_result for calculating edges of histogram
max_result = np.amax(result_array)
range_result = max_result - min_result
print(min_result, max_result, range_result)
Beispiel #40
0
    option = input("Option: ")

    while (option != "1" and option != "2" and option != "3" and option != "4"
           and option != "5" and option != "6" and option != "7"
           and option != "0"):
        option = input("Invalid choice. Option: ")
    if option == "1":
        wilcoxonMenu(data)
    elif option == "2":
        harder = getArrayFromData(data, 16)
        safeCount = len([1 for x in harder if x == "safe"])
        uniformCount = len([1 for x in harder if x == "uniform"])
        equalCount = len([1 for x in harder if x == "equal"])
        totalCount = safeCount + uniformCount + equalCount
        pvalue = binom_test(safeCount, totalCount)
        print("\n[BERNULLI TEST] Results:")
        print("#safe = " + str(safeCount))
        print("#uniform = " + str(uniformCount))
        print("#equal = " + str(equalCount))
        print("p-value = " + str(pvalue))
    elif option == "3":
        graphMenu(data)
    elif option == "4":
        functionMenu()
    elif option == "5":
        heatmapMenu()
    elif option == "6":
        fontMenu()
    elif option == "7":
        data = getData(inputDir)
Beispiel #41
0
def rate_tst(x_s, y_s, n0=10, n1=10):
    p = n1 / (n0 + n1)
    p_val = binom_test(y_s, x_s + y_s, p, alternative='greater')
    return p_val
Beispiel #42
0
# save history
np.savetxt(
        "data/%s-results.csv" % id,
        np.asarray([ history.history['loss'], history.history['val_loss']]),
        delimiter=",")

test_y_prob = model.predict(test_X)

# get actual predictions
test_y_pred = np.argmax(test_y_prob, axis=-1)

print('Test set confusion matrix:')
print(confusion_matrix(test_y, test_y_pred))
print('Test set p-value:')
print(binom_test(np.sum(test_y == test_y_pred), len(test_y)))

# save test output for simulations etc.
np.savetxt(
        "data/%s-test-output.csv" % id,
        np.asarray([ test_y_pred, test_y ]),
        delimiter=",")

confidence = np.amax(test_y_prob*10,axis=1).astype(int)
test_y_conf_pred = test_y_pred[confidence > 5]
test_y_conf_real = test_y[confidence > 5]
correct_confident = np.sum(test_y_conf_pred == test_y_conf_real)
n_confident = len(test_y_conf_pred)
acc_confident = float(correct_confident) / float(n_confident)
p_val_confident = binom_test(correct_confident, n_confident)
Beispiel #43
0
    x = np.arange(len(brands))  # the label locations
    width = 0.25  # the width of the bars

    fig, ax1 = plt.subplots()
    rects1 = ax1.bar(x - width / 2,
                     brand_agg_df['percent contaminated'],
                     width,
                     label='percent bad',
                     color="dodgerblue")

    ax1.set_xlabel('brand')
    ax1.set_ylabel('percent', color="dodgerblue", fontsize=16)
    ax1.set_title('price vs quality of ingredients')
    ax1.set_xticks(x)
    ax1.set_xticklabels(brand_agg_df['brand'], rotation=90)
    ax1.legend()
    ax2 = ax1.twinx()
    rects2 = ax2.bar(x + width / 2,
                     brand_agg_df['median price'],
                     width,
                     label='median price',
                     color="salmon")
    ax2.set_ylabel('price', color="salmon", fontsize=16)
    plt.tight_layout()
    plt.show()

from scipy import stats

print(stats.binom_test(x=2403, n=5247, p=0.400) / 2)
print(stats.binom_test(x=110, n=5247, p=0.017) / 2)
from scipy.stats import binom_test

pval = binom_test(510, n=10000, p=0.06)
print(pval)

pval2 = binom_test(590, n=10000, p=0.06)
print(pval2)
Beispiel #45
0
    def get_matchup_totals_with_significance(self) -> pd.DataFrame:
        """
        Return dataframe with matchup win totals + significance.
        """
        def _signf_level(p):
            if p < 0.001:
                return "***", "p<.001"
            elif p < 0.01:
                return "**", "p<.01"
            elif p < 0.05:
                return "*", "p<.05"
            else:
                return "", "p>.05"

        output = []
        for _, run_annotations in self.dataframe.groupby('run_id'):
            question = list(run_annotations.question)[0]
            for matchup, annotations in run_annotations.groupby('matchup'):
                model1, model2 = matchup.split('__vs__')
                wincount1 = np.sum(annotations['winner'] == model1)
                wincount2 = np.sum(annotations['winner'] == model2)
                numratings = wincount1 + wincount2
                winrate1 = np.mean(annotations['winner'] == model1)
                winrate2 = np.mean(annotations['winner'] == model2)
                p = binom_test([wincount1, wincount2])

                stars, plevel = _signf_level(p)

                agreements = []
                for _, pairing_annotations in annotations.groupby(
                        'pairing_id'):
                    pair_wincount1 = np.sum(
                        pairing_annotations['winner'] == model1)
                    pair_wincount2 = np.sum(
                        pairing_annotations['winner'] == model2)
                    if pair_wincount1 < 2 and pair_wincount2 < 2:
                        if pair_wincount1 == 1 and pair_wincount2 == 1:
                            agreements.append(0)
                    else:
                        majority_wincount = max(pair_wincount1, pair_wincount2)
                        num_pair_annotations = pair_wincount1 + pair_wincount2
                        pair_agreement = majority_wincount / num_pair_annotations
                        agreements.append(pair_agreement)
                total_agreement = np.mean(agreements)

                output.append({
                    'question': question,
                    'matchup': matchup,
                    'model1': model1,
                    'model2': model2,
                    'numwins1': wincount1,
                    'numwins2': wincount2,
                    'winrate1': winrate1,
                    'winrate2': winrate2,
                    'numratings': numratings,
                    'p': p,
                    'stars': stars,
                    'sigf': plevel,
                    'agree': total_agreement,
                })
        output = pd.DataFrame(output)
        # order the columns how we want
        self.significance_df = output[[
            'question',
            'matchup',
            'model1',
            'numwins1',
            'winrate1',
            'model2',
            'numwins2',
            'winrate2',
            'numratings',
            'sigf',
            'stars',
            'p',
            'agree',
        ]]
        return self.significance_df
Beispiel #46
0
import fetchmaker
from scipy.stats import binom_test
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import chi2_contingency

rottweiler_tl = fetchmaker.get_tail_length('rottweiler')

print(np.mean(rottweiler_tl))
print(np.std(rottweiler_tl))

whippet_rescue = fetchmaker.get_is_rescue('whippet')
num_whippet_rescues = np.count_nonzero(whippet_rescue)
num_whippets = np.size(whippet_rescue)

print(binom_test(num_whippet_rescues, num_whippets, .08))

w = fetchmaker.get_weight('whippet')
t = fetchmaker.get_weight('terrier')
p = fetchmaker.get_weight('pitbull')

print(f_oneway(w, t, p).pvalue)

values = np.concatenate([w, t, p])
labels = ['whippet'] * len(w) + ['terrier'] * len(t) + ['pitbull'] * len(p)
print(pairwise_tukeyhsd(values, labels, .05))

poodle_colors = fetchmaker.get_color('poodle')
shihtzu_colors = fetchmaker.get_color('shihtzu')

color_table = [[
 def func(qi):
     return stats.binom_test(q_ * nobs, nobs, p=qi) - alpha
# model the expected distribution mu, sigma
n = 1000
p = 0.5
rv = st.binom(n,p)
mu = rv.mean()
sd = rv.std()
print("The expected distribution for a fair coin is mu=%s, sd=%s"%(mu,sd))

# simulate the p-value
n_samples = 10000
xs = np.random.binomial(n, p, samples)
print("Simulation p-value - %s"%(2*np.sum(xs >= h)/(xs.size + 0.0)))

## p-value by binomial test
print("Binomial test - %s"%st.binom_test(h, n, p))

## MLE
# likelihood = p(event)/p(all)
print("Maximum likelihood %s"%(np.sum(results)/float(len(results))))

## bootstrap
# make samples from 'results'
bs_samples = np.random.choice(results, (nsamples, len(results)), replace=True)
bs_ps = np.mean(bs_samples, axis=1) # posterior...mean of samples
bs_ps.sort()
print("Bootstrap CI: (%.4f, %.4f)" % (bs_ps[int(0.025*nsamples)], bs_ps[int(0.975*nsamples)]))
#vis:
plt.hist(bs_ps)
plt.show()
Beispiel #49
0
def scores(key, paths, config, as_dataframe=False):
    import mapreduce
    print(key)
    if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER):
        print("Failed for key %s" % key)
        return None
    values = [mapreduce.OutputCollector(p) for p in paths]
    values = [item.load() for item in values]
    y_true = [item["y_true"].ravel() for item in values]
    y_pred = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    prob_pred = [item["proba_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred) #area under curve score.

    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1,alternative = 'greater')
    pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1,alternative = 'greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5,alternative = 'greater')
    pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5,alternative = 'greater')
    pvalue_recall_mean = binom_test(success[0]+success[1], s[0] + s[1], p=0.5,alternative = 'greater')


    # Beta's measures of similarity
    betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T

    # Correlation
    R = np.corrcoef(betas)
    #print R
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) /  (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0])])
        #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1))
        #print(np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5,
        #                   rtol=0, atol=1e-02))

        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0]) for j in range(i+1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    scores = OrderedDict()
    scores['key'] = key
    try:
        a, l1, l2 , tv  = [float(par) for par in key.split("_")]
        scores['a'] = a
        scores['l1'] = l1
        scores['l2'] = l2
        scores['tv'] = tv
        left = float(1 - tv)
        if left == 0: left = 1.
        scores['l1_ratio'] = float(l1) / left
    except:
        pass
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['recall_mean'] = r.mean()
    scores["auc"] = auc
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores['pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores['pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_recall_mean'] = pvalue_recall_mean
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar

    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)

    if as_dataframe:
        scores = pd.DataFrame([list(scores.values())], columns=list(scores.keys()))

    return scores
Beispiel #50
0
#print(whippet_rescue)
#How many whippets are rescues?
num_whippet_rescues = np.sum(whippet_rescue == 1)

num_whippets = len(whippet_rescue)

#Use a hypothesis test to test the following null and alternative hypotheses:

#Null: 8% of whippets are rescues
#Alternative: more or less than 8% of whippets are rescues

#we have a samlpe of binary data: 0,1 and we want to compare a sample frequency to a population value: whipets

#we will use a binom test

pval = binom_test(num_whippet_rescues, num_whippets, p=0.08)

print(pval)
#pval more than 0.05 so the null hypothesis is true

#Is there a significant difference in the average weights of these three dog breeds (mid sized dogs)

wt_whippets = dogs_wtp.weight[dogs_wtp.breed == 'whippet']
wt_terriers = dogs_wtp.weight[dogs_wtp.breed == 'terrier']
wt_pitbulls = dogs_wtp.weight[dogs_wtp.breed == 'pitbull']

#Null: whippets, terriers, and pitbulls all weigh the same amount on average

#Alternative: whippets, terriers, and pitbulls do not all weigh the same amount on average (at least one pair of breeds has differing average weights)

fstat, pval = f_oneway(wt_whippets, wt_terriers, wt_pitbulls)
                    help='aura|babe|babe+aura|')

args = parser.parse_args()
n = args.active_collators
steps = np.arange(args.stake_steps, 1, args.stake_steps)
stall_steps = np.arange(args.stall_steps, 1, args.stall_steps)
number_of_trials = args.number_of_trials
results = np.zeros((len(stall_steps), len(steps)))
c = args.difficulty_parameter

for i, stall_percentage in enumerate(stall_steps):
    for j, stake in enumerate(steps):
        single_stall_probability = algo_switcher.get(
            args.algorithm, lambda: "Invalid algorithm")(stake)
        results[i][j] = binom_test(number_of_trials * stall_percentage,
                                   number_of_trials, single_stall_probability,
                                   'greater')

labels = []
for stall in stall_steps:
    labels.append(
        str("%.2f" % (stall * 100)) +
        ('% of blocks proposed by honest actors'))

for y_arr, label in zip(results, labels):
    plt.plot(steps, y_arr, label=label)

plt.xlabel("Attacker controlled stake in active collator set")
plt.ylabel("Probability")
plt.legend()
plt.show()
Beispiel #52
0
n = 1  # 독립시행
p = 0.4  # 모수

X = stats.binom.rvs(n=n, p=p, size=100)
print(X)

# 2) 성공회수 카운터
cnt = np.count_nonzero(X)
print('성공회수 =', cnt)  # 성공회수 = 41

# 3) 이항검정
help(stats.binom_test)
# binom_test(x, n=None, p=0.5, alternative='two-sided')
# x : 성공회수, n : 시행회수, p : 귀무가설 성공확률, alternative : 양측검정)
n = 100  # size
pvalue = stats.binom_test(x=cnt, n=n, p=0.4, alternative='two-sided')
# 0.8388931714011669

# 유의확률 vs 유의수준(alpha)
alpha = 0.05  # 알파 결정 : 0.95 = 1-alpha

if pvalue >= alpha:
    print('귀무가설 채택')
else:
    print('귀무가설 기각')

#######################
## 이항검정 example
#######################
'''
150명의 합격자 중에서 남자 합격자가 62명 일때 
Beispiel #53
0
def scores(key, paths, config, as_dataframe=False, algo_idx=None):
    # print(key, paths)
    # key = 'enettv_0.1_0.5_0.1'
    # paths = ['5cv/cv00/refit/enetgn_0.1_0.9_0.1', '5cv/cv01/refit/enetgn_0.1_0.9_0.1', '5cv/cv02/refit/enetgn_0.1_0.9_0.1', '5cv/cv03/refit/enetgn_0.1_0.9_0.1', '5cv/cv04/refit/enetgn_0.1_0.9_0.1']
    key_parts = key.split("_")
    algo = key_parts[algo_idx] if algo_idx is not None else None
    key_parts.remove(algo)
    if len(key_parts) > 0:
        try:
            params = [float(p) for p in key_parts]
        except:
            params = [None, None, None]
    print(algo, params)
    if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER):
        print("Failed for key %s" % key)
        return None

    values = [mapreduce.OutputCollector(p) for p in paths]
    try:
        values = [item.load() for item in values]
    except Exception as e:
        print(e)
        return None

    y_true_splits = [item["y_true"].ravel() for item in values]
    y_pred_splits = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true_splits)
    y_pred = np.concatenate(y_pred_splits)
    prob_pred_splits = [item["proba_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred_splits)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)

    # balanced accuracy (recall_mean)
    bacc_splits = [
        recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean()
        for f in range(len(y_true_splits))
    ]
    auc_splits = [
        roc_auc_score(y_true_splits[f], prob_pred_splits[f])
        for f in range(len(y_true_splits))
    ]

    print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits)))
    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_bacc = binom_test(success[0] + success[1],
                             s[0] + s[1],
                             p=0.5,
                             alternative='greater')

    # Beta's measures of similarity
    betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T

    # Correlation
    R = np.corrcoef(betas)
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in range(betas.shape[0])
        ])
        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0])
              for j in range(i + 1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(
                float(np.sum((A == B)[(A != 0) & (B != 0)])) /
                (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    # Proportion of selection within the support accross the CV
    support_count = (betas_t != 0).sum(axis=0)
    support_count = support_count[support_count > 0]
    support_prop = support_count / betas_t.shape[0]

    scores = OrderedDict()
    scores['key'] = key
    scores['algo'] = algo
    scores['a'], scores['l1_ratio'], scores['tv_ratio'] = params

    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['bacc'] = r.mean()
    scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits))
    scores["auc"] = auc
    scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits))
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_bacc_mean'] = pvalue_bacc
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar
    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)
    scores['beta_support_prop_select_mean'] = support_prop.mean()
    scores['beta_support_prop_select_sd'] = support_prop.std()

    if as_dataframe:
        scores = pd.DataFrame([list(scores.values())],
                              columns=list(scores.keys()))

    return scores
Beispiel #54
0
def pval_all(mlen, sequences, order=1, dist="binomial"):
    threshold = 0
    seq_name = ""

    with open(sequences, "r") as f:
        text = f.read()
        text = text.replace(' ', '')
        text = text.split("\n")
    file_length = len(text)
    ln_num = -1

    num_seq = 0
    num_spots = 0
    seq_matched = {}

    while ln_num < file_length - 1:
        # retrieve line
        ln_num += 1
        line = text[ln_num]

        if len(line) == 0:
            # ignore blank lines
            continue

        if line[0] == '>':
            # beginning of new sequence, save sequence name
            seq_name = line[1:]
            continue

        # retrieve sequence
        sequence = ""
        while ln_num < file_length and len(text[ln_num]) != 0:
            sequence += text[ln_num]
            ln_num += 1

        num_seq += 1
        seq_len = len(sequence)

        # search sequence
        pos = 0
        while pos < seq_len - mlen:
            num_spots += 1
            match = sequence[pos:pos + mlen]
            if match not in seq_matched:
                seq_matched[match] = 1
            elif match in seq_matched:
                seq_matched[match] += 1
            pos += 1

    matches = sorted(seq_matched, key=seq_matched.get, reverse=True)

    pvals = {}

    n = len(seq_matched)

    mkv_res = pval_markov('A' * mlen, sequences, order)
    mkv = mkv_res[1]
    bg_model = mkv_res[2]

    for match in matches:
        # print '   '+str(matches[i])+' ('+str(seq_matched[matches[i]])+')'
        k = seq_matched[match]
        # print('pval: k='+str(k)+'\tn='+str(n)+'\tP='+str(p))

        p_hat = markov2pval(match, mkv, bg_model, order)
        pvals[match] = stats.binom_test(k, n, p_hat, alternative='greater')

    return pvals
# **<font color='red'> Вопрос 3. </font>Каково p-value при проверке описанной гипотезы?**

# In[20]:

has_two_similar = (np.array(num_unique_sites) < 10).astype('int')
has_two_similar

# Проверяется нулевая гипотеза:
# - $H_0:$ доля сессий с повторяющимися сайтами = 95%. Против альтернативы:
# - $H_1:$ доля таких сессий больше чем 95%

# In[21]:

stats.binom_test(has_two_similar.sum(),
                 len(num_unique_sites),
                 p=0.95,
                 alternative='greater')

# In[22]:

import statsmodels.stats.proportion as psts
psts.proportions_ztest(has_two_similar.sum(),
                       len(has_two_similar),
                       value=0.95,
                       prop_var=0.95,
                       alternative='larger')

# **<font color='red'> Вопрос 4. </font>Каков 95% доверительный интервал Уилсона для доли случаев, когда пользователь повторно посетил какой-то сайт (из п. 3)?**

# In[23]:
Beispiel #56
0
## Print
print('Rottweiler Tail Mean: {0}'.format(str(rottweiler_tail_mean)))
print('Rottweiler Tail STD: {0}'.format(str(rottweiler_tail_std)))

## Get whippet rescue data
whippet_rescue = fetchmaker.get_is_rescue('whippet')
num_whippet_rescue = np.count_nonzero(whippet_rescue)
num_whippets = np.size(whippet_rescue)

## Print
print('Whipper Rescues: {0}'.format(str(num_whippet_rescue)))
print('Total Whippets: {0}'.format(str(num_whippets)))

## Binomial test of whippet rescue rate
whippet_pval = binom_test(6, n=100, p=0.08)

## Print
print('Whippet Rescue P-Value: {0}'.format(str(whippet_pval)))
if whippet_pval < 0.05:
    print('It is unlikely that whippet rescue rate is 8%')
else:
    print('It is likely that whippet rescue rate is 8%')

## Comparison of whippets, terriers, pitbulls
## Load data for each
whippets = fetchmaker.get_weight('whippet')
terriers = fetchmaker.get_weight('terrier')
pitbulls = fetchmaker.get_weight('pitbull')

## ANOVA comparing weight between the three dog types
num_visits = len(abdata)
print("Number of visits: ",num_visits)
# Calculate the purchase rate needed at 0.99
num_sales_needed_099=1000/0.99
print("# of $0.99 sales to breakeven: ",num_sales_needed_099)
# Print the purchase rate needed at 0.99
p_sales_needed_099 = num_sales_needed_099/num_visits
print("Purchase rate of $0.99 sales to breakeven: ", p_sales_needed_099)
# Calculate the purchase rate needed at 1.99
num_sales_needed_199 = 1000/1.99
p_sales_needed_199 = num_sales_needed_199/num_visits
# Print the purchase rate needed at 1.99
print("Purchase rate of $1.99 sales to breakeven: ", p_sales_needed_199)
# Calculate the purchase rate needed at 4.99
num_sales_needed_499 = 1000/4.99
p_sales_needed_499 = num_sales_needed_499/num_visits
# Print the purchase rate needed at 4.99
print("Purchase rate of $4.99 sales to breakeven: ",
p_sales_needed_499)
# Calculate samp size & sales for 0.99 price point
samp_size_099 = np.sum(abdata.group == 'A')
sales_099 = np.sum((abdata.group == 'A') & (abdata.is_purchase == 'Yes'))
# Print samp size & sales for 0.99 price point
print(samp_size_099)
print(sales_099)
# Import the binom_test module and Calculate the p-value for Group A
from scipy.stats import binom_test
pvalueA = binom_test(sales_099, n=samp_size_099, p=p_sales_needed_099, alternative='greater')
# Print the p-value for Group A
print(pvalueA) # p-value is above 0.05, so observed purchase rate is not significatily greater
Beispiel #58
0
import sys
from scipy.stats import binom_test

successes = int(sys.argv[1])
trials = int(sys.argv[2])

desired_success_rate = int( sys.argv[ 3 ])/100 if len( sys.argv ) == 4 else 90/100
desired_confidence = int( sys.argv[ 4 ]) if len( sys.argv ) == 5 else 95

p_value_pass = binom_test( successes, trials, desired_success_rate, alternative="greater" )
p_value_fail = binom_test( successes, trials, desired_success_rate, alternative="less" )
result_pass = ( 1 - desired_confidence / 100 ) > p_value_pass
result_fail = ( 1 - desired_confidence / 100 ) > p_value_fail
print( result_pass )
print( result_fail )

print( "\ndesired confidence is: {0}".format( desired_confidence ) )
print( "p value for success is: %f" % p_value_pass )
print( "p value for failure is: %f" % p_value_fail )

print( "\nWhat’s the result?" )

if result_pass:
    print( "They passed the test! We can be confident that they meet the desired accuracy rate." )
elif result_fail:
    print( "They failed the test! We can be confident that they don’t meet the desired accuracy rate." )
else:
    print( "Need more trials..." )
Beispiel #59
0
            dPVal = np.nan

        else:

            iOverlappingPeaks = (dfResults[strAnnot] >= 1).sum()
            iOverlapBP = dfResults[strAnnot].sum()
            iTotalPeaks = len(dfResults.index)
            dPctPeaksOverlap = iOverlappingPeaks / float(iTotalPeaks)
            dAnnotToGenomeRatio = dictAnnotSize[strAnnot] / float(iGSize)
            dEnrichment = dPctPeaksOverlap / dAnnotToGenomeRatio

            dfOLData = dictAnnotOL[strAnnot]
            iAnnotPeaksWithOL = (dfOLData[strPeaks] >= 1).sum()

            dPVal = stats.binom_test(n=iTotalPeaks,
                                     x=iOverlappingPeaks,
                                     p=dAnnotToGenomeRatio)
        # Append rows to dataframe here.
        dfPV.loc[i] = [
            strAnnot, dEnrichment, dPVal, iOverlappingPeaks, iOverlapBP,
            iTotalPeaks, dPctPeaksOverlap, dictIntervalsSize[strAnnot],
            iAnnotPeaksWithOL, dictAnnotSize[strAnnot], dAnnotToGenomeRatio,
            iGSize
        ]
        i += 1

        #print iAnnotOverlap,iTotalPeakLen,dAnnotToGenomeRatio
        #print dictAnnotSize[strAnnot],iSizeEffGenome
        #print stats.binom_test(n = iTotalPeakLen,x=iAnnotOverlap, p=dAnnotToGenomeRatio)
    iTotalCoverage = (dfResults["end"] - dfResults["start"]).sum()
    dfPV['Annotation'] = dfPV['Annotation'].map(lambda x: os.path.basename(x))
Beispiel #60
0
    def _parse(self, vcffile):
        """ parses a vcffile.
		stores a pandas data frame, with one row per roipsn/roiname combination, in self.bases
		
		Arguments:
			vcffile: the vcf file to parse
			
		Returns:
			None
			Output is stored in self.bases
		"""

        # set up variable for storing output
        resDict = {}
        nAdded = 0
        self.region_stats = None
        self.bases = None

        # transparently handle vcf.gz files.
        if vcffile.endswith('.gz'):
            f = gzip.open(vcffile, "rb")
        else:
            f = file.open(vcffile, "rt")

        # precompute a sorted list of positions to look for
        # in an efficient data structure
        sought_psns = deque(sorted(self.psn2roi.keys()))

        try:
            sought_now = sought_psns.popleft()

            # iterate over the vcf file
            for line in f:
                line = line.decode()
                if line[0] == "#":
                    continue  # it is a comment; go to next line;
                if "INDEL" in line:
                    continue  #this is not needed because ours don't contain INDELs anymore; go to next line;

                # parse the line.
                chrom, pos, varID, ref, alts, score, filterx, infos, fields, sampleInfo = line.strip(
                ).split()
                pos = int(pos)

                if pos == sought_now:

                    alts = alts.split(",")
                    infos = dict(item.split("=") for item in infos.split(";"))

                    # confirm the self.infos tag is present.
                    try:
                        baseCounts4 = list(
                            map(int, infos[self.infotag].split(
                                ",")))  #get frequencies of high quality bases
                    except KeyError:
                        raise KeyError(
                            "Expected a tag {0} in the 'info' component of the call file, but it was not there.  Keys present are: {1}"
                            .format(self.infotag, infos.keys()))

                    # extract the baseCounts, and do QC
                    baseFreqs = baseCounts4.copy()
                    baseFreqs.sort(reverse=True)
                    if not len(baseFreqs) == 4:
                        raise TypeError(
                            "Expected tag {0} to contain 4 depths, but {1} found.  Base = {2}; tag contents are {4}"
                            .format(self.infos, len(baseFreqs), pos,
                                    baseCounts4))
                    depth = sum(baseCounts4)

                    # compute probability that the minor variant frequency differs from self.expectedErrorRate from exact binomial test
                    if (
                            baseFreqs[0] < depth and depth > 0
                    ):  # the majority base is not the only base AND depth is more than 0;
                        pvalue = stats.binom_test(
                            x=baseFreqs[1], n=depth, p=self.expectedErrorRate
                        )  # do the test if any variation
                    elif baseFreqs[0] == depth:
                        pvalue = 1  # there is only one base
                    elif depth == 0:
                        pvalue = None  # can't tell, no data
                    else:
                        raise Error(
                            "Logical error: should never reach this point {0} {1}"
                            .format(baseFreqs[0], depth))

                    if pvalue == 0:
                        mlp = 250  # code minus log p as 250 if p value is recorded as 0 in float format
                    elif pvalue is not None:
                        mlp = -math.log(pvalue, 10)
                    elif pvalue is None:
                        mlp = None

                    # store output in a dictionary
                    if depth > 0:
                        maf = float(baseFreqs[1]) / float(depth)
                    else:
                        maf = None

                    for roi_name in self.psn2roi[sought_now]:
                        nAdded += 1
                        resDict[nAdded] = {'roi_name':roi_name, 'pos':pos, 'ref':ref, 'depth':depth,\
                           'base_a':baseCounts4[0],
                           'base_c':baseCounts4[1],
                           'base_g':baseCounts4[2],
                           'base_t':baseCounts4[3], \
                           'maf':maf,
                           'mlp':mlp}

                    # recover the next item to recover
                    try:
                        sought_now = sought_psns.popleft()
                    except IndexError:  # no positions selected
                        break  # all positions have been selected

        except IndexError:  # no positions defined for selection; this is allowed
            pass

        # construct data frame
        self.bases = pd.DataFrame.from_dict(resDict, orient='index')

        # construct summary by region, defined by roi_name
        if len(self.bases.index) > 0:
            r1 = self.bases.groupby(
                ['roi_name'])['depth'].mean().to_frame(name='mean_depth')
            r2 = self.bases.groupby(
                ['roi_name'])['depth'].min().to_frame(name='min_depth')
            r3 = self.bases.groupby(
                ['roi_name'])['depth'].max().to_frame(name='max_depth')

            r4 = self.bases.groupby(['roi_name'
                                     ])['pos'].min().to_frame(name='start')
            r5 = self.bases.groupby(['roi_name'
                                     ])['pos'].max().to_frame(name='stop')
            r6 = self.bases.groupby(['roi_name'
                                     ])['pos'].count().to_frame(name='length')

            # if all mafs are NA, then mean() will fail with a pandas.core.base.DataError
            try:
                r8 = self.bases.groupby(
                    ['roi_name'])['maf'].mean().to_frame(name='mean_maf')
            except pd.core.base.DataError:
                r8 = r1.copy()
                r8.columns = ['mean_maf']
                r8['mean_maf'] = None

            # compute total depth
            r9 = self.bases.groupby(
                ['roi_name'])['depth'].sum().to_frame(name='total_depth')

            # compute total_nonmajor_depth
            self.bases['most_common'] = self.bases[[
                'base_a', 'base_c', 'base_g', 'base_t'
            ]].max(axis=1)
            self.bases[
                'nonmajor'] = self.bases['depth'] - self.bases['most_common']
            r10 = self.bases.groupby([
                'roi_name'
            ])['nonmajor'].sum().to_frame(name='total_nonmajor_depth')

            df = pd.concat([r1, r2, r3, r4, r5, r6, r8, r9, r10],
                           axis=1)  # in R,  this is a cbind operation
        else:
            df = None
        self.region_stats = df
        f.close()