Beispiel #1
0
def bandwidth_suppression_from_peak(tuningDict, subtractBaseline=False):
    spikeArray = tuningDict['responseArray']
    baselineSpikeRate = tuningDict['baselineSpikeRate']
    spikeCountMat = tuningDict['spikeCountMat']
    
    suppressionIndex = np.zeros(spikeArray.shape[1])
    facilitationIndex = np.zeros_like(suppressionIndex)
    
    suppressionpVal = np.zeros_like(suppressionIndex)
    facilitationpVal = np.zeros_like(suppressionIndex)
            
    if not subtractBaseline:
        baselineSpikeRate = 0
    
    for ind in range(len(suppressionIndex)):    
        suppressionIndex[ind] = (max(spikeArray[:,ind])-spikeArray[:,ind][-1])/(max(spikeArray[:,ind])-baselineSpikeRate)
        facilitationIndex[ind] = (max(spikeArray[:,ind])-spikeArray[:,ind][0])/(max(spikeArray[:,ind])-baselineSpikeRate)

        trialsThisSeconsVal = tuningDict['trialsEachCond'][:,:,ind]
        peakInd = np.argmax(spikeArray[:,ind])
        
        peakSpikeCounts = spikeCountMat[trialsThisSeconsVal[:,peakInd]].flatten()
        whiteNoiseSpikeCounts = spikeCountMat[trialsThisSeconsVal[:,-1]].flatten()
        pureToneSpikeCounts = spikeCountMat[trialsThisSeconsVal[:,0]].flatten()
        
        suppressionpVal[ind] = stats.ranksums(peakSpikeCounts, whiteNoiseSpikeCounts)[1]
        facilitationpVal[ind] = stats.ranksums(peakSpikeCounts, pureToneSpikeCounts)[1]
        
    
    suppressionDict = {'suppressionIndex':suppressionIndex,
                       'suppressionpVal':suppressionpVal,
                       'facilitationIndex':facilitationIndex,
                       'facilitationpVal':facilitationpVal}
    
    return suppressionDict
def testNTDifference(oFN, minAvgPhast = .90, minSNR = 2, oRNAType = 'oRNA'):

        oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
        oNX.load(['phastScores', 'snrSS'])
        
        groupA = [10,11,12,13]
        #groupB = [15,16,17,18]
        groupB = [4,5,6,7]
        
        a = []
        b = []

        for oID in oNX.phastScores:

                avgScore = sum(oNX.phastScores[oID]) / float(len(oNX.phastScores[oID]))
                
                #filter
                if (avgScore < float(minAvgPhast)) or (oNX.snrSS[oID] < float(minSNR)):
                        continue
                
                if avgScore == 1.00: continue
                for i, pScore in enumerate(oNX.phastScores[oID]):
                        if (i + 1) in groupA:
                                a.append(pScore)

                        if (i + 1) in groupB:
                                b.append(pScore)
        
        print len(a)/4, len(b)/4
        print stats.ranksums(a,b)                                
def WilcoxonTest(Original_input, Symbiotic_output, Modified_GA_output):
    '''
    Returns the most similar output to Original_input out of 
    Symbiotic_output and GA_output using Wilcoxon Rank Sum Test.
    Args:
        Original_input: Data with N features
        Symbiotic_output: Data with N-1 features extracted using 
                          symbiotic algorithm
        Modified_GA_output: Data with N-1 features extracted using 
                            Modified Genetic algorithm
    Returns:
        The best suited N-1 features for a given distribution or [-1]
    '''
    z_stat_for_symbiotic, p_val_for_symbiotic = stats.ranksums(
                            Symbiotic_output, Original_input)
    z_stat_for_GA, p_val_for_GA = stats.ranksums(
                            Modified_GA_output, Original_input)
    print p_val_for_symbiotic, p_val_for_GA
    if max(p_val_for_GA , p_val_for_symbiotic) < 1e-300:
        return -1
    if (p_val_for_GA > p_val_for_symbiotic): 
        print "Forest one is better"
        return 1
    else: 
        print "Symbiotic is better"
        return 0
Beispiel #4
0
def rank_sum_3_sites(measurements, details=False):
    output = [1, 2, 3, 4, 5, 6, 7, 8]
    done = False
    while not done:
        done = True
        for i in range(len(measurements) - 1):
            if ranksums(measurements[output[i] - 1], measurements[output[i+1] - 1])[0] < 0:
                output[i], output[i+1] = output[i+1], output[i]
                done = False
    if details:
        for i in range(len(measurements) - 1):
            print(ranksums(measurements[output[i] - 1], measurements[output[i + 1] - 1]))
    return output
Beispiel #5
0
    def getCompArray(self, datasetA, datasetB, plot):
        warnings.filterwarnings("error")
        aggregateDataA, offScreen = datasetA.getAggregateData()
        aggregateDataB, offScreen = datasetB.getAggregateData()
        results = []
        # get x, y magnitude of difference between sets, and significance
        for i in range(self.params['gridWidth']):
            for j in range(self.params['gridHeight']):
                # get two arrays for given plot
                setA = aggregateDataA[i][j].getResult(plot)
                setB = aggregateDataB[i][j].getResult(plot)
                # only compare if mean counts of both are greater than one 
                if st.nanmean(aggregateDataA[i][j].getResult(0)) > 1 or st.nanmean(aggregateDataB[i][j].getResult(0)) > 1:
                    # print str(i) + ", " + str(j) + ":  " + str(st.nanmean(setA))
                    try:
                        mww_z, p = stats.ranksums(setA, setB)
                    except UserWarning:
                        p = numpy.nan

                    results.append((st.nanmean(setA), st.nanmean(setB), p))
                else:
                    # print str(i) + ", " + str(j) + ":  " + str(0)
                    results.append((numpy.nan, numpy.nan, numpy.nan))
                    
        return results
Beispiel #6
0
    def printBoxData(self, datasets, boxCoord, plot):
        print "Box " + str(boxCoord)
        means = []
        print "Mean, StdDev, n"
        for ds in datasets:
            alldata = ds.getAggregateDataAsArray(plot)
            boxdata = alldata[boxCoord[0]][boxCoord[1]]
            means.append(st.nanmean(boxdata))
            print str(st.nanmean(boxdata)) + ", " + str(numpy.std(boxdata)) + ", " + str(len(boxdata))
        print "-----"
        print str(st.nanmean(means)) + ", " + str(numpy.std(means)) + ", " + str(len(means))

        for i in range(len(datasets)):
            dsA = datasets[i]
            alldata = dsA.getAggregateDataAsArray(plot)
            boxdata = alldata[boxCoord[0]][boxCoord[1]]
            for j in range(len(datasets))[i+1:]:
                dsB = datasets[j]
                alldataB = dsB.getAggregateDataAsArray(plot)
                boxdataB = alldataB[boxCoord[0]][boxCoord[1]]
                try:
                    mww_z, p = stats.ranksums(boxdata, boxdataB)
                except UserWarning:
                    p = 1

                if p <= 0.05:
                    print "Difference between " + dsA.label + " and " + dsB.label + ".  p = " + str(p)
                else:
                    print "Nothing between " + dsA.label + " and " + dsB.label + "(p=" + str(p) + ")"
Beispiel #7
0
def stats(d_lengths,dn,):

    for bool_skip in [False,True,]:

        even = []
        odd = []
        for dist_min in d_lengths.keys():
            for len_diff in d_lengths[dist_min].keys():
                if bool_skip == True:
                    if len_diff == 1: continue
                if len_diff % 2 == 0:
                    even += d_lengths[dist_min][len_diff]*[dist_min]
                else:
                    odd += d_lengths[dist_min][len_diff]*[dist_min]

        import scipy
        from scipy import stats

        u,p = stats.mannwhitneyu(even,odd)
        fd = open('stats','a')
        fd.write('mannwhitneyu u %s p %s %s %s\n' %(u,p,dn,bool_skip))
        fd.close()
        
        z,p = stats.ranksums(even,odd)
        fd = open('stats','a')
        fd.write('ranksums z %s p %s %s %s\n' %(z,p,dn,bool_skip))
        fd.close()

        average_even = sum(even)/len(even)
        average_odd = sum(odd)/len(odd)
        fd = open('stats','a')
        fd.write('average even %s odd %s %s %s\n' %(average_even,average_odd,dn,bool_skip))
        fd.close()
    
    return
Beispiel #8
0
def compute_ranksum_p(start_gs,last_gs):
	res = {}
	with gzip.open(gene_sets_discrete,"r") as infile:
	    gs = infile.readlines()
	    for line in gs[start_gs:min(last_gs,len(gs))]:
	        words = line.strip().split("\t")
	        gs_genes = words[2].split("|")
	
	        # Stratify 
	        gs_gene_scores = []
	        other_genes_scores = []
	        for g in reconstituted_gene_sets_df.index:
	            if g in gs_genes:
	                gs_gene_scores.append(reconstituted_gene_sets_df.iloc[reconstituted_gene_sets_df.index.get_loc(g),reconstituted_gene_sets_df.columns.get_loc(words[0])])
	            else:
	                other_genes_scores.append(reconstituted_gene_sets_df.iloc[reconstituted_gene_sets_df.index.get_loc(g),reconstituted_gene_sets_df.columns.get_loc(words[0])])    
	
	        # Test
	        z, p1 = ranksums(gs_gene_scores, other_genes_scores)
	        t, p2 = ttest_ind(gs_gene_scores, other_genes_scores, equal_var=False)
	        print "{}: gs_median={}, other_median={}, p_utest={} p_ttest={})".format(words[0],numpy.median(gs_gene_scores),numpy.median(other_genes_scores),p1,p2)
	        res[words[0]] = p1
	    
	    # Write to file
	    with open("{}_{}_{}.tab".format(outfile_prefix,start_gs,last_gs), "w") as f:
	    	for gs in res:
	       		f.write("{}\t{}\n".format(gs,res[gs]))
Beispiel #9
0
 def testRankSum(self, ctrlData, expData):
 
     result=[]
     for k in range(ctrlData.shape[1]):
         result.append(ranksums(ctrlData[:,k], expData[:,k])[self.index])
         
     return result
Beispiel #10
0
def ROC_base(X,I,m,n,cat0,cat1,y):
    if y == []:
        y =  [0] * m + [1] * n
    y2 = []
    for j in range(len(y)):
        if y[j] == 0:
            y2.append(0)
        if y[j] == 1:
            y2.append(1)
    res = []
    for i in range(len(X)):
        x = X[i]
        x2 = []
        x2_cat0 = []
        x2_cat1 = []
        for j in range(len(x)):
            if y[j] == 0:
                x2.append(x[j])
                x2_cat0.append(x[j])
            if y[j] == 1:
                x2.append(x[j])
                x2_cat1.append(x[j])
        Wilcoxon = ranksums(x2_cat0,x2_cat1) #mannwhitneyu(x, y, use_continuity=True) 
        res.append(ROC_ligne(x2,m,n,cat0,cat1,y=y2)[4:] + I[i] + [Wilcoxon[1]] + non_nul(X,y)[i])
    res2 = sorted (res, reverse=True)
    return res2
Beispiel #11
0
def do_significance_test(tpx_feature, test="Wilcoxon Ranksum"):
	"""
	Do significance testing to see if the two distributions differ significantly.
	If p <= 0.05, we are highly confident that the distributions differ significantly.
	
	Arguments:
	tpx_feature (string): Name of the temporal expression feature to test
	test (string): which test to do: Wilcoxon Ranksum or Mann Whitney U
	"""

	md_table = pd.DataFrame.from_csv(os.path.join(wdir, md_csv), header=0)
	ht_table = pd.DataFrame.from_csv(os.path.join(wdir, "tpx-corpus-counts.csv"), header=0)
	working_table = ht_table.join(md_table)

	# get data points
	data = copy.copy(working_table[tpx_feature])

	# get ids of historical novels
	idnos_hist = md_table[md_table["subgenre_hist"] == "historical"].index.tolist()
	# get ids of non-historical novels
	idnos_not_hist = md_table[md_table["subgenre_hist"] == "not_historical"].index.tolist()

	# split data into subgroups
	data_hist = data[idnos_hist]
	data_not_hist = data[idnos_not_hist]

	if test == "Mann Whitney":
		test_stat = stats.mannwhitneyu(data_hist, data_not_hist)
	else:
		# do Wilcoxon Ranksum by default
		test_stat = stats.ranksums(data_hist, data_not_hist)
	return test_stat
def append_wilcoxmann(df,columns,multitest):
	pvalcols = []
	groups = [ratcol+"_ratio" for ratcol in design.run.ratios]
	cntr=0
	for col in columns:
		pvals = []
		data = df[col].values
		for vals in data:
			pvals.append(ranksums([v for v in vals],[item for sublist in data for item in sublist])[1])
		df.insert(len(df.columns),groups[cntr].replace("_ratio","")+"^wmannpvals",pvals)
		pvalcols.append(groups[cntr].replace("_ratio","")+"^wmannpvals")
		cntr+=1
	log("P-values calculated for following groups: "+str(columns),1)

	if multitest:
		combined=[]
		for row in df[pvalcols].get_values():
			combined.append(combine_pvalues(row,method='fisher', weights=None)[1])
		log("Fisher combined p_value test completed",1)
		df.insert(len(df.columns),'fisher_combined_wmannpval',combined)
		bh_corrected = bh_correct(dict(zip(df.index.values,df['fisher_combined_wmannpval'].values)))
		corrected_vals = []
		for k in df.index.values:
			corrected_vals.append(bh_corrected[k])
		df.insert(len(df.columns),'benj_hoch_corrected_wmannpval',corrected_vals)
		log("Benjamini-hochberg correction successfully applied to combined p-values",1)

	return df
Beispiel #13
0
def select_feature(x,y):
    for i in range(0, feature_number):
        temp0 = x[y==0,i]
        temp1 = x[y==1,i]
        pvalues[i] = ranksums(temp0,temp1 )[1]
    top_n_index = sorted(range(len(feature_name)), key=lambda i: pvalues[i])[0:top_n]
    return top_n_index
Beispiel #14
0
def bandwidth_suppression_by_bins(tuningDict, lowBandInds=[1,2], highBandInds=[5,6], subtractBaseline=False):
    spikeArray = tuningDict['responseArray']
    spikeCountMat = tuningDict['spikeCountMat']
    baselineSpikeRate = tuningDict['baselineSpikeRate']
    
    if not subtractBaseline:
        baselineSpikeRate = 0
    
    suppressionIndex = np.zeros(spikeArray.shape[1])
    suppressionpVal = np.zeros_like(suppressionIndex)
    
    for ind in range(len(suppressionIndex)):
        trialsThisSeconsVal = tuningDict['trialsEachCond'][:,:,ind]
        
        lowBandSpikeCounts = []
        for lowInd in lowBandInds:
            thisBinCounts = spikeCountMat[trialsThisSeconsVal[:,lowInd]].flatten()
            lowBandSpikeCounts.extend(thisBinCounts)
            
        highBandSpikeCounts = []
        for highInd in highBandInds:
            thisBinCounts = spikeCountMat[trialsThisSeconsVal[:,highInd]].flatten()
            highBandSpikeCounts.extend(thisBinCounts)
            
        suppressionIndex[ind] = (np.mean(lowBandSpikeCounts)-np.mean(highBandSpikeCounts))/(np.mean(lowBandSpikeCounts)+np.mean(highBandSpikeCounts)-2*baselineSpikeRate)
        suppressionpVal[ind] = stats.ranksums(lowBandSpikeCounts, highBandSpikeCounts)[1]
    
    suppressionDict = {'suppressionIndex':suppressionIndex,
                       'suppressionpVal':suppressionpVal}
    
    return suppressionDict
Beispiel #15
0
def rank_sum_n_sites(measurements, details=False):
    if math.frexp(len(measurements))[0] != 0.5:
        print("rank_sum_n_sites received an input of length %s, which is not equal to the number of genotypes."
              "Quitting." % len(measurements))
        sys.exit()
    output_indices = []
    for genotype in measurements:
        output_indices.append(measurements.keys().index(genotype))
    done = False
    while not done:
        done = True
        for i in range(len(measurements) - 1):
            if ranksums(measurements[measurements.keys()[output_indices[i]]],
                        measurements[measurements.keys()[output_indices[i+1]]])[0] < 0:
                output_indices[i], output_indices[i + 1] = output_indices[i + 1], output_indices[i]
                done = False
    output = []
    output_look_good = []
    number_loci = 0
    for index in output_indices:
        output.append(measurements.keys()[index])
        if len(measurements.keys()[index]) > number_loci:
            number_loci = len(measurements.keys()[index])
    for index in output_indices:
        output_look_good.append(genotype_look_good(measurements.keys()[index], number_loci))
    output_detailed = []
    for genotype in output:
        fitness = measurements[genotype][1:]
        output_detailed.append([genotype, np.mean(fitness)])
    if not details:
        return output
    else:
        return output_detailed
Beispiel #16
0
def computeRankSumZvalsPvals(errRates, lowIsBetter=True):
    ranks = computeRanks(errRates, onlyFullRows=False)

    # compute the ranked sums test p-value between different classifiers
    numClassifiers = errRates.shape[1]
    dims = (numClassifiers, numClassifiers)
    zvals = np.empty(dims)
    pvals = np.empty(dims)
    for i in range(numClassifiers):
        zvals[i, i] = 0
        pvals[i, i] = 1
        for j in range(i+1, numClassifiers):
            x = errRates.iloc[:, i]
            y = errRates.iloc[:, j]

            # compare using all datasets they have in common
            rowsWithoutNans = np.invert(np.isnan(x) + np.isnan(y))
            x = x[rowsWithoutNans]
            y = y[rowsWithoutNans]

            zvals[i, j], pvals[i, j] = ranksums(y, x) # cols are indep var
            zvals[j, i], pvals[j, i] = -zvals[i, j], pvals[i, j]

    classifierNames = ranks.columns.values
    zvals = pd.DataFrame(data=zvals, index=classifierNames,
        columns=classifierNames)
    pvals = pd.DataFrame(data=pvals, index=classifierNames,
        columns=classifierNames)
    return zvals, pvals
Beispiel #17
0
def calc_ranksum():
    atributos = ['tteste', 'ttreinamento', 'precisao']
    for dataset in DATASETS:
        for atributo in atributos:
            for f1,f2 in combinations(FUNCOES, 2):
                d1 = np.array([i[atributo] for i in DADOS if i['dataset_file'] == dataset and i['function_path'] == f1])
                d2 = np.array([i[atributo] for i in DADOS if i['dataset_file'] == dataset and i['function_path'] == f2])
                print ','.join([str(s) for s in [dataset,atributo,f1,f2,ranksums(d1,d2)[0]]])
 def rank_sums(features1, features2, **_):
     """
     
     :param features1: 
     :param features2: 
     :param _: 
     :return: 
     """
     return stats.ranksums(features1, features2)
Beispiel #19
0
def plot_histogram(histogram, html_writer, title='', max_pathway_length=8, xmin=None, xlim=20, error_bars=True, min_to_show=20, legend_loc='upper left'):
    fig = pylab.figure()

    pylab.hold(True)

    reps = 1000
    
    y_offset = 0
    offset_step = 0.007
    colors = {1:'r', 2:'orange', 3:'green', 4:'cyan', 5:'blue', 'Rest':'violet', 'Not first':'k--', 'No known regulation':'grey', 'Activated':'green', 'Inhibited':'r', 'Mixed regulation':'blue'}
    for key, value in histogram.iteritems():
        if len(value) >= min_to_show:
            m = stats.cmedian(value)
            
            sample_std = None
            
            if error_bars:
                sample_vals = []
                i = 0
                while i < reps:
                    samples = []
                    while len(samples) < len(value):
                        samples.append(random.choice(value))
                    sample_vals.append(pylab.median(samples))
                    i += 1
                
                sample_std = pylab.std(sample_vals)
                        
            plotting.cdf(value, label='%s (med=%.1f, N=%d)' % \
                (key, m, len(value)),
                style=colors.get(key, 'grey'), std=sample_std, y_offset=y_offset)
            y_offset += offset_step
            

    xmin = -1 * xlim if xmin == None else xmin
    pylab.xlim(xmin, xlim)
    pylab.xlabel('Irreversability')
    #pylab.xlabel('deltaG')
    pylab.ylabel('Cumulative distribution')
    legendfont = matplotlib.font_manager.FontProperties(size=11)
    pylab.legend(loc=legend_loc, prop=legendfont)
    pylab.title(title)
    pylab.hold(False)
    
    if 'Not first' in histogram:
        print '%s, first vs. non-first ranksum test: ' % title + '(%f, %f)' % stats.ranksums(histogram[1], histogram['Not first'])
    
    if 'Inhibited' in histogram:
        print '%s, inhibited vs. non-regulated ranksum test: ' % title + '(%f, %f)' % stats.ranksums(histogram['Inhibited'], histogram['No known regulation'])
         
    
    #for k1, h1 in histogram.iteritems():
    #    for k2, h2 in histogram.iteritems():
    #        print k1, k2, stats.ranksums(h1, h2)
    
    return fig
Beispiel #20
0
def caculation(data):
    trt_1 = data[data['trt'] == 1]
    trt_0 = data[data['trt'] == 0]
    
    medi = statistics.median(trt_1['y']) - statistics.median(trt_0['y'])
    mean = statistics.mean(trt_1['y']) - statistics.mean(trt_0['y'])
    peop = len(trt_1) + len(trt_0)
    vari = statistics.variance(trt_1['y']) + statistics.variance(trt_0['y'])
    z_stat, p_val = stats.ranksums(trt_0['y'], trt_1['y']) 
    return [medi, mean, peop, p_val]
def BootstrapGenes(CNVTargets, ToSampleFrom, Nsamples, NGenes):
    '''
    (dict, dict, int, int) -> dict
    Take the dictionary with predicted targets and CNV status for each gene
    in each study, the dictionary with numbers: gene pairs, the number of bootstrap
    replicates and the number of genes to sample and return a dictionary with the
    the number of replicates in which CNV genes have more targets, less targets
    and no significant differences    
    '''
    
    # create a dict for each study with a list with numbers of each different
    # outcomes when comparing targets in CNV and non-CNV genes
    # {study: [# replicates CNV > non-CNV, # replicates CNV < non-CNV, # replicates no differences]}
    BootStrap = {}
    # initialize list values
    for study in ToSampleFrom:
        BootStrap[study] = [0, 0, 0]
    # loop over studies in dict to sample from
    for study in ToSampleFrom:
        replicates = Nsamples
        while replicates != 0:
            # make list of targets for CNV and non-CNV genes
            repCNVtargets, repNonCNVtargets = [], []
            # draw NGenes CNV genes and NGenes non-CNV genes with replacement
            for i in range(NGenes):
                # draw a random CNV gene
                j = random.randint(0, len(ToSampleFrom[study]['CNV']) - 1)
                k = random.randint(0, len(ToSampleFrom[study]['not_CNV']) - 1)
                # get the corresponding genes
                gene1 = ToSampleFrom[study]['CNV'][j]
                gene2 = ToSampleFrom[study]['not_CNV'][k]            
                # get the the number of targets for these 2 genes
                assert CNVTargets[study][gene1][-1] == 'CNV', 'random gene should be CNV'
                assert CNVTargets[study][gene2][-1] == 'not_CNV', 'random gene should be non-CNV'
                repCNVtargets.append(CNVTargets[study][gene1][2])
                repNonCNVtargets.append(CNVTargets[study][gene2][2])
            # make sure that the correct numbers of genes is drawn
            assert len(repCNVtargets) == NGenes, 'number of CNV genes is not correct'
            assert len(repNonCNVtargets) == NGenes, 'number of non-CNV genes is not correct'
            # compare CNV and non-CNV genes
            Pval = stats.ranksums(repCNVtargets, repNonCNVtargets)[1]
            # check significance
            if Pval >= 0.05:
                # difference is not significance
                BootStrap[study][2] += 1
            elif Pval < 0.05:
                # difference is significance, check if CNV genes have a greater number of targets
                if np.mean(repCNVtargets) > np.mean(repNonCNVtargets):
                    BootStrap[study][0] += 1
                elif np.mean(repCNVtargets) < np.mean(repNonCNVtargets):
                    BootStrap[study][1] += 1
                assert np.mean(repCNVtargets) != np.mean(repNonCNVtargets), 'means are equal but significantly different'
            # update replicate number
            replicates -= 1
    return BootStrap   
def rank_sums_test(treatment1, treatment2):
    """ See if the distribution of treatmen1 is different than
    the distribution treatment2

    Arguments:
    - `treatment1`:
    - `treatment2`:
    """
    z_stat, p_val = stats.ranksums(treatment1, treatment2)
    print "Mann-Whitney-Wilcoxon RankSum P for treatments 1 and 2 =", p_val
    return z_stat, p_val
Beispiel #23
0
def rstest_mw(x, y): 
  '''rank sum test p value of x > y, Not used because of some bug in mannwhitneyu
  '''
  from scipy.stats import ranksums, mannwhitneyu
  #n1, n2 = len(x), len(y)
  #mu = n1 * n2 / 2.
  st1, p1 = ranksums(x, y)
  try : st, p = mannwhitneyu(x, y)
  except : return 0.5
  if st1 > 0 : return p
  else: return 1 - p ###
def statistics_test(data, labels):
    d = data.as_matrix()
    y = labels.as_matrix()
    y = y.reshape([y.shape[0]])
    in1 = np.where(y < 1e-5)[0]
    in2 = np.where(y > 1-(1e-5))[0]
    l = []
    for i in range(d.shape[1]):
        s, p = stt.ranksums(d[in1, i], d[in2, i])
        l.append(p)
    df = pd.DataFrame(data=np.array(l)*len(l), index=data.columns.values, columns=['pvalue'])
    return df
Beispiel #25
0
def Wilcoxon(ipdc, ipdt): #calculates z_stat and pvalue from Wilcoxon test, appends to a file
        pv_list = []
        if len(ipdc) > len(ipdt):
        	sample = len(ipdt)
        else:
        	sample = len(ipdc)
        for r in range(100): 
           sampled_timepoint = numpy.random.choice(ipdt, sample, replace=True)
           sampled_control = numpy.random.choice(ipdc, sample, replace=True)
           z_stat, p_val = stats.ranksums(sampled_control, sampled_timepoint)  
           pv_list.append(p_val)
        return pv_list         
Beispiel #26
0
def series_mean_ranksums(word_set1, word_set2, words_time_series, one_minus=True):
    word_means1 = np.array(get_word_means(words_time_series, word_set1).values())
    word_means2 = np.array(get_word_means(words_time_series, word_set2).values())
    if one_minus:
        word_means1 = 1 - word_means1
        word_means2 = 1 - word_means2
    z,p = ranksums(word_means1, word_means2)
    return {"z" : z, 
            "p" : p, 
            "set1_size" : len(word_means1), 
            "set2_size" : len(word_means2), 
            "set1_med" : np.median(word_means1),
            "set2_med" : np.median(word_means2)}
Beispiel #27
0
def outcomeBoxplot(cyDf, cyVar, outcomeVar, printP=True, axh=None):
    if axh is None:
        axh = plt.gca()
    axh.cla()
    sns.boxplot(y=cyVar, x=outcomeVar, data=cyDf, ax=axh, order=[0,1])
    sns.stripplot(y=cyVar, x=outcomeVar, data=cyDf, jitter=True, ax=axh, order=[0,1])
    plt.xticks([0,1], ['False', 'True'])
    if printP:
        tmp = cyDf[[cyVar, outcomeVar]].dropna()
        z, pvalue = stats.ranksums(tmp[cyVar].loc[tmp[outcomeVar] == 1], tmp[cyVar].loc[tmp[outcomeVar] == 0])
        annParams = dict(textcoords='offset points', xytext=(0,-5), ha='center', va='top', color='black', weight='bold', size='medium')
        plt.annotate('p = %1.3g' % pvalue, xy=(0.5,plt.ylim()[1]), **annParams)
    plt.show()
def main():
	"""
	1st phase
	top1 = [70.0, 71.1, 72.5, 70.8, 68.1, 71.9, 71.1, 71.3, 68.4, 70.2]
	top3 = [75.8, 78.4, 77.8, 77.7, 80.0, 77.8, 78.7, 76.4, 79.1, 77.3]
	2nd phase
	"""
	x = [53.6, 54.5, 53.7, 52.7, 53.1, 55.5, 55.5, 52.8, 53.7, 52.7]
	y = [89.7, 89.1, 89.5, 88.7, 89.4, 88.6, 89.8, 89.5, 89.2, 89.7]
	# Compute the Wilcoxon rank-sum statistic for two samples.
	wilcoxon = stats.ranksums(x, y)
	anova = stats.f_oneway(x, y)
	print "Wilcoxon: " + str(wilcoxon[1]) + "; ANOVA: " + str(anova[1])
Beispiel #29
0
def ranksum_test_two_sample(sample_a, sample_b):
    '''
    doing ranksum test on two samples, sample_a and sample_b should be array_like
    '''
    z_statistic, two_tailed_pvalue = stats.ranksums(sample_a, sample_b)
    z_statistic = z_statistic.tolist()
    if z_statistic < 0:
        pvalue_left = two_tailed_pvalue / 2
        pvalue_right = 1 - two_tailed_pvalue / 2
    else:
        pvalue_left = 1 - two_tailed_pvalue / 2
        pvalue_right = two_tailed_pvalue / 2
    return z_statistic, pvalue_left, pvalue_right
def wilcox_test(x, y):
    '''
    Performs the Wilcoxon-Ranked Sums Test.
    @param x: collection of numerics.
    @param y: collection of numerics.
    @return: float referencing the test p-value.
    '''

    pval = ranksums(x, y)[-1].astype('float64')
    if numpy.isnan(pval):  # replace NaN with a poor p-value.
        pval = 1.0
    if pval == 0.0:  # sanity checks to ensure all values are non-zero.
        pval = numpy.finfo(numpy.float64).tiny.astype('float')
    return pval
def violin_from_dict(ann_violin,
                     dict_list,
                     category_label,
                     prefix,
                     taskid,
                     colormap=None,
                     figsize=(1.2, 1.2),
                     lfc=False):
    from scipy.stats import ranksums
    ann_violin = ann_violin.copy()
    ann_violin.X = ann_violin.raw.X
    sc.pp.normalize_per_cell(ann_violin,
                             copy=False,
                             counts_per_cell_after=15000)

    for t, _genes in dict_list.items():
        if len(_genes) > 1:
            fig_all, ax_sub = plt.subplots(1,
                                           len(_genes),
                                           figsize=((figsize[0] * len(_genes)),
                                                    figsize[1] + 0.3))
        for _, g in enumerate(_genes):

            if not g in ann_violin.var_names:
                print(g, 'not found')
                continue
            ann_violin.obs['exp'] = ann_violin.X[:, ann_violin.var_names ==
                                                 g].A.reshape(-1)

            ann_violin.obs['l2fc'] = np.log2(
                ann_violin.X[:, ann_violin.var_names == g].A.reshape(-1) + 1)
            ann_violin.obs[g] = ann_violin.obs['l2fc']
            # Figure properties
            fig, ax = plt.subplots(figsize=figsize)
            #                rc={'font.size': 32, 'axes.labelsize': 18, 'legend.fontsize': 18,
            #    'axes.titlesize': 20, 'xtick.labelsize': 20, 'ytick.labelsize': 20}
            #plt.rcParams.update(**rc)
            # Violin Plot

            mask = ann_violin.obs[category_label] == ann_violin.obs[
                category_label].cat.categories[0]
            if lfc:
                rep = np.log2(ann_violin.obs['exp'][~mask].mean() +
                              1) - np.log2(ann_violin.obs['exp'][mask].mean() +
                                           1)
            else:
                stat, pval = ranksums(ann_violin.obs['l2fc'][mask],
                                      ann_violin.obs['l2fc'][~mask])
                rep = pval

            from statannot import add_stat_annotation
            if ann_violin.obs[category_label].dtype.name == 'category':
                ann_violin.obs[category_label] = ann_violin.obs[
                    category_label].cat.remove_unused_categories()
            axs = [ax]
            if len(_genes) > 1:
                axs.append(ax_sub[_])
            for _ax in axs:
                sns.violinplot(data=ann_violin.obs,
                               palette=colormap,
                               y=g,
                               x=category_label,
                               linewidth=1,
                               ax=_ax)
                if lfc:
                    add_stat_annotation(
                        _ax,
                        data=ann_violin.obs,
                        y=g,
                        x=category_label,
                        box_pairs=[(ann_violin.obs[category_label].unique())],
                        perform_stat_test=False,
                        pvalues=[rep],
                        text_format='custom',
                        line_offset_to_box=0.2,
                        line_offset=0.1,
                        line_height=0.05,
                        linewidth=0.6,
                        text_offset=0.5)
                else:
                    add_stat_annotation(
                        _ax,
                        data=ann_violin.obs,
                        y=g,
                        x=category_label,
                        box_pairs=[(ann_violin.obs[category_label].unique())],
                        perform_stat_test=False,
                        pvalues=[rep],
                        line_offset_to_box=0.2,
                        line_offset=0.1,
                        line_height=0.05,
                        linewidth=0.6,
                        text_offset=0.5)

                _ax.set_xlabel('')
                _ax.set_title(g)
                _ax.set_ylabel('')

            ax.set_ylabel(r'$ \log_{2}( expression) $')
            if _ == 0 and len(_genes) > 1:
                ax_sub[_].set_ylabel(r'$ \log_{2}( expression) $', fontsize=7)
            if len(_genes) > 1:
                ax_sub[_].tick_params(axis='y', pad=-3)
            path = prefix + t + '-' + g + '-' + taskid + '-' + ".pdf"
            fig.savefig(path, dpi=300, bbox_inches='tight')
            plt.close(fig)

        fig_all.tight_layout(pad=0.3)
        fig_all.savefig(prefix + t + '-' + taskid + '-' + ".pdf",
                        bbox_inches='tight')
        plt.close('all')
Beispiel #32
0
def batch_stats_extended(marker_exp, c_list, coi):
    """Applies t test , wilcoxon test, and likelihood ratio test (Based on logistic regression)
    to a gene expression matrix, gene by gene. Also gives simple up versus down regulation test (difference between means).

    :param marker_exp: A DataFrame whose rows are cell identifiers, columns are
        gene identifiers, and values are float values representing gene
        expression.
    :param c_list: A Series whose indices are cell identifiers, and whose
        values are the cluster which that cell is part of.
    :param coi: The cluster of interest.

    :returns: A matrix with arbitary row indices whose columns are the gene, t
              statistic, then t p-value; the last two being of float type.
              Their names are 'gene', 't_stat' , 't_pval' , w_stat, w_pval , LRT_pval, up/down regulated

    :rtype: pandas.DataFrame
    """
    def LRT_LogReg(df):
        # Define model matrix and response
        X = np.matrix(df.drop('cluster', axis=1))
        y = df['cluster']
        # Train logistic regression with full model
        logreg1 = LogisticRegression(solver='lbfgs').fit(X, y)
        ll1 = -log_loss(y, logreg1.predict_proba(X), normalize=False)
        # Train logistic regression with null model (only intercept)
        logreg0 = LogisticRegression(solver='lbfgs').fit([[0]] * len(X), y)
        ll0 = -log_loss(y, logreg0.predict_proba(X), normalize=False)
        # Likelihood ratio test
        stat = 2 * (ll1 - ll0)
        pval = ss.chi2.sf(stat, 1)
        return (pval)

    LRT_pvals = []
    up_v_down_vals = []
    for column in marker_exp:
        log_reg_in = pd.DataFrame(data=[marker_exp[column]])
        log_reg_in = np.transpose(log_reg_in)
        c_list_2 = np.array(c_list)
        c_list_2 = np.array(c_list_2 == coi, dtype=int)
        c_list_2 = np.transpose(c_list_2)
        log_reg_in['cluster'] = c_list_2
        in_cls = marker_exp[column][c_list == coi].values
        out_cls = marker_exp[column][c_list != coi].values
        out_cls_mean = np.sum(out_cls) / len(out_cls)
        in_cls_mean = np.sum(in_cls) / len(in_cls)
        test = in_cls_mean - out_cls_mean
        if test <= 0:
            up_v_down_vals.append('down')
        else:
            up_v_down_vals.append('up')

        LRT_pval = LRT_LogReg(log_reg_in)
        LRT_pvals.append(LRT_pval)

    t = marker_exp.apply(lambda col: ss.ttest_ind(
        col[c_list == coi], col[c_list != coi], equal_var=False))
    ws = marker_exp.apply(
        lambda col: ss.ranksums(col[c_list == coi], col[c_list != coi]))
    output = pd.DataFrame()
    output['gene_1'] = t.index
    #output['gene_1'] = ws.index
    output[['t_stat', 't_pval']] = pd.DataFrame(t.values.tolist(),
                                                columns=['t_stat', 't_pval'])
    output[['w_stat', 'w_pval']] = pd.DataFrame(ws.values.tolist(),
                                                columns=['w_stat', 'w_pval'])

    output['up_down'] = up_v_down_vals
    output['LRT_pval'] = LRT_pvals
    return output
Beispiel #33
0
    for (j, k), l in grouped:
        bar_num = sorted(list_of_genotypes).index(j)
        index_num = sorted(list_of_treatments).index(k)

        p = plt.bar(index_num + (bar_width * bar_num),
                    means[j, k],
                    bar_width,
                    alpha=opacity[index_num],
                    color=colourlist[bar_num],
                    yerr=sems[j, k],
                    error_kw=error_config,
                    label=[j, k])
        #Mann-Whitney test = st.ranksums

        z_stat_gt, p_val_gt = st.ranksums(
            df[(df[groupinglist[0]] == j) & (df[groupinglist[1]] == k)][i],
            df[(df[groupinglist[0]] == CONTROL_GENOTYPE)
               & (df[groupinglist[1]] == k)][i])
        z_stat_tr, p_val_tr = st.ranksums(
            df[(df[groupinglist[0]] == j) & (df[groupinglist[1]] == k)][i],
            df[(df[groupinglist[0]] == j)
               & (df[groupinglist[1]] == CONTROL_TREATMENT)][i])

        p_vals_gt_rounded = ['%.4f' % elem for elem in p_vals_gt]
        p_vals_tr_rounded = ['%.4f' % elem for elem in p_vals_tr]

        q = plt.text(
            index_num + (bar_width * (bar_num + 0.5)),  #centre of bar
            means[j, k] + 0.5 * sems[j, k] +
            0.1 * means.values.max(),  #just above error bar
            'p(genotype) = ' + str(p_val_gt.round(4)) + '\np(treatment) = ' +
            str(p_val_tr.round(4)) + '\nn = ' + str(ns[j, k]),
# %%
significance = np.zeros(
    (
        cumulative_weighted_assessment_score.shape[1],
        unstacked_cumulative_weighted_assessment_score.shape[1],
    ),
    dtype=bool,
)

for offset, date in enumerate(cumulative_weighted_assessment_score):
    df = unstacked_cumulative_weighted_assessment_score[date]
    columns = df.columns
    offset *= len(columns)
    for i, j in combinations(range(len(columns)), r=2):
        significance[i, j + offset] = significance[j, i + offset] = (ranksums(
            df.iloc[:, i], df.iloc[:, j], nan_policy="omit").pvalue <= 0.05)

significance_df = (pd.DataFrame(
    significance,
    index=unstacked_cumulative_weighted_assessment_score.columns.levels[1],
    columns=unstacked_cumulative_weighted_assessment_score.T.index,
).rename_axis("").rename_axis(["", "H0 rejected"], axis=1))

display(significance_df)

# %% [markdown] tags=[]
# As in the work of Hlosta et al. the hypothesis test indicates a
# significant difference between the groups in the time slices, starting from the
# first time slice (the assessment from day 33).
#
# We also note that the test did not reject the null hypothesis for the (Withdrawn-Fail)
Beispiel #35
0
            salineDataObjs[1], 'k')
    ax.tick_params(axis='both', which='major', labelsize=labelFontSize)
    ax.tick_params(axis='both', which='minor', labelsize=labelFontSize)
    title('sal-{}'.format([key
                           for key, value in salineSoundTypes.iteritems()][1]))
    behavioranalysis.nice_psycurve_settings(ax, fontsize=10, lineweight=2)

# figtext(0.075, 0.7, 'Fraction of trials going to the right', rotation='vertical')
# figtext(0.4, 0.05, 'Log2(frequency) - octaves')
plt.subplots_adjust(wspace=0.25, hspace=0.25)
show()
suptitle(animal)

#We should be using nonparametric stats
from scipy import stats
print(stats.ranksums(salineAmpEstimates[:, 1], musAmpEstimates[:, 1]))
print(stats.ranksums(salineChordEstimates[:, 1], musChordEstimates[:, 1]))

sa = salineAmpEstimates[:, 1]

ma = musAmpEstimates[:, 1]

sc = salineChordEstimates[:, 1]

mc = musChordEstimates[:, 1]

figure()
subplot(121)
plot(zeros((len(sa), 1)), 1 / (4. * sa), 'ko')
plot(ones((len(ma), 1)), 1 / (4. * ma), 'ro')
xlim([-1, 2])
def rand_samp(checks, file_name, gene_high, gene_mid, gene_low, verheek_merged,
              verheek_mm):
    table = pd.DataFrame()
    for i in checks:
        x = pd.DataFrame(
            dict(high=gene_high[i].value_counts(),
                 low=gene_low[i].value_counts()))
        table = table.append(x)
    table.fillna(0, inplace=True)
    p = []
    for i, j in table.values:
        x = [gene_low.shape[0] - j, j], [gene_high.shape[0] - i, i]
        z = fisher_exact(x)[1]
        p.append(z)
    table['p'] = p
    table.to_csv(file_name + "_pre_balanced.csv")
    # random sampling until fisher is not significant or 10000 iterations
    c = 0
    sig = 0
    sig2 = 1
    while c < 10000 and sig2 > 0.05:
        c = c + 1
        print('Iteration number = {}'.format(str(c)))
        table = pd.DataFrame()
        if gene_high.shape[0] > 20:
            gene_high_samp = gene_high.sample(
                np.random.randint(20, gene_high.shape[0]))
        else:
            gene_high_samp = gene_high.copy()
        if gene_mid.shape[0] > 20:
            gene_mid_samp = gene_mid.sample(
                np.random.randint(20, gene_mid.shape[0]))
        else:
            gene_mid_samp = gene_mid.copy()
        if gene_mid.shape[0] > 20:
            gene_low_samp = gene_low.sample(
                np.random.randint(20, gene_low.shape[0]))
        else:
            gene_low_samp = gene_low.copy()
        for i in checks:
            x = pd.DataFrame(
                dict(high=gene_high_samp[i].value_counts(),
                     low=gene_low_samp[i].value_counts()))
            table = table.append(x)
        table.fillna(0, inplace=True)
        p = [
            fisher_exact(([gene_low_samp.shape[0] - j,
                           j], [gene_high_samp.shape[0] - i, i]))[1]
            for i, j in table.values
        ]
        # p = []
        # for i,j in table.values:
        #     x = [gene_low.shape[0]-j,j],[gene_high.shape[0]-i,i]
        #     z = fisher_exact(x)[1]
        #     p.append(z)
        table['p'] = p
        p.sort()
        sig = p[0]
        print("Lowest fisher significance = {:.8f}".format(sig))
        if sig > 0.05:
            sig2 = ranksums(gene_high_samp.os, gene_low_samp.os)[1]
            print("Survival significance = {:.3f}".format(sig2))
    a = gene_high_samp.index.values.tolist() \
            + gene_mid_samp.index.values.tolist() + \
        gene_low_samp.index.values.tolist()
    print("exporting balanced unscaled to csv...")
    verheek_merged.loc[a].to_csv(file_name + "_balanced_unscaled.csv")
    print("gzipping balanced unscaled csv...")
    with open(file_name + "_balanced_unscaled.csv", 'rb') as f_in:
        with gzip.open(file_name + "_balanced_unscaled.csv.gz", 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(file_name + "_balanced_unscaled.csv")
    print("exporting balanced scaled to csv...")
    verheek_mm.loc[a].to_csv(file_name + "_balanced.csv")
    print("gzipping balanced scaled csv...")
    with open(file_name + "_balanced.csv", 'rb') as f_in:
        with gzip.open(file_name + "_balanced.csv.gz", 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(file_name + "_balanced.csv")
        if miranda[species][gene][-1] == 'CNV':
            k += 1
        elif miranda[species][gene][-1] == 'not_CNV':
            l += 1
    # check that numbers match
    assert i == k, 'CNV genes should match between miranda and targetscan'
    assert j == l, 'non-CNV genes should match between miranda and targetscan'
    # populate dict
    GeneNumbers[species] = [i, j]
print('counted CNV and non-CNV genes for each species')

# perform stattistical tests between CNV and non-CNV genes
# create dicts to store results {species: [P-value targetscan, P-value mirnada]}
CompTargets = {}
for species in SpeciesDataTargetscan:
    Ptargetscan = stats.ranksums(SpeciesDataTargetscan[species][0],
                                 SpeciesDataTargetscan[species][1])[1]
    Pmiranda = stats.ranksums(SpeciesDataMiranda[species][0],
                              SpeciesDataMiranda[species][1])[1]
    CompTargets[species] = [Ptargetscan, Pmiranda]
print('compared CNV and non-CNV genes')
# get the significance level
Significance = {}
for species in CompTargets:
    Significance[species] = []
    for i in range(len(CompTargets[species])):
        if CompTargets[species][i] >= 0.05:
            Significance[species].append('')
        elif CompTargets[species][i] < 0.05 and CompTargets[species][i] >= 0.01:
            Significance[species].append('*')
        elif CompTargets[species][i] < 0.01 and CompTargets[species][
                i] >= 0.001:
Beispiel #38
0
from scipy.stats import ranksums
import matplotlib.pyplot as plot
import sys

input_filename = "run2.csv)"
if len(sys.argv) > 1:
    input_filename = sys.argv[1]
f = open(input_filename, "r")

results = []

for line in f:
	cols = [float(x) for x in line.split()]

	for i, col in zip(range(len(cols)), cols):
		try:
			results[i].append(col)
		except:
			results.append([col])

for i, result in zip(range(len(results)), results):
    print i, sum(result) / len(result)

print "---------- RESULTS ----------"
print "Normal Test", normaltest(results[1])
print "Mann-Whitney", mannwhitneyu(results[1], results[3])
print "Wilcoxon", wilcoxon(results[1], results[3])
print "T-Test", ttest_rel(results[1], results[3])

print "Rank Sums", ranksums(results[1], results[9])
Beispiel #39
0
    'worst_concavity', 'worst_concave', 'worst_symmetry',
    'worst_fractal_dimension'
]

df = pd.read_csv(endereco_dos_dados, header=None)
df.columns = nomes_das_variaveis
del endereco_dos_dados, nomes_das_variaveis
"""----------------------------------------------------------------------------
    Verificando a base de dados
"""
# Top 5 dos dados
df.head()

# Tipagem dos dados
df.ftypes

# Descrição dos dados
df.describe()
"""----------------------------------------------------------------------------
    Aplicando o teste
"""
# Separando os dados
diagnostico_m = df.query("diagnosis == 'M'")
diagnostico_b = df.query("diagnosis == 'B'")

# Aplicando o teste
estatistica, p = ranksums(diagnostico_m["mean_radius"],
                          diagnostico_b["mean_radius"])
print("p-valor: {}".format(round(p, 2)))
print("estatistica: {}".format(round(estatistica, 2)))
Beispiel #40
0
Datei: plot.py Projekt: ludev/bin
def plot(handle, out, cols, names, bins, title, xlab, ylab, xlog, ylog, \
              vmax, vmin, vMinSum, collapse, normed, alpha, legendLoc, colors,\
              verbose, dlimit=1):
    """
    """
    if verbose:
        sys.stderr.write("Parsing data...\n")
    x = [[] for i in range(len(cols))]
    for l in handle:
        try:
            ldata = l[:-1].split('\t')
            vals = []
            for col in cols:
                if col >= len(ldata) or not ldata[col]:
                    continue
                v = float(ldata[col])
                if vmin < v < vmax:
                    vals.append(v)
            #skip entire line if one value out of bounds
            # or if sum of values below threshold
            if len(vals) != len(cols) or sum(vals) < vMinSum:
                continue
            for i, v in enumerate(vals):
                x[i].append(v)
        except:
            sys.stderr.write("[Error] Cannot parse line: %s\n" %
                             ",".join(l.split('\t')))
    if verbose:
        sys.stderr.write(" %s values loaded.\n" % len(x))
    fig = plt.figure()
    #http://matplotlib.org/users/customizing.html
    #mpl.rcParams['figure.subplot.wspace'] = 0.3
    '''mpl.rcParams['figure.subplot.hspace'] = 0.5
    mpl.rcParams['axes.titlesize'] = 8
    mpl.rcParams['axes.labelsize'] = 6
    mpl.rcParams['xtick.labelsize'] = 5
    mpl.rcParams['ytick.labelsize'] = 5'''
    #add subplots
    plt.rc('axes', color_cycle=colors)  #['c', 'm', 'y', 'k']

    #plot
    x, y = x

    # get correlation
    print "%s points\n mean X: %s +/- %s\n mean Y: %s +/- %s" % (
        len(x), np.mean(x), np.std(x), np.mean(y), np.std(y))
    print "Pearson: r=%s p=%s" % stats.pearsonr(x, y)
    print "Spearman: r=%s p=%s" % stats.spearmanr(x, y)
    print "Wilcoxon: T=%s p=%s" % ranksums(x, y)  # wilcoxon(x, y)

    pairwise_wilcoxon(x, y)

    ax = fig.add_subplot(111)
    ax.plot(x, y, 'b.', alpha=0.5)

    #add title
    ax.set_title(title)
    #add subplots labels
    ax.set_xlabel(xlab)  #, fontsize=30)
    ax.set_ylabel(ylab)  #, fontsize=30)
    #plot legend only if collapsed
    if xlog:
        ax.set_xscale('log')
    if ylog:
        ax.set_yscale('log', nonposy='clip')

    ax.grid(True)
    #save or show
    if type(out) is file and out.name == '<stdout>':
        plt.show()
    else:
        fpath = out  #handle.name+".png"
        fformat = fpath.split('.')[-1]
        plt.savefig(fpath,
                    dpi=300,
                    format=fformat,
                    orientation='landscape',
                    transparent=False)
        sys.stderr.write("Figure written to: %s\n" % fpath)
Beispiel #41
0
    def plot_update_trend(self):
        if self.y_axis.value:

            selected_indices = {n: getattr(self.sources, 'time_%s' % n).selected.indices for n in GROUP_LABELS}
            for n in GROUP_LABELS:
                if not selected_indices[n]:
                    selected_indices[n] = range(len(getattr(self.sources, 'time_%s' % n).data['x']))

            group = {n: {'x': [], 'y': []} for n in GROUP_LABELS}

            for n in GROUP_LABELS:
                for i in range(len(getattr(self.sources, 'time_%s' % n).data['x'])):
                    if i in selected_indices[n]:
                        for v in ['x', 'y']:
                            group[n][v].append(getattr(self.sources, 'time_%s' % n).data[v][i])

            try:
                avg_len = int(self.look_back_distance.value)
            except:
                avg_len = 1

            try:
                percentile = float(self.plot_percentile.value)
            except:
                percentile = 90.

            # average daily data and keep track of points per day, calculate moving average

            group_collapsed = {n: [] for n in GROUP_LABELS}
            for n in GROUP_LABELS:
                if group[n]['x']:
                    group_collapsed[n] = collapse_into_single_dates(group[n]['x'], group[n]['y'])
                    if self.look_back_units.value == "Dates with a Sim":
                        x_trend, moving_avgs = moving_avg(group_collapsed[n], avg_len)
                    else:
                        x_trend, moving_avgs = moving_avg_by_calendar_day(group_collapsed[n], avg_len)

                    y_np = np.array(group[n]['y'])
                    upper_bound = float(np.percentile(y_np, 50. + percentile / 2.))
                    average = float(np.percentile(y_np, 50))
                    lower_bound = float(np.percentile(y_np, 50. - percentile / 2.))
                    getattr(self.sources, 'time_trend_%s' % n).data = {'x': x_trend,
                                                                  'y': moving_avgs,
                                                                  'mrn': ['Avg'] * len(x_trend)}
                    getattr(self.sources, 'time_bound_%s' % n).data = {'x': group[n]['x'],
                                                                  'mrn': ['Bound'] * len(group[n]['x']),
                                                                  'upper': [upper_bound] * len(group[n]['x']),
                                                                  'avg': [average] * len(group[n]['x']),
                                                                  'lower': [lower_bound] * len(group[n]['x'])}
                    getattr(self.sources, 'time_patch_%s' % n).data = {'x': [group[n]['x'][0], group[n]['x'][-1],
                                                                        group[n]['x'][-1], group[n]['x'][0]],
                                                                  'y': [upper_bound, upper_bound, lower_bound, lower_bound]}
                else:
                    for v in ['trend', 'bound', 'patch']:
                        clear_source_data(self.sources, 'time_%s_%s' % (v, n))

            x_var = str(self.y_axis.value)
            if x_var.startswith('DVH Endpoint'):
                self.histograms.xaxis.axis_label = x_var.split("DVH Endpoint: ")[1]
            elif x_var == 'EUD':
                self.histograms.xaxis.axis_label = "%s (Gy)" % x_var
            elif x_var == 'NTCP/TCP':
                self.histograms.xaxis.axis_label = "NTCP or TCP"
            else:
                if self.range_categories[x_var]['units']:
                    self.histograms.xaxis.axis_label = "%s (%s)" % (x_var, self.range_categories[x_var]['units'])
                else:
                    self.histograms.xaxis.axis_label = x_var

            # Normal Test
            s, p = {n: '' for n in GROUP_LABELS}, {n: '' for n in GROUP_LABELS}
            for n in GROUP_LABELS:
                if group[n]['y']:
                    s[n], p[n] = normaltest(group[n]['y'])
                    p[n] = "%0.3f" % p[n]

            # t-Test and Rank Sums
            pt, pr = '', ''
            if group['1']['y'] and group['2']['y']:
                st, pt = ttest_ind(group['1']['y'], group['2']['y'])
                sr, pr = ranksums(group['1']['y'], group['2']['y'])
                pt = "%0.3f" % pt
                pr = "%0.3f" % pr

            self.histogram_normaltest_1_text.text = "Group 1 Normal Test p-value = %s" % p['1']
            self.histogram_normaltest_2_text.text = "Group 2 Normal Test p-value = %s" % p['2']
            self.histogram_ttest_text.text = "Two Sample t-Test (Group 1 vs 2) p-value = %s" % pt
            self.histogram_ranksums_text.text = "Wilcoxon rank-sum (Group 1 vs 2) p-value = %s" % pr

        else:
            for n in GROUP_LABELS:
                for k in ['trend', 'bound', 'patch']:
                    clear_source_data(self.sources, "time_%s_%s" % (k, n))

            self.histogram_normaltest_1_text.text = "Group 1 Normal Test p-value = "
            self.histogram_normaltest_2_text.text = "Group 2 Normal Test p-value = "
            self.histogram_ttest_text.text = "Two Sample t-Test (Group 1 vs 2) p-value = "
            self.histogram_ranksums_text.text = "Wilcoxon rank-sum (Group 1 vs 2) p-value = "

        self.update_histograms()
#scores_df['fb_gain'] = (scores_df['second'] - scores_df['first'])/scores_df['first']

#scores_df = scores_df.fillna(0)

factor_names = ['fb_type', 'metric_type', 'threshold_factor']
p_values_df = pd.DataFrame(columns=factor_names +
                           ['runksum', 'comparison', 'pvalue'])
pvalue_types = ['FBMock', 'FB500', 'FB250', 'FB0'][::-1]
for factors_values, group in scores_df.groupby(factor_names):
    for pvalue_type in ['FBMock', 'FB500', 'FB250', 'FB0']:
        if factors_values[0] == pvalue_type: continue
        mock = scores_df.query(
            'fb_type=="{}" & metric_type=="{}" & threshold_factor=={}'.format(
                pvalue_type, *factors_values[1:]))
        p_value_mock = ranksums(group['score'].values, mock['score'].values)
        pvalue = stats.ranksums(group['score'].values,
                                mock['score'].values).pvalue
        print('*' if stats.shapiro(group['score'].values)[1] < 0.05 else '-',
              factors_values)

        #p_value_0 = stats.ranksums(group['score'].values, 0)[0]

        pvalue_dict = dict(
            zip(
                factor_names + ['runksum', 'comparison', 'pvalue'],
                list(factors_values) +
                [[p_value_mock], [pvalue_type], [pvalue]]))
        #print(pvalue_dict)
        p_values_df = p_values_df.append(pd.DataFrame(pvalue_dict),
                                         ignore_index=True)

#sns.set(rc={'figure.figsize':(2,2)})
Beispiel #43
0
                   boxprops=boxprops,
                   meanprops=meanlineprops,
                   meanline=True,
                   medianprops=medianprops,
                   capprops=capprops,
                   whiskerprops=whiskerprops)
axes[3, 0].set_ylim(-0.1, ylimMax)
axes[3, 0].set_ylabel("Surface Distance (mm)", fontsize=20)
axes[3, 1].boxplot(dist_bs[1],
                   labels=labels,
                   showmeans=True,
                   showfliers=False,
                   boxprops=boxprops,
                   meanprops=meanlineprops,
                   meanline=True,
                   medianprops=medianprops,
                   capprops=capprops,
                   whiskerprops=whiskerprops)
axes[3, 1].set_ylim(-0.1, ylimMax)
axes[3, 1].set_ylabel("Surface Distance (mm)", fontsize=20)

plt.savefig(png_path_out, dpi=fig.dpi)
plt.savefig(eps_path_out, dpi=fig.dpi)
plt.show()

print(ss.ranksums(dist[0][0], dist[0][1]),
      ss.ranksums(dist_apx[0][0], dist_apx[0][1]),
      ss.ranksums(dist_md[0][0], dist_md[0][1]),
      ss.ranksums(dist_bs[0][0], dist_bs[0][1]))

results.close()
Beispiel #44
0
def tests_compare_report_experimental(request, test_id_1, test_id_2):
    data = Aggregate.objects.raw(
        """
        SELECT a.url as "id", a1.average as "average_1", a2.average as "average_2", a1.average - a2.average as "avg_diff",
        (((a1.average-a2.average)/a2.average)*100) as "avg_diff_percent",
        a1.median - a2.median as "median_diff",
        (((a1.median-a2.median)/a2.median)*100) as "median_diff_percent" FROM
        (SELECT action_id, average, median FROM jltc.aggregate WHERE test_id = %s) a1,
        (SELECT action_id, average, median FROM jltc.aggregate WHERE test_id = %s) a2,
        jltc.action a
        WHERE a1.action_id = a2.action_id and a.id = a1.action_id
        """, [test_id_1, test_id_2])
    reasonable_percent = 3
    reasonable_abs_diff = 5  # ms
    negatives = []
    positives = []
    absense = []
    MWW_test = []
    avg_list_1 = []
    avg_list_2 = []
    for row in data:
        if row.avg_diff_percent > reasonable_percent:
            negatives.append(row)
        elif row.avg_diff_percent < -reasonable_percent:
            positives.append(row)
    test_1_actions = list(
        Aggregate.objects.annotate(url=F('action__url')).filter(
            test_id=test_id_1).values('url'))
    test_2_actions = list(
        Aggregate.objects.annotate(url=F('action__url')).filter(
            test_id=test_id_2).values('url'))
    for url in test_2_actions:
        if url not in test_1_actions:
            absense.append(url)

    action_list_2 = TestActionAggregateData.objects.filter(
        test_id=test_id_2).values()
    for action in action_list_2:
        action_id = action['action_id']
        action_url = Action.objects.values().get(id=action_id)['url']
        set_1 = TestActionData.objects. \
            filter(test_id=test_id_1, action_id=action_id). \
            annotate(average=RawSQL("((data->>%s)::numeric)", ('avg',))). \
            values("average")
        set_2 = TestActionData.objects. \
            filter(test_id=test_id_2, action_id=action_id). \
            annotate(average=RawSQL("((data->>%s)::numeric)", ('avg',))). \
            values("average")
        data_1 = queryset_to_json(set_1)
        data_2 = queryset_to_json(set_2)
        for d in data_1:
            avg_list_1.append(d['average'])
        for d in data_2:
            avg_list_2.append(d['average'])

        logger.info(action_id)
        if not avg_list_1:
            absense.append(action_url)
        else:
            z_stat, p_val = stats.ranksums(avg_list_1, avg_list_2)
            if p_val <= 0.05:
                a_1 = queryset_to_json(
                    TestActionAggregateData.objects.filter(
                        test_id=test_id_1, action_id=action_id).annotate(
                            mean=RawSQL("((data->>%s)::numeric)", (
                                'mean', ))).annotate(
                                    p50=RawSQL("((data->>%s)::numeric)", (
                                        '50%', ))).values("mean", "p50"))
                a_2 = queryset_to_json(
                    TestActionAggregateData.objects.filter(
                        test_id=test_id_2, action_id=action_id).annotate(
                            mean=RawSQL("((data->>%s)::numeric)", (
                                'mean', ))).annotate(
                                    p50=RawSQL("((data->>%s)::numeric)", (
                                        '50%', ))).values("mean", "p50"))
                mean_1 = float(a_1[0]['mean'])
                mean_2 = float(a_2[0]['mean'])

                mean_diff_percent = (mean_1 - mean_2 / mean_2) * 100
                if mean_diff_percent > 0:
                    negatives.append({
                        "id": action_url,
                        "mean_diff_percent": mean_diff_percent,
                        "mean_1": mean_1,
                        "mean_2": mean_2
                    })
                else:
                    positives.append({
                        "id": action_url,
                        "mean_diff_percent": mean_diff_percent,
                        "mean_1": mean_1,
                        "mean_2": mean_2
                    })

                MWW_test.append({"url": action_url, "p_val": p_val})

                logger.info("MWW RankSum P for 1 and 2 = {}".format(p_val))

    return render(
        request, 'compare_report.html', {
            'negatives': negatives,
            'positives': positives,
            'absense': absense,
            'MWW_test': MWW_test,
        })
Beispiel #45
0
                             data=np.stack([r2_sig_pr, r2_noise_pr, r2_interference_pr]).T)])
r2 = r2.melt()
r2['corrected'] = np.tile(np.concatenate(((False * np.ones(len(r2_sig)).astype(bool), (True * np.ones(len(r2_sig)).astype(bool))))), [3])
r2 = r2.rename(columns={'value': r'$cvR^2$', 'variable': 'Regressor'})

# model coefficients
coefs = pd.DataFrame(columns=[r"$\Delta$ Signal"+"\nmagnitude", 
                                r"$\Delta$ Shared"+"\nnoise variance", 
                                r"$\Delta$ Noise" +"\ninterference"], data=coefs[:, :-1]) 
coefs = coefs.melt()
coefs = coefs.rename(columns={'value': 'Coefficient', 'variable': 'Regressor'})

# stats for r2 for each predictor across sites. Is significant?
r2_raw = r2[r2.corrected==False]
x = r2_raw[r2_raw.Regressor==r"$\Delta$ Signal magnitude"][r"$cvR^2$"]
U, pval = ss.ranksums(x, np.zeros(x.shape[0]))
m = x.mean()
print(f"R2 for signal magnitude, pval: {pval}, U: {U}, mean: {m}\n")

x = r2_raw[r2_raw.Regressor==r"$\Delta$ Shared noise variance"][r"$cvR^2$"]
U, pval = ss.ranksums(x, np.zeros(x.shape[0]))
m = x.mean()
print(f"R2 for shared noise variance, pval: {pval}, U: {U}, mean: {m}\n")

x = r2_raw[r2_raw.Regressor==r"$\Delta$ Noise interference"][r"$cvR^2$"]
U, pval = ss.ranksums(x, np.zeros(x.shape[0]))
m = x.mean()
print(f"R2 for noise interference, pval: {pval}, U: {U}, mean: {m}\n")

# same for corrected 
r2_raw = r2[r2.corrected==True]
        ranks.append(rankdata(ms).tolist())
    ranks = np.array(ranks)
    mean_ranks = np.mean(ranks, axis=0)
    best_clusters.append(np.argmax(mean_ranks) + 2)
    # print("\nRanks:\n", ranks)
    # print("\nMean ranks:\n", )

    alpha = .05
    length = len(clfs)

    s = np.zeros((length, length))
    p = np.zeros((length, length))

    for i in range(length):
        for j in range(length):
            s[i, j], p[i, j] = ranksums(ranks.T[i], ranks.T[j])
    _ = np.where((p < alpha) * (s > 0))
    conclusions = [list(1 + _[1][_[0] == i]) for i in range(length)]

    t.append(["%s" % div] + ["%.3f" % v for v in mean_ranks])

    # t.append([''] + [", ".join(["%i" % i for i in c])
    #                  if len(c) > 0 else nc
    #                  for c in conclusions])
    t.append([''] + [
        ", ".join(["%i" % i
                   for i in c]) if len(c) > 0 and len(c) < len(clfs) - 1 else
        ("all" if len(c) == len(clfs) - 1 else "---") for c in conclusions
    ])

    # print(t)
Beispiel #47
0
def calc_p_value(a_vec, b_vec, is_normal = True):
    if is_normal:
        _, p_val = stats.ttest_ind(a_vec, b_vec)
    else:
        _, p_val = stats.ranksums(a_vec, b_vec)
    return p_val
Beispiel #48
0
            else:
                normpop.append(False)
                print "population is NOT normal"
                fdatasize.write("--> population is NOT normal --> "
                                "p-value (Shapiro's test) :" + str(p) + "\n\n")
            plotlabels.append(dirname)
    print normpop
    print sums
    print plotlabels
    if False in normpop:
        """ Non parametric Wilcoxon rank sum test."""
        print "At least one sample does Not have a normal distibution" \
            "--> wilcoxon rank sum test"
        fdatasize.write("At least one sample does Not have a normal "
                        "distibution --> wilcoxon rank sum test" + "\n")
        statrank, prank = stats.ranksums(*sums)
        if prank > 0.05:
            print "--> populations are NOT statiscically different " \
                "--> p-value is " + str(prank)
            fdatasize.write("--> populations are NOT statiscically different "
                            "--> p-value (wilcoxon rank sum test) : " +
                            str(prank) + "\n\n")
        else:
            print "--> populations are statiscically different " \
                "--> p-value is " + str(prank)
            fdatasize.write("--> populations are statiscically different "
                            "--> p-value (wilcoxon rank sum test) : " +
                            str(prank) + "\n\n")

    else:
        """ Bartlett's test for equal variance."""
verheek_mm_noNN = verheek_merged.copy()
verheek_mm_noNN = verheek_mm_noNN.query("karyotype != 'karyotype: NN'")
verheek_mm_noNN.loc[:, '1007_s_at':
                    'AFFX-TrpnX-M_at'] = min_max_scaler.fit_transform(
                        verheek_mm_noNN.loc[:, '1007_s_at':'AFFX-TrpnX-M_at'])

## loop to look for most significant genes
x = {}
y = 0
for i in verheek_mm.loc[:, '1007_s_at':'AFFX-TrpnX-M_at'].columns:
    y += 1
    gene_high = verheek_mm.loc[verheek_mm.loc[:, i] > 0.7]
    gene_low = verheek_mm.loc[verheek_mm.loc[:, i] < 0.3]
    if gene_high.shape[0] > 20:
        if gene_low.shape[0] < 400:
            z = ranksums(gene_high.os, gene_low.os)[1]
            if z < 0.05:
                print(i, y)
                x[i] = [gene_high.shape[0], gene_low.shape[0], z]

# change to dataframe
sig = pd.DataFrame.from_dict(x, orient='index')
sig.columns = ['high_no', 'low_no', 'sig']
sig = sig.join(probes['Gene Symbol'])
sig.to_csv('sig.csv')
# resume from here no need to test all sig again
sig = pd.read_csv('sig.csv', index_col=0)

gene = '217975_at'  # WBP5
# gene = sig.sort_values('sig').index[4]
symbol = probes.loc[gene]['Gene Symbol']
Beispiel #50
0
    print(delta_class_gain(call_total, 'empirical_reg', 'BS'))
    print(delta_class_gain(call_total, 'DNN', 'empirical_reg'))
    separated_BS = pd.pivot_table(call_total,
                                  values='SSE_DNN',
                                  columns=['delta_class'],
                                  index=['ut'],
                                  aggfunc=np.sum)
    separated_MV = pd.pivot_table(call_total,
                                  values='SSE_empirical_reg',
                                  columns=['delta_class'],
                                  index=['ut'],
                                  aggfunc=np.sum)
    print(1 - separated_BS / separated_MV)
    for idx, data in call_total.groupby('delta_class'):
        print(delta_class_gain(data, 'empirical_reg', 'BS'))
        print(idx, ranksums(data['SSE_empirical_reg'], data['SSE_BS']))

    for idx, data in call_total.groupby('delta_class'):
        print(delta_class_gain(data, 'DNN', 'empirical_reg'))
        print(idx, ranksums(data['SSE_empirical_reg'], data['SSE_DNN']))

    for idx, data in call_total.groupby(['delta_class', 'ut']):
        print(idx, 1 - data['SSE_DNN'].sum() / data['SSE_empirical_reg'].sum(),
              ranksums(data['SSE_empirical_reg'], data['SSE_DNN']))

    put_total = get_backtesting(total_data,
                                start_date=datetime(2015, 1, 15),
                                end_date=datetime(2019, 12, 30),
                                cp=-1,
                                rolling=False,
                                TESTING_PERIOD=240 * 9,
Beispiel #51
0
                                if ith_base != None:
                                    t_alt_pos_from_end.append(
                                        min(ith_base,
                                            read_i.query_length - ith_base))

                                # Flanking indels:
                                t_alt_flanking_indel.append(flanking_indel_i)

                            # Inconsistent read or 2nd alternate calls:
                            else:
                                t_noise_read_count += 1

                    # Done extracting info from tumor tBAM. Now tally them:
                    t_ref_mq = mean(t_ref_read_mq)
                    t_alt_mq = mean(t_alt_read_mq)
                    t_z_ranksums_mq = stats.ranksums(t_alt_read_mq,
                                                     t_ref_read_mq)[0]

                    t_ref_bq = mean(t_ref_read_bq)
                    t_alt_bq = mean(t_alt_read_bq)
                    t_z_ranksums_bq = stats.ranksums(t_alt_read_bq,
                                                     t_ref_read_bq)[0]

                    t_ref_NM = mean(t_ref_edit_distance)
                    t_alt_NM = mean(t_alt_edit_distance)
                    t_z_ranksums_NM = stats.ranksums(t_alt_edit_distance,
                                                     t_ref_edit_distance)[0]
                    t_NM_Diff = t_alt_NM - t_ref_NM - abs(indel_length)

                    t_concordance_fet = stats.fisher_exact(
                        ((t_ref_concordant_reads, t_alt_concordant_reads),
                         (t_ref_discordant_reads, t_alt_discordant_reads)))[1]
        noisePval = np.empty(len(db))
        baseRange = [-0.2, 0]
        responseRange = [0, 0.2]
        for indCell, cell in db.iterrows():
            spikeData, eventData = dataloader.get_session_ephys(
                cell, 'noiseburst')
            eventOnsetTimes = eventData.get_event_onset_times()
            alignmentRange = [baseRange[0], responseRange[1]]
            (spikeTimesFromEventOnset, trialIndexForEachSpike,
             indexLimitsEachTrial) = spikesanalysis.eventlocked_spiketimes(
                 spikeData.timestamps, eventOnsetTimes, alignmentRange)
            nspkBase = spikesanalysis.spiketimes_to_spikecounts(
                spikeTimesFromEventOnset, indexLimitsEachTrial, baseRange)
            nspkResp = spikesanalysis.spiketimes_to_spikecounts(
                spikeTimesFromEventOnset, indexLimitsEachTrial, responseRange)
            [zScore, pVal] = stats.ranksums(nspkResp, nspkBase)
            noiseZscore[indCell] = zScore
            noisePval[indCell] = pVal
        db['noiseZscore'] = noiseZscore
        db['noisePval'] = noisePval

        #Laser pulse response
        #NOTE: This does the same thing as the noise burst response, but I am not making a function
        #because things are getting hidden and I want to be more explicit about what I am doing.
        pulseZscore = np.empty(len(db))
        pulsePval = np.empty(len(db))
        baseRange = [-0.1, 0]
        responseRange = [0, 0.1]
        for indCell, cell in db.iterrows():
            spikeData, eventData = dataloader.get_session_ephys(
                cell, 'laserpulse')
Beispiel #53
0
               fontsize=fontSizeLabels)
    plt.ylabel('Number of cells', fontsize=fontSizeLabels)
    extraplots.boxoff(plt.gca())

    # -- Stats: test whether the modulation index distribution for all good cells is centered at zero -- #
    print 'Total number of sound responsive good cells is:', sum(
        soundRespAStr), '\nNumber of cells significantly modulated is:', len(
            sigModIAStr)
    (Z, pVal) = stats.wilcoxon(allModIAStr)
    print 'For AStr: Mean mod index is {:.3f}. Using the Wilcoxon signed-rank test, comparing the modulation index distribution for all good cells to zero yielded a p value of {:.3f}'.format(
        np.mean(allModIAStr), pVal)
    (Z, pVal) = stats.wilcoxon(sigModIAStr)
    print 'For significantly modulated cells in AC: Mean mod index is {:.3f}. Using the Wilcoxon signed-rank test, comparing the modulation index distribution to zero yielded a p value of {:.3f}'.format(
        np.mean(sigModIAC), pVal)

    (Z, pValBtAreas) = stats.ranksums(np.abs(allModIAC), np.abs(allModIAStr))
    print 'Using wilcoxon rank sum test to compare ABSOLUTE modulation indices between AC and AStr, p value is {:.3f}'.format(
        pValBtAreas)
    print 'Median absolute mod index for AC: {}'.format(
        np.median(np.abs(allModIAC)))
    print 'Median absolute mod index for AStr: {}'.format(
        np.median(np.abs(allModIAStr)))

    #(oddRatio, pValFisher) = stats.fisher_exact([[sum(soundRespAC)-len(sigModIAC), len(sigModIAC)],[sum(soundRespAStr)-len(sigModIAStr), len(sigModIAStr)]])
    #print 'Using Fishers exact test to compare fraction of modulated cells between AC and AStr, p value is {:.3f}'.format(pValFisher)
    (Z, pValBtAreasSig) = stats.ranksums(np.abs(sigModIAC),
                                         np.abs(sigModIAStr))
    print 'Using wilcoxon rank sum test to compare ABSOLUTE modulation indices between significantly modulated cells in AC and AStr, p value is {:.3f}'.format(
        pValBtAreasSig)
    print 'Median absolute mod index for modulated cells in AC: {}'.format(
        np.median(np.abs(sigModIAC)))
        odor_end=odor_end)
    correlation.plot_correlation_across_days(temp,
                                             days,
                                             loop_keys=['mouse', 'odor'],
                                             shuffle=shuffle,
                                             figure_path=figure_path,
                                             reuse=False,
                                             save=True,
                                             analyze=False,
                                             plot_bool=True,
                                             odor_end=odor_end)
    ixa = temp['odor_valence'] == 'CS+'
    ixb = temp['odor_valence'] == 'CS-'
    a = temp['corrcoef'][ixa]
    b = temp['corrcoef'][ixb]
    print(ranksums(a, b))

if condition.name == 'PIR':
    naive_config = statistics.analyze.PIR_NAIVE_Config()
    data_path_ = os.path.join(Config.LOCAL_DATA_PATH,
                              Config.LOCAL_DATA_TIMEPOINT_FOLDER,
                              naive_config.condition.name)
    save_path_ = os.path.join(Config.LOCAL_EXPERIMENT_PATH, 'COUNTING',
                              naive_config.condition.name)
    res_naive = fio.load_pickle(os.path.join(save_path_, 'dict.pkl'))
    learned_day_per_mouse_, last_day_per_mouse_ = get_days_per_mouse(
        data_path_, naive_config.condition)
    #
    res = statistics.analyze.analyze_data(save_path,
                                          condition_config,
                                          m_threshold=.1)
Beispiel #55
0
# extract all predicted driver genes | sift_score

genelist1 = pd.read_csv(
    '/encrypted/e3000/gatkwork/COREAD-ESCA-predicteddriver.tsv',
    header=None,
    skiprows=0,
    sep='\t')
genelist1.columns = ['geneName']

merged_df1 = sift_df.merge(genelist1, how='inner', on=['geneName'])

merged_df1.drop(['geneName'], axis=1, inplace=True)

# calculate p-value for ranksums with SIFT

stat, pvalue = ranksums(merged_df, merged_df1)

print(pvalue)

#################### POLYPHEN ###################

# calculate ranksums for POLYPHEN

polyphen_df = df[['geneName', 'polyphen']]

# extract all non-driver genes | sift_score
genelist = pd.read_csv('/encrypted/e3000/gatkwork/COREAD-ESCA-all-driver.tsv',
                       header=None,
                       skiprows=0,
                       sep='\t')
genelist.columns = ['geneName']
    for i in range(nb):
        # ( actual labels, predicted probabilities )
        fpr[i], tpr[i], _ = roc_curve(test_labels[:, i],
                                      test_prediction[:, i])  # flip here
        roc_auc[i] = auc(fpr[i], tpr[i])

    return [round(roc_auc[x], 3) for x in range(nb)]


Y_pred = F
Y = np_utils.to_categorical(label, nb_classes)
ROC = AUC(Y, Y_pred, nb_classes)
print('AUC =', ROC[1])

import scipy.stats as stat
a = Y_pred[:, 0]
b = Y[:, 0]
groups = [a[b == i] for i in xrange(2)]
rs = stat.ranksums(groups[0], groups[1])[1]
print('p = ', rs)

score = model.evaluate(X, Y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

#find indeces where Y is greater than a certain value

idx = Y_pred[:, 0]

indeces = [i for i, v in enumerate(idx >= 0.5) if v]
Beispiel #57
0
        ortholog = subdata['ortholog']
        if list(subdata.values())[0] == "E":
            try:
                essential.append(float(conservation[ortholog]))
            except:
                pass
        if list(subdata.values())[0] == "NE":
            try:
                non_essential.append(float(conservation[ortholog]))
            except:
                pass

    print("The number of essential genes: %d" % len(essential))
    print("The number of non-essential genes: %d" % len(non_essential))

    print(ranksums(essential, non_essential))

    # # # https://blog.csdn.net/aijiudu/article/details/89387328
    # print(pd.DataFrame(essential).describe())
    # print(pd.DataFrame(non_essential).describe())
    # print("-------------------------------------")

# Results:

# This organism is: S cerevisiae
# The number of essential genes: 1033
# The number of non-essential genes: 4301
# RanksumsResult(statistic=4.84696083548837, pvalue=1.2536716457740872e-06)
# This organism is: S pombe
# The number of essential genes: 1140
# The number of non-essential genes: 2600
Beispiel #58
0
	print(dic['Question_Text'][mask].iloc[i])
	tags = tag_matrix.ix[:, mask].ix[:, i]
	print("Tags: "+'%s, '*tags.sum() % tuple(tags.index.str.lower()[tags]))
	
	# Checks Ordinal data, which is used in rating questions (rate from 1-6)
	if dic['Data_type'][mask].iloc[i] == 'Ordinal':
		print(dic['Data_values'][mask].iloc[i])
		print("Category\t\tn\tMean\t1\t2\t3\t4\t5\t6\t(4-6)\tpWilc.\tpBinom.")
		# Rates data by each demographic
		for j in range(len(category_names)):
			width = np.zeros(6)
			total = subframe.ix[:, i].ix[categories.ix[:, j]].valid().count()
			width = np.histogram(subframe.ix[:, i].ix[categories.ix[:, j]],
							bins=np.arange(1, 8), range=(1,7),
							normed=True)[0]
			pval = stats.ranksums(subframe.ix[:, i].ix[categories.ix[:, j]].dropna(), subframe.ix[:, i].ix[-categories.ix[:, j]].dropna())[1]
			yes = (subframe.ix[:, i].ix[categories.ix[:, j]].dropna() > 3).sum()
			no = (subframe.ix[:, i].ix[categories.ix[:, j]].dropna() <= 3).sum()
			p0 = (subframe.ix[:, i].ix[-categories.ix[:, j]].dropna() > 3).mean()
			pval_binom = stats.binom_test((yes, no), p=p0)
			pval_comb = stats.combine_pvalues((pval, pval_binom))[1]
			print('%21s\t%i' % (category_names[j], total) + '\t%2.1f' % (subframe.ix[:, i].ix[categories.ix[:, j]]).mean() + '\t%3.1f%%'*6 % tuple(width*100)+'\t%3.1f%%' % (width[3:].sum()*100) +'\t%3.2f' % (pval)+'*'*(pval < 0.05)+'\t%3.2f' % (pval_binom)+'*'*(pval_binom < 0.05))
			#print '%21s\t%i' % (category_names[j], total) + '\t%2.1f' % (subframe.ix[:, i].ix[categories.ix[:, j]]).mean() + '\t%3.1f%%'*6 % tuple(width*100) +'\t%3.1f%%' % (width[3:].sum()*100)+'\t%3.2f' % (pval_binom)+'*'*(pval < 0.05)
		print
	elif dic['Data_type'][mask].iloc[i] == 'Binary':
		responsetypes = dic['Data_values'][mask].iloc[i].split(';')
		print("Category\t\t n\t"+ '%s\t'*len(responsetypes) % tuple(responsetypes)+"p-value")
		for j in range(len(category_names)):
			yes = (subframe.ix[:, i].ix[categories.ix[:, j]]==responsetypes[0]).sum()
			no = (subframe.ix[:, i].ix[categories.ix[:, j]]==responsetypes[1]).sum()
			total = (yes+no)*1.
Beispiel #59
0
#print(wilcoxon(rt-np.mean(rt), correction = True))
print('Mu_ge equal 0')

print('***')
GE = pd.read_csv('C:/Users/anivia/Desktop/geDJ.txt',
                 sep="\s+",
                 header=None,
                 names=['date', 'open', 'high', 'low', 'close', 'vol'])
SP = pd.read_csv(
    'https://www.math.ust.hk/~macwyu/MAFS5110_2018-2019/MAFS5110_2018-2019/Chapter_1/sp500.txt',
    sep="\s+")
logreturn_GE = np.diff(np.log(np.array(GE["close"])))
logreturn_sp500 = np.diff(np.log(np.array(SP["close"])))
da2 = pd.concat([pd.DataFrame(logreturn_GE),
                 pd.DataFrame(logreturn_sp500)],
                axis=1)
da2.columns = ["logreturn_GE", "logreturn_sp500"]
da2.boxplot(column=['logreturn_GE', 'logreturn_sp500'])
#plt.show()

print('***')
print(stats.mood(logreturn_sp500, logreturn_GE))
print('H0 can be rejected, the variances are significantly different')
print(ttest_ind(logreturn_sp500, logreturn_GE, equal_var=True))
print('Means are insignificantly different')
#cm=sms.CompareMeans(sms.DescrStatsW(logreturn_sp500),sms.DescrStatsW(logreturn_GE))
#print('C.I. is ',cm.tconfint_diff())
print('so they are not equal.')
from scipy.stats import ranksums
print(ranksums(logreturn_sp500, logreturn_GE))
print('two groups do not have equal meDIans')
Beispiel #60
0
CLIP_y = []

for z in range(0, len(sorted_CLIP)):
    CLIP_y.append(float(1.0 / len(sorted_CLIP)) * z)

random_y = []

for z in range(0, len(sorted_random)):
    random_y.append(float(1.0 / len(sorted_random)) * z)

WSN_y = []

for z in range(0, len(sorted_WSN)):
    WSN_y.append(float(1.0 / len(sorted_WSN)) * z)

statistic, pvalue_CLIP = stats.ranksums(sorted_total, sorted_CLIP)
print pvalue_CLIP

params = {'mathtext.default': 'regular'}
plt.rcParams.update(params)

plt.scatter(sorted_total,
            total_y,
            s=1,
            color='k',
            alpha=0.5,
            label="not CLIP n=" + str(len(sorted_total)))
plt.scatter(sorted_CLIP,
            CLIP_y,
            s=1,
            color='r',