Beispiel #1
0
def test_regularity(dataset):
    choice_sets, choices, person_df = dataset.load()

    unique_choice_sets, idx = np.unique(choice_sets, axis=0, return_inverse=True)

    tests = 0
    for i, j in combinations(range(len(unique_choice_sets)), 2):
        c1, c2 = unique_choice_sets[i], unique_choice_sets[j]
        if all(c1 * c2 == c2):
            c1, c2 = c2, c1
            i, j = j, i

        if any(c1 * c2 != c1):
            continue

        c1_counts = np.bincount(choices[idx == i, 0], minlength=len(c1))
        c2_counts = np.bincount(choices[idx == j, 0], minlength=len(c1))
        c1_tot = np.sum(c1_counts)
        c2_tot = np.sum(c2_counts)
        c1_prop = c1_counts / c1_tot
        c2_prop = c2_counts / c2_tot

        c1_names = [dataset.item_names[k] for k in range(len(c1)) if c1[k] == 1]
        c2_names = [dataset.item_names[k] for k in range(len(c1)) if c2[k] == 1]

        for k in range(len(c1)):
            if c1[k] == 1:
                tests += 1
                oddsratio, pvalue = stats.fisher_exact(
                    [[c1_counts[k], c1_tot - c1_counts[k]], [c2_counts[k], c2_tot - c2_counts[k]]])

                if c1_prop[k] < c2_prop[k] and pvalue < 0.05:
                    print(f'{dataset.item_names[k]} (p={pvalue:.2g}])\n'
                          f'\t{c1_prop[k]:.2f} ({c1_tot} samples) in {c1_names} \n'
                          f'\t{c2_prop[k]:.2f} ({c2_tot} samples) in {c2_names} ')
Beispiel #2
0
def FisherScoreCutoff(posScores, negScores):
    """Define the upper and lower 5 percent bounds of the distribution of scores, 
		then increase the score cutoff by 1/100 the difference between the two scores at a time
		compute the Fishter p-value for each score cutoff and record the best one """
    #posScores=sorted(posScores,reverse=True)
    negScores = sorted(negScores, reverse=True)

    contingencyTable = [[0, 0], [0, 0]]
    scoreCutoff = negScores[len(negScores) / 100]
    posCount = 0
    negCount = 0
    for i in range(len(posScores)):
        if posScores[i] >= scoreCutoff:
            posCount += 1
    for j in range(len(negScores)):
        if negScores[j] >= scoreCutoff:
            negCount += 1
    contingencyTable[0][0] = posCount
    contingencyTable[0][1] = len(posScores) - posCount
    contingencyTable[1][0] = negCount
    contingencyTable[1][1] = len(negScores) - negCount
    ob, p_value = stats.fisher_exact(contingencyTable)
    return p_value, scoreCutoff, contingencyTable[0] + contingencyTable[1]
    '''upperBound=negScores[len(negScores)/10000]
	lowerBound=negScores[len(negScores)/100*50]
	print "Lower index",len(negScores)/100*50
	print len(posScores),len(negScores)
	print "UPPER LOWER",upperBound,lowerBound
	intervalnum=100
	interval=(upperBound-lowerBound)/intervalnum
	print "INTERVAL",interval
	bestcuttoff=-100.0
	bestpvalue=1.0
	contingencyTable=[[0,0],[0,0]]
	besttable=contingencyTable
	for i in range(intervalnum):
		scoreCutoff=lowerBound+interval*i
		if scoreCutoff >upperBound:
			break
		posCount=0
		negCount=0
		for j in range(len(posScores)):
			if posScores[j]>=scoreCutoff:
				posCount+=1
		for j in range(len(negScores)):
			if negScores[j]>=scoreCutoff:
				negCount+=1
		contingencyTable[0][0]=posCount
		contingencyTable[0][1]=len(posScores)-posCount
		contingencyTable[1][0]=negCount
		contingencyTable[1][1]=len(negScores)-negCount
		ob,p_value=stats.fisher_exact(contingencyTable)
		#print p_value,scoreCutoff,contingencyTable
		if p_value<bestpvalue:
			bestpvalue=p_value
			bestcuttoff=scoreCutoff
			besttable=contingencyTable[0]+contingencyTable[1]
	return bestpvalue,bestcuttoff,besttable'''

    return
Beispiel #3
0
    def generate_rules_for_class(self, general_summary, class_name):
        special_summary = []
        for summary_detail in general_summary:
            if summary_detail[1][class_name] > 0:
                special_summary.append(summary_detail)
                '''
                Compute p-value
                '''
                item_set = string_2_itemset(summary_detail[0])
                satisfy_rule = self.freq_itemset_dict.get_frequency(
                    summary_detail[0])
                no_satisfy_rule = self.freq_itemset_dict.ntransactions - satisfy_rule

                correct_predict = self.lookup_frequency(item_set, class_name)
                incorrect_predict = satisfy_rule - correct_predict

                belong_to_class = self.freq_itemset_dict.get_frequency(
                    class_name)
                no_rule_belong_to_class = belong_to_class - correct_predict
                contingency_matrix = np.array(
                    [[correct_predict, incorrect_predict],
                     [
                         no_rule_belong_to_class,
                         no_satisfy_rule - no_rule_belong_to_class
                     ]])

                _, p_value = stats.fisher_exact(contingency_matrix)
                summary_detail[1]['p-value'] = p_value

        return special_summary
def main():
	posfile = argv[1]
	negfile = argv[2]
	fastafile = argv[3]
	outfile = argv[4]


	totalseq = len(open(fastafile).read().split(">")) - 1
	posdict = {}
	file = open(posfile)
	file.readline()
	for line in file:
		tmp = line.strip().split('\t')
		if tmp[0] not in posdict:
			posdict[tmp[0]] = {}
		posdict[tmp[0]][tmp[1]] = 1 

	negdict = {}
	file = open(negfile)
	file.readline()
	for line in file:
		tmp = line.strip().split('\t')
		if tmp[0] not in negdict:
			negdict[tmp[0]] = {}
		negdict[tmp[0]][tmp[1]] = 1 

	
	lines = [] 
	pvalues = [] 
	#enrichments = []
	for motif in posdict:
		if motif not in negdict:
			print "ERROR, not the same set of motifs"
			break
		posnum = len(posdict[motif])
		negnum = len(negdict[motif])
		enrichment, pvalue = stats.fisher_exact([[posnum, totalseq - posnum], [negnum, totalseq - negnum]])
		pvalues += [pvalue]
		#enrichments = [enrichment]
		line = [motif, str(posnum), str(totalseq - posnum), str(negnum), str(totalseq - negnum), str(enrichment), str(pvalue)]
		line = '\t'.join(line)
		lines += [line]
	
	sortedindex = sorted(range(len(pvalues)), key = lambda x: pvalues[x])
	lines = [lines[x] for x in sortedindex]
	target = open(outfile,'w')
	for line in lines:
		target.write(line+'\n')
	target.close()

	return
def fisher_exact_two_groups(dataset, target_col, protected_col):
    """
    Performs a Fisher exact test on a 2x2 contingency table as in scipy.stats.fisher_exact_two_groups()

    @param dataset:
    @param target_col:      name of the column that contains the classifier results
    @param protected_col:   name of the column that contains the protection status

    @return: odds ratio and related p-value
    """
    positive_protected = dataset.count_classification_and_category(target_col, protected_col, group=1, accepted=1)
    negative_protected = dataset.count_classification_and_category(target_col, protected_col, group=1, accepted=0)
    positive_nonprotected = dataset.count_classification_and_category(target_col, protected_col, group=0, accepted=1)
    negative_nonprotected = dataset.count_classification_and_category(target_col, protected_col, group=0, accepted=0)

    contingency_table = [[positive_protected, negative_protected], [positive_nonprotected, negative_nonprotected]]

    return stats.fisher_exact(contingency_table)
Beispiel #6
0
def test_discrete(a, b):
    # multiple classes, Fisher's exact test, followed by Bonferonni correction
    # returns the smallest p-value for all tested classes

    all_categories = set(flatten(a))
    pvalues = []
    for category in all_categories:
        # calculate number of items with this category
        a1, a0 = get_counts(a, category)
        b1, b0 = get_counts(b, category)
        # we are only interested in enrichment, so right_tail
        oddsratio, pvalue = stats.fisher_exact([[a1, a0], [b1, b0]],
                                               alternative="greater")
        pvalues.append((pvalue, category))

    # fisher's exact test plus bonferroni correction of number of tests
    min_pvalue, min_category = min(pvalues)
    min_pvalue *= len(pvalues)
    return min_pvalue, min_category
Beispiel #7
0
def test_discrete(a, b):
    # multiple classes, Fisher's exact test, followed by Bonferonni correction
    # returns the smallest p-value for all tested classes

    all_categories = set(flatten(a))
    pvalues = []
    for category in all_categories:
        # calculate number of items with this category
        a1, a0 = get_counts(a, category)
        b1, b0 = get_counts(b, category)
        # we are only interested in enrichment, so right_tail
        oddsratio, pvalue = stats.fisher_exact([[a1, a0], [b1, b0]],
                            alternative="greater")
        pvalues.append((pvalue, category))

    # fisher's exact test plus bonferroni correction of number of tests
    min_pvalue, min_category = min(pvalues)
    min_pvalue *= len(pvalues)
    return min_pvalue, min_category
Beispiel #8
0
def TestPatternInNegSeq(posPatternCovLis, posSeqCnt, negSeqFn, allKmerSet):
    patternSet = set()
    patternCnt = len(posPatternCovLis)
    initPatternSet = map(lambda x:x[0], posPatternCovLis)
    negSeqLis, negSeqCnt, _, _ = BioinfoComm.loadSinglelineSeq(negSeqFn)
    negKmer2seqIdSet = BioinfoComm.FetchCovInSeqLisMutliKmer(negSeqLis, allKmerSet)
    negKmer2seqIdInt = BioinfoComm.formatCovId(negKmer2seqIdSet, negSeqCnt)

    for pattern, posCov in posPatternCovLis:
        posUncov = posSeqCnt - posCov
        negCov, _, negKmer2seqIdInt = BioinfoComm.FetchPatternCov(pattern, negSeqLis, negKmer2seqIdInt)
        negUncov = negSeqCnt - negCov
        dataTable = [[posCov, posUncov], [negCov, negUncov]]
        _, rawPValue = stats.fisher_exact(dataTable)
        adjustedPvalue = min(rawPValue * patternCnt, 1)
        if adjustedPvalue < 0.05: patternSet.add(pattern)

    INFO('pattern before negative filter')
    INFO(initPatternSet)
    INFO('pattern after negative filter')
    INFO(patternSet)
    return patternSet
Beispiel #9
0
def pvalue_calculation(infile):

    sample = ((infile.split('_tenmers'))[0]).replace('donor', 'd')
    with open(allseqs_background) as inF:
        for line in inF:
            if 'seqs' in line:
                index_sample = ((line.strip()).split('\t')).index(sample)
            else:
                linea = line.split('\t')
                all_background = float(linea[index_sample])

    with open(allseqs_snatched) as inF:
        for line in inF:
            if 'seqs' in line:
                index_sample = ((line.strip()).split('\t')).index(sample)
            else:
                linea = line.split('\t')
                all_snatched = float(linea[index_sample])

    out = ('fishers_output_%s.txt')%(sample)
    o = open(out, 'w')
    with open(infile, 'r') as inF:
        for line in inF:
            linea = (line.strip()).split('\t')
            sequence = linea[0]
            snatch = float(linea[2])
            unsnatch = float(linea[1])

            A = snatch
            B = unsnatch
            C = all_snatched - snatch
            D = all_background - unsnatch

            oddsratio, pvalue = stats.fisher_exact([[A, B], [C,D]])

            outlist = [sequence, str(A), str(B), str(C), str(D), str(pvalue), str(oddsratio), '\n']
            output = '\t'.join(outlist)
            o.write(output)
Beispiel #10
0
def FisherScoreCutoff(posScores,negScores):
	"""Define the upper and lower 5 percent bounds of the distribution of scores, 
		then increase the score cutoff by 1/100 the difference between the two scores at a time
		compute the Fishter p-value for each score cutoff and record the best one """
	#posScores=sorted(posScores,reverse=True)
	negScores=sorted(negScores,reverse=True)
	
	contingencyTable=[[0,0],[0,0]]
	scoreCutoff=negScores[len(negScores)/100]
	posCount=0
	negCount=0
	for i in range(len(posScores)):
		if posScores[i]>=scoreCutoff:
			posCount+=1
	for j in range(len(negScores)):
		if negScores[j]>=scoreCutoff:
			negCount+=1
	contingencyTable[0][0]=posCount
	contingencyTable[0][1]=len(posScores)-posCount
	contingencyTable[1][0]=negCount
	contingencyTable[1][1]=len(negScores)-negCount
	ob,p_value=stats.fisher_exact(contingencyTable)
	return p_value,scoreCutoff,contingencyTable[0]+contingencyTable[1]
	'''upperBound=negScores[len(negScores)/10000]
	lowerBound=negScores[len(negScores)/100*50]
	print "Lower index",len(negScores)/100*50
	print len(posScores),len(negScores)
	print "UPPER LOWER",upperBound,lowerBound
	intervalnum=100
	interval=(upperBound-lowerBound)/intervalnum
	print "INTERVAL",interval
	bestcuttoff=-100.0
	bestpvalue=1.0
	contingencyTable=[[0,0],[0,0]]
	besttable=contingencyTable
	for i in range(intervalnum):
		scoreCutoff=lowerBound+interval*i
		if scoreCutoff >upperBound:
			break
		posCount=0
		negCount=0
		for j in range(len(posScores)):
			if posScores[j]>=scoreCutoff:
				posCount+=1
		for j in range(len(negScores)):
			if negScores[j]>=scoreCutoff:
				negCount+=1
		contingencyTable[0][0]=posCount
		contingencyTable[0][1]=len(posScores)-posCount
		contingencyTable[1][0]=negCount
		contingencyTable[1][1]=len(negScores)-negCount
		ob,p_value=stats.fisher_exact(contingencyTable)
		#print p_value,scoreCutoff,contingencyTable
		if p_value<bestpvalue:
			bestpvalue=p_value
			bestcuttoff=scoreCutoff
			besttable=contingencyTable[0]+contingencyTable[1]
	return bestpvalue,bestcuttoff,besttable'''


	return
def main():
    usage = 'usage: %prog anchor_results.txt anchor_results_null.txt\n'\
        'Requires two input arguments:\n'\
        '1) Interesting anchor results, output from run_anchor_batch.py\n'\
        '2) Null anchor results, output from run_anchor_batch.py\n'
    parser = OptionParser(usage=usage)
    parser.add_option('-1',
                      '--exon_label1',
                      dest='exon_label1',
                      default='Exon label 1',
                      help='Exon label of anchor_results.txt.')
    parser.add_option('-2',
                      '--exon_label2',
                      dest='exon_label2',
                      default='Exon label 2',
                      help='Exon label of anchor_results_null.txt')
    parser.add_option(
        '-t',
        '--title',
        dest='title',
        default='Fraction of exons with predicted binding regions',
        help='Title of plot.')
    (options, args) = parser.parse_args()
    if len(args) != 2:
        print 'Two arguments need to be specified in command line.\n'
        print usage
        sys.exit()
    anchor_results_path = args[0]
    anchor_results_null_path = args[1]
    exon_label1 = options.exon_label1
    exon_label2 = options.exon_label2
    mytitle = options.title

    # init dic with keys and empty lists
    anchor_dic = {}
    for key in ['binding', 'non_binding', 'total']:
        anchor_dic[key] = []

    for results in [anchor_results_path, anchor_results_null_path]:
        binding_count, total_count = count_anchor_results(results)
        non_binding_count = total_count - binding_count
        for key, val in zip(['binding', 'non_binding', 'total'],
                            [binding_count, non_binding_count, total_count]):
            anchor_dic[key].append(val)

    oddsratio, pvalue = \
        fisher_exact([anchor_dic['binding'], anchor_dic['non_binding']])

    print 'oddsratio: %s\npvalue: %s' % (oddsratio, pvalue)

    # plot distributions (from plot_meme_motif_null_comparison.py)
    mylabels = [exon_label1, exon_label2]
    # Plot bargraphs
    frac_binding = float(anchor_dic['binding'][0]) / anchor_dic['total'][0]
    frac_binding_null = float(
        anchor_dic['binding'][1]) / anchor_dic['total'][1]
    myvals = [frac_binding, frac_binding_null]
    plot_barplot(myvals, mytitle, mylabels,
                 ylabel='Fraction predicted binding regions',
                 mytext1="%i/%i" \
                    %(anchor_dic['binding'][0],
                      anchor_dic['total'][0]),
                  mytext2='%i/%i' %(anchor_dic['binding'][1],
                                    anchor_dic['total'][1]),
                  mytext3="*Fisher's Exact Test\nP-value=%.2e" %pvalue,
                  ymin=0,
                  ymax=1,
                  width=0.5)
    plt.show()
def main():
    usage = 'usage: %prog anchor_results.txt anchor_results_null.txt\n'\
        'Requires two input arguments:\n'\
        '1) Interesting anchor results, output from run_anchor_batch.py\n'\
        '2) Null anchor results, output from run_anchor_batch.py\n'
    parser = OptionParser(usage=usage)
    parser.add_option('-1', '--exon_label1', dest='exon_label1',
                      default='Exon label 1',
                      help='Exon label of anchor_results.txt.')
    parser.add_option('-2', '--exon_label2', dest='exon_label2',
                      default='Exon label 2',
                      help='Exon label of anchor_results_null.txt')
    parser.add_option('-t', '--title', dest='title',
                      default='Fraction of exons with predicted binding regions',
                      help='Title of plot.')
    (options, args) = parser.parse_args()
    if len(args) != 2:
        print 'Two arguments need to be specified in command line.\n'
        print usage
        sys.exit()
    anchor_results_path = args[0]
    anchor_results_null_path = args[1]
    exon_label1 = options.exon_label1
    exon_label2 = options.exon_label2
    mytitle = options.title

    # init dic with keys and empty lists
    anchor_dic = {}
    for key in ['binding', 'non_binding', 'total']:
        anchor_dic[key] = []

    for results in [anchor_results_path, anchor_results_null_path]:
        binding_count, total_count = count_anchor_results(results)
        non_binding_count = total_count - binding_count
        for key, val in zip(['binding', 'non_binding', 'total'],
                            [binding_count, non_binding_count, total_count]):
            anchor_dic[key].append(val)

    oddsratio, pvalue = \
        fisher_exact([anchor_dic['binding'], anchor_dic['non_binding']])

    print 'oddsratio: %s\npvalue: %s' %(oddsratio, pvalue)

    # plot distributions (from plot_meme_motif_null_comparison.py)
    mylabels = [exon_label1, exon_label2]
    # Plot bargraphs
    frac_binding = float(anchor_dic['binding'][0]) / anchor_dic['total'][0]
    frac_binding_null = float(anchor_dic['binding'][1]) / anchor_dic['total'][1]
    myvals = [frac_binding, frac_binding_null]
    plot_barplot(myvals, mytitle, mylabels,
                 ylabel='Fraction predicted binding regions',
                 mytext1="%i/%i" \
                    %(anchor_dic['binding'][0],
                      anchor_dic['total'][0]),
                  mytext2='%i/%i' %(anchor_dic['binding'][1],
                                    anchor_dic['total'][1]),
                  mytext3="*Fisher's Exact Test\nP-value=%.2e" %pvalue,
                  ymin=0,
                  ymax=1,
                  width=0.5)
    plt.show()