def isCoding(chr, start, end, cds_gf): chr = chrFill(chr) if isinstance(cds_gf, str): gf = tabix.Tabix(cds_gf) else: gf = cds_gf regs = gf.fetch('%s:%i-%i' % (chr, start, end)) regs = [reg for reg in regs] return len(regs) >= 1
def getGeneFromCoords( chr, ex, genesFile='/net/crate-04/data/burge/alexrson/finalAnalyses' '/long_short_exons/genelines.sorted.gff.gz'): start_q, end_q = min(ex), max(ex) chr = chrFill(chr) if isinstance(genesFile, str): gf = tabix.Tabix(genesFile) else: gf = genesFile regs = gf.fetch('%s:%i-%i' % (chr, start_q, end_q)) regs = [reg for reg in regs] genes = set() trans2gene = {} found_trans = False found_CDS = False for reg in regs: regl = reg.split('\t') attri = readAttributes.readAttributesIntoDict(regl[8]) start_exon, end_exon = map(int, regl[3:5]) if regl[2] == 'exon': if start_exon == start_q or start_exon == end_q or \ end_exon == start_q or end_exon == end_q: found_trans = attri['Parent'][0] elif regl[2] == 'CDS': if start_exon == start_q or start_exon == end_q or \ end_exon == start_q or end_exon == end_q: found_CDS = attri['Parent'][0] elif regl[2] == 'mRNA': gene = attri['Parent'][0] trans = attri['ID'][0] trans2gene[trans] = gene genes.add(tuple([regl[1], gene])) elif regl[2] == 'gene': gene = attri['ID'][0] genes.add(tuple([regl[1], gene])) if found_CDS: return trans2gene[found_CDS] if found_trans: return trans2gene[found_trans] if len(genes) == 1: return genes.pop()[1] for esp, gene in genes: if esp in ['protein_coding', 'rkb', 'liana']: return gene for esp, gene in genes: if esp in ['ucsc.knownGene-kgXref-ensGene']: return gene if not genes: print regs, chr, ex return None return genes.pop()[1]
chrm = 'chr%s' % (chrm) gene = attr.split('"')[1] ref[(chrm, start)].add(gene) ref[(chrm, stop)].add(gene) return ref if __name__ == '__main__': reffiles = sys.argv[1].split(',') indir = sys.argv[2] gene2go, gene2name = get_go_bits_from_david() known2ens = get_known2ens() ref = get_ref(reffiles) tab = tabix.Tabix( '/net/afterthefact/data/jmerkin/Mus_musculus.NCBIM37.67.gtf.gz') types = [] faileds = [] conv_file = open('convert_allevents_id2gene', 'w') for misotype in os.listdir(indir): reffunc, this_ref = get_reffunc(misotype, known2ens, ref) print misotype, reffunc if reffunc is None: continue types.append(misotype) fi = open( '%s/%s/Comparisons/N2ASoma_vs_N2AAxon/bayes-factors/N2ASoma_vs_N2AAxon.miso_bf' % ( indir, misotype,
def setUp(self): self.tb = tabix.Tabix(EXAMPLEFILE)
import sys, tabix, optparse p = optparse.OptionParser() p.add_option( '-p', '--padded', action='store', dest='pad', help= 'Program will remove this amount of padding when checking for scores, but will remain in the output bed file', default=0) options, args = p.parse_args() # PWM bed bed = open(args[0], 'rU') # PhastCons try: fc = tabix.Tabix(args[2]) except: fc = tabix.Tabix('fastcons44.bed.gz') #Output w = open(args[1], 'w') for line in bed: scores = [] l1 = line.strip().split('\t') chrom = l1[0] #Positive strand if int(l1[1]) > 0: start = int(l1[1]) end = int(l1[2]) #Negative strand else:
def main(): reffile = sys.argv[1] indir = sys.argv[2] minbf = float(sys.argv[3]) minpsi = .25 #gene2go = get_go_bits() gene2go, gene2name = get_go_bits_from_david() go2gene = defaultdict(set) these, alls = [{} for _ in xrange(2)] faileds, types = [[] for _ in xrange(2)] go_fore, go_back = [defaultdict(int) for _ in xrange(2)] tab = tabix.Tabix( '/net/afterthefact/data/jmerkin/Mus_musculus.NCBIM37.67.gtf.gz') known2ens = get_known2ens() ref = get_ref(reffile) conv_file = open('convert_allevents_id2gene', 'w') for misotype in os.listdir(indir): reffunc, this_ref = get_reffunc(misotype, known2ens, ref) if reffunc is None: continue types.append(misotype) fi = open( '%s/Comparisons/N2ASoma_vs_N2AAxon/bayes-factors/N2ASoma_vs_N2AAxon.miso_bf' % (misotype, ), 'r') line = fi.readline() conv_file.write(line) #import code ; code.interact(local=locals()) for line in fi: line = line.split('\t') gene = reffunc(this_ref, line, tab) if gene: if float(line[8]) > minbf and abs(float(line[7])) > minpsi: these[gene] = True alls[gene] = True conv_file.write('%s\t%s\t%s' % (misotype, gene, '\t'.join(line))) else: faileds.append(misotype) conv_file.close() types = '.'.join(types) for gene in alls: for go in gene2go[gene]: go_back[go] += 1 for gene in these: for go in gene2go[gene]: go_fore[go] += 1 go2gene[go].add('%s:%s' % (gene, gene2name[gene])) fore = len(these) back = len(alls) scoreds, folds = [[] for _ in xrange(2)] for go in go_back: back_with = go_back[go] fore_with = go_fore[go] if min(back_with, fore_with) < 2: continue #fore_with = max(fore_with-1 , 0) back_without = back - back_with fore_without = fore - fore_with try: fold = float(fore_with * back) / float(back_with * fore) except: fold = 'NA' table = [ #[back_with, back_without], #[fore_with, fore_without] #[back_with, fore_with], #[back_without, fore_without] [fore_with, fore_without], [back_with, back_without] ] #print table table = np.array(table) pval = fisher_exact(table, alternative='greater')[1] scoreds.append((pval, go, fold)) scoreds.sort(key=lambda xx: xx[0]) scores, names, folds = zip(*scoreds) names = np.array(names) scores = np.array(scores) nscores = scores.shape[0] folds = np.array(folds) bonferroni = np.minimum(scores * float(nscores), 1.) benjamini = [] oldp, knum, store = 0, 0, 0 for ii in scores: benjamini.append(ii * nscores / (nscores - knum)) #benjamini.append(ii * (nscores - knum) / nscores ) store += 1 if oldp == ii: # to handle ties. count number of tied scores, then add them later pass else: knum += store store = 0 oldp = ii benjamini = np.minimum(np.array(benjamini), 1.) print faileds print 'failed', len(faileds) print 'these', len(these) print 'all', len(alls) nscores = scores.shape[0] iis = np.arange(nscores) + 1 Q = 0.05 Qs = iis * Q / nscores def test_ben(pv, ii, ll, Q=0.05): if pv < ii * Q / ll: return True else: return False f_end = 'bf%s_psi%s_%s' % (minbf, minpsi, types) outf = open('go_analyses_%s' % (f_end), 'w') outf.write('term\tp-value\tbenjamini\tfdr\tfold_enrich\tgenes\n') passed = False for go, pvalue, bonf, benj, fold, ind, qv in reversed( zip(names, scores, bonferroni, benjamini, folds, iis, Qs)): if fold < 1: continue #if benj > 0.05: break #print pvalue, qv if passed or pvalue < qv: #test_ben(pvalue, ind, nscores, Q=Q): #print pvalue, benj, bonf line = '\t'.join( map(str, [go, pvalue, benj, qv, fold, ';'.join(go2gene[go])])) outf.write(line) outf.write('\n') passed = True outf.close() fout = open('genes_sigdif_%s' % (f_end), 'w') for gene in these: fout.write(gene) fout.write('\n') fout.close() fout = open('genes_all_%s' % (f_end), 'w') for gene in alls: fout.write(gene) fout.write('\n') fout.close()
''' Appends the average conservation score of a region to each entry in a bed file usage: python conservationBed.py [OPTIONS] bedfile outputfile <conservationfile> ''' import sys, tabix, optparse p = optparse.OptionParser() p.add_option('-p', '--padded',action = 'store', dest = 'pad', help = 'Program will remove this amount of padding when checking for scores, but will remain in the output bed file', default = 0) options, args = p.parse_args() # PWM bed bed = open(args[0], 'rU') # PhastCons try: fc = tabix.Tabix(args[2]) except: fc = tabix.Tabix('/gen_local/hsuj/ref/PWM/fastcons44.bed.gz') #Output w = open(args[1], 'w') def main(): for line in bed: scores = [] l1 = line.strip().split('\t') chrom = l1[0] start = int(l1[1]) end = int(l1[2]) start += int(options.pad) end -= int(options.pad)