Exemple #1
0
def isCoding(chr, start, end, cds_gf):
    chr = chrFill(chr)
    if isinstance(cds_gf, str):
        gf = tabix.Tabix(cds_gf)
    else:
        gf = cds_gf
    regs = gf.fetch('%s:%i-%i' % (chr, start, end))
    regs = [reg for reg in regs]
    return len(regs) >= 1
Exemple #2
0
def getGeneFromCoords(
    chr,
    ex,
    genesFile='/net/crate-04/data/burge/alexrson/finalAnalyses'
    '/long_short_exons/genelines.sorted.gff.gz'):
    start_q, end_q = min(ex), max(ex)
    chr = chrFill(chr)
    if isinstance(genesFile, str):
        gf = tabix.Tabix(genesFile)
    else:
        gf = genesFile
    regs = gf.fetch('%s:%i-%i' % (chr, start_q, end_q))
    regs = [reg for reg in regs]
    genes = set()
    trans2gene = {}
    found_trans = False
    found_CDS = False
    for reg in regs:
        regl = reg.split('\t')
        attri = readAttributes.readAttributesIntoDict(regl[8])
        start_exon, end_exon = map(int, regl[3:5])
        if regl[2] == 'exon':
            if start_exon == start_q or start_exon == end_q or \
              end_exon == start_q or end_exon == end_q:
                found_trans = attri['Parent'][0]
        elif regl[2] == 'CDS':
            if start_exon == start_q or start_exon == end_q or \
              end_exon == start_q or end_exon == end_q:
                found_CDS = attri['Parent'][0]
        elif regl[2] == 'mRNA':
            gene = attri['Parent'][0]
            trans = attri['ID'][0]
            trans2gene[trans] = gene
            genes.add(tuple([regl[1], gene]))
        elif regl[2] == 'gene':
            gene = attri['ID'][0]
            genes.add(tuple([regl[1], gene]))
    if found_CDS:
        return trans2gene[found_CDS]
    if found_trans:
        return trans2gene[found_trans]
    if len(genes) == 1:
        return genes.pop()[1]
    for esp, gene in genes:
        if esp in ['protein_coding', 'rkb', 'liana']:
            return gene
    for esp, gene in genes:
        if esp in ['ucsc.knownGene-kgXref-ensGene']:
            return gene
    if not genes:
        print regs, chr, ex
        return None
    return genes.pop()[1]
Exemple #3
0
            chrm = 'chr%s' % (chrm)
            gene = attr.split('"')[1]
            ref[(chrm, start)].add(gene)
            ref[(chrm, stop)].add(gene)
    return ref


if __name__ == '__main__':

    reffiles = sys.argv[1].split(',')
    indir = sys.argv[2]

    gene2go, gene2name = get_go_bits_from_david()
    known2ens = get_known2ens()
    ref = get_ref(reffiles)
    tab = tabix.Tabix(
        '/net/afterthefact/data/jmerkin/Mus_musculus.NCBIM37.67.gtf.gz')
    types = []
    faileds = []

    conv_file = open('convert_allevents_id2gene', 'w')

    for misotype in os.listdir(indir):
        reffunc, this_ref = get_reffunc(misotype, known2ens, ref)
        print misotype, reffunc
        if reffunc is None: continue
        types.append(misotype)
        fi = open(
            '%s/%s/Comparisons/N2ASoma_vs_N2AAxon/bayes-factors/N2ASoma_vs_N2AAxon.miso_bf'
            % (
                indir,
                misotype,
Exemple #4
0
 def setUp(self):
     self.tb = tabix.Tabix(EXAMPLEFILE)
Exemple #5
0
import sys, tabix, optparse
p = optparse.OptionParser()
p.add_option(
    '-p',
    '--padded',
    action='store',
    dest='pad',
    help=
    'Program will remove this amount of padding when checking for scores, but will remain in the output bed file',
    default=0)
options, args = p.parse_args()
# PWM bed
bed = open(args[0], 'rU')
# PhastCons
try:
    fc = tabix.Tabix(args[2])
except:
    fc = tabix.Tabix('fastcons44.bed.gz')
#Output
w = open(args[1], 'w')

for line in bed:
    scores = []
    l1 = line.strip().split('\t')
    chrom = l1[0]
    #Positive strand
    if int(l1[1]) > 0:
        start = int(l1[1])
        end = int(l1[2])
    #Negative strand
    else:
Exemple #6
0
def main():
    reffile = sys.argv[1]
    indir = sys.argv[2]
    minbf = float(sys.argv[3])
    minpsi = .25
    #gene2go = get_go_bits()
    gene2go, gene2name = get_go_bits_from_david()
    go2gene = defaultdict(set)
    these, alls = [{} for _ in xrange(2)]
    faileds, types = [[] for _ in xrange(2)]
    go_fore, go_back = [defaultdict(int) for _ in xrange(2)]
    tab = tabix.Tabix(
        '/net/afterthefact/data/jmerkin/Mus_musculus.NCBIM37.67.gtf.gz')

    known2ens = get_known2ens()
    ref = get_ref(reffile)

    conv_file = open('convert_allevents_id2gene', 'w')
    for misotype in os.listdir(indir):
        reffunc, this_ref = get_reffunc(misotype, known2ens, ref)
        if reffunc is None: continue
        types.append(misotype)
        fi = open(
            '%s/Comparisons/N2ASoma_vs_N2AAxon/bayes-factors/N2ASoma_vs_N2AAxon.miso_bf'
            % (misotype, ), 'r')
        line = fi.readline()
        conv_file.write(line)
        #import code ; code.interact(local=locals())
        for line in fi:
            line = line.split('\t')
            gene = reffunc(this_ref, line, tab)
            if gene:
                if float(line[8]) > minbf and abs(float(line[7])) > minpsi:
                    these[gene] = True
                alls[gene] = True
                conv_file.write('%s\t%s\t%s' %
                                (misotype, gene, '\t'.join(line)))
            else:
                faileds.append(misotype)
    conv_file.close()
    types = '.'.join(types)

    for gene in alls:
        for go in gene2go[gene]:
            go_back[go] += 1
    for gene in these:
        for go in gene2go[gene]:
            go_fore[go] += 1
            go2gene[go].add('%s:%s' % (gene, gene2name[gene]))

    fore = len(these)
    back = len(alls)
    scoreds, folds = [[] for _ in xrange(2)]
    for go in go_back:
        back_with = go_back[go]
        fore_with = go_fore[go]
        if min(back_with, fore_with) < 2: continue
        #fore_with = max(fore_with-1 , 0)
        back_without = back - back_with
        fore_without = fore - fore_with
        try:
            fold = float(fore_with * back) / float(back_with * fore)
        except:
            fold = 'NA'
        table = [
            #[back_with, back_without],
            #[fore_with, fore_without]
            #[back_with, fore_with],
            #[back_without, fore_without]
            [fore_with, fore_without],
            [back_with, back_without]
        ]
        #print table
        table = np.array(table)
        pval = fisher_exact(table, alternative='greater')[1]
        scoreds.append((pval, go, fold))
    scoreds.sort(key=lambda xx: xx[0])
    scores, names, folds = zip(*scoreds)
    names = np.array(names)
    scores = np.array(scores)
    nscores = scores.shape[0]
    folds = np.array(folds)
    bonferroni = np.minimum(scores * float(nscores), 1.)
    benjamini = []
    oldp, knum, store = 0, 0, 0
    for ii in scores:
        benjamini.append(ii * nscores / (nscores - knum))
        #benjamini.append(ii * (nscores - knum) / nscores )
        store += 1
        if oldp == ii:
            # to handle ties. count number of tied scores, then add them later
            pass
        else:
            knum += store
            store = 0
        oldp = ii
    benjamini = np.minimum(np.array(benjamini), 1.)

    print faileds
    print 'failed', len(faileds)
    print 'these', len(these)
    print 'all', len(alls)

    nscores = scores.shape[0]
    iis = np.arange(nscores) + 1
    Q = 0.05
    Qs = iis * Q / nscores

    def test_ben(pv, ii, ll, Q=0.05):
        if pv < ii * Q / ll:
            return True
        else:
            return False

    f_end = 'bf%s_psi%s_%s' % (minbf, minpsi, types)
    outf = open('go_analyses_%s' % (f_end), 'w')
    outf.write('term\tp-value\tbenjamini\tfdr\tfold_enrich\tgenes\n')
    passed = False
    for go, pvalue, bonf, benj, fold, ind, qv in reversed(
            zip(names, scores, bonferroni, benjamini, folds, iis, Qs)):
        if fold < 1: continue
        #if benj > 0.05: break
        #print pvalue, qv
        if passed or pvalue < qv:  #test_ben(pvalue, ind, nscores, Q=Q):
            #print pvalue, benj, bonf
            line = '\t'.join(
                map(str, [go, pvalue, benj, qv, fold, ';'.join(go2gene[go])]))
            outf.write(line)
            outf.write('\n')
            passed = True
    outf.close()

    fout = open('genes_sigdif_%s' % (f_end), 'w')
    for gene in these:
        fout.write(gene)
        fout.write('\n')
    fout.close()

    fout = open('genes_all_%s' % (f_end), 'w')
    for gene in alls:
        fout.write(gene)
        fout.write('\n')
    fout.close()
Exemple #7
0
'''
Appends the average conservation score of a region to each entry in a bed file

usage:
    python conservationBed.py [OPTIONS] bedfile outputfile <conservationfile> 

'''
import sys, tabix, optparse
p = optparse.OptionParser()
p.add_option('-p', '--padded',action = 'store', dest = 'pad', help = 'Program will remove this amount of padding when checking for scores, but will remain in the output bed file', default = 0)
options, args = p.parse_args()
# PWM bed
bed = open(args[0], 'rU')
# PhastCons
try:
    fc = tabix.Tabix(args[2])
except:
    fc = tabix.Tabix('/gen_local/hsuj/ref/PWM/fastcons44.bed.gz')
#Output
w = open(args[1], 'w')

def main():
    for line in bed:
        scores = []
        l1 = line.strip().split('\t')
        chrom = l1[0]
        start = int(l1[1])
        end = int(l1[2])
        start += int(options.pad)
        end -= int(options.pad)