Example #1
0
def get_pseudogenes():
    filename = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerPseudoGene.txt'
    geneList = dict()
    f = open(filename)
    lines = f.readlines()
    for line in lines:
        raw_line = line.split('\t')
        name = raw_line[0]
        geneID = raw_line[1]
        chr = raw_line[2]
        sense = raw_line[3]
        txCoords = map(int, raw_line[4:6])
        cdCoords = map(int, raw_line[6:8])
        exStarts = map(int, raw_line[9].split(',')[:-1])
        exEnds = map(int, raw_line[10].split(',')[:-1])
        # fix the edges to work with the Gene specs
        #exEnds = map(lambda n: n-1, exEnds)
        txCoords = [txCoords[0], txCoords[1]]
        cdCoords = [cdCoords[0], max([cdCoords[1], 0])]
        if cdCoords[0] != 0 or cdCoords[1] != 0:
            newGene = utilityModule.Gene(name, geneID, chr, sense, txCoords,
                                         cdCoords, exStarts, exEnds)
            if not (geneList.has_key(chr)): geneList[chr] = []
            geneList[chr].append([txCoords[0], txCoords[1], newGene])
    f.close()
    return geneList
Example #2
0
def getRefFlatbyChrom(genomeName, test=lambda g: True):
    genomeToFile = dict()
    genomeToFile['hg18'] = '/lab/bartel3_ata/jwnam/h.sapiens/refFlat.txt'
    genomeToFile['hg19'] = 'refFlat.txt'
    genomeToFile[
        'GRCh37.66'] = '/lab/bartel3_ata/jwnam/h.sapiens/hg19/GRCh37.66.genePred2'
    genomeToFile[
        'ce6'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/refFlat.txt'
    genomeToFile[
        'dm3'] = '/lab/bartel3_ata/jwnam/d.melanogaster/annotation/refFlat.txt'
    genomeToFile[
        'mm9'] = '/lab/bartel6_ata/nam/mouse/annotation/refFlat_mm9.txt'
    genomeToFile['danRer6'] = '/lab/bartel3_ata/jwnam/zebrafish/refFlat.txt'
    genomeToFile[
        'danRer7'] = '/lab/bartel3_ata/jwnam/zebrafish/danRer7/refFlat.txt'
    genomeToFile[
        'sangerRna'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerRnaGene.txt'
    genomeToFile[
        'sangerPseudogene'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerPseudoGene.txt'
    genomeToFile[
        'sangerGene'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerGene.txt'
    genomeToFile[
        'denovo_annotation'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/combined_step3.txt'
    genomeToFile[
        'denovo_annotation_v1.3'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/combined_step3_v1.3.txt'
    genomeToFile[
        'combined'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/newTest3P_directCombined.txt'
    genomeToFile[
        'modEncode_Aggregate'] = '/lab/bartel1_ata/jwnam/Project/Celegans/modEncode/Robert_Aggregate/Aggregate_1003.integrated_transcripts.ucsc_browser.ws190.txt'
    filename = genomeToFile[genomeName]
    geneList = dict()
    f = open(filename)
    lines = f.readlines()
    for line in lines:
        raw_line = line.split('\t')
        if len(raw_line) == 10: raw_line.insert(1, raw_line[0])
        name = raw_line[0]
        geneID = raw_line[1]
        chr = raw_line[2]
        sense = raw_line[3]
        txCoords = map(int, raw_line[4:6])
        cdCoords = map(int, raw_line[6:8])
        exStarts = map(int, raw_line[9].split(',')[:-1])
        exEnds = map(int, raw_line[10].split(',')[:-1])
        # fix the edges to work with the Gene specs
        #exEnds = map(lambda n: n-1, exEnds)
        exStarts = map(lambda n: n + 1, exStarts)
        #exStarts[0]+=1

        txCoords = [txCoords[0], txCoords[1]]
        cdCoords = [cdCoords[0], max([cdCoords[1], 0])]
        if cdCoords[0] != 0 or cdCoords[1] != 0:
            newGene = utilityModule.Gene(name, geneID, chr, sense, txCoords,
                                         cdCoords, exStarts, exEnds)
            if not (geneList.has_key(chr)): geneList[chr] = []
            geneList[chr].append([txCoords[0], txCoords[1], newGene])
    f.close()
    return geneList
Example #3
0
def getNewRefFlat(genomeName, method, tpSeqSample, test=lambda g: True):
    newRefFlat = get_new3UTR(genomeName, method, tpSeqSample)
    genomeToFile = dict()
    genomeToFile[
        'ce6'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/refFlat.txt'
    genomeToFile['danRer6'] = '/lab/bartel3_ata/jwnam/zebrafish/refFlat.txt'
    genomeToFile[
        'danRer7'] = '/lab/bartel3_ata/jwnam/zebrafish/danRer7/refFlat.txt'
    genomeToFile[
        'dm3'] = '/lab/bartel3_ata/jwnam/d.melanogaster/annotation/refFlat.txt'
    genomeToFile[
        'mm9'] = '/lab/bartel6_ata/nam/mouse/annotation/refFlat_mm9.txt'
    genomeToFile[
        'hg18_distinct'] = '/lab/bartel3_ata/jwnam/h.sapiens/nature_refSeq_distinct.fa'
    genomeToFile['hg19'] = 'refFlat.txt'
    genomeToFile[
        'GRCh37.66'] = '/lab/bartel3_ata/jwnam/h.sapiens/hg19/GRCh37.66.genePred2'
    genomeToFile[
        'sangerRna'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerRnaGene.txt'
    genomeToFile[
        'sangerPseudogene'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerPseudoGene.txt'
    genomeToFile[
        'sangerGene'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerGene.txt'
    genomeToFile[
        'denovo_annotation'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/combined_step3.txt'
    genomeToFile[
        'denovo_annotation_v1.3'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/combined_step3_v1.3.txt'
    genomeToFile[
        'combined'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/newTest3P_directCombined.txt'
    genomeToFile[
        'modEncode_Aggregate'] = '/lab/bartel1_ata/jwnam/Project/Celegans/modEncode/Robert_Aggregate/Aggregate_1003.integrated_transcripts.ucsc_browser.ws190.txt'
    filename = genomeToFile[genomeName]
    geneList = dict()
    f = open(filename)
    lines = f.readlines()
    nnum = 0
    updatedUTRNum = 0
    updatedUTRNum_100D = 0
    for line in lines:
        raw_line = line.split('\t')
        if len(raw_line) == 10: raw_line.insert(1, raw_line[0])
        name = raw_line[0]
        if len(name) == 0:
            nnum += 1
            name = str(nnum)
        geneID = raw_line[1]
        if genomeName == 'sangerGene' or genomeName == 'sangerPseudogene' or genomeName == 'sangerRna':
            name = geneID
        #///////////
        newEnd = int(newRefFlat.get(geneID, -1))

        chr = raw_line[2]
        sense = raw_line[3]
        txCoords = map(int, raw_line[4:6])
        cdCoords = map(int, raw_line[6:8])
        exStarts = map(int, raw_line[9].split(',')[:-1])
        exEnds = map(int, raw_line[10].split(',')[:-1])
        # fix the edges to work with the Gene specs
        exEnds = map(lambda n: n - 1, exEnds)

        #print geneID, sense, txCoords,
        if newEnd > 0:
            if sense == '+':
                txCoords = [txCoords[0], newEnd]
                #exEnds[-1] = max(exEnds[-1],newEnd)
                exEnds[-1] = newEnd
                if abs(exEnds[-1] - newEnd) >= 100: updatedUTRNum_100D += 1
                if abs(exEnds[-1] - newEnd) >= 1: updatedUTRNum += 1
            else:
                txCoords = [newEnd, txCoords[1]]
                #exStarts[0] = min(exStarts[0],newEnd)
                exStarts[0] = newEnd
                if abs(exStarts[0] - newEnd) >= 100: updatedUTRNum_100D += 1
                if abs(exStarts[0] - newEnd) >= 1: updatedUTRNum += 1
        #print txCoords
        cdCoords = [cdCoords[0], max([cdCoords[1], 0])]
        if cdCoords[0] != 0 or cdCoords[1] != 0:
            newGene = utilityModule.Gene(name, geneID, chr, sense, txCoords,
                                         cdCoords, exStarts, exEnds)
            if not (geneList.has_key(geneID)): geneList[geneID] = newGene
            else:
                preGene = geneList[geneID]
                geneLen = preGene.txLocus().end() - preGene.txLocus().start()
                geneLen2 = txCoords[1] - txCoords[0]
                if geneLen2 > geneLen: geneList[geneID] = newGene
    f.close()
    print '#Updated 3UTR Number: ' + str(updatedUTRNum)
    print '#Updated 3UTR Number(>=100): ' + str(updatedUTRNum_100D)
    return geneList