def get_pseudogenes(): filename = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerPseudoGene.txt' geneList = dict() f = open(filename) lines = f.readlines() for line in lines: raw_line = line.split('\t') name = raw_line[0] geneID = raw_line[1] chr = raw_line[2] sense = raw_line[3] txCoords = map(int, raw_line[4:6]) cdCoords = map(int, raw_line[6:8]) exStarts = map(int, raw_line[9].split(',')[:-1]) exEnds = map(int, raw_line[10].split(',')[:-1]) # fix the edges to work with the Gene specs #exEnds = map(lambda n: n-1, exEnds) txCoords = [txCoords[0], txCoords[1]] cdCoords = [cdCoords[0], max([cdCoords[1], 0])] if cdCoords[0] != 0 or cdCoords[1] != 0: newGene = utilityModule.Gene(name, geneID, chr, sense, txCoords, cdCoords, exStarts, exEnds) if not (geneList.has_key(chr)): geneList[chr] = [] geneList[chr].append([txCoords[0], txCoords[1], newGene]) f.close() return geneList
def getRefFlatbyChrom(genomeName, test=lambda g: True): genomeToFile = dict() genomeToFile['hg18'] = '/lab/bartel3_ata/jwnam/h.sapiens/refFlat.txt' genomeToFile['hg19'] = 'refFlat.txt' genomeToFile[ 'GRCh37.66'] = '/lab/bartel3_ata/jwnam/h.sapiens/hg19/GRCh37.66.genePred2' genomeToFile[ 'ce6'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/refFlat.txt' genomeToFile[ 'dm3'] = '/lab/bartel3_ata/jwnam/d.melanogaster/annotation/refFlat.txt' genomeToFile[ 'mm9'] = '/lab/bartel6_ata/nam/mouse/annotation/refFlat_mm9.txt' genomeToFile['danRer6'] = '/lab/bartel3_ata/jwnam/zebrafish/refFlat.txt' genomeToFile[ 'danRer7'] = '/lab/bartel3_ata/jwnam/zebrafish/danRer7/refFlat.txt' genomeToFile[ 'sangerRna'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerRnaGene.txt' genomeToFile[ 'sangerPseudogene'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerPseudoGene.txt' genomeToFile[ 'sangerGene'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerGene.txt' genomeToFile[ 'denovo_annotation'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/combined_step3.txt' genomeToFile[ 'denovo_annotation_v1.3'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/combined_step3_v1.3.txt' genomeToFile[ 'combined'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/newTest3P_directCombined.txt' genomeToFile[ 'modEncode_Aggregate'] = '/lab/bartel1_ata/jwnam/Project/Celegans/modEncode/Robert_Aggregate/Aggregate_1003.integrated_transcripts.ucsc_browser.ws190.txt' filename = genomeToFile[genomeName] geneList = dict() f = open(filename) lines = f.readlines() for line in lines: raw_line = line.split('\t') if len(raw_line) == 10: raw_line.insert(1, raw_line[0]) name = raw_line[0] geneID = raw_line[1] chr = raw_line[2] sense = raw_line[3] txCoords = map(int, raw_line[4:6]) cdCoords = map(int, raw_line[6:8]) exStarts = map(int, raw_line[9].split(',')[:-1]) exEnds = map(int, raw_line[10].split(',')[:-1]) # fix the edges to work with the Gene specs #exEnds = map(lambda n: n-1, exEnds) exStarts = map(lambda n: n + 1, exStarts) #exStarts[0]+=1 txCoords = [txCoords[0], txCoords[1]] cdCoords = [cdCoords[0], max([cdCoords[1], 0])] if cdCoords[0] != 0 or cdCoords[1] != 0: newGene = utilityModule.Gene(name, geneID, chr, sense, txCoords, cdCoords, exStarts, exEnds) if not (geneList.has_key(chr)): geneList[chr] = [] geneList[chr].append([txCoords[0], txCoords[1], newGene]) f.close() return geneList
def getNewRefFlat(genomeName, method, tpSeqSample, test=lambda g: True): newRefFlat = get_new3UTR(genomeName, method, tpSeqSample) genomeToFile = dict() genomeToFile[ 'ce6'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/refFlat.txt' genomeToFile['danRer6'] = '/lab/bartel3_ata/jwnam/zebrafish/refFlat.txt' genomeToFile[ 'danRer7'] = '/lab/bartel3_ata/jwnam/zebrafish/danRer7/refFlat.txt' genomeToFile[ 'dm3'] = '/lab/bartel3_ata/jwnam/d.melanogaster/annotation/refFlat.txt' genomeToFile[ 'mm9'] = '/lab/bartel6_ata/nam/mouse/annotation/refFlat_mm9.txt' genomeToFile[ 'hg18_distinct'] = '/lab/bartel3_ata/jwnam/h.sapiens/nature_refSeq_distinct.fa' genomeToFile['hg19'] = 'refFlat.txt' genomeToFile[ 'GRCh37.66'] = '/lab/bartel3_ata/jwnam/h.sapiens/hg19/GRCh37.66.genePred2' genomeToFile[ 'sangerRna'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerRnaGene.txt' genomeToFile[ 'sangerPseudogene'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerPseudoGene.txt' genomeToFile[ 'sangerGene'] = '/lab/bartel3_ata/jwnam/c.elgans_may_2008/annotation/sangerGene.txt' genomeToFile[ 'denovo_annotation'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/combined_step3.txt' genomeToFile[ 'denovo_annotation_v1.3'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/combined_step3_v1.3.txt' genomeToFile[ 'combined'] = '/lab/solexa_bartel1/jwnam/RNASeq/Celegans/Combined_TophatOutput/corrected/gtfs/newTest3P_directCombined.txt' genomeToFile[ 'modEncode_Aggregate'] = '/lab/bartel1_ata/jwnam/Project/Celegans/modEncode/Robert_Aggregate/Aggregate_1003.integrated_transcripts.ucsc_browser.ws190.txt' filename = genomeToFile[genomeName] geneList = dict() f = open(filename) lines = f.readlines() nnum = 0 updatedUTRNum = 0 updatedUTRNum_100D = 0 for line in lines: raw_line = line.split('\t') if len(raw_line) == 10: raw_line.insert(1, raw_line[0]) name = raw_line[0] if len(name) == 0: nnum += 1 name = str(nnum) geneID = raw_line[1] if genomeName == 'sangerGene' or genomeName == 'sangerPseudogene' or genomeName == 'sangerRna': name = geneID #/////////// newEnd = int(newRefFlat.get(geneID, -1)) chr = raw_line[2] sense = raw_line[3] txCoords = map(int, raw_line[4:6]) cdCoords = map(int, raw_line[6:8]) exStarts = map(int, raw_line[9].split(',')[:-1]) exEnds = map(int, raw_line[10].split(',')[:-1]) # fix the edges to work with the Gene specs exEnds = map(lambda n: n - 1, exEnds) #print geneID, sense, txCoords, if newEnd > 0: if sense == '+': txCoords = [txCoords[0], newEnd] #exEnds[-1] = max(exEnds[-1],newEnd) exEnds[-1] = newEnd if abs(exEnds[-1] - newEnd) >= 100: updatedUTRNum_100D += 1 if abs(exEnds[-1] - newEnd) >= 1: updatedUTRNum += 1 else: txCoords = [newEnd, txCoords[1]] #exStarts[0] = min(exStarts[0],newEnd) exStarts[0] = newEnd if abs(exStarts[0] - newEnd) >= 100: updatedUTRNum_100D += 1 if abs(exStarts[0] - newEnd) >= 1: updatedUTRNum += 1 #print txCoords cdCoords = [cdCoords[0], max([cdCoords[1], 0])] if cdCoords[0] != 0 or cdCoords[1] != 0: newGene = utilityModule.Gene(name, geneID, chr, sense, txCoords, cdCoords, exStarts, exEnds) if not (geneList.has_key(geneID)): geneList[geneID] = newGene else: preGene = geneList[geneID] geneLen = preGene.txLocus().end() - preGene.txLocus().start() geneLen2 = txCoords[1] - txCoords[0] if geneLen2 > geneLen: geneList[geneID] = newGene f.close() print '#Updated 3UTR Number: ' + str(updatedUTRNum) print '#Updated 3UTR Number(>=100): ' + str(updatedUTRNum_100D) return geneList