def parse(exon1,exon2): h = {} for t in exon1.split(','): rm = re.match('(.+)\.exon(.+)\/(.+)',t) h[(rm.group(1),int(rm.group(3)))] = [int(rm.group(2))] for t in exon2.split(','): rm = re.match('(.+)\.exon(.+)\/(.+)',t) if (rm.group(1),int(rm.group(3))) in h: h[(rm.group(1),int(rm.group(3)))].append(int(rm.group(2))) h2 = {} for t in h: if len(h[t]) == 2: mybasic.addHash(h2,tuple(h[t]),t) h2_items = h2.items() h2_items.sort(lambda x,y: cmp(len(y[1]),len(x[1]))) return ','.join(['%s-%s' % (eS+1,eE-1) for ((eS,eE),l) in h2_items])
def geneInfoH(geneNameH, geneSetH, refSeqSummaryFileName='/Z/Sequence/ucsc_hg19/annot/refSeqSummary.txt', hugoFileName='/Z/Sequence/geneinfo/hugo.txt', \ censusFileName='/Z/Sequence/geneinfo/cancer_gene_census.txt', biocartaFileName='/Z/Sequence/geneinfo/BIOCARTA.gmt', \ goFileName='/Z/Sequence/geneinfo/hugo.txt', keggFileName='/Z/Sequence/geneinfo/hugo.txt'): geneInfoH = {} for line in open(refSeqSummaryFileName): (refSeqId,status,summary) = line[:-1].split('\t') if refSeqId in geneNameH: geneName = geneNameH[refSeqId] if geneName not in geneInfoH: geneInfoH[geneName] = {} geneInfoH[geneName]['summary'] = summary for line in open(hugoFileName): (geneName,desc,aliases,geneCardName,refSeqIds) = line[:-1].split('\t') if geneName not in geneInfoH: geneInfoH[geneName] = {} geneInfoH[geneName]['desc'] = desc geneInfoH[geneName]['aliases'] = aliases geneInfoH[geneName]['refSeqIds'] = refSeqIds for line in open(censusFileName): tokL = line[:-1].split('\t') (geneName,desc,somatic,germline,mutType,translocPartners) = (tokL[0],tokL[1],tokL[7],tokL[8],tokL[12],tokL[13]) if geneName == 'Symbol': continue if geneName not in geneInfoH: geneInfoH[geneName] = {'desc':desc} geneInfoH[geneName]['census_somatic'] = somatic geneInfoH[geneName]['census_germline'] = germline geneInfoH[geneName]['census_mutType'] = mutType geneInfoH[geneName]['census_translocPartners'] = translocPartners for geneSetDB in geneSetH.keys(): for (geneSetName,(geneSetDesc,geneNameL)) in geneSetH[geneSetDB].iteritems(): for geneName in geneNameL: if geneName in geneInfoH: mybasic.addHash(geneInfoH[geneName],geneSetDB,(geneSetName,geneSetDesc)) else: geneInfoH[geneName] = {geneSetDB:[(geneSetName,geneSetDesc)]} return geneInfoH
def main(dirPath): fileNameL = filter(lambda x: re.match('.*TCGA-..-....-0..*\.bam', x), os.listdir(dirPath)) # normal sample, bam fileNameTokL = map( lambda x: re.match('.*(TCGA-..-....)-...-..([DW]).*\.bam', x), fileNameL) h = {} for rm in fileNameTokL: sN = rm.group(1) type = rm.group(2) if 'SOLiD' in rm.group(0): type += '-SD' elif 'IlluminaGA' in rm.group(0): type += '-GA' mybasic.addHash(h, sN, type) for (sN, typeL) in h.iteritems(): typeL = list(set(typeL)) typeL.sort() sys.stdout.write('%s\tXSeq_%s\n' % (sN, ','.join(typeL)))
def parse(exon1, exon2): h = {} for t in exon1.split(','): rm = re.match('(.+)\.exon(.+)\/(.+)', t) h[(rm.group(1), int(rm.group(3)))] = [int(rm.group(2))] for t in exon2.split(','): rm = re.match('(.+)\.exon(.+)\/(.+)', t) if (rm.group(1), int(rm.group(3))) in h: h[(rm.group(1), int(rm.group(3)))].append(int(rm.group(2))) h2 = {} for t in h: if len(h[t]) == 2: mybasic.addHash(h2, tuple(h[t]), t) h2_items = h2.items() h2_items.sort(lambda x, y: cmp(len(y[1]), len(x[1]))) return ','.join(['%s-%s' % (eS + 1, eE - 1) for ((eS, eE), l) in h2_items])
def loadKgByChr(dataFileName='/Z/Sequence/ucsc_hg19/annot/knownGene.txt',h={}): for line in open(dataFileName): r = processKgLine(line) mybasic.addHash(h, r['chrom'], r) return h
def loadLincByChr(dataFileName='/Z/Sequence/ucsc_hg19/annot/lincRNAsTranscripts.txt',h={}): for line in open(dataFileName): r = processLincLine(line) mybasic.addHash(h, r['chrom'], r) return h
def loadRefFlatByGene(refFlatFileName): h = {} for line in open(refFlatFileName): r = processRefFlatLine(line) mybasic.addHash(h, r['geneName'], r) return h
def process_bp(inGsnapFileName): result = mygsnap.gsnapFile(inGsnapFileName, False) #outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups() bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups() # if bp1[0] == bp2[0]: # continue if direction == 'sense': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) bp12 = (bp1, bp2) else: seq = mybasic.rc(r.seq(), 'DNA') offset = len(seq) - int(match.segL[0][1].split('..')[1]) bp12 = (bp2, bp1) mybasic.addHash(seqH, bp12, (offset, seq)) seqL = seqH.items() seqL.sort(lambda x, y: cmp(len(y[1]), len(x[1]))) for ((bp1, bp2), vL) in seqL: vL.sort(lambda x, y: cmp(y[0], x[0])) maxOffset = vL[0][0] print '\n', bp1, bp2, len(vL), '\n' for (offset, seq) in vL: print '%s%s %s' % (' ' * (maxOffset - offset), seq[:offset], seq[offset:])
def process_bp(inGsnapFileName, outBpFileName): result = mygsnap.gsnapFile(inGsnapFileName, False) outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] if s1[0] != s2[0]: raise Exception strand = s1[0] s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups() s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups() if strand == '+': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) junction = (s1T, s2T) else: seq = mybasic.rc(r.seq(), 'DNA') offset = len(seq) - int(match.segL[0][1].split('..')[1]) junction = (s2T, s1T) mybasic.addHash(seqH, junction, (offset, seq)) for ((j1, j2), vL) in seqH.items(): vL.sort(lambda x, y: cmp(x[0], y[0])) vL_mod = [] for (offset, seq) in vL: offset = blockSize - offset + 1 vL_mod.append('%s:%s' % (offset, seq)) outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' % (j1[0].split('_')[0], int(j1[1]) - blockSize, j1[1], j1[0].split('_')[0], j1[1], int(j1[1]) + blockSize, '|'.join(vL_mod)))
def loadRefFlatByChr(refFlatFileName='/data1/Sequence/ucsc_hg19/annot/refFlat_splice_EGFR.txt'): h = {} for line in open(refFlatFileName): r = mygenome.processRefFlatLine(line) mybasic.addHash(h, r['chrom'], r) return h
def process_bp(inGsnapFileName,outBpFileName): result = mygsnap.gsnapFile(inGsnapFileName,False) outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] if s1[0] != s2[0]: raise Exception strand = s1[0] s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups() s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups() if strand == '+': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) junction = (s1T, s2T) else: seq = mybasic.rc(r.seq(),'DNA') offset = len(seq)-int(match.segL[0][1].split('..')[1]) junction = (s2T, s1T) mybasic.addHash(seqH,junction,(offset,seq)) for ((k1,k2), v) in seqH.items(): v.sort(lambda x,y: cmp(y[0],x[0])) k1T = re.match() k2T = re.match() k1_pos = k2_pos = k1_seq = k2_seq = outBpFile.write('%s,%s,%s\n' % (':'.join(k1),':'.join(k2),'|'.join(['%s:%s' % (offset,seq) for (offset,seq) in v])))
def process_bp(inGsnapFileName): result = mygsnap.gsnapFile(inGsnapFileName,False) #outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups() bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups() # if bp1[0] == bp2[0]: # continue if direction == 'sense': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) bp12 = (bp1, bp2) else: seq = mybasic.rc(r.seq(),'DNA') offset = len(seq)-int(match.segL[0][1].split('..')[1]) bp12 = (bp2, bp1) mybasic.addHash(seqH,bp12,(offset,seq)) seqL = seqH.items() seqL.sort(lambda x,y: cmp(len(y[1]),len(x[1]))) for ((bp1,bp2), vL) in seqL: vL.sort(lambda x,y: cmp(y[0],x[0])) maxOffset = vL[0][0] print '\n',bp1,bp2,len(vL),'\n' for (offset,seq) in vL: print '%s%s %s' % (' ' * (maxOffset-offset),seq[:offset],seq[offset:])
def loadRefFlatByChr( refFlatFileName='/data1/Sequence/ucsc_hg19/annot/refFlat_splice_EGFR.txt' ): h = {} for line in open(refFlatFileName): r = mygenome.processRefFlatLine(line) mybasic.addHash(h, r['chrom'], r) return h
def process_bp(inGsnapFileName,outBpFileName): result = mygsnap.gsnapFile(inGsnapFileName,False) outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] if s1[0] != s2[0]: raise Exception strand = s1[0] s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups() s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups() if strand == '+': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) junction = (s1T, s2T) else: seq = mybasic.rc(r.seq(),'DNA') offset = len(seq)-int(match.segL[0][1].split('..')[1]) junction = (s2T, s1T) mybasic.addHash(seqH,junction,(offset,seq)) for ((j1,j2), vL) in seqH.items(): vL.sort(lambda x,y: cmp(x[0],y[0])) vL_mod = [] for (offset,seq) in vL: offset = blockSize-offset+1 vL_mod.append('%s:%s' % (offset,seq)) outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' % (j1[0].split('_')[0],int(j1[1])-blockSize,j1[1], j1[0].split('_')[0],j1[1],int(j1[1])+blockSize, '|'.join(vL_mod)))
def loadRefFlatByChr(refFlatFileName='/Z/Sequence/ucsc_hg19/annot/refFlat.txt'): h = {} for line in open(refFlatFileName): r = processRefFlatLine(line) mybasic.addHash(h, r['chrom'], r) if 'chrM' not in h: h['chrM'] = [] return h
def loadAnnot(geneL=[]): refFlatH = mygenome.loadRefFlatByChr() eiH = {} ei_keyH = {} juncInfoH = {} for chrom in refFlatH.keys(): eiH[chrom] = {} juncInfoH[chrom] = {} refFlatL = refFlatH[chrom] for tH in refFlatL: if geneL!=[] and tH['geneName'] not in geneL: continue for i in range(len(tH['exnList'])): if tH['strand'] == '+': pos = tH['exnList'][i][1] e_num = i+1 else: pos = tH['exnList'][i][0] e_num = len(tH['exnList'])-i mybasic.addHash(juncInfoH[chrom], pos, '%s%s:%s:%s/%s' % (tH['strand'], tH['geneName'], tH['refSeqId'], e_num, len(tH['exnList']))) eiH[chrom][pos] = 0 ei_keyH[chrom] = eiH[chrom].keys() ei_keyH[chrom].sort() ei_cntH = {} for chrom in juncInfoH.keys(): ei_cntH[chrom] = {} i = 0 for pos in sorted(juncInfoH[chrom].keys()): i += 1 ei_cntH[chrom][pos] = i return eiH,ei_keyH,juncInfoH,ei_cntH
def parse(loc, juncInfo): rm = re.match('([^:]+):([^:]+)', loc) chrom, pos = rm.groups() h = {} for junc in juncInfo.split(','): rm = re.match('([+-])([^:]+):[^:]+:(.*)\/(.*)', junc) strand, geneN, exonIdx, exonTot = rm.groups() if strand == '+': locParsed = '%s%s:%s' % (strand, chrom, pos) else: locParsed = '%s%s:%s' % (strand, chrom, int(pos) + 1) mybasic.addHash(h, (locParsed, geneN), junc) parseL = [] for ((locParsed, geneN), juncL) in h.iteritems(): maxTrans = 0 isLastExon = True for junc in juncL: rm = re.match('([+-])([^:]+):[^:]+:(.*)\/(.*)', junc) strand, geneN, exonIdx, exonTot = rm.groups() if exonIdx != exonTot: isLastExon = False if int(exonTot) > maxTrans: alias = '%s/%s' % (exonIdx, exonTot) maxTrans = int(exonTot) parseL.append((locParsed, geneN, alias, isLastExon, ','.join(juncL))) return parseL
def parse(loc,juncInfo): rm = re.match('([^:]+):([^:]+)',loc) chrom, pos = rm.groups() h = {} for junc in juncInfo.split(','): rm = re.match('([+-])([^:]+):[^:]+:(.*)\/(.*)',junc) strand,geneN,exonIdx,exonTot = rm.groups() if strand == '+': locParsed = '%s%s:%s' % (strand,chrom,pos) else: locParsed = '%s%s:%s' % (strand,chrom,int(pos)+1) mybasic.addHash(h,(locParsed,geneN),junc) parseL = [] for ((locParsed,geneN),juncL) in h.iteritems(): maxTrans = 0 isLastExon = True for junc in juncL: rm = re.match('([+-])([^:]+):[^:]+:(.*)\/(.*)',junc) strand,geneN,exonIdx,exonTot = rm.groups() if exonIdx!=exonTot: isLastExon = False if int(exonTot) > maxTrans: alias = '%s/%s' % (exonIdx,exonTot) maxTrans = int(exonTot) parseL.append((locParsed,geneN,alias,isLastExon,','.join(juncL))) return parseL
def loadAnnot(geneL=[]): refFlatH = mygenome.loadRefFlatByChr() eiH = {} ei_keyH = {} juncInfoH = {} for chrom in refFlatH.keys(): eiH[chrom] = {} juncInfoH[chrom] = {} refFlatL = refFlatH[chrom] for tH in refFlatL: if geneL!=[] and tH['geneName'] not in geneL: continue for i in range(len(tH['exnList'])): if tH['strand'] == '+': pos = tH['exnList'][i][1] e_num = i+1 else: pos = tH['exnList'][i][0] e_num = len(tH['exnList'])-i mybasic.addHash(juncInfoH[chrom], pos, '%s%s:%s:%s/%s' % (tH['strand'], tH['geneName'], tH['refSeqId'], e_num, len(tH['exnList']))) eiH[chrom][pos] = 0 cursor.execute('replace into temp_table (chrom,pos) values ("%s",%s)' % (chrom,pos)) ei_keyH[chrom] = eiH[chrom].keys() ei_keyH[chrom].sort() return eiH,ei_keyH,juncInfoH
def main(dirPath): fileNameL = filter(lambda x: re.match('.*TCGA-..-....-0..*\.bam',x), os.listdir(dirPath)) # normal sample, bam fileNameTokL = map(lambda x: re.match('.*(TCGA-..-....)-...-..([DW]).*\.bam',x), fileNameL) h = {} for rm in fileNameTokL: sN = rm.group(1) type = rm.group(2) if 'SOLiD' in rm.group(0): type += '-SD' elif 'IlluminaGA' in rm.group(0): type += '-GA' mybasic.addHash(h,sN,type) for (sN,typeL) in h.iteritems(): typeL = list(set(typeL)) typeL.sort() sys.stdout.write('%s\tXSeq_%s\n' % (sN,','.join(typeL)))
def exonSkip_proc_annot(inReportFileName, outReportFileName, inCnaGctFileName=None): geneDB = mygenome.getGeneDB() frameInfoH = mygenome.getFrameInfoH() if inCnaGctFileName: cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName) else: cnaDB = None outReportFile = open(outReportFileName, 'w') for line in open(inReportFileName): (sampN, bp1, bp2, t1, t2, nmatch, nseq, nreg) = line[:-1].split('\t') if inCnaGctFileName: indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*', sampN).group(1) geneS = set() geneH = {} for tL in (t1, t2): for t in tL.split(','): ro = re.match('(.*)\.exon([0-9]*)/[0-9]*', t) t = ro.group(1) e = int(ro.group(2)) mybasic.addHash(geneH, t, e) g = mygenome.gene(t, geneDB=geneDB) if g.geneName: geneS.add(g.geneName) frameL = [] for transId in geneH: exnList = geneH[transId] if len(exnList) != 2: continue #exnList.sort() cons = mygenome.frameCons(transId, exnList[0], transId, exnList[1], frameInfoH) if cons: frameL.append('%s:%s' % (transId, cons)) else: continue cnaInfo = [] geneInfo = [] censusInfo = [] goInfoS = set() keggInfoS = set() biocInfoS = set() for geneName in geneS: gene = mygenome.gene(geneName, geneDB=geneDB) if cnaDB: cnaInfo.append('%s:%s' % (geneName, cnaDB.query(indivId, geneName))) geneInfo.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) goInfoS = goInfoS.union(set(gene.getAttr('go'))) keggInfoS = keggInfoS.union(set(gene.getAttr('kegg'))) biocInfoS = biocInfoS.union(set(gene.getAttr('bioc'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (sampN, bp1,bp2, t1,t2, ','.join(frameL), ';'.join(geneS), ','.join(cnaInfo),';'.join(geneInfo),';'.join(censusInfo), \ ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)),nmatch,nseq,nreg))
def main(inDrugFileName,outDirName,outFileName,geneL='wg',cutoff=0.05, plottype='AUC', plot='FALSE', outPlotDirName='/home/heejin/DrugScreening/figure',seqType='WTS'): if seqType == 'WES': idH = WESidH else: idH = WTSidH inFile = open(inDrugFileName) drugH = {} drugL = inFile.readline()[:-2].split(',')[1:] for drug in drugL: drugH[drug] = {} for line in inFile: dataH = {} dataL = line[:-2].split(',') sId = dataL[0] for i in range(len(drugL)): dataH[sId] = dataL[i+1] drugH[drugL[i]].update(dataH) con,cursor = mymysql.connectDB(db='common') if geneL == 'wg': cursor.execute('SELECT distinct geneName FROM refFlat_hg19') geneL = [x for (x,) in cursor.fetchall()] elif geneL == 'cs': cursor.execute('SELECT distinct gene_sym FROM cs_gene') geneL = [x for (x,) in cursor.fetchall()] else: geneL = geneL # fusion outFile = open(outFileName, 'w') outFile.write('Drug\tGene\tp_twosided\tp_greater\tp_less\tD\tp_twosided2\tD2\twtN\tmutN\twt_sampN\tmut_sampN\twilcox_p\tttest_p\tmed_z.score\tmean_z.score\tAltInfo\n') outFile.close() con,cursor = mymysql.connectDB(db='ircr1') dbIdL = idH.values() cursor.execute('select distinct samp_id from rpkm_gene_expr') procSampL = [x for (x,) in cursor.fetchall()] for gN in geneL: # if gN != 'MET': # continue tempFileName = '%s/temp4test.txt' % outDirName tempFile = open(tempFileName, 'w') tempFile.write('Drug\tGene\tds_id\tdb_id\tAUC\tAlt\n') cursor.execute('SELECT samp_id,gene_sym1,gene_sym2,nReads/(nReads+nReads_w1) as maf FROM splice_fusion_AF \ where (gene_sym1 ="%s" or gene_sym2="%s") and nPos>2 and frame like "%s:Y%s" ' % (gN,gN,'%','%')) result = cursor.fetchall() mutH = {} if len(result) == 0: continue for (dbId, gs, gs2, maf) in result: try: if float(maf) < cutoff: continue except: continue type = '%s_%s' % (gs,gs2) #mutH[dbId] = (type,maf) mybasic.addHash(mutH,dbId,(type,maf)) scr_idL = drugH[drugH.keys()[0]].keys() for drug in drugH.keys(): for id in idH.keys(): if id not in scr_idL: continue if idH[id] not in procSampL: continue try: Alt = mutH[idH[id]] Alt = '/'.join(map(str,Alt)) except: Alt = 'NA' tempFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (drug, gN, id, idH[id], drugH[drug][id], Alt)) tempFile.close() os.system('Rscript ~/JK1/NSL/HTS/drugRespByFus.R %s %s %s %s %s' % (tempFileName, outFileName,plot,outPlotDirName,plottype))
def main(inFileDir,outFileName): outFile = open(outFileName,'w') registry = [] inFileNameL = glob.glob('%s/*HumanMethylation*' % (inFileDir,)) inFileNameL.sort(lambda x,y: cmp(y,x)) for inFileName in inFileNameL: sId = inFileName[inFileName.index('TCGA-'):inFileName.index('TCGA-')+28] if sId in registry: continue registry.append(sId) pId = sId[:12] if int(sId[13:15])<10: TN = 'T' else: TN = 'N' if 'HumanMethylation450' in inFileName: platform = 'Infinium450k' else: platform = 'Infinium27k' print sId, platform inFile = open(inFileName) line = inFile.readline() inFile.readline() geneH = {} for line in inFile: tokL = line.rstrip().split('\t') geneN = tokL[3] value = tokL[2] if not geneN or value=='NA': continue mybasic.addHash(geneH,geneN,float(value)) for geneN,valueL in geneH.iteritems(): v = numpy.mean(valueL) for g in geneN.split(';'): outFile.write('%s\t%s\t%s\t%s\t%s\t%.2f\n' % (platform,sId,pId,TN,g,v)) inFileNameL = glob.glob('%s/*OMA002*' % (inFileDir,)) platform = 'GoldenGate3k' for inFileName in inFileNameL: inFile = open(inFileName) line = inFile.readline() sId = line.rstrip().split('\t')[1] if sId in registry: continue registry.append(sId) pId = sId[:12] if int(sId[13:15])<10: TN = 'T' else: TN = 'N' inFile.readline() print sId, platform geneH = {} for line in inFile: name,value = line.rstrip().split('\t') geneN = name[:name.find('_')] loc = name[name.find('_')+1:] if not geneN or value=='N/A': continue mybasic.addHash(geneH,geneN,float(value)) for geneN,valueL in geneH.iteritems(): for g in geneN.split(';'): outFile.write('%s\t%s\t%s\t%s\t%s\t%.2f\n' % (platform,sId,pId,TN,g,v))
def exonSkip_filter(inFileName, outFileName): ''' filters-in exon-skipping candidates in splice-mapped gsnap ''' result = mygsnap.gsnapFile(inFileName, False) if outFileName[-3:] == '.gz': outFile = gzip.open(outFileName, 'wb') else: outFile = open(outFileName, 'w') count_all = 0 count_include = 0 for r in result: if r.nLoci != 1: continue match = r.matchL()[0] if len(match.segL) != 2: continue segObjL = match.getSegInfo() jncH = {} skip = False for segObj in segObjL: if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5: skip = True break if segObj.label == '': break for b in segObj.label.split('|'): rm2 = re.match('(.*)\.exon([0-9]+)\/[0-9]+', b) transId = rm2.group(1) exonNum = int(rm2.group(2)) mybasic.addHash(jncH, transId, exonNum) if skip: continue jncL = jncH.items() if len(jncL) > 0 and max([len(j[1]) for j in jncL]) > 1: minDist = 100 for i in range(len(jncL)): if len(jncL[i][1]) == 2 and abs(jncL[i][1][0] - jncL[i][1][1]) < minDist: minDist = abs(jncL[i][1][0] - jncL[i][1][1]) if minDist == 1: # only difference outFile.write(r.rawText() + '\n') count_include += 1 count_all += 1 print 'Results:', count_include, count_all
def exonSkip_proc_annot(inReportFileName,outReportFileName,inCnaGctFileName=None): geneDB = mygenome.getGeneDB() frameInfoH = mygenome.getFrameInfoH() if inCnaGctFileName: cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName) else: cnaDB = None outReportFile = open(outReportFileName,'w') for line in open(inReportFileName): (sampN,bp1,bp2,t1,t2,nmatch,nseq,nreg) = line[:-1].split('\t') if inCnaGctFileName: indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*',sampN).group(1) geneS = set() geneH = {} for tL in (t1,t2): for t in tL.split(','): ro = re.match('(.*)\.exon([0-9]*)/[0-9]*',t) t = ro.group(1) e = int(ro.group(2)) mybasic.addHash(geneH,t,e) g = mygenome.gene(t,geneDB=geneDB) if g.geneName: geneS.add(g.geneName) frameL = [] for transId in geneH: exnList = geneH[transId] if len(exnList) != 2: continue #exnList.sort() cons = mygenome.frameCons(transId,exnList[0], transId,exnList[1],frameInfoH) if cons: frameL.append('%s:%s' % (transId,cons)) else: continue cnaInfo = [] geneInfo = [] censusInfo = [] goInfoS = set() keggInfoS = set() biocInfoS = set() for geneName in geneS: gene = mygenome.gene(geneName,geneDB=geneDB) if cnaDB: cnaInfo.append('%s:%s' % (geneName,cnaDB.query(indivId,geneName))) geneInfo.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary'))) censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners'))) goInfoS = goInfoS.union(set(gene.getAttr('go'))) keggInfoS = keggInfoS.union(set(gene.getAttr('kegg'))) biocInfoS = biocInfoS.union(set(gene.getAttr('bioc'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (sampN, bp1,bp2, t1,t2, ','.join(frameL), ';'.join(geneS), ','.join(cnaInfo),';'.join(geneInfo),';'.join(censusInfo), \ ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)),nmatch,nseq,nreg))
def exonSkip_filter(inFileName,outFileName): ''' filters-in exon-skipping candidates in splice-mapped gsnap ''' result = mygsnap.gsnapFile(inFileName, False) outFile = open(outFileName, 'w') count_all = 0 count_include = 0 for r in result: if r.nLoci != 1: continue match = r.matchL()[0] if len(match.segL) != 2: continue segObjL = match.getSegInfo() jncH = {} skip = False for segObj in segObjL: if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5: skip = True break if segObj.label == '': break for b in segObj.label.split('|'): rm2 = re.match('(.*)\.exon([0-9]+)\/[0-9]+',b) transId = rm2.group(1) exonNum = int(rm2.group(2)) mybasic.addHash(jncH,transId,exonNum) if skip: continue jncL = jncH.items() if len(jncL)>0 and max([len(j[1]) for j in jncL])>1: minDist = 100 for i in range(len(jncL)): if len(jncL[i][1]) == 2 and abs(jncL[i][1][0]-jncL[i][1][1]) < minDist: minDist = abs(jncL[i][1][0]-jncL[i][1][1]) if minDist > 1: outFile.write(r.rawText()+'\n') count_include += 1 count_all += 1 print 'Results:',count_include, count_all
def main(inFileDir, outFileName): outFile = open(outFileName, 'w') registry = [] inFileNameL = glob.glob('%s/*HumanMethylation*' % (inFileDir, )) inFileNameL.sort(lambda x, y: cmp(y, x)) for inFileName in inFileNameL: sId = inFileName[inFileName.index('TCGA-'):inFileName.index('TCGA-') + 28] if sId in registry: continue registry.append(sId) pId = sId[:12] if int(sId[13:15]) < 10: TN = 'T' else: TN = 'N' if 'HumanMethylation450' in inFileName: platform = 'Infinium450k' else: platform = 'Infinium27k' print sId, platform inFile = open(inFileName) line = inFile.readline() inFile.readline() geneH = {} for line in inFile: tokL = line.rstrip().split('\t') geneN = tokL[3] value = tokL[2] if not geneN or value == 'NA': continue mybasic.addHash(geneH, geneN, float(value)) for geneN, valueL in geneH.iteritems(): v = numpy.mean(valueL) for g in geneN.split(';'): outFile.write('%s\t%s\t%s\t%s\t%s\t%.2f\n' % (platform, sId, pId, TN, g, v)) inFileNameL = glob.glob('%s/*OMA002*' % (inFileDir, )) platform = 'GoldenGate3k' for inFileName in inFileNameL: inFile = open(inFileName) line = inFile.readline() sId = line.rstrip().split('\t')[1] if sId in registry: continue registry.append(sId) pId = sId[:12] if int(sId[13:15]) < 10: TN = 'T' else: TN = 'N' inFile.readline() print sId, platform geneH = {} for line in inFile: name, value = line.rstrip().split('\t') geneN = name[:name.find('_')] loc = name[name.find('_') + 1:] if not geneN or value == 'N/A': continue mybasic.addHash(geneH, geneN, float(value)) for geneN, valueL in geneH.iteritems(): for g in geneN.split(';'): outFile.write('%s\t%s\t%s\t%s\t%s\t%.2f\n' % (platform, sId, pId, TN, g, v))