def gene_annot(geneN_idx=0, inFileN='', outFileN='', have_header=True): geneDB = mygenome.getGeneDB() inFile = sys.stdin if inFileN != '': inFile = open(inFileN, 'r') outFile = sys.stdout if outFileN != '': outFile = open(outFileN, 'w') if have_header: header = inFile.readline()[:-1] outFile.write('%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n' % header) else: last_pos = inFile.tell() # remember current position header = inFile.readline().rstrip() ncol = len(header.split('\t')) headerL = map(lambda x: 'X%s' % (x + 1), range(ncol)) outFile.write('%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n' % '\t'.join(headerL)) inFile.seek(last_pos) # return to original position #if have_header for line in inFile: tokL = line[:-1].split('\t') geneName = tokL[geneN_idx].split(',')[0] geneS = set() geneH = {} geneInfo = [] censusInfo = [] goInfoS = set() keggInfoS = set() biocInfoS = set() gene = mygenome.gene(geneName, geneDB=geneDB) geneInfo.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo.append( '%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) goInfoS = goInfoS.union(set(gene.getAttr('go'))) keggInfoS = keggInfoS.union(set(gene.getAttr('kegg'))) biocInfoS = biocInfoS.union(set(gene.getAttr('biocarta'))) outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % \ ('\t'.join(tokL), ';'.join(geneInfo),';'.join(censusInfo), \ ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)))) outFile.flush() outFile.close() inFile.close()
def gene_annot(inReportFileName, outReportFileName): geneDB = mygenome.getGeneDB() outReportFile = open(outReportFileName, 'w') inFile = open(inReportFileName) header = inFile.readline()[:-1] outReportFile.write('%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n' % header) headerL = header.split('\t') if 'geneN' in headerL: geneN_idx = headerL.index('geneN') if 'gene_symL' in headerL: geneN_idx = headerL.index('gene_symL') if 'SYMBOL' in headerL: geneN_idx = headerL.index('SYMBOL') for line in inFile: tokL = line[:-1].split('\t') # geneName = tokL[geneN_idx].split(',')[0] geneName = tokL[geneN_idx].split(';')[0] geneS = set() geneH = {} geneInfo = [] censusInfo = [] goInfoS = set() keggInfoS = set() biocInfoS = set() gene = mygenome.gene(geneName, geneDB=geneDB) geneInfo.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo.append( '%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) goInfoS = goInfoS.union(set(gene.getAttr('go'))) keggInfoS = keggInfoS.union(set(gene.getAttr('kegg'))) biocInfoS = biocInfoS.union(set(gene.getAttr('biocarta'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % \ ('\t'.join(tokL), ';'.join(geneInfo),';'.join(censusInfo), \ ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS))))
def gene_annot(inReportFileName,outReportFileName): geneDB = mygenome.getGeneDB() outReportFile = open(outReportFileName,'w') inFile = open(inReportFileName) header = inFile.readline()[:-1] outReportFile.write('%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n' % header) headerL = header.split('\t') if 'geneN' in headerL: geneN_idx = headerL.index('geneN') if 'gene_symL' in headerL: geneN_idx = headerL.index('gene_symL') if 'SYMBOL' in headerL: geneN_idx = headerL.index('SYMBOL') for line in inFile: tokL = line[:-1].split('\t') # geneName = tokL[geneN_idx].split(',')[0] geneName = tokL[geneN_idx].split(';')[0] geneS = set() geneH = {} geneInfo = [] censusInfo = [] goInfoS = set() keggInfoS = set() biocInfoS = set() gene = mygenome.gene(geneName,geneDB=geneDB) geneInfo.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary'))) censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners'))) goInfoS = goInfoS.union(set(gene.getAttr('go'))) keggInfoS = keggInfoS.union(set(gene.getAttr('kegg'))) biocInfoS = biocInfoS.union(set(gene.getAttr('biocarta'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % \ ('\t'.join(tokL), ';'.join(geneInfo),';'.join(censusInfo), \ ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS))))
def gene_annot(geneN_idx=0, inFileN="", outFileN="", have_header=True): geneDB = mygenome.getGeneDB() inFile = sys.stdin if inFileN != "": inFile = open(inFileN, "r") outFile = sys.stdout if outFileN != "": outFile = open(outFileN, "w") if have_header: header = inFile.readline()[:-1] outFile.write("%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n" % header) else: last_pos = inFile.tell() # remember current position header = inFile.readline().rstrip() ncol = len(header.split("\t")) headerL = map(lambda x: "X%s" % (x + 1), range(ncol)) outFile.write("%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n" % "\t".join(headerL)) inFile.seek(last_pos) # return to original position # if have_header for line in inFile: tokL = line[:-1].split("\t") geneName = tokL[geneN_idx].split(",")[0] geneS = set() geneH = {} geneInfo = [] censusInfo = [] goInfoS = set() keggInfoS = set() biocInfoS = set() gene = mygenome.gene(geneName, geneDB=geneDB) geneInfo.append("%s:%s:%s" % (geneName, gene.getAttr("desc"), gene.getAttr("summary"))) censusInfo.append( "%s:%s:%s:%s" % ( gene.getAttr("census_somatic"), gene.getAttr("census_germline"), gene.getAttr("census_mutType"), gene.getAttr("census_translocPartners"), ) ) goInfoS = goInfoS.union(set(gene.getAttr("go"))) keggInfoS = keggInfoS.union(set(gene.getAttr("kegg"))) biocInfoS = biocInfoS.union(set(gene.getAttr("biocarta"))) outFile.write( "%s\t%s\t%s\t%s\t%s\t%s\n" % ( "\t".join(tokL), ";".join(geneInfo), ";".join(censusInfo), ";".join(map(str, goInfoS)), ";".join(map(str, keggInfoS)), ";".join(map(str, biocInfoS)), ) ) outFile.flush() outFile.close() inFile.close()
def exonSkip_proc(inGsnapFileName, outGsnapFileName, outReportFileName, sampN): geneNameH = mygenome.geneNameH() geneSetH = mygenome.geneSetH() geneInfoH = mygenome.geneInfoH(geneNameH, geneSetH) refFlatH = mygenome.loadRefFlatByChr() result = mygsnap.gsnapFile(inGsnapFileName, False) juncHH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1) direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1)) transcript1 = re.search('label_[12]:([^,\t]*)', match.segL[0][3]) gene1 = set() if transcript1: transcript1 = tuple( [x.split('.exon')[0] for x in transcript1.group(1).split('|')]) for t in transcript1: g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH) if g.geneName: gene1.add(g.geneName) else: transcript1 = () transcript2 = re.search('label_[12]:([^,\t]*)', match.segL[1][3]) gene2 = set() if transcript2: transcript2 = tuple( [x.split('.exon')[0] for x in transcript2.group(1).split('|')]) for t in transcript2: g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH) if g.geneName: gene2.add(g.geneName) else: transcript2 = () s1 = match.segL[0][2] s2 = match.segL[1][2] bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)', s1) bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+', s2) if (bp1.group(1), direction) in (('+', 'sense'), ('-', 'antisense')): trans_strand1 = '+' elif (bp1.group(1), direction) in (('+', 'antisense'), ('-', 'sense')): trans_strand1 = '-' else: raise Exception if (bp2.group(1), direction) in (('+', 'sense'), ('-', 'antisense')): trans_strand2 = '+' elif (bp2.group(1), direction) in (('+', 'antisense'), ('-', 'sense')): trans_strand2 = '-' else: raise Exception bp_gene1 = mygenome.locus( '%s:%s-%s%s' % (bp1.group(2), int(bp1.group(3)) - 1, bp1.group(3), trans_strand1)).overlappingGeneL( refFlatH=refFlatH, strand_sensitive=True) bp_gene2 = mygenome.locus( '%s:%s-%s%s' % (bp2.group(2), int(bp2.group(3)) - 1, bp2.group(3), trans_strand2)).overlappingGeneL( refFlatH=refFlatH, strand_sensitive=True) if direction == 'sense': key = (bp1.groups()[1:], bp2.groups()[1:]) transcript = (transcript1, transcript2) gene = (tuple(gene1), tuple(gene2)) bp_gene = (bp_gene1, bp_gene2) elif direction == 'antisense': key = (bp2.groups()[1:], bp1.groups()[1:]) transcript = (transcript2, transcript1) gene = (tuple(gene2), tuple(gene1)) bp_gene = (bp_gene2, bp_gene1) else: raise Exception if key in juncHH: juncHH[key]['match'].append(r) juncHH[key]['seq'].append(r.seq()) juncHH[key]['reg'].append((direction, offset)) else: juncHH[key] = { 'match': [r], 'splice_type': splice_type, 'seq': [r.seq()], 'reg': [(direction, offset)], 'transcript': transcript, 'gene': gene, 'bp_gene': bp_gene } juncKH = juncHH.items() juncKH.sort(lambda x, y: cmp(len(set(y[1]['reg'])), len(set(x[1]['reg'])))) outGsnapFile = open(outGsnapFileName, 'w') outReportFile = open(outReportFileName, 'w') for (key, juncH) in juncKH: if key[0][0] == key[1][0]: type = 'intra' else: type = 'inter' geneInfo1 = [] censusInfo1 = [] for geneName in juncH['gene'][0]: gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH) geneInfo1.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo1.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) geneInfo2 = [] censusInfo2 = [] for geneName in juncH['gene'][1]: gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH) geneInfo2.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo2.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (type, juncH['splice_type'], sampN, ':'.join(key[0]), ':'.join(key[1]), \ ';'.join(juncH['transcript'][0]), ';'.join(juncH['transcript'][1]), ';'.join(juncH['gene'][0]), ';'.join(juncH['gene'][1]), ';'.join(geneInfo1), ';'.join(geneInfo2), \ ';'.join(censusInfo1), ';'.join(censusInfo2), ','.join(juncH['bp_gene'][0]), ','.join(juncH['bp_gene'][1]), \ len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg'])))) for m in juncH['match']: outGsnapFile.write(m.rawText() + '\n')
def genCompositeModel(outTextFileName,outFaFileName,intronSize=100): geneNameH = mygenome.geneNameH() geneSetH = mygenome.geneSetH() geneInfoH = mygenome.geneInfoH(geneNameH,geneSetH) geneH = mygenome.loadKgByChr() #geneH = mygenome.loadLincByChr(h=geneH) outTextFile = open(outTextFileName, 'w') outFaFile = open(outFaFileName, 'w') for chrNum in range(1,23)+['X','Y','M']: #for chrNum in [1]: chrom = 'chr%s' % chrNum geneH_byChr = filter(lambda x: mygenome.gene(x['geneId'],geneNameH,geneSetH,geneInfoH).geneName in mygenome.TK, geneH[chrom]) txnLocusL_combined = [] for strand in ['+','-']: txnLocusL = [mygenome.locus('%s:%s-%s%s' % (chrom,h['txnSta'],h['txnEnd'],strand),h['geneId']) for h in filter(lambda x: x['strand']==strand, geneH_byChr)] n_before = len(txnLocusL) txnLocusL = mygenome.mergeLoci(txnLocusL) n_after = len(txnLocusL) #print chrom, strand, n_before, n_after txnLocusL_combined += txnLocusL txnLocusL_combined.sort(lambda x,y: cmp(x.chrEnd,y.chrEnd)) txnLocusL_combined.sort(lambda x,y: cmp(x.chrSta,y.chrSta)) for txnLoc in txnLocusL_combined: exnLocusL = [] for h in filter(lambda x: x['geneId'] in txnLoc.id, geneH_byChr): for (exnSta,exnEnd) in h['exnList']: exnLocusL.append(mygenome.locus('%s:%s-%s%s' % (chrom, exnSta, exnEnd, h['strand']))) exnLocusL.sort(lambda x,y: cmp(x.chrEnd,y.chrEnd)) exnLocusL.sort(lambda x,y: cmp(x.chrSta,y.chrSta)) exnLocusL = mygenome.mergeLoci(exnLocusL) exnStaL = [str(exnLoc.chrSta) for exnLoc in exnLocusL] exnEndL = [str(exnLoc.chrEnd) for exnLoc in exnLocusL] outTextFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (txnLoc.id,txnLoc.chrom,txnLoc.strand,txnLoc.chrSta,txnLoc.chrEnd,len(exnLocusL),','.join(exnStaL),','.join(exnEndL))) outFaFile.write('>%s|%s|%s|%s|%s\n' % (txnLoc.id,txnLoc.chrom,txnLoc.strand,txnLoc.chrSta,txnLoc.chrEnd)) for i in range(len(exnLocusL)): exnLocCopy = copy.deepcopy(exnLocusL[i]) exnLocCopy.strand = '+' if i > 0: exnLocCopy.chrSta -= min(intronSize, int((exnLocusL[i].chrSta - exnLocusL[i-1].chrEnd)/2)) if i < len(exnLocusL)-1: exnLocCopy.chrEnd += min(intronSize, int((exnLocusL[i+1].chrSta - exnLocusL[i].chrEnd)/2)) outFaFile.write(exnLocCopy.nibFrag()) outFaFile.write('\n') outTextFile.close() outFaFile.close()
def gsnap_process_junction(inGsnapFileName,outGsnapFileName,outReportFileName,sampN): geneNameH = mygenome.geneNameH() geneSetH = mygenome.geneSetH() geneInfoH = mygenome.geneInfoH(geneNameH,geneSetH) refFlatH = mygenome.loadRefFlatByChr() result = mygsnap.gsnapFile(inGsnapFileName,False) juncHH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1) direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1)) rm = re.search('label_[12]:([^,\t]*)', match.segL[0][3]) gene1 = set() if rm: trans_exon1 = rm.group(1).split('|') for t in trans_exon1: g = mygenome.gene(t.split('.exon')[0],geneNameH,geneSetH,geneInfoH) if g.geneName: gene1.add(g.geneName) else: trans_exon1 = () rm = re.search('label_[12]:([^,\t]*)', match.segL[0][3]) gene2 = set() if rm: trans_exon2 = rm.group(1).split('|') for t in trans_exon2: g = mygenome.gene(t.split('.exon')[0],geneNameH,geneSetH,geneInfoH) if g.geneName: gene2.add(g.geneName) else: trans_exon2 = () s1 = match.segL[0][2] s2 = match.segL[1][2] bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)',s1) bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+',s2) if (bp1.group(1),direction) in (('+','sense'),('-','antisense')): trans_strand1 = '+' elif (bp1.group(1),direction) in (('+','antisense'),('-','sense')): trans_strand1 = '-' else: raise Exception if (bp2.group(1),direction) in (('+','sense'),('-','antisense')): trans_strand2 = '+' elif (bp2.group(1),direction) in (('+','antisense'),('-','sense')): trans_strand2 = '-' else: raise Exception locus1 = mygenome.locus('%s:%s-%s%s' % (bp1.group(2),int(bp1.group(3))-1,bp1.group(3),trans_strand1)) bp_gene1 = list(set(locus1.overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True)).difference(gene1)) locus2 = mygenome.locus('%s:%s-%s%s' % (bp2.group(2),int(bp2.group(3))-2,bp2.group(3),trans_strand2)) bp_gene2 = list(set(locus2.overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True)).difference(gene2)) if direction=='sense': key = (bp1.groups()[1:],bp2.groups()[1:]) trans_exon = (trans_exon1,trans_exon2) gene = (list(gene1),list(gene2)) bp_gene = (bp_gene1,bp_gene2) elif direction=='antisense': key = (bp2.groups()[1:],bp1.groups()[1:]) trans_exon = (trans_exon2,trans_exon1) gene = (list(gene2),list(gene1)) bp_gene = (bp_gene2,bp_gene1) else: raise Exception if key in juncHH: juncHH[key]['match'].append(r) juncHH[key]['seq'].append(r.seq()) juncHH[key]['reg'].append((direction,offset)) else: juncHH[key] = {'match':[r], 'splice_type':splice_type, 'seq':[r.seq()], 'reg':[(direction,offset)], 'trans_exon':trans_exon, 'gene':gene, 'bp_gene':bp_gene} juncKH = juncHH.items() juncKH.sort(lambda x,y: cmp(len(set(y[1]['reg'])),len(set(x[1]['reg'])))) outGsnapFile = open(outGsnapFileName,'w') outReportFile = open(outReportFileName,'w') for (key, juncH) in juncKH: if key[0][0] == key[1][0]: type = 'intra' else: type = 'inter' geneInfo1 = [] censusInfo1 = [] for geneName in juncH['gene'][0]+juncH['bp_gene'][0]: gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH) geneInfo1.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary'))) censusInfo1.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners'))) geneInfo2 = [] censusInfo2 = [] for geneName in juncH['gene'][1]+juncH['bp_gene'][1]: gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH) geneInfo2.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary'))) censusInfo2.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s;%s\t%s;%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (type, juncH['splice_type'], sampN, ':'.join(key[0]), ':'.join(key[1]), \ ','.join(juncH['trans_exon'][0]), ','.join(juncH['trans_exon'][1]), \ ','.join(juncH['gene'][0]), ','.join(juncH['bp_gene'][0]), ','.join(juncH['gene'][1]), ','.join(juncH['bp_gene'][1]), \ ';'.join(geneInfo1), ';'.join(geneInfo2), ';'.join(censusInfo1), ';'.join(censusInfo2), \ len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg'])))) for m in juncH['match']: outGsnapFile.write(m.rawText()+'\n')
def exonSkip_proc_annot(inReportFileName,outReportFileName,inCnaGctFileName=None): geneDB = mygenome.getGeneDB() frameInfoH = mygenome.getFrameInfoH() if inCnaGctFileName: cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName) else: cnaDB = None outReportFile = open(outReportFileName,'w') for line in open(inReportFileName): (sampN,bp1,bp2,t1,t2,nmatch,nseq,nreg) = line[:-1].split('\t') if inCnaGctFileName: indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*',sampN).group(1) geneS = set() geneH = {} for tL in (t1,t2): for t in tL.split(','): ro = re.match('(.*)\.exon([0-9]*)/[0-9]*',t) t = ro.group(1) e = int(ro.group(2)) mybasic.addHash(geneH,t,e) g = mygenome.gene(t,geneDB=geneDB) if g.geneName: geneS.add(g.geneName) frameL = [] for transId in geneH: exnList = geneH[transId] if len(exnList) != 2: continue #exnList.sort() cons = mygenome.frameCons(transId,exnList[0], transId,exnList[1],frameInfoH) if cons: frameL.append('%s:%s' % (transId,cons)) else: continue cnaInfo = [] geneInfo = [] censusInfo = [] goInfoS = set() keggInfoS = set() biocInfoS = set() for geneName in geneS: gene = mygenome.gene(geneName,geneDB=geneDB) if cnaDB: cnaInfo.append('%s:%s' % (geneName,cnaDB.query(indivId,geneName))) geneInfo.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary'))) censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners'))) goInfoS = goInfoS.union(set(gene.getAttr('go'))) keggInfoS = keggInfoS.union(set(gene.getAttr('kegg'))) biocInfoS = biocInfoS.union(set(gene.getAttr('bioc'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (sampN, bp1,bp2, t1,t2, ','.join(frameL), ';'.join(geneS), ','.join(cnaInfo),';'.join(geneInfo),';'.join(censusInfo), \ ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)),nmatch,nseq,nreg))
def gsnap_process_junction(inReportFileName,outReportFileName): geneNameH = mygenome.geneNameH() geneSetH = mygenome.geneSetH() geneInfoH = mygenome.geneInfoH(geneNameH,geneSetH) outReportFile = open(outReportFileName,'w') for line in open(inReportFileName): (spliceType,sampN,bp1,bp2,t1,t2,nmatch,nseq,nreg) = line[:-1].split('\t') gene1 = set() if t1: transcript1 = tuple(t1.split(';')) for t in transcript1: g = mygenome.gene(t,geneNameH,geneSetH,geneInfoH) if g.geneName: gene1.add(g.geneName) else: gene1 = () gene2 = set() if t2: transcript2 = tuple(t2.split(';')) for t in transcript2: g = mygenome.gene(t,geneNameH,geneSetH,geneInfoH) if g.geneName: gene2.add(g.geneName) else: gene2 = () bp_gene1 = set() # transcript1 = tuple([x for x in bp1.split('|') if "uc" in x]) for t in tuple([x for x in bp1.split('|') if "uc" in x]): g = mygenome.gene(t,geneNameH,geneSetH,geneInfoH) if g.geneName: bp_gene1.add(g.geneName) bp_gene2 = set() # transcript2 = tuple([x for x in bp2.split('|') if "uc" in x]) for t in tuple([x for x in bp2.split('|') if "uc" in x]): g = mygenome.gene(t,geneNameH,geneSetH,geneInfoH) if g.geneName: bp_gene2.add(g.geneName) # ch1 = tuple([x for x in id1.split('|') if not "uc" in x]) # ch2 = tuple([x for x in id2.split('|') if not "uc" in x]) if tuple([x for x in bp1.split('|') if "chr" in x])[0] == tuple([x for x in bp2.split('|') if "chr" in x])[0]: type = 'intra' else: type = 'inter' geneInfo1 = [] censusInfo1 = [] for geneName in gene1: gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH) geneInfo1.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary'))) censusInfo1.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners'))) geneInfo2 = [] censusInfo2 = [] for geneName in gene2: gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH) geneInfo2.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary'))) censusInfo2.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (type, spliceType, sampN, bp1, bp2, \ t1, t2, ';'.join(gene1), ';'.join(gene2), ';'.join(geneInfo1), ';'.join(geneInfo2), \ ';'.join(censusInfo1), ';'.join(censusInfo2), ','.join(bp_gene1), ','.join(bp_gene2), \ nmatch ,nseq, nreg))
def exonSkip_proc_annot(inReportFileName, outReportFileName, inCnaGctFileName=None): geneDB = mygenome.getGeneDB() frameInfoH = mygenome.getFrameInfoH() if inCnaGctFileName: cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName) else: cnaDB = None outReportFile = open(outReportFileName, 'w') for line in open(inReportFileName): (sampN, bp1, bp2, t1, t2, nmatch, nseq, nreg) = line[:-1].split('\t') if inCnaGctFileName: indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*', sampN).group(1) geneS = set() geneH = {} for tL in (t1, t2): for t in tL.split(','): ro = re.match('(.*)\.exon([0-9]*)/[0-9]*', t) t = ro.group(1) e = int(ro.group(2)) mybasic.addHash(geneH, t, e) g = mygenome.gene(t, geneDB=geneDB) if g.geneName: geneS.add(g.geneName) frameL = [] for transId in geneH: exnList = geneH[transId] if len(exnList) != 2: continue #exnList.sort() cons = mygenome.frameCons(transId, exnList[0], transId, exnList[1], frameInfoH) if cons: frameL.append('%s:%s' % (transId, cons)) else: continue cnaInfo = [] geneInfo = [] censusInfo = [] goInfoS = set() keggInfoS = set() biocInfoS = set() for geneName in geneS: gene = mygenome.gene(geneName, geneDB=geneDB) if cnaDB: cnaInfo.append('%s:%s' % (geneName, cnaDB.query(indivId, geneName))) geneInfo.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) goInfoS = goInfoS.union(set(gene.getAttr('go'))) keggInfoS = keggInfoS.union(set(gene.getAttr('kegg'))) biocInfoS = biocInfoS.union(set(gene.getAttr('bioc'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (sampN, bp1,bp2, t1,t2, ','.join(frameL), ';'.join(geneS), ','.join(cnaInfo),';'.join(geneInfo),';'.join(censusInfo), \ ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)),nmatch,nseq,nreg))
def fusion_proc_annot(inReportFileName,outReportFileName,inCnaGctFileName=None): geneDB = mygenome.getGeneDB() frameInfoH = mygenome.getFrameInfoH() refFlatH = mygenome.loadRefFlatByChr() if inCnaGctFileName: cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName) else: cnaDB = None outReportFile = open(outReportFileName,'w') for line in open(inReportFileName): (splice_type,sampN,bp1,bp2,teStr1,teStr2,nmatch,nseq,nreg) = line[:-1].split('\t') if inCnaGctFileName: indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*',sampN).group(1) geneStatL = [] for (bp,teStr) in ((bp1,teStr1),(bp2,teStr2)): geneS = set() teL = [] for te in teStr.split(','): rm = re.match('(.*)\.exon([0-9]*)/[0-9]*',te) if rm: t = rm.group(1) e = int(rm.group(2)) g = mygenome.gene(t,geneDB=geneDB) if g.geneName: geneS.add(g.geneName) teL.append((t,e)) rm = re.match('([+-])(chr[^:]*):([0-9]*)',bp) bp_geneS = set(mygenome.locus('%s:%s-%s%s' % (rm.group(2),int(rm.group(3))-1,rm.group(3),rm.group(1))).overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True)) bp_geneS = bp_geneS.difference(geneS) cnaInfo = [] geneInfo = [] censusInfo = [] goInfoS = set() keggInfoS = set() biocartaInfoS = set() for geneName in list(geneS) + list(bp_geneS): gene = mygenome.gene(geneName,geneDB=geneDB) if cnaDB: cnaInfo.append('%s:%s' % (geneName,cnaDB.query(indivId,geneName))) geneInfo.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary'))) censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners'))) goInfoS = goInfoS.union(set(gene.getAttr('go'))) keggInfoS = keggInfoS.union(set(gene.getAttr('kegg'))) biocartaInfoS = biocartaInfoS.union(set(gene.getAttr('biocarta'))) geneStatL.append((bp1.split(':')[0],bp,teStr,teL,geneS,bp_geneS,cnaInfo,geneInfo,censusInfo,goInfoS,keggInfoS,biocartaInfoS)) (chrom1,bp1,teStr1,teL1,geneS1,bp_geneS1,cnaInfo1,geneInfo1,censusInfo1,goInfoS1,keggInfoS1,biocartaInfoS1) = geneStatL[0] (chrom2,bp2,teStr2,teL2,geneS2,bp_geneS2,cnaInfo2,geneInfo2,censusInfo2,goInfoS2,keggInfoS2,biocartaInfoS2) = geneStatL[1] if chrom1 == chrom2: type = 'intra' else: type = 'inter' frameL = [] for (t1,e1) in teL1: for (t2,e2) in teL2: cons = mygenome.frameCons(t1,e1, t2,e2, frameInfoH) if cons=='Y': frameL.append('%s.%s-%s.%s:%s' % (t1,e1,t2,e2,cons)) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (sampN, splice_type, type, bp1,bp2, teStr1,teStr2, ','.join(frameL), ','.join(cnaInfo1), ','.join(cnaInfo2), \ '%s;%s' % (','.join(geneS1),','.join(bp_geneS1)), ';'.join(geneInfo1), ';'.join(censusInfo1), \ ';'.join(map(str,goInfoS1)), ';'.join(map(str,keggInfoS1)), ';'.join(map(str,biocartaInfoS1)), '%s;%s' % (','.join(geneS2),','.join(bp_geneS2)), ';'.join(geneInfo2), ';'.join(censusInfo2), \ ';'.join(map(str,goInfoS2)), ';'.join(map(str,keggInfoS2)), ';'.join(map(str,biocartaInfoS2)), nmatch,nseq,nreg))
def gsnap_process_junction(inReportFileName, outReportFileName): geneNameH = mygenome.geneNameH() geneSetH = mygenome.geneSetH() geneInfoH = mygenome.geneInfoH(geneNameH, geneSetH) outReportFile = open(outReportFileName, 'w') for line in open(inReportFileName): (spliceType, sampN, bp1, bp2, t1, t2, nmatch, nseq, nreg) = line[:-1].split('\t') gene1 = set() if t1: transcript1 = tuple(t1.split(';')) for t in transcript1: g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH) if g.geneName: gene1.add(g.geneName) else: gene1 = () gene2 = set() if t2: transcript2 = tuple(t2.split(';')) for t in transcript2: g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH) if g.geneName: gene2.add(g.geneName) else: gene2 = () bp_gene1 = set() # transcript1 = tuple([x for x in bp1.split('|') if "uc" in x]) for t in tuple([x for x in bp1.split('|') if "uc" in x]): g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH) if g.geneName: bp_gene1.add(g.geneName) bp_gene2 = set() # transcript2 = tuple([x for x in bp2.split('|') if "uc" in x]) for t in tuple([x for x in bp2.split('|') if "uc" in x]): g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH) if g.geneName: bp_gene2.add(g.geneName) # ch1 = tuple([x for x in id1.split('|') if not "uc" in x]) # ch2 = tuple([x for x in id2.split('|') if not "uc" in x]) if tuple([x for x in bp1.split('|') if "chr" in x ])[0] == tuple([x for x in bp2.split('|') if "chr" in x])[0]: type = 'intra' else: type = 'inter' geneInfo1 = [] censusInfo1 = [] for geneName in gene1: gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH) geneInfo1.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo1.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) geneInfo2 = [] censusInfo2 = [] for geneName in gene2: gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH) geneInfo2.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo2.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (type, spliceType, sampN, bp1, bp2, \ t1, t2, ';'.join(gene1), ';'.join(gene2), ';'.join(geneInfo1), ';'.join(geneInfo2), \ ';'.join(censusInfo1), ';'.join(censusInfo2), ','.join(bp_gene1), ','.join(bp_gene2), \ nmatch ,nseq, nreg))
def fusion_proc_annot(inReportFileName, outReportFileName, inCnaGctFileName=None): geneDB = mygenome.getGeneDB() frameInfoH = mygenome.getFrameInfoH() refFlatH = mygenome.loadRefFlatByChr() if inCnaGctFileName: cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName) else: cnaDB = None outReportFile = open(outReportFileName, 'w') for line in open(inReportFileName): (splice_type, sampN, bp1, bp2, teStr1, teStr2, nmatch, nseq, nreg) = line[:-1].split('\t') if inCnaGctFileName: indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*', sampN).group(1) geneStatL = [] for (bp, teStr) in ((bp1, teStr1), (bp2, teStr2)): geneS = set() teL = [] for te in teStr.split(','): rm = re.match('(.*)\.exon([0-9]*)/[0-9]*', te) if rm: t = rm.group(1) e = int(rm.group(2)) g = mygenome.gene(t, geneDB=geneDB) if g.geneName: geneS.add(g.geneName) teL.append((t, e)) rm = re.match('([+-])(chr[^:]*):([0-9]*)', bp) bp_geneS = set( mygenome.locus('%s:%s-%s%s' % (rm.group(2), int(rm.group(3)) - 1, rm.group(3), rm.group(1))).overlappingGeneL( refFlatH=refFlatH, strand_sensitive=True)) bp_geneS = bp_geneS.difference(geneS) cnaInfo = [] geneInfo = [] censusInfo = [] goInfoS = set() keggInfoS = set() biocartaInfoS = set() for geneName in list(geneS) + list(bp_geneS): gene = mygenome.gene(geneName, geneDB=geneDB) if cnaDB: cnaInfo.append('%s:%s' % (geneName, cnaDB.query(indivId, geneName))) geneInfo.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) goInfoS = goInfoS.union(set(gene.getAttr('go'))) keggInfoS = keggInfoS.union(set(gene.getAttr('kegg'))) biocartaInfoS = biocartaInfoS.union( set(gene.getAttr('biocarta'))) geneStatL.append( (bp1.split(':')[0], bp, teStr, teL, geneS, bp_geneS, cnaInfo, geneInfo, censusInfo, goInfoS, keggInfoS, biocartaInfoS)) (chrom1, bp1, teStr1, teL1, geneS1, bp_geneS1, cnaInfo1, geneInfo1, censusInfo1, goInfoS1, keggInfoS1, biocartaInfoS1) = geneStatL[0] (chrom2, bp2, teStr2, teL2, geneS2, bp_geneS2, cnaInfo2, geneInfo2, censusInfo2, goInfoS2, keggInfoS2, biocartaInfoS2) = geneStatL[1] if chrom1 == chrom2: type = 'intra' else: type = 'inter' frameL = [] for (t1, e1) in teL1: for (t2, e2) in teL2: cons = mygenome.frameCons(t1, e1, t2, e2, frameInfoH) if cons == 'Y': frameL.append('%s.%s-%s.%s:%s' % (t1, e1, t2, e2, cons)) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (sampN, splice_type, type, bp1,bp2, teStr1,teStr2, ','.join(frameL), ','.join(cnaInfo1), ','.join(cnaInfo2), \ '%s;%s' % (','.join(geneS1),','.join(bp_geneS1)), ';'.join(geneInfo1), ';'.join(censusInfo1), \ ';'.join(map(str,goInfoS1)), ';'.join(map(str,keggInfoS1)), ';'.join(map(str,biocartaInfoS1)), '%s;%s' % (','.join(geneS2),','.join(bp_geneS2)), ';'.join(geneInfo2), ';'.join(censusInfo2), \ ';'.join(map(str,goInfoS2)), ';'.join(map(str,keggInfoS2)), ';'.join(map(str,biocartaInfoS2)), nmatch,nseq,nreg))