def run(self): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### counter = ['>>'] # List containing count times menulist = [('F','Change output file name','outfile','OutFile'),('X','Exit','return',''),('R','Run','return','')] mchoice = rje_menu.menu(self,'WormPump Menu',menulist,choicetext='Please select:',changecase=True,default='R') if mchoice == 'X': return self.printLog('#OUT','Output will be to %s' % self.info['OutFile']) self.printLog('#START','Initialising counter...') ### ~ [2] ~ Perform counts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### wormid = None while counter[-1] != 'X': if wormid: counter.append(rje.choice('ID <ENTER> for new worm | X <ENTER> to exit | <ENTER> for "%s" pump count' % wormid,default='').upper()) else: counter.append(rje.choice('ID <ENTER> for new worm | X <ENTER> to exit',default='').upper()) if counter[-1]: wormid = counter[-1] if wormid == 'X': break self.printLog('#WORM','Worm "%s"' % wormid) counter.append(time.time()) self.deBug(counter) ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### head = ['Worm','Count','WormTime','AbsTime'] rje.delimitedFileOutput(self,self.info['OutFile'],headers=head,rje_backup=True) wormstart = 0.0 wormid = None wtot = 0 while counter: x = counter.pop(0) if x in ['>>','X']: continue if x: wormid = x wormstart = counter[0] wx = 0 wtot += 1 else: if not wormid: continue wx += 1 t = counter.pop(0) tt = time.localtime(t) wdata = {'Worm':wormid,'Count':wx,'WormTime':t-wormstart, #'AbsTime':'%s/%s/%s %s:%s:%s' % (tt[2],tt[1],tt[0],rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))} 'AbsTime':'%s:%s:%s' % (rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))} rje.delimitedFileOutput(self,self.info['OutFile'],headers=head,datadict=wdata) self.printLog('#OUT','Counts for %d worms output to %s' % (wtot,self.info['OutFile'])) rje.choice('<ENTER> to exit') except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def enrichment(self): ### Performs final enrichment analysis on SLiMDIP and Random datasets. ''' Performs final enrichment analysis on SLiMDIP and Random datasets. This requires the "real" predicted DMI from the slimDIP() method plus the randomised PPI datasets (from randomisePPI()). The latter are also run through the slimDIP() method to generate a background distribution. This is used directly to calculate enrichment "p-values" but also to generate a summary output file that can be used for generating histograms etc. with slimdip.R. This method needs: - slimdip table (or *.slimdip.tdt output file to load). - randbase.XX.tdt files. ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Check for required data else run preceding step(s) ~~~~~~~~~~~~~~~~~~~~~~~~~~ ## # Check for SLiMDIP table of real predicted DMI. Load in slimDIP() method if present and not in Database Table. if not self.db('slimdip') and not self.slimDIP(): return False # Check for randomised PPI datasets. These should be named randbase.XXX.tdt. for r in range(self.getInt('RandPPI')): randfile = '%s.%s.tdt' % (self.getStr('RandBase'),rje.preZero(r,self.getInt('RandPPI')-1)) if not rje.exists(randfile): if not self.randomisePPI(): return False break ### ~ [1] Perform Enrichment Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Perform SLiMDIP analysis of each random PPI dataset ~~~~~~~~~~~~~~~~~~~~~~~~~ ## return True except: self.errorLog('%s.enrichment() error' % self.prog()); return False
def enrichment( self ): ### Performs final enrichment analysis on SLiMDIP and Random datasets. ''' Performs final enrichment analysis on SLiMDIP and Random datasets. This requires the "real" predicted DMI from the slimDIP() method plus the randomised PPI datasets (from randomisePPI()). The latter are also run through the slimDIP() method to generate a background distribution. This is used directly to calculate enrichment "p-values" but also to generate a summary output file that can be used for generating histograms etc. with slimdip.R. This method needs: - slimdip table (or *.slimdip.tdt output file to load). - randbase.XX.tdt files. ''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Check for required data else run preceding step(s) ~~~~~~~~~~~~~~~~~~~~~~~~~~ ## # Check for SLiMDIP table of real predicted DMI. Load in slimDIP() method if present and not in Database Table. if not self.db('slimdip') and not self.slimDIP(): return False # Check for randomised PPI datasets. These should be named randbase.XXX.tdt. for r in range(self.getInt('RandPPI')): randfile = '%s.%s.tdt' % (self.getStr('RandBase'), rje.preZero( r, self.getInt('RandPPI') - 1)) if not rje.exists(randfile): if not self.randomisePPI(): return False break ### ~ [1] Perform Enrichment Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Perform SLiMDIP analysis of each random PPI dataset ~~~~~~~~~~~~~~~~~~~~~~~~~ ## return True except: self.errorLog('%s.enrichment() error' % self.prog()) return False
def makeFlySeq(self): ### Main run method '''Main run method.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F','seqnr=F','gnspacc=F'] genes = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-gene-r5.5.fasta' % flybase]+scmd) cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd) exons = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-exon-r5.5.fasta' % flybase]+scmd) ### ~ [1] ~ Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ### genedict = {} # Dictionary of {ID:Sequence object} (gx,gtot) = (0.0,genes.seqNum()) for gene in genes.seq: self.log.printLog('\r#GENE','Processing Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False) gx += 100 (id,scaffold,pos,name,glen) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',gene.info['Name']) if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) genedict[id] = gene gene.setInfo({'Scaffold':scaffold,'Gene':name}) try: (end,start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) gene.opt['Complement'] = start > end # Sequence on "lagging" strand gene.setStat({'Start':start,'End':end}) gene.list['CDS'] = [] # Will add CDS sequences here gene.list['Exon'] = [] # Will add exon sequences here self.log.printLog('\r#GENE','Processing Gene Annotation complete!') ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (cx,ctot) = (0.0,cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#CDS','Processing CDS Annotation: %.1f%%' % (cx/ctot),newline=False,log=False) cx += 100 try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) seq.obj['Parent'] = gene = genedict[parent] try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos) except: try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) seq.opt['Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start':start,'End':end}) gene.list['CDS'].append(seq) self.log.printLog('\r#CDS','Processing CDS Annotation complete!') ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (ex,etot) = (0.0,exons.seqNum()) for seq in exons.seq: self.log.printLog('\r#EXON','Processing Exon Annotation: %.1f%%' % (ex/etot),newline=False,log=False) ex += 100 try: (id,scaffold,pos,name,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise seq.obj['Parent'] = gene = genedict[string.split(parent,',')[0]] try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos) except: try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) seq.opt['Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start':start,'End':end}) gene.list['Exon'].append(seq) self.log.printLog('\r#EXON','Processing Exon Annotation complete!') ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (gx,gtot) = (0.0,genes.seqNum()) for gene in genes.seq: glen = gene.aaLen() self.log.printLog('\r#GENE','Generating new Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False) gx += 100 clist = [] for seq in gene.list['CDS']: if gene.opt['Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen)) clist.append(pos) clist = rje.sortUnique(clist,xreplace=False) elist = [] for seq in gene.list['Exon']: if gene.opt['Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen)) elist.append(pos) elist = rje.sortUnique(elist,xreplace=False) gene.info['Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (gene.info['Gene'],gene.info['SpecCode'],gene.info['AccNum'],gene.aaLen(),string.join(clist,','),string.join(elist,',')) self.log.printLog('\r#GENE','Generating new Gene Annotation complete!') ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## genes.saveFasta(seqfile='flybase_DROME.genes.fas') except: self.log.errorLog(rje_zen.Zen().wisdom())
def alignmentToLocal(self,alignment=[],protqry=False): ### Converts alignment into local hits table ''' Converts alignment into local hits table. >> alignment:list of alignment text strings parsed from exonerate output. >> protqry:bool[False] = Whether query is protein << returns local database table. ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### vfields = ['Qry','Hit','AlnID','Score','Expect','Length','Identity','Positives','QryStart','QryEnd','HitStart','HitEnd','QrySeq','HitSeq','AlnSeq','Rank','Phase','HitStrand'] vdb = self.db().addEmptyTable('local',vfields,['Qry','Hit','AlnID']) ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ''' Query: FAXD1_NOTSC (P82807) Venom prothrombin activator notecarin-D1 [Notechis scutatus scutatus] Target: ahap_PSETE__EBS10XV2AHAP187 haploidB edges=694320..157489 left=833615 right=281503 ver=1.9 style=4:[revcomp] Model: protein2genome:local Raw score: 1170 Query range: 19 -> 295 Target range: 12312786 -> 12307250 20 : AlaGluSerAsnValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg : 37 ..!...||| |||||||||||||||||||||||||||||||||||||||||| CysSerSerLeuValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg 12312786 : TGTTCTTCTTTAGTATTCTTAAAAAGCAAAGTGGCAAATAGATTTTTGCAAAGA : 12312735 264 : {G} >>>> Target Intron 7 >>>> {ly}GluIleAspIleSerArg : 270 {|} 1304 bp {||}|||||||||||||||!!! {G}++ ++{ly}GluIleAspIleSerSer 12308652 : {G}gt.........................ag{GG}GAAATAGACATATCAAGC : 12307328 289 : ValProProAsnTyrTyrTyr : 295 |||||| !!!..||| !!||| ValProAlaThrTyrAspTyr 12307273 : GTTCCTGCCACGTATGACTAT : 12307251 ''' qry = None hit = None alnx = {} ventry = {} parsing = alignment[0:] rank = 1 while parsing: line = parsing.pop(0) #self.bugPrint(line) # Query if rje.matchExp('Query: (\S+)',line): if ventry: vdb.addEntry(ventry) ventry = {'Qry':rje.matchExp('Query: (\S+)',line)[0],'QrySeq':'','HitSeq':'','AlnSeq':'','Rank':rank} rank += 1 # Hit if rje.matchExp('Target: (\S+)',line): ventry['Hit'] = rje.matchExp('Target: (\S+)',line)[0] qh = (ventry['Qry'],ventry['Hit']) if qh in alnx: alnx[qh] += 1 else: alnx[qh] = 1 ventry['AlnID'] = alnx[qh] # Score if rje.matchExp('core: (\S+)',line): ventry['Score'] = int(rje.matchExp('core: (\S+)',line)[0]) # Alignment if rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line): adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line) #self.bugPrint('= new aln: %s -> %s' % (adata[0],adata[2])) start = int(adata[0]) end = int(adata[2]) aln = adata[1] x = line.find(aln) if 'QryStart' not in ventry: ventry['QryStart'] = start ventry['QryEnd'] = end ventry['QrySeq'] += aln #self.bugPrint('^%s$' % ventry['QrySeq']) line = parsing.pop(0) #self.bugPrint(line) #self.bugPrint(']%s[' % aln) #self.bugPrint(']%s[' % line[x:x+len(aln)]) ventry['AlnSeq'] += line[x:x+len(aln)] #self.debug('^%s$' % ventry['AlnSeq']) #self.bugPrint(parsing[0]) adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0)) if not adata: #self.deBug(parsing[0]) adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0)) if not adata: raise ValueError('Partial alignment! Truncated output?') #self.bugPrint('+ hit aln: %s -> %s' % (adata[0],adata[2])) start = int(adata[0]) end = int(adata[2]) aln = adata[1] if 'HitStart' not in ventry: ventry['HitStart'] = start ventry['HitEnd'] = end ventry['HitSeq'] += aln if ventry: vdb.addEntry(ventry) ## Seq Check for ventry in vdb.entries(): #self.bugPrint('^%s$' % ventry['QrySeq']) #self.bugPrint('^%s$' % ventry['AlnSeq']) #self.bugPrint('^%s$' % ventry['HitSeq']) if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']): self.debug(ventry) raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq']))) ### ~ [3] Split on introns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DNAHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F','dna=T']) self.obj['ProtHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F']) #i# Protein Position Conversion if protqry: for ventry in vdb.entries(): # 1->1, 2->4, 3->7 = 1+3*(n-1) ventry['QryStart'] = 1+3*(ventry['QryStart']-1) if ventry['QrySeq'].startswith('{'): codend = ventry['QrySeq'].find('}') # {X} = phase 2, find = 2 if codend == 2: ventry['QryStart'] += 2 # {XX} = phase 1, find = 3 elif codend == 3: ventry['QryStart'] += 1 else: raise ValueError('QrySeq {} bracket mismatch!: %s' % ventry) ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1 vdb.newKey(['Qry','Rank','Hit','AlnID']) for vkey in vdb.dataKeys(): ventry = vdb.data(vkey) #i# Make a combined hitseq to output to fasta #># phap_PSETE__EBS10XV2PHAP187.FAXD1_NOTSC.XXX hitname = '%s.ex%s %s %s-%s' % (ventry['Qry'],ventry['Rank'],ventry['Hit'],rje.iStr(ventry['HitStart']),rje.iStr(ventry['HitEnd'])) hitseq = '' phase = (ventry['QryStart'] + 2) % 3 alnx = 1 vkeyentries = [ventry] dirn = 1 if ventry['HitEnd'] < ventry['HitStart']: dirn = -1 ventry['HitStrand'] = '-' else: ventry['HitStrand'] = '+' for seq in ['HitSeq','QrySeq','AlnSeq']: ventry[seq] = string.replace(ventry[seq],'}','') ventry[seq] = string.replace(ventry[seq],'{','') while rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq']): intron = rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq'])[0] x = ventry['QrySeq'].find(intron) y = x + len(intron) intronlen = int(rje.matchExp('(\d+) bp',ventry['AlnSeq'][x:y])[0]) #i# Create a new entry of the first exon newentry = rje.combineDict({},ventry) for seq in ['HitSeq','QrySeq','AlnSeq']: newentry[seq] = newentry[seq][:x] newentry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx); alnx += 1 newentry['QryEnd'] = newentry['QryStart'] + len(newentry['QrySeq']) - string.count(newentry['QrySeq'],'-') - 1 newentry['HitEnd'] = newentry['HitStart'] + (len(newentry['HitSeq']) - string.count(newentry['HitSeq'],'-') - 1) * dirn newentry['Length'] = x newentry['Identity'] = string.count(newentry['AlnSeq'],'|') vkeyentries.append(vdb.addEntry(newentry)) hitseq += newentry['HitSeq'] #i# Update ventry to be the rest of the hit for seq in ['HitSeq','QrySeq','AlnSeq']: ventry[seq] = ventry[seq][y:] ventry['QryStart'] = newentry['QryEnd'] + 1 if protqry: ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1 ventry['HitStart'] = newentry['HitEnd'] + intronlen * dirn #i# Calculate length and identity of final exon ventry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx) ventry['Length'] = len(ventry['AlnSeq']) ventry['Identity'] = string.count(ventry['AlnSeq'],'|') #i# Add sequence hits hitname += ' (%d alignment blocks)' % alnx hitseq += ventry['HitSeq'] hitseq = string.replace(hitseq,'-','') protseq = rje_sequence.dna2prot('%s%s' % ('N' * phase,hitseq)) self.obj['ProtHits']._addSeq(hitname,protseq) if ventry['HitStart'] > ventry['HitEnd']: hitseq = rje_sequence.reverseComplement(hitseq) self.obj['DNAHits']._addSeq(hitname,hitseq) #i# Update AlnID for proper float sorting for ventry in vkeyentries: (vcore,vx) = string.split(ventry['AlnID'],'.') ventry['AlnID'] = '%s.%s' % (vcore,rje.preZero(int(vx),alnx)) #self.debug(ventry) vdb.dataFormat({'AlnID':'string'}) vdb.remakeKeys() self.debug(vdb.dataKeys()) ## Seq Check for ventry in vdb.entries(): #self.bugPrint('^%s$' % ventry['QrySeq']) #self.bugPrint('^%s$' % ventry['AlnSeq']) #self.bugPrint('^%s$\n' % ventry['HitSeq']) if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']): self.debug(ventry) raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq']))) udb = self.reduceLocal(byqry=True) udb.rename('unique') udb.newKey(['Qry','Rank','Hit','AlnID']) self.debug(vdb.dataKeys()) #i# Calculate exon phase for ventry in vdb.entries() + udb.entries(): ventry['Phase'] = (ventry['QryStart'] - 1) % 3 #i# Protein Position Conversion if protqry: for ventry in vdb.entries(): ventry['QryStart'] = (ventry['QryStart']+2)/3 ventry['QryEnd'] = (ventry['QryEnd']+2)/3 for ventry in udb.entries(): ventry['QryStart'] = (ventry['QryStart']+2)/3 ventry['QryEnd'] = (ventry['QryEnd']+2)/3 #vdb.remakeKeys() return vdb except: self.errorLog('%s.alignmentToLocal error' % self.prog()); raise
def seqSubset2(self): ### Extracts sequence subset from MOUSE cDNA and Peptide libraries '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if os.path.exists('%s.map.tdt' % self.baseFile()): mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),mainkeys=['Ingolia'],name='map') else: ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt' xref = db.addTable(xfile,mainkeys=['Gene'],name='xref') afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt' self.obj['Map'] = rje_genemap.GeneMap(self.log,self.cmd_list) #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle') self.obj['Map'].loadData(['sourcedata=%s' % xfile,'aliases=%s' % afile]) ing_genes = string.split(string.join(self.db('starts').index('Gene').keys()).upper()) map = self.obj['Map'] ing_map = {} for gene in ing_genes: ing_map[gene] = map.bestMap(gene) ing_mgi = rje.sortUnique(ing_map.values()) self.printLog('#MUSG','%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes),rje.iLen(ing_mgi))) xdb = self.db('xref') bad_genes = [] for gene in ing_mgi[0:]: if gene not in xdb.data(): self.printLog('#MAP','Cannot map gene "%s" from Ingolia data!' % gene) bad_genes.append(gene); ing_mgi.remove(gene) self.printLog('#BAD','Failed to map %s genes from Ignolia' % rje.iLen(bad_genes)) open('ingolia.bad.txt','w').write(string.join(bad_genes)) ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ing_musg = xdb.dataList(xdb.entryList(ing_mgi),'EnsEMBL',sortunique=True) if '' in ing_musg: ing_musg.remove('') self.printLog('#MUSG','%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes),rje.iLen(ing_musg))) if not ing_musg: raise ValueError self.deBug(ing_musg[:10]) for stype in ['cdna','pep']: seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype if self.getBool('Force') or not os.path.exists(seqfile): seqout = 'Ingolia.%s.all.fa' % stype seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'seqout=%s' % seqout,'autofilter=T','autload=T','seqmode=file','gooddesc=%s' % string.join(ing_musg,',')] rje_seqlist.SeqList(self.log,seqcmd) mdb = self.db().addEmptyTable('map',['Ingolia','Gene','EnsEMBL'],['Ignolia']) for gene in ing_map: entry = {'Ingolia':gene,'Gene':ing_map[gene]} if entry['Gene'] in bad_genes: entry['EnsEMBL'] = '' else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL'] mdb.addEntry(entry) seqfile = 'Ingolia.cdna.all.fa' seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'autofilter=F','autload=T','seqmode=file'] iseq = rje_seqlist.SeqList(self.log,seqcmd) if 'ENST' not in mdb.fields(): mdb.addField('ENST',evalue='') while iseq.nextSeq(): (iname,icdna) = iseq.getSeq() musg = rje.matchExp('gene:(\S+)',iname)[0] for entry in mdb.indexEntries('EnsEMBL',musg): if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0] else: entry['ENST'] = string.split(iname)[0] mdb.saveToFile() ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb = self.db('starts') sdb.dataFormat({'Init Codon [nt]':'int'}) icod = 'Init Codon [nt]' icon = 'Init Context [-3 to +4]' sdb.info['Name'] = 'mapped_start' sdb.addField('ENST'); sdb.addField('ENSP'); sdb.addField('ENSI'); ENST = open('IngExact.cdna.all.fa','w') ENSP = open('IngExact.pep.all.fa','w') ex = 0.0; etot = sdb.entryNum(); sx = 0; fx = 0 minpep = 20 for entry in sdb.entries(): self.progLog('\r#ING','Mapping Ignolia Harrington Starts: %.2f%%' % (ex/etot)); ex += 100.0 #self.deBug(entry) entry[icon] = entry[icon].upper() gene = entry['Gene'].upper() mentry = mdb.data(gene) entry['ENST'] = entry['ENSI'] = '' cdnaseq = peptseq = '' if not mentry or not mentry['ENST']: fx += 1; continue #self.deBug(mentry) mtype = 'fail' for trans in string.split(mentry['ENST'],','): (tname,tseq) = iseq.getDictSeq(trans,format='tuple') self.deBug('%s vs %s' % (tseq[entry[icod]-3:][:7],entry[icon])) if tseq[entry[icod]-3:][:7] == entry[icon]: ipept = string.split(rje_sequence.dna2prot(tseq[entry[icod]:]),'*')[0] self.deBug(ipept) if len(ipept) > len(peptseq): entry['ENST'] = trans cdnaseq = tseq peptseq = ipept mtype = 'exact' if not entry['ENST']: self.printLog('\r#ING','Unable to find Harrington start for %s %s (%s)' % (gene,entry[icod],entry[icon]),screen=False) fx += 1; continue elif len(peptseq) < minpep: self.printLog('\r#ING','Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene,entry[icod],entry[icon]),screen=False) fx += 1; continue id = rje.preZero(int(ex/100),etot) entry['ENSI'] = 'ENSINGT%s' % id entry['ENSP'] = 'ENSINGP%s' % id ENST.write('>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],entry['Gene'],mentry['Gene'],cdnaseq)) ENSP.write('>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],id,entry['Gene'],mentry['Gene'],peptseq)) sx += 1 sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile()) ENST.close(); ENSP.close() self.printLog('\r#ING','Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx),rje.iStr(fx))) return except: self.errorLog('%s.method error' % self)
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### counter = ['>>'] # List containing count times menulist = [('F', 'Change output file name', 'outfile', 'OutFile'), ('X', 'Exit', 'return', ''), ('R', 'Run', 'return', '')] mchoice = rje_menu.menu(self, 'WormPump Menu', menulist, choicetext='Please select:', changecase=True, default='R') if mchoice == 'X': return self.printLog('#OUT', 'Output will be to %s' % self.info['OutFile']) self.printLog('#START', 'Initialising counter...') ### ~ [2] ~ Perform counts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### wormid = None while counter[-1] != 'X': if wormid: counter.append( rje.choice( 'ID <ENTER> for new worm | X <ENTER> to exit | <ENTER> for "%s" pump count' % wormid, default='').upper()) else: counter.append( rje.choice( 'ID <ENTER> for new worm | X <ENTER> to exit', default='').upper()) if counter[-1]: wormid = counter[-1] if wormid == 'X': break self.printLog('#WORM', 'Worm "%s"' % wormid) counter.append(time.time()) self.deBug(counter) ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### head = ['Worm', 'Count', 'WormTime', 'AbsTime'] rje.delimitedFileOutput(self, self.info['OutFile'], headers=head, rje_backup=True) wormstart = 0.0 wormid = None wtot = 0 while counter: x = counter.pop(0) if x in ['>>', 'X']: continue if x: wormid = x wormstart = counter[0] wx = 0 wtot += 1 else: if not wormid: continue wx += 1 t = counter.pop(0) tt = time.localtime(t) wdata = { 'Worm': wormid, 'Count': wx, 'WormTime': t - wormstart, #'AbsTime':'%s/%s/%s %s:%s:%s' % (tt[2],tt[1],tt[0],rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))} 'AbsTime': '%s:%s:%s' % (rje.preZero(tt[3], 24), rje.preZero( tt[4], 60), rje.preZero(tt[5], 60)) } rje.delimitedFileOutput(self, self.info['OutFile'], headers=head, datadict=wdata) self.printLog( '#OUT', 'Counts for %d worms output to %s' % (wtot, self.info['OutFile'])) rje.choice('<ENTER> to exit') except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def qsub(self): ### Creates job and calls with qsub '''Creates job and calls with qsub. Returns qsub job ID or 0 if jobwait=True and job completed.''' try: ### Basics ### hr = int(self.stat['Walltime']) min = int((0.5 + (self.stat['Walltime'] - hr) * 60.0)) if self.opt['Report']: return self.report() jobstr = string.replace('%s.job' % self.info['Job'], '.job', '') jlist = [ '#!/bin/bash', '#PBS -N %s' % jobstr, #,'#PBS -q batch', '#PBS -l nodes=%d:ppn=%d' % (self.stat['Nodes'], self.stat['PPN']), '#PBS -l walltime=%d:%s:00' % (hr, rje.preZero(min, 60)), '#PBS -l vmem=%dgb' % self.getInt('VMem'), '#PBS -l mem=%dgb' % self.getInt('VMem'), '' ] #10 #if not os.popen('hostname').read().startswith('katana.science.unsw.edu.au'): # jlist[-2] = '#PBS -l mem=%dgb' % self.getInt('VMem') if self.getBool('Monitor'): if self.getBool('JobWait'): self.warnLog( 'Cannot run with wait=T and monitor=T: switched monitor=F' ) self.setBool({'Monitor': False}) else: jlist += ['#PBS -k oed'] if self.getStr('Email'): jlist += ['#PBS -M %s' % self.getStr('Email'), '#PBS -m ae'] if self.getBool('MailStart'): jlist[-1] = '#PBS -m bae' jlist += [ '### Define number of processors', 'NPROCS=`wc -l < $PBS_NODEFILE`', 'echo Running on host `hostname`', 'echo Time is `date`', 'echo Directory is `pwd`', #2 'echo This jobs runs on the following processors:', 'echo `cat $PBS_NODEFILE`', '', #5 'echo This job has allocated $NPROCS cpus', '' ] self.printLog( '#PPN', '%d Node(s) requested: %d PPN.' % (self.getInt('Nodes'), self.getInt('PPN'))) self.printLog('#VMEM', '%s GB VMem requested.' % (self.getStat('VMem'))) if self.getBool('ModPurge'): jlist.append('module purge') self.printLog('#MOD', 'Modules purged (modpurge=T)') for mod in self.list['Modules']: if mod.lower() not in ['', 'none']: jlist.append('module add %s' % mod) if self.list['Modules']: self.printLog( '#MOD', 'Modules added: %s' % string.join(self.list['Modules'], '; ')) for pcall in self.list['PreCall']: self.printLog('#PCALL', pcall) jlist.append(pcall) #x#jlist = ['#!/bin/sh'] # New Iridis shell script method! ### Directory & Program ### jlist.append('cd %s' % self.info['QPath']) pcall = self.info['Program'] if self.opt['RjePy']: pcall = 'python ' + self.info['PyPath'] + pcall jlist.append(pcall) ### Completion message jlist += ['', 'echo ---', 'qstat -f $PBS_JOBID', 'echo ---'] jlist += ['', 'echo', 'echo Time is `date`', 'echo Job complete'] ### Output and call ### job = '{0}.job'.format( jobstr ) #string.replace('%s.job' % self.info['Job'],'.job.job','.job') open(job, 'w').write(string.join(jlist, '\n')) self.printLog('#DIR', self.info['QPath']) self.printLog('#RUN', pcall) #qsub = 'qsub %s -S /bin/sh -l walltime=%d:%d:00,nodes=%d:ppn=2' % (job,hr,min,self.stat['Nodes']) qsub = 'qsub' if self.getBool('StartBash'): qsub += ' -S /bin/bash' if self.list['Depend']: qsub += ' -W depend=afterany' #for id in self.list['Depend']: qsub += ':%s.bio-server' % id myhost = self.getStr('DependHPC') if not self.getStrLC('DependHPC'): myhost = string.split(os.popen('hostname').read())[0] for id in self.list['Depend']: qsub += ':%s.%s' % (id, myhost) qsub += ' %s' % (job) self.printLog('#JOB', qsub) if self.test(): self.printLog('#TEST', 'Test mode: will not place job in queue.') self.verbose( 0, 1, string.join(['>>>>>'] + jlist + ['<<<<<', ''], '\n')) return False qrun = os.popen(qsub).read() self.printLog('#QSUB', qrun) qid = string.split(qrun, '.')[0] showstart = 'qstat -T' if os.popen('hostname').read().startswith( 'katana.science.unsw.edu.au'): showstart = 'showstart' self.printLog('#SHOW', 'Attempt %s %s in %s sec' % (showstart, qrun, self.stat['Pause']), log=False) time.sleep(self.stat['Pause']) for qline in os.popen('%s %s' % (showstart, qrun)): #qid): if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False) ### Wait for job to be completed if self.getBool('JobWait'): if self.getBool('Monitor'): raise ValueError('Cannot run with wait=T and monitor=T') self.printLog('#WAIT', 'Waiting for job {0} to finish'.format(qid)) ofile = '{0}.o{1}'.format( string.replace('%s.job' % self.info['Job'], '.job', ''), qid) running = False while not rje.exists(ofile): qstat = string.atoi( os.popen("qstat | grep '^{0}' -c".format( qid)).read().split()[0]) if not qstat: self.printLog( '#QSTAT', 'Job {0} disappeared from qstat'.format(qid)) break elif not running: try: qstat = string.split( os.popen("qstat | grep '^{0}'".format( qid)).read().split()[4]) if qstat == 'R': running = True self.printLog('#QSTAT', 'Job {0} running...'.format(qid)) except: pass time.sleep(max(1, self.getInt('Pause'))) owait = 300 while owait and not rje.exists(ofile): owait -= 1 time.sleep(1) if rje.exists(ofile): if 'Job complete' in os.popen( 'tail -n 1 {0}'.format(ofile)).read(): self.printLog( '#DONE', '{0} job ({1}) complete.'.format(jobstr, qid)) return 0 else: self.printLog( '#FAIL', '{0} job ({1}) failed to finish.'.format( jobstr, qid)) return qid else: self.printLog( '#FAIL', '{0} job ({1}) failed to generate {2}.'.format( jobstr, qid, ofile)) return qid except: self.errorLog('Error in qsub()') return False
def makeFlySeq(self): ### Main run method '''Main run method.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F'] genes = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-gene-r5.5.fasta' % flybase] + scmd) cds = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd) exons = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-exon-r5.5.fasta' % flybase] + scmd) ### ~ [1] ~ Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ### genedict = {} # Dictionary of {ID:Sequence object} (gx, gtot) = (0.0, genes.seqNum()) for gene in genes.seq: self.log.printLog('\r#GENE', 'Processing Gene Annotation: %.1f%%' % (gx / gtot), newline=False, log=False) gx += 100 (id, scaffold, pos, name, glen) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);', gene.info['Name']) if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) genedict[id] = gene gene.setInfo({'Scaffold': scaffold, 'Gene': name}) try: (end, start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)', pos) except: (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos) (start, end) = (string.atoi(start), string.atoi(end)) gene.opt[ 'Complement'] = start > end # Sequence on "lagging" strand gene.setStat({'Start': start, 'End': end}) gene.list['CDS'] = [] # Will add CDS sequences here gene.list['Exon'] = [] # Will add exon sequences here self.log.printLog('\r#GENE', 'Processing Gene Annotation complete!') ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (cx, ctot) = (0.0, cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#CDS', 'Processing CDS Annotation: %.1f%%' % (cx / ctot), newline=False, log=False) cx += 100 try: (id, scaffold, pos, name, glen, parent) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;', seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) seq.obj['Parent'] = gene = genedict[parent] try: (end, start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)', pos) except: try: (start, end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos) except: (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos) (start, end) = (string.atoi(start), string.atoi(end)) seq.opt[ 'Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start': start, 'End': end}) gene.list['CDS'].append(seq) self.log.printLog('\r#CDS', 'Processing CDS Annotation complete!') ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (ex, etot) = (0.0, exons.seqNum()) for seq in exons.seq: self.log.printLog('\r#EXON', 'Processing Exon Annotation: %.1f%%' % (ex / etot), newline=False, log=False) ex += 100 try: (id, scaffold, pos, name, parent) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);', seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise seq.obj['Parent'] = gene = genedict[string.split(parent, ',')[0]] try: (end, start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)', pos) except: try: (start, end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos) except: (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos) (start, end) = (string.atoi(start), string.atoi(end)) seq.opt[ 'Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start': start, 'End': end}) gene.list['Exon'].append(seq) self.log.printLog('\r#EXON', 'Processing Exon Annotation complete!') ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (gx, gtot) = (0.0, genes.seqNum()) for gene in genes.seq: glen = gene.aaLen() self.log.printLog('\r#GENE', 'Generating new Gene Annotation: %.1f%%' % (gx / gtot), newline=False, log=False) gx += 100 clist = [] for seq in gene.list['CDS']: if gene.opt[ 'Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start, glen), rje.preZero(end, glen)) clist.append(pos) clist = rje.sortUnique(clist, xreplace=False) elist = [] for seq in gene.list['Exon']: if gene.opt[ 'Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start, glen), rje.preZero(end, glen)) elist.append(pos) elist = rje.sortUnique(elist, xreplace=False) gene.info[ 'Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % ( gene.info['Gene'], gene.info['SpecCode'], gene.info['AccNum'], gene.aaLen(), string.join(clist, ','), string.join(elist, ',')) self.log.printLog('\r#GENE', 'Generating new Gene Annotation complete!') ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## genes.saveFasta(seqfile='flybase_DROME.genes.fas') except: self.log.errorLog(rje_zen.Zen().wisdom())
def seqSubset2( self ): ### Extracts sequence subset from MOUSE cDNA and Peptide libraries '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if os.path.exists('%s.map.tdt' % self.baseFile()): mdb = self.db().addTable('%s.map.tdt' % self.baseFile(), mainkeys=['Ingolia'], name='map') else: ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt' xref = db.addTable(xfile, mainkeys=['Gene'], name='xref') afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt' self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list) #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle') self.obj['Map'].loadData( ['sourcedata=%s' % xfile, 'aliases=%s' % afile]) ing_genes = string.split( string.join( self.db('starts').index('Gene').keys()).upper()) map = self.obj['Map'] ing_map = {} for gene in ing_genes: ing_map[gene] = map.bestMap(gene) ing_mgi = rje.sortUnique(ing_map.values()) self.printLog( '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes), rje.iLen(ing_mgi))) xdb = self.db('xref') bad_genes = [] for gene in ing_mgi[0:]: if gene not in xdb.data(): self.printLog( '#MAP', 'Cannot map gene "%s" from Ingolia data!' % gene) bad_genes.append(gene) ing_mgi.remove(gene) self.printLog( '#BAD', 'Failed to map %s genes from Ignolia' % rje.iLen(bad_genes)) open('ingolia.bad.txt', 'w').write(string.join(bad_genes)) ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ing_musg = xdb.dataList(xdb.entryList(ing_mgi), 'EnsEMBL', sortunique=True) if '' in ing_musg: ing_musg.remove('') self.printLog( '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes), rje.iLen(ing_musg))) if not ing_musg: raise ValueError self.deBug(ing_musg[:10]) for stype in ['cdna', 'pep']: seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype if self.getBool('Force') or not os.path.exists(seqfile): seqout = 'Ingolia.%s.all.fa' % stype seqcmd = self.cmd_list + [ 'seqin=%s' % seqfile, 'seqout=%s' % seqout, 'autofilter=T', 'autload=T', 'seqmode=file', 'gooddesc=%s' % string.join(ing_musg, ',') ] rje_seqlist.SeqList(self.log, seqcmd) mdb = self.db().addEmptyTable('map', ['Ingolia', 'Gene', 'EnsEMBL'], ['Ignolia']) for gene in ing_map: entry = {'Ingolia': gene, 'Gene': ing_map[gene]} if entry['Gene'] in bad_genes: entry['EnsEMBL'] = '' else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL'] mdb.addEntry(entry) seqfile = 'Ingolia.cdna.all.fa' seqcmd = self.cmd_list + [ 'seqin=%s' % seqfile, 'autofilter=F', 'autload=T', 'seqmode=file' ] iseq = rje_seqlist.SeqList(self.log, seqcmd) if 'ENST' not in mdb.fields(): mdb.addField('ENST', evalue='') while iseq.nextSeq(): (iname, icdna) = iseq.getSeq() musg = rje.matchExp('gene:(\S+)', iname)[0] for entry in mdb.indexEntries('EnsEMBL', musg): if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0] else: entry['ENST'] = string.split(iname)[0] mdb.saveToFile() ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb = self.db('starts') sdb.dataFormat({'Init Codon [nt]': 'int'}) icod = 'Init Codon [nt]' icon = 'Init Context [-3 to +4]' sdb.info['Name'] = 'mapped_start' sdb.addField('ENST') sdb.addField('ENSP') sdb.addField('ENSI') ENST = open('IngExact.cdna.all.fa', 'w') ENSP = open('IngExact.pep.all.fa', 'w') ex = 0.0 etot = sdb.entryNum() sx = 0 fx = 0 minpep = 20 for entry in sdb.entries(): self.progLog( '\r#ING', 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot)) ex += 100.0 #self.deBug(entry) entry[icon] = entry[icon].upper() gene = entry['Gene'].upper() mentry = mdb.data(gene) entry['ENST'] = entry['ENSI'] = '' cdnaseq = peptseq = '' if not mentry or not mentry['ENST']: fx += 1 continue #self.deBug(mentry) mtype = 'fail' for trans in string.split(mentry['ENST'], ','): (tname, tseq) = iseq.getDictSeq(trans, format='tuple') self.deBug('%s vs %s' % (tseq[entry[icod] - 3:][:7], entry[icon])) if tseq[entry[icod] - 3:][:7] == entry[icon]: ipept = string.split( rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0] self.deBug(ipept) if len(ipept) > len(peptseq): entry['ENST'] = trans cdnaseq = tseq peptseq = ipept mtype = 'exact' if not entry['ENST']: self.printLog( '\r#ING', 'Unable to find Harrington start for %s %s (%s)' % (gene, entry[icod], entry[icon]), screen=False) fx += 1 continue elif len(peptseq) < minpep: self.printLog( '\r#ING', 'Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene, entry[icod], entry[icon]), screen=False) fx += 1 continue id = rje.preZero(int(ex / 100), etot) entry['ENSI'] = 'ENSINGT%s' % id entry['ENSP'] = 'ENSINGP%s' % id ENST.write( '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id, mtype, entry['ENST'], mentry['EnsEMBL'], entry['Gene'], mentry['Gene'], cdnaseq)) ENSP.write( '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id, entry['Gene'], mentry['Gene'], peptseq)) sx += 1 sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile()) ENST.close() ENSP.close() self.printLog( '\r#ING', 'Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx), rje.iStr(fx))) return except: self.errorLog('%s.method error' % self)