def testparallelcompletion(run_name,projectdir): statusdir='%s/status'%projectdir donefilename='%s/%s.done'%(statusdir,run_name) os.system('touch %s'%donefilename) if run_name in ['p0','p2','p5','p7','p9']: numprocs=1 numdone=1 time.sleep(5) listdir='%s/list'%projectdir if run_name[0:2] in ['p1','p3']: numprocs=sum(1 for line in open('%s/alllist.txt'%listdir)) numdone=len([ f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])]) if run_name[0:2] in ['p4']: numprocs=sum(1 for line in open('%s/chrlist.txt'%listdir)) numdone=len([ f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])]) if run_name[0:2] in ['p6']: numprocs=sum(1 for line in open('%s/allpair.txt'%listdir)) numdone=len([ f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])]) if run_name[0:2] in ['p8']: numprocs=sum(1 for line in open('%s/allpair.txt'%listdir))*len([ f for f in os.listdir('%s/splitgene'%listdir)]) numdone=len([ f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])]) if numdone==numprocs: donefilename='%s/done.%s'%(statusdir,run_name[0:2]) os.system('touch %s'%donefilename) message='%s: %d of %d completed'%(run_name[0:2],numdone,numprocs) common.printstatus(message,'S',common.func_name())
def _getgeneannogrsdict(self): if self.geneannogrsdict == {}: self._num_gagd_accessed += 1 geneannogrsdict_file = '%s/geneannogrsdict.pck' % self.dir if os.path.isfile(geneannogrsdict_file): #print geneannogrsdict_file geneannogrsdict = cPickle.load(open(geneannogrsdict_file)) else: geneannogrsdict = {} islanddict = self.getislanddict() lnum = 0 for gene in islanddict.keys(): lnum += 1 if lnum % 1000 == 1: message = 'Processing Graph Struct construction at gene %d' % lnum common.printstatus(message, 'S', common.func_name()) geneannogrsdict[ gene] = self._genetranscriptdict2graphstruct(gene) cPickle.dump(geneannogrsdict, open(geneannogrsdict_file, 'w')) message = 'Completed Processing Graph Struct construction' common.printstatus(message, 'S', common.func_name()) if self._num_gagd_accessed > 2: self.geneannogrsdict = geneannogrsdict return geneannogrsdict else: return self.geneannogrsdict
def getgeneannodivdict(self): if self.geneannodivdict=={}: self._num_gadd_accessed+=1 geneannodivdict_file='%s/geneannodivdict.pck'%self.dir if os.path.isfile(geneannodivdict_file): geneannodivdict=cPickle.load(open(geneannodivdict_file)) else: geneannodivdict={} islanddict=self.getislanddict() lnum=0 for gene in islanddict.keys(): lnum+=1 if lnum%1000==1: message='Processing Divergence Struct construction at gene %d:%s'%(lnum,gene) common.printstatus(message,'S',common.func_name()) graphstruct=self.getgene2annographstruct(gene) divdict=common.graphstruct2divdict(graphstruct) geneannodivdict[gene]=divdict cPickle.dump(geneannodivdict,open(geneannodivdict_file,'w')) message='Completed Processing Divergence Struct construction' common.printstatus(message,'S',common.func_name()) if self._num_gadd_accessed>2: self.geneannodivdict=geneannodivdict return geneannodivdict else: return self.geneannodivdict
def getgeneannodivdict(self): if self.geneannodivdict == {}: self._num_gadd_accessed += 1 geneannodivdict_file = '%s/geneannodivdict.pck' % self.dir if os.path.isfile(geneannodivdict_file): geneannodivdict = cPickle.load(open(geneannodivdict_file)) else: geneannodivdict = {} islanddict = self.getislanddict() lnum = 0 for gene in islanddict.keys(): lnum += 1 if lnum % 1000 == 1: message = 'Processing Divergence Struct construction at gene %d:%s' % ( lnum, gene) common.printstatus(message, 'S', common.func_name()) graphstruct = self.getgene2annographstruct(gene) divdict = common.graphstruct2divdict(graphstruct) geneannodivdict[gene] = divdict cPickle.dump(geneannodivdict, open(geneannodivdict_file, 'w')) message = 'Completed Processing Divergence Struct construction' common.printstatus(message, 'S', common.func_name()) if self._num_gadd_accessed > 2: self.geneannodivdict = geneannodivdict return geneannodivdict else: return self.geneannodivdict
def _checkdir(self, dirname, dirpath, function_name): if not (os.path.exists(dirpath)): message = 'Config file %s has errors: ' % self.config_file message += '%s does not exist' % dirname common.printstatus(message, 'F', function_name) else: return dirpath
def createfoldersetup(outputdir,runid,numtypes,numdatasets,configfilename,biasfilename): if os.path.exists('%s/%s'%(outputdir,runid)): message='%s/%s already exists'%(outputdir,runid) common.printstatus(message,'F',common.func_name()) cmd='mkdir -p %s/%s'%(outputdir,runid) os.system(cmd) cmd='mkdir -p %s/%s/config'%(outputdir,runid) os.system(cmd) cmd='mkdir -p %s/%s/data'%(outputdir,runid) os.system(cmd) cmd='mkdir -p %s/%s/metadata'%(outputdir,runid) os.system(cmd) cmd='cp %s %s/%s/config'%(configfilename,outputdir,runid) os.system(cmd) cmd='cp %s %s/%s/config'%(biasfilename,outputdir,runid) os.system(cmd) outdict={} outdict['metadata']='%s/%s/metadata'%(outputdir,runid) if numtypes==0: for ds in range(1,numdatasets+1): foldername='%s/%s/data/T%02dS01'%(outputdir,runid,ds) cmd='mkdir -p %s'%foldername os.system(cmd) else: for type in range(1,numtypes+1): for ds in range(1,numdatasets+1): foldername='%s/%s/data/T%02dS%02d'%(outputdir,runid,type,ds) cmd='mkdir -p %s'%foldername os.system(cmd) outdict['data']=[numtypes,numdatasets,'%s/%s/data'%(outputdir,runid)] return outdict
def _checkfile(self, filename, filepath, function_name): if not (os.path.isfile(filepath)): message = 'Config file %s has errors: ' % self.config_file message += '%s does not exist' % filename common.printstatus(message, 'F', function_name) else: return filepath
def _checkfile(self,filename,filepath,function_name): if not(os.path.isfile(filepath)): message='Config file %s has errors: '%self.config_file message+='%s does not exist'%filename common.printstatus(message,'F',function_name) else: return filepath
def _checkdir(self,dirname,dirpath,function_name): if not(os.path.exists(dirpath)): message='Config file %s has errors: '%self.config_file message+='%s does not exist'%dirname common.printstatus(message,'F',function_name) else: return dirpath
def testparallelcompletion(run_name, projectdir): statusdir = '%s/status' % projectdir donefilename = '%s/%s.done' % (statusdir, run_name) os.system('touch %s' % donefilename) if run_name in ['p0', 'p2', 'p5', 'p7', 'p9']: numprocs = 1 numdone = 1 time.sleep(5) listdir = '%s/list' % projectdir if run_name[0:2] in ['p1', 'p3']: numprocs = sum(1 for line in open('%s/alllist.txt' % listdir)) numdone = len( [f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])]) if run_name[0:2] in ['p4']: numprocs = sum(1 for line in open('%s/chrlist.txt' % listdir)) numdone = len( [f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])]) if run_name[0:2] in ['p6']: numprocs = sum(1 for line in open('%s/allpair.txt' % listdir)) numdone = len( [f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])]) if run_name[0:2] in ['p8']: numprocs = sum(1 for line in open('%s/allpair.txt' % listdir)) * len( [f for f in os.listdir('%s/splitgene' % listdir)]) numdone = len( [f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])]) if numdone == numprocs: donefilename = '%s/done.%s' % (statusdir, run_name[0:2]) os.system('touch %s' % donefilename) message = '%s: %d of %d completed' % (run_name[0:2], numdone, numprocs) common.printstatus(message, 'S', common.func_name())
def _getgeneannogrsdict(self): if self.geneannogrsdict=={}: self._num_gagd_accessed+=1 geneannogrsdict_file='%s/geneannogrsdict.pck'%self.dir if os.path.isfile(geneannogrsdict_file): #print geneannogrsdict_file geneannogrsdict=cPickle.load(open(geneannogrsdict_file)) else: geneannogrsdict={} islanddict=self.getislanddict() lnum=0 for gene in islanddict.keys(): lnum+=1 if lnum%1000==1: message='Processing Graph Struct construction at gene %d'%lnum common.printstatus(message,'S',common.func_name()) geneannogrsdict[gene]=self._genetranscriptdict2graphstruct(gene) cPickle.dump(geneannogrsdict,open(geneannogrsdict_file,'w')) message='Completed Processing Graph Struct construction' common.printstatus(message,'S',common.func_name()) if self._num_gagd_accessed>2: self.geneannogrsdict=geneannogrsdict return geneannogrsdict else: return self.geneannogrsdict
def wgs2sparsegraphdict(wgs): ''' Assumes retained intron edge not = splice edge ever ''' exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist=wgs nodelist=[] for edge in splicelist: if edge[0] not in nodelist: nodelist.append(edge[0]) if edge[1] not in nodelist: nodelist.append(edge[1]) exoniclist=exonlist+intronlist exonicwtlist=exonwtlist+intronwtlist exonictuplist=zip(exoniclist,exonicwtlist) exonictuplist.sort() for exonictup in exonictuplist: if exonictup[0][0] not in nodelist: if exonictup[0][0]-1 not in nodelist: nodelist.append(exonictup[0][0]) else: exonictup[0][0]-=1 if exonictup[0][1] not in nodelist: if exonictup[0][1]+1 not in nodelist: nodelist.append(exonictup[0][1]) else: exonictup[0][1]+=1 nodelist.sort() nodedict=dict([(id,[val,0]) for (id,val) in enumerate(nodelist)]) for nodeid in nodedict: if nodedict[nodeid][0] not in novelnodelist: nodedict[nodeid][1]=1 if (nodedict[nodeid][0] in startnodelist) or (nodedict[nodeid][0]-1 in startnodelist) or (nodedict[nodeid][0]+1 in startnodelist): nodedict[nodeid][1]=2 if (nodedict[nodeid][0] in endnodelist) or (nodedict[nodeid][0]-1 in endnodelist) or (nodedict[nodeid][0]+1 in endnodelist): nodedict[nodeid][1]=3 sparsegraphdict={} for exonic,wt in exonictuplist: node1=exonic[0]; node2=exonic[1] node1idx=nodelist.index(node1) node2idx=nodelist.index(node2) if node1idx not in sparsegraphdict: sparsegraphdict[node1idx]={} sparsegraphdict[node1idx][node2idx]=[wt,1] for splice,wt in zip(splicelist,splicewtlist): node1=splice[0]; node2=splice[1] node1idx=nodelist.index(node1) node2idx=nodelist.index(node2) if node1idx not in sparsegraphdict: sparsegraphdict[node1idx]={} message='new splice node added %d'%(node1) common.printstatus(message,'W',common.func_name(),1) if node2idx not in sparsegraphdict[node1idx]: sparsegraphdict[node1idx][node2idx]=[wt,2] else: message='Two edges between two nodes %d(%d)-%d(%d)'%(node1,node1idx,node2,node2idx) common.printstatus(message,'W',common.func_name(),1) sparsegraphdict[node1idx][node2idx][0]+=wt return (sparsegraphdict,nodedict)
def Toactfilelines(self,island): chrnm='%s'%island[0] gene=island[3] if island[4]=='+': tdir=1 else: tdir=0 actlines=[] nodelist=[] exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist=self.tuple for edge in splicelist: if edge[0] not in nodelist: nodelist.append(edge[0]) if edge[1] not in nodelist: nodelist.append(edge[1]) for edge in exonlist+intronlist: if edge[0] not in nodelist: if edge[0]-1 not in nodelist: nodelist.append(edge[0]) if edge[1] not in nodelist: if edge[1]+1 not in nodelist: nodelist.append(edge[1]) for node in startnodelist: if node not in nodelist: nodelist.append(node) for node in endnodelist: if node not in nodelist: nodelist.append(node) nodelist.sort() for node in nodelist: if node in startnodelist: actlines.append('%s\tannos\tnode\t%d\t%d\t0.00\t%d\t.\t%s'%(chrnm,node,node,tdir,gene)) elif node in endnodelist: actlines.append('%s\tannoe\tnode\t%d\t%d\t0.00\t%d\t.\t%s'%(chrnm,node,node,tdir,gene)) elif node in novelnodelist: actlines.append('%s\tnovel\tnode\t%d\t%d\t0.00\t%d\t.\t%s'%(chrnm,node,node,tdir,gene)) else: actlines.append('%s\tannot\tnode\t%d\t%d\t0.00\t%d\t.\t%s'%(chrnm,node,node,tdir,gene)) edgelist=exonlist[0:]+intronlist[0:]+splicelist[0:] edgelist.sort() for edge in edgelist: if edge in exonlist: wt=exonwtlist[exonlist.index(edge)] actlines.append('%s\tannot\texon\t%d\t%d\t%8.2f\t%d\t.\t%s'%(chrnm,edge[0],edge[1],wt,tdir,gene)) elif edge in splicelist: wt=splicewtlist[splicelist.index(edge)] if edge[0] in novelnodelist or edge[1] in novelnodelist: actlines.append('%s\tnovel\tsplice\t%d\t%d\t%8.2f\t%d\t.\t%s'%(chrnm,edge[0],edge[1],wt,tdir,gene)) else: actlines.append('%s\tannot\tsplice\t%d\t%d\t%8.2f\t%d\t.\t%s'%(chrnm,edge[0],edge[1],wt,tdir,gene)) elif edge in intronlist: wt=intronwtlist[intronlist.index(edge)] actlines.append('%s\tnovel\tretint\t%d\t%d\t%8.2f\t%d\t.\t%s'%(chrnm,edge[0],edge[1],wt,tdir,gene)) else: message='Orphan edge: %s'%str(edge) common.printstatus(message,'W',common.func_name()) return actlines
def _checknum(self, x, type, function_name): if type == 'int': try: int(x) return int(x) except: message = '%s is not integer' % x common.printstatus(message, 'F', function_name) if type == 'float': try: float(x) return float(x) except: message = '%s is not float' % x common.printstatus(message, 'F', function_name)
def _processlist(self,tsliststr): if len(tsliststr.split('->'))==1: tslist=tsliststr.split(',') for ts in tslist: if ts not in self.data_dict.keys(): message='Command Line has error(s): %s does not exist'%ts common.printstatus(message,'F',common.func_name()) elif len(tsliststr.split('->'))==2: tsrange=tsliststr.split('->') alltslist=self.data_dict.keys() tslist=[] for ts in tslist: if ts>=tsrange[0] and ts<=tsrange[1]: tslist.append(ts) return tslist
def _checknum(self,x,type,function_name): if type=='int': try: int(x) return int(x) except: message='%s is not integer'%x common.printstatus(message,'F',function_name) if type=='float': try: float(x) return float(x) except: message='%s is not float'%x common.printstatus(message,'F',function_name)
def _processlist(self, tsliststr): if len(tsliststr.split('->')) == 1: tslist = tsliststr.split(',') for ts in tslist: if ts not in self.data_dict.keys(): message = 'Command Line has error(s): %s does not exist' % ts common.printstatus(message, 'F', common.func_name()) elif len(tsliststr.split('->')) == 2: tsrange = tsliststr.split('->') alltslist = self.data_dict.keys() tslist = [] for ts in tslist: if ts >= tsrange[0] and ts <= tsrange[1]: tslist.append(ts) return tslist
def addwgs(wgs1,wgs2,component1): if component1==1.0: return wgs1 exonlist1,intronlist1,splicelist1,startnodelist1,endnodelist1,novelnodelist1,exonwtlist1,intronwtlist1,splicewtlist1=wgs1 exonlist2,intronlist2,splicelist2,startnodelist2,endnodelist2,novelnodelist2,exonwtlist2,intronwtlist2,splicewtlist2=wgs2 if (exonlist1==exonlist2) and (intronlist1==intronlist2) and (splicelist1==splicelist2) and (startnodelist1==startnodelist2) and (endnodelist1==endnodelist2) and (novelnodelist1==novelnodelist2): exonwtlist=[component1*wt1+(1-component1)*wt2 for wt1,wt2 in zip(exonwtlist1,exonwtlist2)] intronwtlist=[component1*wt1+(1-component1)*wt2 for wt1,wt2 in zip(intronwtlist1,intronwtlist2)] splicewtlist=[component1*wt1+(1-component1)*wt2 for wt1,wt2 in zip(splicewtlist1,splicewtlist2)] return [exonlist1,intronlist1,splicelist1,startnodelist1,endnodelist1,novelnodelist1,exonwtlist,intronwtlist,splicewtlist] else: message='Two wgs are different \n%s \n%s\n %d,%d,%d,%d,%d,%d'%(str(wgs1),str(wgs2), (exonlist1==exonlist2),(intronlist1==intronlist2),(splicelist1==splicelist2), (startnodelist1==startnodelist2),(endnodelist1==endnodelist2),(novelnodelist1==novelnodelist2)) common.printstatus(message,'W',common.func_name(),1)
def _checknum(self,x,type,function_name): if type=='int': try: int(x) return int(x) except: message='Config file %s has errors: '%self.config_file message+='%s is not integer'%x common.printstatus(message,'F',function_name) if type=='float': try: float(x) return float(x) except: message='Config file %s has errors: '%self.config_file message+='%s is not float'%x common.printstatus(message,'F',function_name)
def _fixgenetranscriptdict(self): genetranscriptdict_file = '%s/genetranscriptdict.pck' % self.dir genetranscriptdict = cPickle.load(open(genetranscriptdict_file)) newgenetranscriptdict = {} for gene in genetranscriptdict: trascriptdict = genetranscriptdict[gene] transcriptkeys = trascriptdict.keys() transcriptkeys.sort() chrstrand = trascriptdict[transcriptkeys[0]][0:2] removelist = [] for key in transcriptkeys[1:]: if chrstrand != trascriptdict[key][0:2]: message = 'Gene %s in multiple chromosomes; processing only one location:%s' % ( gene, str(chrstrand)) common.printstatus(message, 'W', common.func_name()) removelist.append(key) for key in removelist: transcriptkeys.remove(key) txrangelist = [[ trascriptdict[key][2][0][0], trascriptdict[key][2][-1][1] ] for key in transcriptkeys] ztxrangekey = zip(txrangelist, transcriptkeys) ztxrangekey.sort() txrangelist = [x[0] for x in ztxrangekey] transcriptkeys = [x[1] for x in ztxrangekey] trange = txrangelist[0] for i in range(1, len(txrangelist)): txrange = txrangelist[i] if txrange[0] <= trange[1]: trange = [trange[0], max(trange[1], txrange[1])] else: message = 'Gene %s: multiple non-overlapping transcription regions; processing only one region:%s' % ( gene, str(trange)) common.printstatus(message, 'W', common.func_name()) transcriptkeys = transcriptkeys[0:i] break newgenetranscriptdict[gene] = {} for transcript in transcriptkeys: newgenetranscriptdict[gene][transcript] = genetranscriptdict[ gene][transcript] cmd = 'mv %s %s.bk' % (genetranscriptdict_file, genetranscriptdict_file) os.system(cmd) cPickle.dump(newgenetranscriptdict, open(genetranscriptdict_file, 'w')) return newgenetranscriptdict
def _checknum(self, x, type, function_name): if type == 'int': try: int(x) return int(x) except: message = 'Config file %s has errors: ' % self.config_file message += '%s is not integer' % x common.printstatus(message, 'F', function_name) if type == 'float': try: float(x) return float(x) except: message = 'Config file %s has errors: ' % self.config_file message += '%s is not float' % x common.printstatus(message, 'F', function_name)
def _gtf2genetranscriptdict(self): if self.type == 'E': geneidx = 'gene_name' elif self.type == 'F': geneidx = 'gene_id' genetranscriptdict = {} lnum = 0 for linetxt in open(self.name): lnum += 1 if lnum % 200000 == 1: message = 'Processing GTF file for transcripts at line %d' % lnum common.printstatus(message, 'S', common.func_name()) line = linetxt.rstrip('\n').split('\t') if line[2] != 'exon': continue attrdict = self._attrtxt2attrdict(line[8]) if geneidx in attrdict: gene = attrdict[geneidx] else: continue #print attrdict transcript = attrdict['transcript_id'] chrnm = line[0] if line[6] == '+': strand = 1 else: strand = 0 #print gene, transcript, chrnm, strand if gene not in genetranscriptdict.keys(): genetranscriptdict[gene] = {} if transcript not in genetranscriptdict[gene].keys(): genetranscriptdict[gene][transcript] = (chrnm, strand, []) genetranscriptdict[gene][transcript][2].append( (int(line[3]), int(line[4]))) for gene in genetranscriptdict: for transcript in genetranscriptdict[gene]: genetranscriptdict[gene][transcript][2].sort() message = 'Completed processing GTF file for transcripts' common.printstatus(message, 'S', common.func_name()) genetranscriptdict_file = '%s/genetranscriptdict.pck' % self.dir cPickle.dump(genetranscriptdict, open(genetranscriptdict_file, 'w')) genetranscriptdict = self._fixgenetranscriptdict() return genetranscriptdict
def _gtf2genetranscriptdict(self): if self.type=='E': geneidx='gene_name' elif self.type=='F': geneidx='gene_id' genetranscriptdict={} lnum=0 for linetxt in open(self.name): lnum+=1 if lnum%200000==1: message='Processing GTF file for transcripts at line %d'%lnum common.printstatus(message,'S',common.func_name()) line=linetxt.rstrip('\n').split('\t') if line[2]!='exon': continue attrdict=self._attrtxt2attrdict(line[8]) if geneidx in attrdict: gene=attrdict[geneidx] else: continue #print attrdict transcript=attrdict['transcript_id'] chrnm=line[0] if line[6]=='+': strand=1 else: strand=0 #print gene, transcript, chrnm, strand if gene not in genetranscriptdict.keys(): genetranscriptdict[gene]={} if transcript not in genetranscriptdict[gene].keys(): genetranscriptdict[gene][transcript]=(chrnm,strand,[]) genetranscriptdict[gene][transcript][2].append((int(line[3]),int(line[4]))) for gene in genetranscriptdict: for transcript in genetranscriptdict[gene]: genetranscriptdict[gene][transcript][2].sort() message='Completed processing GTF file for transcripts' common.printstatus(message,'S',common.func_name()) genetranscriptdict_file='%s/genetranscriptdict.pck'%self.dir cPickle.dump(genetranscriptdict,open(genetranscriptdict_file,'w')) genetranscriptdict=self._fixgenetranscriptdict() return genetranscriptdict
def setfdmfullpair(self,fdm_full_pair): allts=fdm_full_pair.replace('::',',').replace('+',',').split(',') if allts[0]!='BLANK': for ts in allts: if ts not in self.data_dict.keys(): message='Command Line has error(s): %s does not exist'%ts common.printstatus(message,'F',common.func_name()) run_pair=fdm_full_pair.split('::') self.ffull_run_list=[[run_pair[0].split('+'),run_pair[1].split('+')]] self.run_all_flag=0 self.run_pre_act_flag=0 self.run_splice_collate_flag=0 self.run_proj_act_flag=0 self.run_extract_flows=0 self.run_cluster_flag=0 self.run_fdm_fast_flag=0 self.run_fdm_full_flag=1 self.run_cluster_flag=0 self.run_report_flag=0
def _gtf2islandlist(self): ''' For repeated genes, if regions do not match ignore ''' islanddict = {} rej_genelist = [] for linetxt in open(self.name): line = linetxt.rstrip('\n').split('\t') if line[2] != 'gene': continue attrdict = self._attrtxt2attrdict(line[8]) gene = attrdict['gene_id'] if line[6] == '+': strand = 1 else: strand = 0 gstart = int(line[3]) gend = int(line[4]) chrid = line[0] if gene not in islanddict.keys() and gene not in rej_genelist: islanddict[gene] = [chrid, gstart, gend, gene, strand] else: if gene in rej_genelist: continue intv1 = [chrid, gstart, gend, gene, strand] intv2 = islanddict[gene] intv = self._intervalintersect(intv1, intv2) if intv == 0: rej_genelist.append(gene) else: islanddict[gene] = intv message = 'Number of repeated genes rejected: %d' % len(rej_genelist) common.printstatus(message, 'W', common.func_name()) message = 'Number of genes loaded: %d' % (len(islanddict.keys())) common.printstatus(message, 'S', common.func_name()) for gene in rej_genelist: del islanddict[gene] islandlist = [] for gene in islanddict.keys(): islandlist.append(islanddict[gene]) islandlist.sort() return islandlist
def _gtf2islandlist(self): ''' For repeated genes, if regions do not match ignore ''' islanddict={} rej_genelist=[] for linetxt in open(self.name): line=linetxt.rstrip('\n').split('\t') if line[2]!='gene': continue attrdict=self._attrtxt2attrdict(line[8]) gene=attrdict['gene_id'] if line[6]=='+': strand=1 else: strand=0 gstart=int(line[3]); gend=int(line[4]) chrid=line[0] if gene not in islanddict.keys() and gene not in rej_genelist: islanddict[gene]=[chrid,gstart,gend,gene,strand] else: if gene in rej_genelist: continue intv1=[chrid,gstart,gend,gene,strand] intv2=islanddict[gene] intv=self._intervalintersect(intv1,intv2) if intv==0: rej_genelist.append(gene) else: islanddict[gene]=intv message='Number of repeated genes rejected: %d'%len(rej_genelist) common.printstatus(message,'W',common.func_name()) message='Number of genes loaded: %d'%(len(islanddict.keys())) common.printstatus(message,'S',common.func_name()) for gene in rej_genelist: del islanddict[gene] islandlist=[] for gene in islanddict.keys(): islandlist.append(islanddict[gene]) islandlist.sort() return islandlist
def setfdmfullpair(self, fdm_full_pair): allts = fdm_full_pair.replace('::', ',').replace('+', ',').split(',') if allts[0] != 'BLANK': for ts in allts: if ts not in self.data_dict.keys(): message = 'Command Line has error(s): %s does not exist' % ts common.printstatus(message, 'F', common.func_name()) run_pair = fdm_full_pair.split('::') self.ffull_run_list = [[ run_pair[0].split('+'), run_pair[1].split('+') ]] self.run_all_flag = 0 self.run_pre_act_flag = 0 self.run_splice_collate_flag = 0 self.run_proj_act_flag = 0 self.run_extract_flows = 0 self.run_cluster_flag = 0 self.run_fdm_fast_flag = 0 self.run_fdm_full_flag = 1 self.run_cluster_flag = 0 self.run_report_flag = 0
def _fixgenetranscriptdict(self): genetranscriptdict_file='%s/genetranscriptdict.pck'%self.dir genetranscriptdict=cPickle.load(open(genetranscriptdict_file)) newgenetranscriptdict={} for gene in genetranscriptdict: trascriptdict=genetranscriptdict[gene] transcriptkeys=trascriptdict.keys() transcriptkeys.sort() chrstrand=trascriptdict[transcriptkeys[0]][0:2] removelist=[] for key in transcriptkeys[1:]: if chrstrand!=trascriptdict[key][0:2]: message='Gene %s in multiple chromosomes; processing only one location:%s'%(gene,str(chrstrand)) common.printstatus(message,'W',common.func_name()) removelist.append(key) for key in removelist: transcriptkeys.remove(key) txrangelist=[[trascriptdict[key][2][0][0],trascriptdict[key][2][-1][1]] for key in transcriptkeys] ztxrangekey=zip(txrangelist,transcriptkeys) ztxrangekey.sort() txrangelist=[x[0] for x in ztxrangekey] transcriptkeys=[x[1] for x in ztxrangekey] trange=txrangelist[0] for i in range(1,len(txrangelist)): txrange=txrangelist[i] if txrange[0]<=trange[1]: trange=[trange[0],max(trange[1],txrange[1])] else: message='Gene %s: multiple non-overlapping transcription regions; processing only one region:%s'%(gene,str(trange)) common.printstatus(message,'W',common.func_name()) transcriptkeys=transcriptkeys[0:i] break newgenetranscriptdict[gene]={} for transcript in transcriptkeys: newgenetranscriptdict[gene][transcript]=genetranscriptdict[gene][transcript] cmd='mv %s %s.bk'%(genetranscriptdict_file,genetranscriptdict_file) os.system(cmd) cPickle.dump(newgenetranscriptdict,open(genetranscriptdict_file,'w')) return newgenetranscriptdict
def createfoldersetup(outputdir, runid, numtypes, numdatasets, configfilename, biasfilename): if os.path.exists('%s/%s' % (outputdir, runid)): message = '%s/%s already exists' % (outputdir, runid) common.printstatus(message, 'F', common.func_name()) cmd = 'mkdir -p %s/%s' % (outputdir, runid) os.system(cmd) cmd = 'mkdir -p %s/%s/config' % (outputdir, runid) os.system(cmd) cmd = 'mkdir -p %s/%s/data' % (outputdir, runid) os.system(cmd) cmd = 'mkdir -p %s/%s/metadata' % (outputdir, runid) os.system(cmd) cmd = 'cp %s %s/%s/config' % (configfilename, outputdir, runid) os.system(cmd) cmd = 'cp %s %s/%s/config' % (biasfilename, outputdir, runid) os.system(cmd) outdict = {} outdict['metadata'] = '%s/%s/metadata' % (outputdir, runid) if numtypes == 0: for ds in range(1, numdatasets + 1): foldername = '%s/%s/data/T%02dS01' % (outputdir, runid, ds) cmd = 'mkdir -p %s' % foldername os.system(cmd) else: for type in range(1, numtypes + 1): for ds in range(1, numdatasets + 1): foldername = '%s/%s/data/T%02dS%02d' % (outputdir, runid, type, ds) cmd = 'mkdir -p %s' % foldername os.system(cmd) outdict['data'] = [ numtypes, numdatasets, '%s/%s/data' % (outputdir, runid) ] return outdict
def arrowplot(title,xlabellist,ylabel,y1y2data,sigflglist,wtflglist,imagefilename): fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax.set_xticks(range(1,len(xlabellist)+1)) ax.set_xticklabels(xlabellist) for i in range(len(y1y2data)): if sigflglist[i]==0: plt.plot(i+1,y1y2data[i][0],'bo',markerfacecolor='none') plt.plot(i+1,y1y2data[i][1],'rs',markerfacecolor='none') else: plt.plot(i+1,y1y2data[i][0],'bo') plt.plot(i+1,y1y2data[i][1],'rs') if y1y2data[i][0]!=y1y2data[i][1]: if wtflglist[i]==1: plt.annotate("", xy=(i+1,y1y2data[i][0]), xycoords='data', xytext=(i+1,y1y2data[i][1]), textcoords='data', arrowprops=dict(arrowstyle="<|-",linewidth=3, connectionstyle="arc3"), ) else: plt.annotate("", xy=(i+1,y1y2data[i][0]), xycoords='data', xytext=(i+1,y1y2data[i][1]), textcoords='data', arrowprops=dict(arrowstyle="<|-",linewidth=1, connectionstyle="arc3"), ) plt.title(title) plt.xlabel('') plt.ylabel('Splicing Fraction') plt.axis([0,len(xlabellist)+1,-0.05,1.05]) try: plt.savefig(imagefilename) except: message='Problem in writing %s'%imagefilename common.printstatus(message,'W',common.func_name())
def divdictToflow(self,divdict): ''' divdict=position:[[incoming/outgoing=0,1,exonstart=0=no,1=yes,2=start transcript,3=end transcript][exon, flowlist]] exonstart=0=no,1=yes,2=start transcript and exonstart,3=end transcript and exonstart,4=start transcript and no exonstart(insplice), 5=end transcript and no exonstart outsplice flowlist=[exon/splicelist] wtgraphstruct=(exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist) flowlist len=1 for start or end transcript getedgevalue edgetype = 10/11,20/21,3 = exon,intron,splice flowdict[position]=[[outgoingflg,exonstartflg],[[wt1,wt2],nflowvec]] ''' flowdict={} for position in divdict.keys(): flowvec=[] outgoingflg,exonstartflg=divdict[position][0] exon,flowlist=divdict[position][1] #message='Flowlist : %s'%str(flowlist) #common.printstatus(message,'S',common.func_name()) edgetype=10+outgoingflg wt1=self.getedgevalue(exon,edgetype)[0] if exonstartflg==2: if len(flowlist)!=1: message='Transcript Start Exon has incoming splice; Flowlist : %s, %s'%(str(exon),common.fl2str(flowlist)) common.printstatus(message,'W',common.func_name()) else: #incoming flow edgetype=10+outgoingflg #prev exon wtoth=self.getedgevalue(flowlist[0],edgetype)[0] flowvec=[wtoth,max(wt1-wtoth,0)] if wt1-wtoth<0: message='Flow decreases at transcript start exon: %s; Prev Exon: %s; Weight Start=%10.4f, Before=%10.4f'%(str(exon),common.fl2str(flowlist[0]),wt1,wtoth) common.printstatus(message,'W',common.func_name()) #wt2=wt1 wt2=sum(flowvec) elif exonstartflg==3: if len(flowlist)!=1: message='Transcript End Exon has outgoing splice; Flowlist : %s, %s'%(str(exon),common.fl2str(flowlist)) common.printstatus(message,'W',common.func_name()) else: #outgoing flow edgetype=10+outgoingflg #next exon wtoth=self.getedgevalue(flowlist[0],edgetype)[0] flowvec=[wtoth,max(wt1-wtoth,0)] if wt1-wtoth<0: message='Flow increases at transcript end exon %s: Prev Exon: %s; Weight End=%10.4f, After=%10.4f'%(str(exon),common.fl2str(flowlist[0]),wt1,wtoth) common.printstatus(message,'W',common.func_name()) #wt2=wt1 wt2=sum(flowvec) elif exonstartflg==1: flowvec=[] edgetype=10+outgoingflg flowvec.append(self.getedgevalue(flowlist[0],edgetype)[0]) for flowedge in flowlist[1:]: flowvec.append(self.getedgevalue(flowedge,3)[0]) wt2=sum(flowvec) elif exonstartflg==0: flowvec=[] for flowedge in flowlist: flowvec.append(self.getedgevalue(flowedge,3)[0]) wt2=sum(flowvec) if len(flowvec)>0: nflowvec=common.normalize_vector(flowvec) flowdict[position]=[[outgoingflg,exonstartflg],[[wt1,wt2],nflowvec]] return flowdict
def getedgevalue(self,edge,edgetype): ''' edgetype = 10/11,20/21,3 = exon,retained intron,splice foundflg=0 if not found =1 if exact match =2 if one sided match and +-1 match =3 if +-1 match on both ALERT: exon wt finding should be improved: All overlapping exon weight should be included ''' goodoverlapsize=40 exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist=self.tuple edge=list(edge) # message='Edge: %s; Edgetype: %d'%(str(edge), edgetype) # common.printstatus(message,'S',common.func_name()) # print zip(exonlist,exonwtlist) # print zip(intronlist,intronwtlist) # print zip(splicelist,splicewtlist) # print startnodelist # print endnodelist # print novelnodelist allexonlist=exonlist+intronlist allexonwtlist=exonwtlist+intronwtlist foundflg=0; wt=0.0 if edgetype in [10,11,20,21]: for i in range(len(allexonlist)): exon=allexonlist[i] if edge[0]-exon[0] in [-1,0,1] and edge[1]-exon[1] in [-1,0,1] : wt=allexonwtlist[i] if edge[0]-exon[0] in [0] and edge[1]-exon[1] in [0]: foundflg=1 elif (edge[0]-exon[0] in [0] and edge[1]-exon[1] in [-1,1]) or (edge[0]-exon[0] in [-1,1] and edge[1]-exon[1] in [0]): foundflg=2 else: foundflg=3 break if foundflg==0: if edgetype in [10,20]: for i in range(len(allexonlist)): exon=allexonlist[i] if edge[0]-exon[0] in [-1,0,1]: # and (exon[1]>edge[1] or exon[1]-exon[0]>goodoverlapsize): wt=allexonwtlist[i] foundflg=4 break if edgetype in [11,21]: for i in range(len(allexonlist)): exon=allexonlist[i] if edge[1]-exon[1] in [-1,0,1]: # and (exon[0]<edge[0] or exon[1]-exon[0]>goodoverlapsize): wt=allexonwtlist[i] foundflg=4 break if edgetype==3: for i in range(len(splicelist)): splice=splicelist[i] if edge[0]-splice[0] in [0] and edge[1]-splice[1] in [0] : wt=splicewtlist[i] foundflg=1 break if foundflg==0: message='Edge: %s; Type = %d; Edge weight: %10.4f; Found flag: %d'%(str(edge),edgetype,wt,foundflg) common.printstatus(message,'S',common.func_name()) #message='splicelist is %s'%str(splicelist) #common.printstatus(message,'S',common.func_name()) return (wt,foundflg)
def getfragments(txstr, numfragments, strand, cutscoredict, fragmentsizerange, txstartbias, numcutmu, numcutsig): """ input: txstr: transcript string numfragments: number of fragments output strand: strand of transcript cutscoredict: dict(xmer:score) where score is cuttability at the center of the xmer, default 1 fragmentsizerange: size of fragments output txstartbias: bias of fragments extracted from the start of the transcript output: fragments: start,end """ outfragments = [] fragmentscount = 0 if numfragments == 0: return outfragments sttime = time.time() #compute cutpoints and probabilities scorelist = [1 for i in range(len(txstr))] minxmer = min([len(xmer) for xmer in cutscoredict]) for i in range(len(txstr) - minxmer + 1): for xmer in cutscoredict: if txstr[i:i + len(xmer)] == xmer: scorelist[i] = cutscoredict[xmer] break scorecumlist = [x * 1.0 / sum(scorelist) for x in np.cumsum(scorelist)] scorecumlist.insert(0, 0) numcuts = max(numcutmu, int(random.gauss(numcutmu, numcutsig))) tries = 0 while fragmentscount < numfragments: tries += 1 txdegradepoint = np.random.exponential(0.005 * txstartbias) if strand == '+': cutrange = [0, int(round(len(txstr) * (1 - txdegradepoint)))] else: cutrange = [ int(round(len(txstr) * txdegradepoint)), len(txstr) - 1 ] cutpoints = list(np.random.uniform(0, 1, numcuts)) cutpositions = [] #print scorecumlist #print cutpoints for cutpoint in cutpoints: for i in range(cutrange[0], cutrange[1]): if cutpoint > scorecumlist[i] and cutpoint < scorecumlist[i + 1]: cutpositions.append(i) cutpositions.sort() cutpositions.insert(0, cutrange[0]) cutpositions.append(cutrange[1] + 1) #print 'here',cutpositions goodfragments = [ [x[0], x[1] - 1] for x in zip(cutpositions[:-1], cutpositions[1:]) if fragmentsizerange[0] <= x[1] - x[0] + 1 <= fragmentsizerange[1] ] #print 'here',len(goodfragments) outfragments += goodfragments fragmentscount += len(goodfragments) if debug_flg == 1: eltime = time.time() - sttime message = 'Start Time=%s' % sttime common.printstatus(message, 'S', common.func_name()) message = 'Average fragment: %6.2f, Tries per read: %6.2f, Reads per second: %10.4f, Num reads: %d' % ( len(txstr) / (numcuts + 0.1), tries / (numfragments + 0.1), numfragments / eltime, numfragments) common.printstatus(message, 'S', common.func_name()) outfragments = random.sample(outfragments, numfragments) outfragments.sort() return outfragments
plot.plotscatterwithhistogram(jsdlist,mincovlist,'jsd','mincoverage','mincov-jsd\n%s'%numofgenestr,imagefilename,markertup=('.',5),logscaleyflg=1,logscalexflg=0) if __name__ == "__main__": debug_flg=2 function_map={'RNAmetasource2source':RNAmetasource2source,'fragment2read':fragment2read} usage = 'usage: %prog [options] arg' parser = OptionParser(usage) parser.add_option('-c', '--config', dest='cfgfile',default='config/sregen.cfg') (options, args) = parser.parse_args() config_file=options.cfgfile cfg=runparam(config_file) cfg.parse() message='Program Started' common.printstatus(message,'S',common.func_name()) metasourcefilelist=[cfg.metasourcedict[x] for x in cfg.generate_method_files] parameters=cfg.generate_method_parameters parameterlist=[metasourcefilelist,parameters] genetxcigardict=runfunc(cfg.generate_method,parameterlist) #chr,strand,start,cigar,size folderdict=createfoldersetup(cfg.output_dir,cfg.run_name,cfg.numtypes,cfg.numdatasets,config_file,cfg.cutpreferencefile) genetxcountdict=dict([(gene,len(genetxcigardict[gene])) for gene in genetxcigardict]) if cfg.numtypes==0: for sample_id in range(1,cfg.numdatasets+1): genetxreaddict=getsimulatedoneexpression(genetxcountdict,cfg.readcount) replicate_id=1
def _genetranscriptdict2graphstruct(self, gene): ''' get a graph structure for a gene Alert: Exon Ends may need correction ''' transcriptexonlist = [] genetranscriptdict = self._getgenetranscriptdict() trascriptdict = genetranscriptdict[gene] transcriptkeys = trascriptdict.keys() transcriptkeys.sort() chrstrand = trascriptdict[transcriptkeys[0]][0:2] removelist = [] for key in transcriptkeys[1:]: if chrstrand != trascriptdict[key][0:2]: message = 'Gene %s in multiple chromosomes; processing only one location:%s' % ( gene, str(chrstrand)) common.printstatus(message, 'W', common.func_name()) removelist.append(key) for key in removelist: transcriptkeys.remove(key) txrangelist = [[ trascriptdict[key][2][0][0], trascriptdict[key][2][-1][1] ] for key in transcriptkeys] ztxrangekey = zip(txrangelist, transcriptkeys) ztxrangekey.sort() txrangelist = [x[0] for x in ztxrangekey] transcriptkeys = [x[1] for x in ztxrangekey] trange = txrangelist[0] for i in range(1, len(txrangelist)): txrange = txrangelist[i] if txrange[0] <= trange[1]: trange = [trange[0], max(trange[1], txrange[1])] else: message = 'Gene %s: multiple non-overlapping transcription regions; processing only one region:%s' % ( gene, str(trange)) common.printstatus(message, 'W', common.func_name()) transcriptkeys = transcriptkeys[0:i] break splicelist = [] startnodelist = [] endnodelist = [] for key in transcriptkeys: exonlist = trascriptdict[key][2] #print exonlist if exonlist[0][0] not in startnodelist: startnodelist.append(exonlist[0][0]) if exonlist[-1][1] not in endnodelist: endnodelist.append(exonlist[-1][1]) for exon in exonlist: if exon not in transcriptexonlist: transcriptexonlist.append(exon) for i in range(1, len(exonlist)): splice = [exonlist[i - 1][1], exonlist[i][0]] if splice not in splicelist: splicelist.append(splice) exonlist = [] transcriptexonlist.sort() transcriptexonlist = [list(exon) for exon in transcriptexonlist] splicelist.sort() startnodelist.sort() endnodelist.sort() #print transcriptexonlist exonqueue = transcriptexonlist[0:1] for i in range(len(transcriptexonlist) - 1): exon = transcriptexonlist[i + 1] #print exonqueue, exon, exonlist tqueue = exonqueue[0:] for exonregion in tqueue: if exonregion[1] < exon[0]: exonlist.append(exonregion) exonqueue.remove(exonregion) found = 0 tqueue = exonqueue[0:] for exonregion in tqueue: if exon[0] > exonregion[0] and exon[0] < exonregion[1]: if exon[1] > exonregion[0] and exon[1] < exonregion[1]: exonqueue.append([exonregion[0], exon[0] - 1]) exonqueue.append([exon[0], exon[1]]) exonqueue.append([exon[1] + 1, exonregion[1]]) exonqueue.remove(exonregion) else: exonqueue.append([exonregion[0], exon[0] - 1]) exonqueue.append([exon[0], exonregion[1]]) exonqueue.remove(exonregion) elif exon[1] > exonregion[0] and exon[1] < exonregion[1]: exonqueue.append([exonregion[0], exon[1]]) exonqueue.append([exon[1] + 1, exonregion[1]]) exonqueue.remove(exonregion) exonqueue.sort() if len(exonqueue) > 0: if exon[1] > exonqueue[-1][1]: exonqueue.append([exonqueue[-1][1] + 1, exon[1]]) else: exonqueue.append(exon) for exonregion in exonqueue: if exonregion[0] < exonregion: exonlist.append(exonregion) exonlist.sort() intronlist = [] novelnodelist = [] for i in range(len(exonlist) - 1): if exonlist[i][1] + 1 < exonlist[i + 1][0] - 1: if [exonlist[i][1], exonlist[i + 1][0]] not in splicelist: intronlist.append( [exonlist[i][1] + 1, exonlist[i + 1][0] - 1]) else: dummyleftnode = (exonlist[i][1] + 1 + exonlist[i + 1][0] - 1) / 2 dummyrightnode = dummyleftnode + 1 intronlist.append([exonlist[i][1] + 1, dummyleftnode]) intronlist.append([dummyrightnode, exonlist[i + 1][0] - 1]) novelnodelist.append(dummyleftnode) # print exonlist # print intronlist # print splicelist # print startnodelist # print endnodelist return (exonlist, intronlist, splicelist, startnodelist, endnodelist, novelnodelist)
def parse(self): config = ConfigParser.SafeConfigParser() config.read(self.config_file) self.pathsamtools = self._checkdir('samtools path', config.get('tools', 'pathsamtools'), common.func_name()) self.pathucsctools = self._checkdir( 'ucsc path', config.get('tools', 'pathucsctools'), common.func_name()) self.pathbedtools = self._checkdir('bedtools path', config.get('tools', 'pathbedtools'), common.func_name()) self.chromszfile = self._checkfile( 'chromosome size file', config.get('reference', 'chromszfile'), common.func_name()) self.chrfaifile = self._checkfile( 'chromosome index file', config.get('reference', 'chrfaifile'), common.func_name()) self.annotation_file = self._checkfile( 'gene annotation file', config.get('reference', 'annotation_file'), common.func_name()) self.annotation_file_type = config.get('reference', 'annotation_file_type') self.root_dir = self._checkdir('root_dir path', config.get('project', 'root_dir'), common.func_name()) #Data data_files = config.items('Data') data_dict1 = dict(data_files) data_dict = {} for key in data_dict1.keys(): data_dict[key.upper()] = data_dict1[key] for sample in data_dict.keys(): if not (os.path.isfile(data_dict[sample])): message = 'Config file has error(s): %s does not exist' % data_dict[ sample] common.printstatus(message, 'F', common.func_name()) self.data_dict = data_dict datakeys = data_dict.keys() self.datadir_dict = dict( zip(datakeys, ['/'.join(data_dict[key].split('/')[:-1]) for key in datakeys])) #Project self.project_name = config.get('project', 'project_name') project_groups = config.get('project', 'project_groups') self.project_groups = [ group.split(',') for group in project_groups.split('::') ] proj_tslist = [] for groups in self.project_groups: groups.sort() proj_tslist += groups for ts in proj_tslist: if ts not in datakeys: message = 'Config file has error(s) in [Project] Groups: %s does not exist in Data' % ts common.printstatus(message, 'F', common.func_name()) project_type = config.get('project', 'project_type') if project_type not in ['1', '2']: message = 'Config file has error(s) in Project Type should be 1 or 2' common.printstatus(message, 'F', common.func_name()) self.project_type = int(project_type) self.run_type = self._checknum(config.get('project', 'run_type'), 'int', common.func_name()) self.ffast_min_cov = self._checknum( config.get('compute_params', 'ffast_min_cov'), 'float', common.func_name()) self.ffast_min_fdm = self._checknum( config.get('compute_params', 'ffast_min_fdm'), 'float', common.func_name()) self.ffull_partition = self._checknum( config.get('compute_params', 'ffull_partition'), 'int', common.func_name()) self.ffull_permutation = self._checknum( config.get('compute_params', 'ffull_permutation'), 'int', common.func_name()) self.ffull_pvalue = self._checknum( config.get('compute_params', 'ffull_pvalue'), 'float', common.func_name()) self.cluster_max_dbi = self._checknum( config.get('compute_params', 'cluster_max_dbi'), 'float', common.func_name()) #self.cluster_min_med_cov=self._checknum(config.get('compute_params','cluster_min_med_cov'),'float',common.func_name()) #self.cluster_min_med_fdm=self._checknum(config.get('compute_params','cluster_min_med_fdm'),'float',common.func_name()) self.ffull_genesplit_size = self._checknum( config.get('compute_params', 'ffull_genesplit_size'), 'int', common.func_name()) self.report_top_x = self._checknum( config.get('compute_params', 'report_top_x'), 'int', common.func_name()) self.graph_top_x = self._checknum( config.get('compute_params', 'graph_top_x'), 'int', common.func_name()) #Run Flag self.run_all_flag = int(config.get('Runflags', 'run_all_flag')) self.run_pre_act_flag = max( self.run_all_flag, int(config.get('Runflags', 'run_pre_act_flag'))) self.run_splice_collate_flag = max( self.run_all_flag, int(config.get('Runflags', 'run_splice_collate_flag'))) self.run_proj_act_flag = max( self.run_all_flag, int(config.get('Runflags', 'run_proj_act_flag'))) self.run_extract_flows = max( self.run_all_flag, int(config.get('Runflags', 'run_extract_flows'))) self.run_fdm_fast_flag = max( self.run_all_flag, int(config.get('Runflags', 'run_fdm_fast_flag'))) #todo filter self.run_fdm_full_flag = max( self.run_all_flag, int(config.get('Runflags', 'run_fdm_full_flag'))) # not necessary self.run_cluster_flag = max( self.run_all_flag, int(config.get('Runflags', 'run_cluster_flag'))) self.run_report_flag = max( self.run_all_flag, int(config.get('Runflags', 'run_report_flag'))) #Runpreact if self.run_pre_act_flag == 1: pre_act_run_flags_tdict = dict(config.items('Runpreact')) pre_act_run_flags_dict = {} for key in pre_act_run_flags_tdict.keys(): pre_act_run_flags_dict[key.upper()] = int( pre_act_run_flags_tdict[key]) for ts in pre_act_run_flags_dict.keys(): if ts not in datakeys: message = 'Config file has error(s) in [Runpreact]: %s does not exist in Data' % ts common.printstatus(message, 'F', common.func_name()) self.pre_act_run_list = [ key for key in pre_act_run_flags_dict.keys() if pre_act_run_flags_dict[key] == 1 ] self.pre_act_run_list.sort() else: self.pre_act_run_list = [] #Runact if self.run_proj_act_flag == 1: act_run_flags_tdict = dict(config.items('Runact')) act_run_flags_dict = {} for key in act_run_flags_tdict.keys(): act_run_flags_dict[key.upper()] = int(act_run_flags_tdict[key]) for ts in act_run_flags_dict.keys(): if ts not in datakeys: message = 'Config file has error(s) in [Runact]: %s does not exist in Data' % ts common.printstatus(message, 'F', common.func_name()) self.act_run_list = [ key for key in act_run_flags_dict.keys() if act_run_flags_dict[key] == 1 ] self.act_run_list.sort() else: self.act_run_list = [] #Extractflows ext_flow_gene_file = config.get('Extractflows', 'ext_flow_gene_file') if ext_flow_gene_file.lower( ) == 'all' or ext_flow_gene_file[0:3].lower() == 'chr': self.ext_flow_genelist = [ext_flow_gene_file.lower()] self.flow_prefix = self.project_name else: self.ext_flow_genelist = [ ln.rstrip('\n').split('\t')[0] for ln in open(ext_flow_gene_file).readlines() ] self.flow_prefix = config.get('Extractflows', 'flow_prefix') #Runfastfdm self.ffast_prefix = config.get('Runfastfdm', 'ffast_prefix') ffast_gene_file = config.get('Runfastfdm', 'ffast_gene_file') if ffast_gene_file.lower() == 'all' or ffast_gene_file[0:3].lower( ) == 'chr': self.ffast_genelist = [ffast_gene_file.lower()] self.ffast_prefix = self.project_name else: self.ffast_genelist = [ ln.rstrip('\n').split('\t')[0] for ln in open(ffast_gene_file).readlines() ] tffast_run_dict = dict(config.items('Runfastfdm')) ffast_run_dict = {} for key in tffast_run_dict: if key[0:9] == 'ffast_run': ffast_run_dict[key] = tffast_run_dict[key] allitems = [] for key in ffast_run_dict.keys(): allitems += ffast_run_dict[key].replace('::', ',').replace( '+', ',').replace(':', ',').replace('|', ',').split(',') for item in allitems: if item not in datakeys: message = 'Config file has error(s): [Runfastfdm] has incorrect definition;[Data] does not have %s' % ( item) common.printstatus(message, 'W', common.func_name()) ffast_run_list = [] for key in ffast_run_dict.keys(): run_str = ffast_run_dict[key] if len(run_str.split('::')) == 1: if len(run_str.split(',')) > 1: # within group a,b,c,d run_items = run_str.split(',') for i in range(len(run_items) - 1): for j in range(i + 1, len(run_items)): ffast_run_list.append([[run_items[i]], [run_items[j]]]) else: # all pairs listed a:b|c:d splitrunlist = [x for x in run_str.split('|')] ffast_run_list = [[[x.split(':')[0]], [x.split(':')[1]]] for x in splitrunlist] else: run_pair = run_str.split('::') if len(run_pair[0].split(',')) == 1: ffast_run_list.append( [run_pair[0].split('+'), run_pair[1].split('+')]) else: for item1 in run_pair[0].split(','): for item2 in run_pair[1].split(','): ffast_run_list.append([[item1], [item2]]) ffast_run_list.sort() self.ffast_run_list = ffast_run_list #RunCluster # self.cluster_prefix=config.get('RunCluster','cluster_prefix') # cluster_gene_file=config.get('RunCluster','cluster_gene_file') # if cluster_gene_file.lower() =='all' or cluster_gene_file[0:3].lower()=='chr': # cluster_gene_file=cluster_gene_file.lower() # if cluster_gene_file=='chrx': # self.cluster_genelist=['chrX'] # elif cluster_gene_file=='chry': # self.cluster_genelist=['chrY'] # else: # self.cluster_genelist=[cluster_gene_file] # else: # self.cluster_genelist=[ln.rstrip('\n') for ln in open(cluster_gene_file).readlines()] #self.cluster_unknown_flag=int(config.get('RunCluster','cluster_unknown_flag')) #Runfullfdm self.ffull_prefix = config.get('Runfullfdm', 'ffull_prefix') ffull_gene_file = config.get('Runfullfdm', 'ffull_gene_file') if ffull_gene_file.lower() == 'all' or ffull_gene_file[0:3].lower( ) == 'chr': self.ffull_genelist = [ffull_gene_file.lower()] self.ffull_prefix = self.project_name elif ffull_gene_file.lower() == 'none': self.ffull_genelist = [] else: self.ffull_genelist = [ ln.rstrip('\n').split('\t')[0] for ln in open(ffull_gene_file).readlines() ] tffull_run_dict = dict(config.items('Runfullfdm')) ffull_run_dict = {} for key in tffull_run_dict: if key[0:9] == 'ffull_run': ffull_run_dict[key] = tffull_run_dict[key] allitems = [] for key in ffull_run_dict.keys(): allitems += ffull_run_dict[key].replace('::', ',').replace( '+', ',').replace(':', ',').replace('|', ',').split(',') for item in allitems: if item not in datakeys: message = 'Config file has error(s): [Runfullfdm] has incorrect definition;[Data] does not have %s' % ( item) common.printstatus(message, 'W', common.func_name()) ffull_run_list = [] for key in ffull_run_dict.keys(): run_str = ffull_run_dict[key] if len(run_str.split('::')) == 1: if len(run_str.split(',')) > 1: # within group run_items = run_str.split(',') for i in range(len(run_items) - 1): for j in range(i + 1, len(run_items)): ffull_run_list.append([[run_items[i]], [run_items[j]]]) else: # all pairs listed a:b|c:d splitrunlist = [x for x in run_str.split('|')] ffull_run_list = [[[x.split(':')[0]], [x.split(':')[1]]] for x in splitrunlist] else: run_pair = run_str.split('::') if len(run_pair[0].split(',')) == 1: ffull_run_list.append( [run_pair[0].split('+'), run_pair[1].split('+')]) else: for item1 in run_pair[0].split(','): for item2 in run_pair[1].split(','): ffull_run_list.append([[item1], [item2]]) ffull_run_list.sort() if self.run_fdm_fast_flag == 1: self.ffull_run_list = ffull_run_list else: self.ffull_run_list = []
def _genetranscriptdict2graphstruct(self,gene): ''' get a graph structure for a gene Alert: Exon Ends may need correction ''' transcriptexonlist=[] genetranscriptdict=self._getgenetranscriptdict() trascriptdict=genetranscriptdict[gene] transcriptkeys=trascriptdict.keys() transcriptkeys.sort() chrstrand=trascriptdict[transcriptkeys[0]][0:2] removelist=[] for key in transcriptkeys[1:]: if chrstrand!=trascriptdict[key][0:2]: message='Gene %s in multiple chromosomes; processing only one location:%s'%(gene,str(chrstrand)) common.printstatus(message,'W',common.func_name()) removelist.append(key) for key in removelist: transcriptkeys.remove(key) txrangelist=[[trascriptdict[key][2][0][0],trascriptdict[key][2][-1][1]] for key in transcriptkeys] ztxrangekey=zip(txrangelist,transcriptkeys) ztxrangekey.sort() txrangelist=[x[0] for x in ztxrangekey] transcriptkeys=[x[1] for x in ztxrangekey] trange=txrangelist[0] for i in range(1,len(txrangelist)): txrange=txrangelist[i] if txrange[0]<=trange[1]: trange=[trange[0],max(trange[1],txrange[1])] else: message='Gene %s: multiple non-overlapping transcription regions; processing only one region:%s'%(gene,str(trange)) common.printstatus(message,'W',common.func_name()) transcriptkeys=transcriptkeys[0:i] break splicelist=[] startnodelist=[] endnodelist=[] for key in transcriptkeys: exonlist=trascriptdict[key][2] #print exonlist if exonlist[0][0] not in startnodelist: startnodelist.append(exonlist[0][0]) if exonlist[-1][1] not in endnodelist: endnodelist.append(exonlist[-1][1]) for exon in exonlist: if exon not in transcriptexonlist: transcriptexonlist.append(exon) for i in range(1,len(exonlist)): splice = [exonlist[i-1][1],exonlist[i][0]] if splice not in splicelist: splicelist.append(splice) exonlist=[] transcriptexonlist.sort() transcriptexonlist=[list(exon) for exon in transcriptexonlist] splicelist.sort() startnodelist.sort() endnodelist.sort() #print transcriptexonlist exonqueue=transcriptexonlist[0:1] for i in range(len(transcriptexonlist)-1): exon=transcriptexonlist[i+1] #print exonqueue, exon, exonlist tqueue=exonqueue[0:] for exonregion in tqueue: if exonregion[1]<exon[0]: exonlist.append(exonregion) exonqueue.remove(exonregion) found=0 tqueue=exonqueue[0:] for exonregion in tqueue: if exon[0] > exonregion[0] and exon[0] < exonregion[1]: if exon[1] > exonregion[0] and exon[1] < exonregion[1]: exonqueue.append([exonregion[0],exon[0]-1]) exonqueue.append([exon[0],exon[1]]) exonqueue.append([exon[1]+1,exonregion[1]]) exonqueue.remove(exonregion) else: exonqueue.append([exonregion[0],exon[0]-1]) exonqueue.append([exon[0],exonregion[1]]) exonqueue.remove(exonregion) elif exon[1] > exonregion[0] and exon[1] < exonregion[1]: exonqueue.append([exonregion[0],exon[1]]) exonqueue.append([exon[1]+1,exonregion[1]]) exonqueue.remove(exonregion) exonqueue.sort() if len(exonqueue)>0: if exon[1]>exonqueue[-1][1]: exonqueue.append([exonqueue[-1][1]+1,exon[1]]) else: exonqueue.append(exon) for exonregion in exonqueue: if exonregion[0]<exonregion: exonlist.append(exonregion) exonlist.sort() intronlist=[] novelnodelist=[] for i in range(len(exonlist)-1): if exonlist[i][1]+1<exonlist[i+1][0]-1: if [exonlist[i][1],exonlist[i+1][0]] not in splicelist: intronlist.append([exonlist[i][1]+1,exonlist[i+1][0]-1]) else: dummyleftnode=(exonlist[i][1]+1+exonlist[i+1][0]-1)/2 dummyrightnode=dummyleftnode+1 intronlist.append([exonlist[i][1]+1,dummyleftnode]) intronlist.append([dummyrightnode,exonlist[i+1][0]-1]) novelnodelist.append(dummyleftnode) # print exonlist # print intronlist # print splicelist # print startnodelist # print endnodelist return (exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist)
def _checkfile(self,filename,filepath,function_name): if not(os.path.isfile(filepath)): message='%s does not exist'%filename common.printstatus(message,'F',function_name) else: return filepath
def parse(self): config = ConfigParser.SafeConfigParser() config.read(self.config_file) self.pathsamtools=self._checkdir('samtools path',config.get('tools','pathsamtools'),common.func_name()) self.pathucsctools=self._checkdir('ucsc path',config.get('tools','pathucsctools'),common.func_name()) self.pathbedtools=self._checkdir('bedtools path',config.get('tools','pathbedtools'),common.func_name()) self.chromszfile =self._checkfile('chromosome size file',config.get('reference','chromszfile'),common.func_name()) self.chrfaifile =self._checkfile('chromosome index file',config.get('reference','chrfaifile'),common.func_name()) self.annotation_file =self._checkfile('gene annotation file',config.get('reference','annotation_file'),common.func_name()) self.annotation_file_type = config.get('reference','annotation_file_type') self.root_dir=self._checkdir('root_dir path',config.get('project','root_dir'),common.func_name()) #Data data_files=config.items('Data') data_dict1=dict(data_files) data_dict={} for key in data_dict1.keys(): data_dict[key.upper()]=data_dict1[key] for sample in data_dict.keys(): if not(os.path.isfile(data_dict[sample])): message='Config file has error(s): %s does not exist'%data_dict[sample] common.printstatus(message,'F',common.func_name()) self.data_dict=data_dict datakeys=data_dict.keys() self.datadir_dict=dict(zip(datakeys,['/'.join(data_dict[key].split('/')[:-1]) for key in datakeys])) #Project self.project_name=config.get('project','project_name') project_groups=config.get('project','project_groups') self.project_groups=[group.split(',') for group in project_groups.split('::')] proj_tslist=[] for groups in self.project_groups: groups.sort() proj_tslist+=groups for ts in proj_tslist: if ts not in datakeys: message='Config file has error(s) in [Project] Groups: %s does not exist in Data'%ts common.printstatus(message,'F',common.func_name()) project_type=config.get('project','project_type') if project_type not in ['1','2']: message='Config file has error(s) in Project Type should be 1 or 2' common.printstatus(message,'F',common.func_name()) self.project_type=int(project_type) self.run_type=self._checknum(config.get('project','run_type'),'int',common.func_name()) self.ffast_min_cov=self._checknum(config.get('compute_params','ffast_min_cov'),'float',common.func_name()) self.ffast_min_fdm=self._checknum(config.get('compute_params','ffast_min_fdm'),'float',common.func_name()) self.ffull_partition=self._checknum(config.get('compute_params','ffull_partition'),'int',common.func_name()) self.ffull_permutation=self._checknum(config.get('compute_params','ffull_permutation'),'int',common.func_name()) self.ffull_pvalue=self._checknum(config.get('compute_params','ffull_pvalue'),'float',common.func_name()) self.cluster_max_dbi=self._checknum(config.get('compute_params','cluster_max_dbi'),'float',common.func_name()) #self.cluster_min_med_cov=self._checknum(config.get('compute_params','cluster_min_med_cov'),'float',common.func_name()) #self.cluster_min_med_fdm=self._checknum(config.get('compute_params','cluster_min_med_fdm'),'float',common.func_name()) self.ffull_genesplit_size=self._checknum(config.get('compute_params','ffull_genesplit_size'),'int',common.func_name()) self.report_top_x=self._checknum(config.get('compute_params','report_top_x'),'int',common.func_name()) self.graph_top_x=self._checknum(config.get('compute_params','graph_top_x'),'int',common.func_name()) #Run Flag self.run_all_flag=int(config.get('Runflags','run_all_flag')) self.run_pre_act_flag=max(self.run_all_flag,int(config.get('Runflags','run_pre_act_flag'))) self.run_splice_collate_flag=max(self.run_all_flag,int(config.get('Runflags','run_splice_collate_flag'))) self.run_proj_act_flag=max(self.run_all_flag,int(config.get('Runflags','run_proj_act_flag'))) self.run_extract_flows=max(self.run_all_flag,int(config.get('Runflags','run_extract_flows'))) self.run_fdm_fast_flag=max(self.run_all_flag,int(config.get('Runflags','run_fdm_fast_flag'))) #todo filter self.run_fdm_full_flag=max(self.run_all_flag,int(config.get('Runflags','run_fdm_full_flag'))) # not necessary self.run_cluster_flag=max(self.run_all_flag,int(config.get('Runflags','run_cluster_flag'))) self.run_report_flag=max(self.run_all_flag,int(config.get('Runflags','run_report_flag'))) #Runpreact if self.run_pre_act_flag==1: pre_act_run_flags_tdict=dict(config.items('Runpreact')) pre_act_run_flags_dict={} for key in pre_act_run_flags_tdict.keys(): pre_act_run_flags_dict[key.upper()]=int(pre_act_run_flags_tdict[key]) for ts in pre_act_run_flags_dict.keys(): if ts not in datakeys: message='Config file has error(s) in [Runpreact]: %s does not exist in Data'%ts common.printstatus(message,'F',common.func_name()) self.pre_act_run_list=[key for key in pre_act_run_flags_dict.keys() if pre_act_run_flags_dict[key]==1] self.pre_act_run_list.sort() else: self.pre_act_run_list=[] #Runact if self.run_proj_act_flag==1: act_run_flags_tdict=dict(config.items('Runact')) act_run_flags_dict={} for key in act_run_flags_tdict.keys(): act_run_flags_dict[key.upper()]=int(act_run_flags_tdict[key]) for ts in act_run_flags_dict.keys(): if ts not in datakeys: message='Config file has error(s) in [Runact]: %s does not exist in Data'%ts common.printstatus(message,'F',common.func_name()) self.act_run_list=[key for key in act_run_flags_dict.keys() if act_run_flags_dict[key]==1] self.act_run_list.sort() else: self.act_run_list=[] #Extractflows ext_flow_gene_file=config.get('Extractflows','ext_flow_gene_file') if ext_flow_gene_file.lower() =='all' or ext_flow_gene_file[0:3].lower()=='chr': self.ext_flow_genelist=[ext_flow_gene_file.lower()] self.flow_prefix=self.project_name else: self.ext_flow_genelist=[ln.rstrip('\n').split('\t')[0] for ln in open(ext_flow_gene_file).readlines()] self.flow_prefix=config.get('Extractflows','flow_prefix') #Runfastfdm self.ffast_prefix=config.get('Runfastfdm','ffast_prefix') ffast_gene_file=config.get('Runfastfdm','ffast_gene_file') if ffast_gene_file.lower() =='all' or ffast_gene_file[0:3].lower()=='chr': self.ffast_genelist=[ffast_gene_file.lower()] self.ffast_prefix=self.project_name else: self.ffast_genelist=[ln.rstrip('\n').split('\t')[0] for ln in open(ffast_gene_file).readlines()] tffast_run_dict=dict(config.items('Runfastfdm')) ffast_run_dict={} for key in tffast_run_dict: if key[0:9]=='ffast_run': ffast_run_dict[key]=tffast_run_dict[key] allitems=[] for key in ffast_run_dict.keys(): allitems+=ffast_run_dict[key].replace('::',',').replace('+',',').replace(':',',').replace('|',',').split(',') for item in allitems: if item not in datakeys: message='Config file has error(s): [Runfastfdm] has incorrect definition;[Data] does not have %s'%(item) common.printstatus(message,'W',common.func_name()) ffast_run_list=[] for key in ffast_run_dict.keys(): run_str=ffast_run_dict[key] if len(run_str.split('::'))==1: if len(run_str.split(','))>1: # within group a,b,c,d run_items=run_str.split(',') for i in range(len(run_items)-1): for j in range(i+1,len(run_items)): ffast_run_list.append([[run_items[i]],[run_items[j]]]) else: # all pairs listed a:b|c:d splitrunlist=[x for x in run_str.split('|')] ffast_run_list=[[[x.split(':')[0]],[x.split(':')[1]]] for x in splitrunlist] else: run_pair=run_str.split('::') if len(run_pair[0].split(','))==1: ffast_run_list.append([run_pair[0].split('+'),run_pair[1].split('+')]) else: for item1 in run_pair[0].split(','): for item2 in run_pair[1].split(','): ffast_run_list.append([[item1],[item2]]) ffast_run_list.sort() self.ffast_run_list=ffast_run_list #RunCluster # self.cluster_prefix=config.get('RunCluster','cluster_prefix') # cluster_gene_file=config.get('RunCluster','cluster_gene_file') # if cluster_gene_file.lower() =='all' or cluster_gene_file[0:3].lower()=='chr': # cluster_gene_file=cluster_gene_file.lower() # if cluster_gene_file=='chrx': # self.cluster_genelist=['chrX'] # elif cluster_gene_file=='chry': # self.cluster_genelist=['chrY'] # else: # self.cluster_genelist=[cluster_gene_file] # else: # self.cluster_genelist=[ln.rstrip('\n') for ln in open(cluster_gene_file).readlines()] #self.cluster_unknown_flag=int(config.get('RunCluster','cluster_unknown_flag')) #Runfullfdm self.ffull_prefix=config.get('Runfullfdm','ffull_prefix') ffull_gene_file=config.get('Runfullfdm','ffull_gene_file') if ffull_gene_file.lower() =='all' or ffull_gene_file[0:3].lower()=='chr': self.ffull_genelist=[ffull_gene_file.lower()] self.ffull_prefix=self.project_name elif ffull_gene_file.lower() =='none': self.ffull_genelist=[] else: self.ffull_genelist=[ln.rstrip('\n').split('\t')[0] for ln in open(ffull_gene_file).readlines()] tffull_run_dict=dict(config.items('Runfullfdm')) ffull_run_dict={} for key in tffull_run_dict: if key[0:9]=='ffull_run': ffull_run_dict[key]=tffull_run_dict[key] allitems=[] for key in ffull_run_dict.keys(): allitems+=ffull_run_dict[key].replace('::',',').replace('+',',').replace(':',',').replace('|',',').split(',') for item in allitems: if item not in datakeys: message='Config file has error(s): [Runfullfdm] has incorrect definition;[Data] does not have %s'%(item) common.printstatus(message,'W',common.func_name()) ffull_run_list=[] for key in ffull_run_dict.keys(): run_str=ffull_run_dict[key] if len(run_str.split('::'))==1: if len(run_str.split(','))>1: # within group run_items=run_str.split(',') for i in range(len(run_items)-1): for j in range(i+1,len(run_items)): ffull_run_list.append([[run_items[i]],[run_items[j]]]) else: # all pairs listed a:b|c:d splitrunlist=[x for x in run_str.split('|')] ffull_run_list=[[[x.split(':')[0]],[x.split(':')[1]]] for x in splitrunlist] else: run_pair=run_str.split('::') if len(run_pair[0].split(','))==1: ffull_run_list.append([run_pair[0].split('+'),run_pair[1].split('+')]) else: for item1 in run_pair[0].split(','): for item2 in run_pair[1].split(','): ffull_run_list.append([[item1],[item2]]) ffull_run_list.sort() if self.run_fdm_fast_flag==1: self.ffull_run_list=ffull_run_list else: self.ffull_run_list=[]
def __init__(self, config_file): if os.path.isfile(config_file): self.config_file = config_file else: message = 'Config file %s does not exist' % config_file common.printstatus(message, 'F', common.func_name())
def gensimulatedreadsdata(sample_id,replicate_id,folderdict,genetxreaddict,genetxcigardict,config_object): if debug_flg==2: funcstarttime=time.time() fragmentsizerange=[int(config_object.read_extract_parameters[2]),int(config_object.read_extract_parameters[3])] txstartbias=config_object.txstartbias cutscoredict=dict([(ln.split('\t')[0],float(ln.rstrip('\n').split('\t')[1])) for ln in open(config_object.cutpreferencefile)]) alignsam='%s/T%02dS%02d/alignments.sam'%(folderdict['data'][2],sample_id,replicate_id) alignbam='%s/T%02dS%02d/alignments.bam'%(folderdict['data'][2],sample_id,replicate_id) alignfout=open(alignsam,'w') alignerrsam='%s/T%02dS%02d/alignments_with_errors.sam'%(folderdict['data'][2],sample_id,replicate_id) alignerrbam='%s/T%02dS%02d/alignments_with_errors.bam'%(folderdict['data'][2],sample_id,replicate_id) alignerrfout=open(alignerrsam,'w') txgenereaddict=genetxreads2txgenereads(genetxreaddict,genetxcigardict) metadatafile='%s/expression_T%02dS%02d.txt'%(folderdict['metadata'],sample_id,replicate_id) fout=open(metadatafile,'w') for gene in txgenereaddict: for tx in txgenereaddict[gene]: fout.write('%s:%s:%s\t%s\tTX\t%d\t%6.4f\t%d\t%d\t%s\n'%tuple(txgenereaddict[gene][tx])) fout.close() chrgenedict=genetxdict2chrgenedict(genetxcigardict) chrlist=chrgenedict.keys() chrlist.sort() if debug_flg==2: tottxstrgettime=0 totfraglistgettime=0 totreadlistgettime=0 toterrorreadgettime=0 for chrname in chrlist: if not(os.path.isfile('%s/%s.fa'%(config_object.reference_genome_dir,chrname))): message='%s/%s.fa: not found'%(config_object.reference_genome_dir,chrname) common.printstatus(message,'S',common.func_name()) continue chrfafileptr=open('%s/%s.fa'%(config_object.reference_genome_dir,chrname)) for gene in chrgenedict[chrname]: for tx in genetxcigardict[gene]: if debug_flg==2: loopstarttime=time.time() numreads=txgenereaddict[gene][tx][6] startbase,cigarstr=genetxcigardict[gene][tx][2],genetxcigardict[gene][tx][3] txstr=getRNAtranscriptstring(chrfafileptr,startbase,cigarstr) if debug_flg==2: txstrgettime=time.time() tottxstrgettime+=txstrgettime-loopstarttime message='Gene:%s, Transcript:%s; Transcript Size:%d, Num reads=%d'%(gene,tx,len(txstr),numreads) common.printstatus(message,'S',common.func_name()) if len(txstr)<fragmentsizerange[0]: continue #print 'here',tx,startbase,cigarstr,len(txstr) #empirical number of cuts numcutmu=max(1,int(len(txstr)*4.0/(fragmentsizerange[0]+fragmentsizerange[1]))) numcutsig=numcutmu/2.0 #print gene,tx,len(txstr),fragmentsizerange[0],fragmentsizerange[1],numcutmu,numreads fragmentlist=getfragments(txstr,numreads,txgenereaddict[gene][tx][2],cutscoredict,fragmentsizerange,txstartbias,numcutmu,numcutsig) if debug_flg==2: fraglistgettime=time.time() totfraglistgettime+=fraglistgettime-txstrgettime #print 'here',len(txstr),numcutmu,numreads,len(fragmentlist) allparameterlist=[txstr,cigarstr,fragmentlist,config_object.read_extract_parameters] readlist=runfunc(config_object.read_extract_method,allparameterlist) if debug_flg==2: readlistgettime=time.time() totreadlistgettime+=readlistgettime-fraglistgettime inum=0 for read in readlist: rstartbase,readstr,readcigar=read readstartlocation=sum([int(x) for x in common.cigarsubstr(cigarstr,0,rstartbase+1).replace('M','N').split('N')[:-1]])-1 inum+=1 alignfout.write('%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n'% (tx,inum,rstartbase,chrname,startbase+readstartlocation,readcigar,readstr)) if max(config_object.readqualitydeteriorationrate,config_object.indelrate)>0.0: errreadstr,errcigarstr=adderrortoread(readstr,readcigar,config_object.readqualitydeteriorationrate,config_object.indelrate) alignerrfout.write('%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n'% (tx,inum,rstartbase,chrname,startbase+readstartlocation,errcigarstr,errreadstr)) if debug_flg==2: errorreadgettime=time.time() toterrorreadgettime+=errorreadgettime-readlistgettime alignfout.close() alignerrfout.close() if debug_flg==2: functime=time.time()-funcstarttime othertime=functime-(tottxstrgettime+totfraglistgettime+totreadlistgettime+toterrorreadgettime) message='\n\nFunc Time=%s,Txstr=%4.2f,Fragtime=%4.2f,Readtime=%4.2f,Errtime=%4.2f,Oth=%4.2f\n\n'% \ (functime,tottxstrgettime*100/functime,totfraglistgettime*100/functime, totreadlistgettime*100/functime,toterrorreadgettime*100/functime,othertime*100/functime) common.printstatus(message,'S',common.func_name()) cmd='%s/samtools view -bt %s %s >%s'%(config_object.pathsamtools,config_object.chrfaifile,alignsam,alignbam) message='Running %s'%cmd common.printstatus(message,'S',common.func_name()) os.system(cmd) if max(config_object.readqualitydeteriorationrate,config_object.indelrate)>0.0: cmd='%s/samtools view -bt %s %s >%s'%(config_object.pathsamtools,config_object.chrfaifile,alignerrsam,alignerrbam) message='Running %s'%cmd common.printstatus(message,'S',common.func_name()) os.system(cmd) return 1
def getfragments(txstr,numfragments,strand,cutscoredict,fragmentsizerange,txstartbias,numcutmu,numcutsig): """ input: txstr: transcript string numfragments: number of fragments output strand: strand of transcript cutscoredict: dict(xmer:score) where score is cuttability at the center of the xmer, default 1 fragmentsizerange: size of fragments output txstartbias: bias of fragments extracted from the start of the transcript output: fragments: start,end """ outfragments=[]; fragmentscount=0 if numfragments==0: return outfragments sttime=time.time() #compute cutpoints and probabilities scorelist=[1 for i in range(len(txstr))] minxmer=min([len(xmer) for xmer in cutscoredict]) for i in range(len(txstr)-minxmer+1): for xmer in cutscoredict: if txstr[i:i+len(xmer)]==xmer: scorelist[i]=cutscoredict[xmer] break scorecumlist=[x*1.0/sum(scorelist) for x in np.cumsum(scorelist)] scorecumlist.insert(0,0) numcuts=max(numcutmu,int(random.gauss(numcutmu,numcutsig))) tries=0 while fragmentscount<numfragments: tries+=1 txdegradepoint=np.random.exponential(0.005*txstartbias) if strand=='+': cutrange=[0,int(round(len(txstr)*(1-txdegradepoint)))] else: cutrange=[int(round(len(txstr)*txdegradepoint)),len(txstr)-1] cutpoints=list(np.random.uniform(0,1,numcuts)) cutpositions=[] #print scorecumlist #print cutpoints for cutpoint in cutpoints: for i in range(cutrange[0],cutrange[1]): if cutpoint>scorecumlist[i] and cutpoint<scorecumlist[i+1]: cutpositions.append(i) cutpositions.sort() cutpositions.insert(0,cutrange[0]) cutpositions.append(cutrange[1]+1) #print 'here',cutpositions goodfragments=[[x[0],x[1]-1] for x in zip(cutpositions[:-1],cutpositions[1:]) if fragmentsizerange[0]<=x[1]-x[0]+1<= fragmentsizerange[1]] #print 'here',len(goodfragments) outfragments+=goodfragments fragmentscount+=len(goodfragments) if debug_flg==1: eltime=time.time()-sttime message='Start Time=%s'%sttime common.printstatus(message,'S',common.func_name()) message='Average fragment: %6.2f, Tries per read: %6.2f, Reads per second: %10.4f, Num reads: %d'%(len(txstr)/(numcuts+0.1),tries/(numfragments+0.1),numfragments/eltime,numfragments) common.printstatus(message,'S',common.func_name()) outfragments=random.sample(outfragments,numfragments) outfragments.sort() return outfragments
def Toimage(self,gene, imagedir,genomerange=[0,10000000000],highlightnodelist=[],ext='pdf',inwgs=[]): ts=self.name.split('/')[-1][:-4] #image rendering constants hconst=30;gwidth=100.0;exonplotdistbasis=10.0;msize=10;hoffset=10;woffset=50 if inwgs==[]: wtgraphstruct=self.getwtgraphstruct(gene) else: wtgraphstruct=inwgs exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist=wtgraphstruct #print zip(exonlist,exonwtlist) #print zip(intronlist,intronwtlist) #print zip(splicelist,splicewtlist) #print startnodelist,endnodelist,novelnodelist if genomerange!=[0,10000000000]: glabel='%s:%s:%d-%d'%(ts,gene,genomerange[0],genomerange[1]) else: glabel='%s:%s'%(ts,gene) imgsz=len(exonlist)+len(intronlist) if imgsz>100: message='Image too large for %s. Printing first few exons'%glabel common.printstatus(message,'W',common.func_name(),1) genomerange=[exonlist[0:30][0][0]-1,exonlist[0:30][-1][1]+1] glabel='%s:%s:%d-%d'%(ts,gene,genomerange[0],genomerange[1]) allpoints=[] for splice in splicelist: if splice[0]<genomerange[0] or splice[1]>genomerange[1]: continue if splice[0] not in allpoints: allpoints.append(splice[0]) if splice[1] not in allpoints: allpoints.append(splice[1]) for node in startnodelist: if node<genomerange[0] or node>genomerange[1]: continue if node not in allpoints: allpoints.append(node) for node in endnodelist: if node<genomerange[0] or node>genomerange[1]: continue if node not in allpoints: allpoints.append(node) for exon in exonlist: if exon[0]<genomerange[0] or exon[1]>genomerange[1]: continue if (exon[0] not in allpoints) and (exon[0]-1 not in allpoints) and (exon[0]+1 not in allpoints): allpoints.append(exon[0]) if (exon[1] not in allpoints) and (exon[1]-1 not in allpoints) and (exon[1]+1 not in allpoints): allpoints.append(exon[1]) for exon in intronlist: if exon[0]<genomerange[0] or exon[1]>genomerange[1]: continue if (exon[0] not in allpoints) and (exon[0]-1 not in allpoints) and (exon[0]+1 not in allpoints): allpoints.append(exon[0]) if (exon[1] not in allpoints) and (exon[1]-1 not in allpoints) and (exon[1]+1 not in allpoints): allpoints.append(exon[1]) allpoints.sort() #print exonlist #print allpoints pointdistlist=[(allpoints[i]-allpoints[i-1]) for i in range(1,len(allpoints))] mindist=max(exonplotdistbasis,min(pointdistlist)) plotdistlist=[math.log(max(x,mindist),mindist) for x in pointdistlist] #equidistant #plotdistlist=[1 for x in pointdistlist] #print sum(plotdistlist) runtotal=0 allpointsy=[runtotal] for point in plotdistlist: runtotal+=point allpointsy.append(runtotal*hconst) #print allpointsy pointplotdict=dict(zip(allpoints,allpointsy)) gheight=sum(plotdistlist)*hconst #gwidth=max(gheight/10,100.0) ax = plt.figure(figsize=(4,4*gheight/gwidth)).add_subplot(111) ax.axis('off') #plot exons for i in range(len(exonlist)): exon=exonlist[i] if exon[0]<genomerange[0] or exon[1]>genomerange[1]: continue if exon[0] in pointplotdict: y1=pointplotdict[exon[0]] elif exon[0]-1 in pointplotdict: y1=pointplotdict[exon[0]-1] elif exon[0]+1 in pointplotdict: y1=pointplotdict[exon[0]+1] if exon[1] in pointplotdict: y2=pointplotdict[exon[1]] elif exon[1]-1 in pointplotdict: y2=pointplotdict[exon[1]-1] elif exon[1]+1 in pointplotdict: y2=pointplotdict[exon[1]+1] ax.plot([woffset,woffset],[y1+hoffset,y2+hoffset],'b-',markersize=msize) ax.text(woffset-1,(y1+y2)/2+hoffset,'%5.1f'%exonwtlist[i],horizontalalignment='right',verticalalignment='center',rotation=270) for i in range(len(intronlist)): intron=intronlist[i] if intron[0]<genomerange[0] or intron[1]>genomerange[1]: continue if intron[0] in pointplotdict: y1=pointplotdict[intron[0]] elif intron[0]-1 in pointplotdict: y1=pointplotdict[intron[0]-1] elif intron[0]+1 in pointplotdict: y1=pointplotdict[intron[0]+1] if intron[1] in pointplotdict: y2=pointplotdict[intron[1]] elif intron[1]-1 in pointplotdict: y2=pointplotdict[intron[1]-1] elif intron[1]+1 in pointplotdict: y2=pointplotdict[intron[1]+1] ax.plot([woffset,woffset],[y1+hoffset,y2+hoffset],'c-',markersize=msize) ax.text(woffset-1,(y1+y2)/2+hoffset,'%5.1f'%intronwtlist[i],horizontalalignment='right',verticalalignment='center',rotation=270) maxawidth=0 for i in range(len(splicelist)): splice=splicelist[i] if splice[0]<genomerange[0] or splice[1]>genomerange[1]: continue if splicewtlist[i]>0: y1=pointplotdict[splice[0]] y2=pointplotdict[splice[1]] center=[woffset,(y1+y2)/2+hoffset] awidth=gwidth/gheight*(y2-y1) if awidth>maxawidth: maxawidth=awidth for i in range(len(splicelist)): splice=splicelist[i] if splice[0]<genomerange[0] or splice[1]>genomerange[1]: continue if splicewtlist[i]>0: y1=pointplotdict[splice[0]] y2=pointplotdict[splice[1]] center=[woffset,(y1+y2)/2+hoffset] awidth=gwidth*2/gheight*(y2-y1)*(gwidth/maxawidth) arcs=[Arc(xy=center, width=awidth, height=y2-y1, angle=0, theta1=270, theta2=90,lw=2,color='green',ls='dashed')] # Arc #arcs=[Arc(xy=center, width=awidth, height=y2-y1, angle=0, theta1=270, theta2=90,lw=2,color='green')] # Arc ax.add_artist(arcs[0]) ax.text(woffset+awidth/2+1,(y1+y2)/2+hoffset,splicewtlist[i],horizontalalignment='left',verticalalignment='center', rotation=270) #plot nodes #Todo text for points for i in range(len(allpoints)): node=allpoints[i] if node in startnodelist: ax.plot(woffset,allpointsy[i]+hoffset,marker='s',markerfacecolor='k',fillstyle='bottom',markersize=msize) elif node in endnodelist: ax.plot(woffset,allpointsy[i]+hoffset,marker='s',markerfacecolor='k',fillstyle='top',markersize=msize) elif node in novelnodelist: ax.plot(woffset,allpointsy[i]+hoffset,marker='o',markerfacecolor='white',markersize=msize) else: ax.plot(woffset,allpointsy[i]+hoffset,marker='o',markerfacecolor='k',fillstyle='full',markersize=msize) if node in highlightnodelist: ax.text(woffset-msize,allpointsy[i]+hoffset,node,horizontalalignment='right',verticalalignment='center',color='red') else: ax.text(woffset-msize,allpointsy[i]+hoffset,node,horizontalalignment='right',verticalalignment='center') ax.text(0,max(allpointsy)+hoffset+4*msize,glabel,horizontalalignment='left',verticalalignment='center') ax.set_ylim(0, max(allpointsy)+hoffset+100) #ax.set_ylim(100, 200) ax.set_xlim(0, gwidth+woffset) #plt.show() ax.set_aspect('equal') try: plt.savefig('%s/%s.%s'%(imagedir,glabel.replace(':','__').replace('/','_'),ext),bbox_inches='tight', pad_inches=0.1) except: message='Image too large for %s'%glabel common.printstatus(message,'W',common.func_name(),1)
def _checkdir(self, dirname, dirpath, function_name): if not (os.path.exists(dirpath)): message = '%s does not exist' % dirname common.printstatus(message, 'F', function_name) else: return dirpath
def wgs2problem(self,wgs,readsz=100): """ X (nx1) = Variables are exon wts, splice wts and tx start and end wts B (mx1)= 0 for each node comparing inflow and outflow = given wt for each exon,splice*correction weight = weq/10 * inflow outflow difference for tx start or end A (mxn) = set of equations """ weq=100.0 X=[]; B=[]; A=[] exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist=wgs sparsegraphdict,nodedict=wgs2sparsegraphdict(wgs) #print sparsegraphdict #print nodedict txstartlist=[nodedict[x][0] for x in nodedict.keys() if nodedict[x][1]==2] txendlist=[nodedict[x][0] for x in nodedict.keys() if nodedict[x][1]==3] txstartlist.sort() txendlist.sort() #print txstartlist #print txendlist exonwttuplelist=zip(exonlist+intronlist,exonwtlist+intronwtlist) splicewttuplist=zip(splicelist,splicewtlist) exonwttuplelist.sort() splicewttuplist.sort() #print exonwttuplelist #print splicewttuplist exondict={}; splicedict={} exonidx=0;spliceidx=0 for node1 in sorted(sparsegraphdict.keys()): for node2 in sorted(sparsegraphdict[node1].keys()): if sparsegraphdict[node1][node2][1]==1: exondict[(node1,node2)]=exonidx exonidx+=1 if sparsegraphdict[node1][node2][1]==2: splicedict[(node1,node2)]=spliceidx spliceidx+=1 incomingedgedict={} for node1 in sorted(sparsegraphdict.keys()): for node2 in sorted(sparsegraphdict[node1].keys()): if node2 not in incomingedgedict: incomingedgedict[node2]={} incomingedgedict[node2][node1]=sparsegraphdict[node1][node2][0:] #print exondict #print splicedict #print incomingedgedict numvar=len(exonwttuplelist)+len(splicewttuplist)+len(txstartlist)+len(txendlist) for exon,wt in exonwttuplelist: X.append([2,exon[0],exon[1],wt]) txstend=0 if exon[0] in txstartlist: txstend+=1 if exon[1] in txendlist: txstend+=1 if exon in exonlist: edgetype=1 else: edgetype=2 w=self.getcorrectionwt(exon[1]-exon[0], txstend, wt, readsz,edgetype) row=[0]*numvar row[len(X)-1]=w A.append(row) B.append(w*wt) #B.append(w*math.log10(1.0+exon[2])) for splice,wt in splicewttuplist: X.append([3,splice[0],splice[1],wt]) edgetype=0 w=self.getcorrectionwt(0, 0, wt, readsz,edgetype) row=[0]*numvar row[len(X)-1]=w A.append(row) B.append(w*wt) for node in txstartlist: X.append([1,node,node,0.0]) for node in txendlist: X.append([-1,node,node,0.0]) nodelist=sorted(nodedict.keys()) for node in nodelist: if (nodelist.index(node)==0) and (nodedict[node][1] not in [2,3]): continue if (nodelist.index(node)==len(nodelist)-1) and (nodedict[node][1] not in [2,3]): continue row=[0]*numvar txstflg=0; txendflg=0; inwt=0.0; outwt=0.0 if nodedict[node][1]==2: idx=len(exondict)+len(splicedict)+txstartlist.index(nodedict[node][0]) row[idx]=-1*weq txstflg=1 txidx=idx if nodedict[node][1]==3: idx=len(exondict)+len(splicedict)+len(txstartlist)+txendlist.index(nodedict[node][0]) row[idx]=weq txendflg=1 txidx=idx if node in sparsegraphdict: for node2 in sparsegraphdict[node]: edge=(node,node2) if edge in exondict: idx=exondict[edge] row[idx]=weq outwt+=sparsegraphdict[node][node2][0] elif edge in splicedict: idx=len(exondict)+splicedict[edge] row[idx]=weq outwt+=sparsegraphdict[node][node2][0] else: message='edge not found %d-%d'%(node,node2) common.printstatus(message,'W',common.func_name(),1) if node in incomingedgedict: for node2 in incomingedgedict[node]: edge=(node2,node) if edge in exondict: idx=exondict[edge] row[idx]=-1*weq inwt+=incomingedgedict[node][node2][0] elif edge in splicedict: idx=len(exondict)+splicedict[edge] row[idx]=-1*weq inwt+=incomingedgedict[node][node2][0] else: message='edge not found %d-%d'%(node,node2) common.printstatus(message,'W',common.func_name(),1) A.append(row) B.append(0) # Only internal tx start and end need to closer to difference if txstflg==1: txwt=max(0.0,outwt-inwt) if txendflg==1: txwt=max(0.0,inwt-outwt) if txstflg==1 or txendflg==1: # and inwt>0.01 row=[0]*numvar row[txidx]=5 A.append(row) B.append(5*txwt) # if outwt>0.01: # row=[0]*numvar # row[txidx]=weq/20 # A.append(row) # B.append(weq*txwt/20) # if outwt<=0.01: # row=[0]*numvar # row[txidx]=weq/5 # A.append(row) # B.append(weq*txwt/5) #print node, nodedict[node] #print row return((A,B,X))
def _checkfile(self, filename, filepath, function_name): if not (os.path.isfile(filepath)): message = '%s does not exist' % filename common.printstatus(message, 'F', function_name) else: return filepath
def ACT2Corrected(self,gene,num_iterations=5): """ Next steps: Some way to preserve flows at divergence nodes One way could be reallocate flows at all divergence nodes in the original ratio and fix it Iterate 10 times """ inwgs=self.wgsdict[gene] outwgs=inwgs component1=1.0 for iteri in range(num_iterations): component1=1.0-iteri*1.0/num_iterations wgs=addwgs(inwgs,outwgs,component1) A,B,X=self.wgs2problem(wgs) Xvar = cvx.variable(len(X),1) A=cvx.matrix(A) B=cvx.matrix(B) B=B.T p = cvx.program(cvx.minimize(cvx.norm2(A*Xvar-B)),[cvx.geq(Xvar,0.0)]) try: p.solve(quiet=1) except: message='Could not solve for %s'%(gene) common.printstatus(message,'W',common.func_name(),1) return (outwgs,100.0) if iteri==0: # Get optimal value err=cvx.norm2(A*Xvar-B) #print err.value/len(X) Xval=Xvar.T.value.tolist()[0] X_corr= [a[:] for a in X] for i in range(len(Xval)): X_corr[i][3]=int(Xval[i]*100)/100.0 #print X_corr exonlist=[[a[1],a[2]] for a in X_corr if a[0]==2] exonwtlist=[a[3] for a in X_corr if a[0]==2] #print 'E',exonlist intronlist=[] intronwtlist=[] splicelist=[[a[1],a[2]] for a in X_corr if a[0]==3] splicewtlist=[a[3] for a in X_corr if a[0]==3] removelist=[] for i in range(len(exonlist)): exon=exonlist[i] if exon in splicelist: exonwt=exonwtlist[i] intronlist.append([exon[0]+1,exon[1]-1]) intronwtlist.append(exonwt) removelist.append(i) removelist.reverse() for i in removelist: exonlist.pop(i) exonwtlist.pop(i) #print 'E',exonlist startnodelist=[a[1]for a in X_corr if a[0]==1] endnodelist=[a[1]for a in X_corr if a[0]==-1] novelnodelist=wgs[5] #print exonlist #print wgs[0] #print intronlist #print wgs[1] exonwtlist1=[exonwtlist[i] for i in range(len(exonwtlist)) if exonlist[i] in wgs[0]] intronwtlist1=[exonwtlist[i] for i in range(len(exonwtlist)) if exonlist[i] in wgs[1]] #wgrstuple=(exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist) outwgs=(wgs[0],wgs[1],splicelist,wgs[3],wgs[4],novelnodelist,exonwtlist1,intronwtlist1,splicewtlist) return (outwgs,err.value/len(X))
def __init__(self, config_file): if os.path.isfile(config_file): self.config_file=config_file else: message='Config file %s does not exist'%config_file common.printstatus(message,'F',common.func_name())
def gensimulatedreadsdata(sample_id, replicate_id, folderdict, genetxreaddict, genetxcigardict, config_object): if debug_flg == 2: funcstarttime = time.time() fragmentsizerange = [ int(config_object.read_extract_parameters[2]), int(config_object.read_extract_parameters[3]) ] txstartbias = config_object.txstartbias cutscoredict = dict([(ln.split('\t')[0], float(ln.rstrip('\n').split('\t')[1])) for ln in open(config_object.cutpreferencefile)]) alignsam = '%s/T%02dS%02d/alignments.sam' % (folderdict['data'][2], sample_id, replicate_id) alignbam = '%s/T%02dS%02d/alignments.bam' % (folderdict['data'][2], sample_id, replicate_id) alignfout = open(alignsam, 'w') alignerrsam = '%s/T%02dS%02d/alignments_with_errors.sam' % ( folderdict['data'][2], sample_id, replicate_id) alignerrbam = '%s/T%02dS%02d/alignments_with_errors.bam' % ( folderdict['data'][2], sample_id, replicate_id) alignerrfout = open(alignerrsam, 'w') txgenereaddict = genetxreads2txgenereads(genetxreaddict, genetxcigardict) metadatafile = '%s/expression_T%02dS%02d.txt' % (folderdict['metadata'], sample_id, replicate_id) fout = open(metadatafile, 'w') for gene in txgenereaddict: for tx in txgenereaddict[gene]: fout.write('%s:%s:%s\t%s\tTX\t%d\t%6.4f\t%d\t%d\t%s\n' % tuple(txgenereaddict[gene][tx])) fout.close() chrgenedict = genetxdict2chrgenedict(genetxcigardict) chrlist = chrgenedict.keys() chrlist.sort() if debug_flg == 2: tottxstrgettime = 0 totfraglistgettime = 0 totreadlistgettime = 0 toterrorreadgettime = 0 for chrname in chrlist: if not (os.path.isfile('%s/%s.fa' % (config_object.reference_genome_dir, chrname))): message = '%s/%s.fa: not found' % ( config_object.reference_genome_dir, chrname) common.printstatus(message, 'S', common.func_name()) continue chrfafileptr = open('%s/%s.fa' % (config_object.reference_genome_dir, chrname)) for gene in chrgenedict[chrname]: for tx in genetxcigardict[gene]: if debug_flg == 2: loopstarttime = time.time() numreads = txgenereaddict[gene][tx][6] startbase, cigarstr = genetxcigardict[gene][tx][ 2], genetxcigardict[gene][tx][3] txstr = getRNAtranscriptstring(chrfafileptr, startbase, cigarstr) if debug_flg == 2: txstrgettime = time.time() tottxstrgettime += txstrgettime - loopstarttime message = 'Gene:%s, Transcript:%s; Transcript Size:%d, Num reads=%d' % ( gene, tx, len(txstr), numreads) common.printstatus(message, 'S', common.func_name()) if len(txstr) < fragmentsizerange[0]: continue #print 'here',tx,startbase,cigarstr,len(txstr) #empirical number of cuts numcutmu = max( 1, int( len(txstr) * 4.0 / (fragmentsizerange[0] + fragmentsizerange[1]))) numcutsig = numcutmu / 2.0 #print gene,tx,len(txstr),fragmentsizerange[0],fragmentsizerange[1],numcutmu,numreads fragmentlist = getfragments(txstr, numreads, txgenereaddict[gene][tx][2], cutscoredict, fragmentsizerange, txstartbias, numcutmu, numcutsig) if debug_flg == 2: fraglistgettime = time.time() totfraglistgettime += fraglistgettime - txstrgettime #print 'here',len(txstr),numcutmu,numreads,len(fragmentlist) allparameterlist = [ txstr, cigarstr, fragmentlist, config_object.read_extract_parameters ] readlist = runfunc(config_object.read_extract_method, allparameterlist) if debug_flg == 2: readlistgettime = time.time() totreadlistgettime += readlistgettime - fraglistgettime inum = 0 for read in readlist: rstartbase, readstr, readcigar = read readstartlocation = sum([ int(x) for x in common.cigarsubstr( cigarstr, 0, rstartbase + 1).replace('M', 'N').split('N')[:-1] ]) - 1 inum += 1 alignfout.write( '%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n' % (tx, inum, rstartbase, chrname, startbase + readstartlocation, readcigar, readstr)) if max(config_object.readqualitydeteriorationrate, config_object.indelrate) > 0.0: errreadstr, errcigarstr = adderrortoread( readstr, readcigar, config_object.readqualitydeteriorationrate, config_object.indelrate) alignerrfout.write( '%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n' % (tx, inum, rstartbase, chrname, startbase + readstartlocation, errcigarstr, errreadstr)) if debug_flg == 2: errorreadgettime = time.time() toterrorreadgettime += errorreadgettime - readlistgettime alignfout.close() alignerrfout.close() if debug_flg == 2: functime = time.time() - funcstarttime othertime = functime - (tottxstrgettime + totfraglistgettime + totreadlistgettime + toterrorreadgettime) message='\n\nFunc Time=%s,Txstr=%4.2f,Fragtime=%4.2f,Readtime=%4.2f,Errtime=%4.2f,Oth=%4.2f\n\n'% \ (functime,tottxstrgettime*100/functime,totfraglistgettime*100/functime, totreadlistgettime*100/functime,toterrorreadgettime*100/functime,othertime*100/functime) common.printstatus(message, 'S', common.func_name()) cmd = '%s/samtools view -bt %s %s >%s' % (config_object.pathsamtools, config_object.chrfaifile, alignsam, alignbam) message = 'Running %s' % cmd common.printstatus(message, 'S', common.func_name()) os.system(cmd) if max(config_object.readqualitydeteriorationrate, config_object.indelrate) > 0.0: cmd = '%s/samtools view -bt %s %s >%s' % (config_object.pathsamtools, config_object.chrfaifile, alignerrsam, alignerrbam) message = 'Running %s' % cmd common.printstatus(message, 'S', common.func_name()) os.system(cmd) return 1
def _checkdir(self,dirname,dirpath,function_name): if not(os.path.exists(dirpath)): message='%s does not exist'%dirname common.printstatus(message,'F',function_name) else: return dirpath
if options.proj_range != '': cfg.setprojrange(options.proj_range) if options.fdm_fast_pair != '': cfg.setfdmfastpair(options.fdm_fast_pair) if options.analysis_genefile != '': cfg.setfdmgenefile(options.analysis_genefile) if options.fdm_full_pair != '': cfg.setfdmfullpair(options.fdm_full_pair) if options.analysis_genefile != '': cfg.setfdmgenefile(options.analysis_genefile) message = 'Program Started' common.printstatus(message, 'S', common.func_name()) rootdir = cfg.root_dir gtffilename = cfg.annotation_file gtffile = gtffile.gtfFile(gtffilename, 0, cfg.annotation_file_type) gtffile.getgenetranscriptdict() geneannodivdict = gtffile.getgeneannodivdict() annochrjuncdict = gtffile.getchrjuncdict() islanddict = gtffile.getislanddict() genelist = islanddict.keys() chrlist = gtffile.getchrlist() islandlist = gtffile.getislandlist()