Exemple #1
0
def testparallelcompletion(run_name,projectdir):
    statusdir='%s/status'%projectdir
    donefilename='%s/%s.done'%(statusdir,run_name)
    os.system('touch %s'%donefilename)
    if run_name in ['p0','p2','p5','p7','p9']:
        numprocs=1
        numdone=1
    time.sleep(5)
    listdir='%s/list'%projectdir
    if run_name[0:2] in ['p1','p3']:
        numprocs=sum(1 for line in open('%s/alllist.txt'%listdir))
        numdone=len([ f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])])
    if run_name[0:2] in ['p4']:
        numprocs=sum(1 for line in open('%s/chrlist.txt'%listdir))
        numdone=len([ f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])])
    if run_name[0:2] in ['p6']:
        numprocs=sum(1 for line in open('%s/allpair.txt'%listdir))
        numdone=len([ f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])])        
    if run_name[0:2] in ['p8']:
        numprocs=sum(1 for line in open('%s/allpair.txt'%listdir))*len([ f for f in os.listdir('%s/splitgene'%listdir)])
        numdone=len([ f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])])    
                
    if numdone==numprocs:
        donefilename='%s/done.%s'%(statusdir,run_name[0:2])
        os.system('touch %s'%donefilename)
                        
    message='%s: %d of %d completed'%(run_name[0:2],numdone,numprocs)
    common.printstatus(message,'S',common.func_name())            
Exemple #2
0
 def _getgeneannogrsdict(self):
     if self.geneannogrsdict == {}:
         self._num_gagd_accessed += 1
         geneannogrsdict_file = '%s/geneannogrsdict.pck' % self.dir
         if os.path.isfile(geneannogrsdict_file):
             #print geneannogrsdict_file
             geneannogrsdict = cPickle.load(open(geneannogrsdict_file))
         else:
             geneannogrsdict = {}
             islanddict = self.getislanddict()
             lnum = 0
             for gene in islanddict.keys():
                 lnum += 1
                 if lnum % 1000 == 1:
                     message = 'Processing Graph Struct construction at gene %d' % lnum
                     common.printstatus(message, 'S', common.func_name())
                 geneannogrsdict[
                     gene] = self._genetranscriptdict2graphstruct(gene)
             cPickle.dump(geneannogrsdict, open(geneannogrsdict_file, 'w'))
             message = 'Completed Processing Graph Struct construction'
             common.printstatus(message, 'S', common.func_name())
         if self._num_gagd_accessed > 2:
             self.geneannogrsdict = geneannogrsdict
         return geneannogrsdict
     else:
         return self.geneannogrsdict
Exemple #3
0
 def getgeneannodivdict(self):
     if self.geneannodivdict=={}:
         self._num_gadd_accessed+=1
         geneannodivdict_file='%s/geneannodivdict.pck'%self.dir
         if os.path.isfile(geneannodivdict_file):
             geneannodivdict=cPickle.load(open(geneannodivdict_file))
         else:
             geneannodivdict={}
             islanddict=self.getislanddict()
             lnum=0
             for gene in islanddict.keys():
                 lnum+=1
                 if lnum%1000==1:
                     message='Processing Divergence Struct construction at gene %d:%s'%(lnum,gene)
                     common.printstatus(message,'S',common.func_name())
                 graphstruct=self.getgene2annographstruct(gene)
                 divdict=common.graphstruct2divdict(graphstruct)
                 geneannodivdict[gene]=divdict
             cPickle.dump(geneannodivdict,open(geneannodivdict_file,'w'))  
             message='Completed Processing Divergence Struct construction'
             common.printstatus(message,'S',common.func_name())
         if self._num_gadd_accessed>2:
             self.geneannodivdict=geneannodivdict
         return geneannodivdict
     else:
         return self.geneannodivdict
Exemple #4
0
 def getgeneannodivdict(self):
     if self.geneannodivdict == {}:
         self._num_gadd_accessed += 1
         geneannodivdict_file = '%s/geneannodivdict.pck' % self.dir
         if os.path.isfile(geneannodivdict_file):
             geneannodivdict = cPickle.load(open(geneannodivdict_file))
         else:
             geneannodivdict = {}
             islanddict = self.getislanddict()
             lnum = 0
             for gene in islanddict.keys():
                 lnum += 1
                 if lnum % 1000 == 1:
                     message = 'Processing Divergence Struct construction at gene %d:%s' % (
                         lnum, gene)
                     common.printstatus(message, 'S', common.func_name())
                 graphstruct = self.getgene2annographstruct(gene)
                 divdict = common.graphstruct2divdict(graphstruct)
                 geneannodivdict[gene] = divdict
             cPickle.dump(geneannodivdict, open(geneannodivdict_file, 'w'))
             message = 'Completed Processing Divergence Struct construction'
             common.printstatus(message, 'S', common.func_name())
         if self._num_gadd_accessed > 2:
             self.geneannodivdict = geneannodivdict
         return geneannodivdict
     else:
         return self.geneannodivdict
Exemple #5
0
 def _checkdir(self, dirname, dirpath, function_name):
     if not (os.path.exists(dirpath)):
         message = 'Config file %s has errors: ' % self.config_file
         message += '%s does not exist' % dirname
         common.printstatus(message, 'F', function_name)
     else:
         return dirpath
Exemple #6
0
def createfoldersetup(outputdir,runid,numtypes,numdatasets,configfilename,biasfilename):
    if os.path.exists('%s/%s'%(outputdir,runid)):
        message='%s/%s already exists'%(outputdir,runid)
        common.printstatus(message,'F',common.func_name()) 

    cmd='mkdir -p %s/%s'%(outputdir,runid)
    os.system(cmd)
    cmd='mkdir -p %s/%s/config'%(outputdir,runid)
    os.system(cmd)
    cmd='mkdir -p %s/%s/data'%(outputdir,runid)
    os.system(cmd)
    cmd='mkdir -p %s/%s/metadata'%(outputdir,runid)
    os.system(cmd)
    cmd='cp %s %s/%s/config'%(configfilename,outputdir,runid)
    os.system(cmd)
    cmd='cp %s %s/%s/config'%(biasfilename,outputdir,runid)
    os.system(cmd)
    outdict={}
    outdict['metadata']='%s/%s/metadata'%(outputdir,runid)
    if numtypes==0:
        for ds in range(1,numdatasets+1):
            foldername='%s/%s/data/T%02dS01'%(outputdir,runid,ds)
            cmd='mkdir -p %s'%foldername
            os.system(cmd)
    else:
        for type in range(1,numtypes+1):
            for ds in range(1,numdatasets+1):
                foldername='%s/%s/data/T%02dS%02d'%(outputdir,runid,type,ds)
                cmd='mkdir -p %s'%foldername
                os.system(cmd)
    outdict['data']=[numtypes,numdatasets,'%s/%s/data'%(outputdir,runid)]
    return outdict
Exemple #7
0
 def _checkfile(self, filename, filepath, function_name):
     if not (os.path.isfile(filepath)):
         message = 'Config file %s has errors: ' % self.config_file
         message += '%s does not exist' % filename
         common.printstatus(message, 'F', function_name)
     else:
         return filepath
Exemple #8
0
 def _checkfile(self,filename,filepath,function_name):
     if not(os.path.isfile(filepath)):
         message='Config file %s has errors: '%self.config_file
         message+='%s does not exist'%filename
         common.printstatus(message,'F',function_name) 
     else:
         return filepath       
Exemple #9
0
 def _checkdir(self,dirname,dirpath,function_name):
     if not(os.path.exists(dirpath)):
         message='Config file %s has errors: '%self.config_file
         message+='%s does not exist'%dirname
         common.printstatus(message,'F',function_name) 
     else:
         return dirpath
Exemple #10
0
def testparallelcompletion(run_name, projectdir):
    statusdir = '%s/status' % projectdir
    donefilename = '%s/%s.done' % (statusdir, run_name)
    os.system('touch %s' % donefilename)
    if run_name in ['p0', 'p2', 'p5', 'p7', 'p9']:
        numprocs = 1
        numdone = 1
    time.sleep(5)
    listdir = '%s/list' % projectdir
    if run_name[0:2] in ['p1', 'p3']:
        numprocs = sum(1 for line in open('%s/alllist.txt' % listdir))
        numdone = len(
            [f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])])
    if run_name[0:2] in ['p4']:
        numprocs = sum(1 for line in open('%s/chrlist.txt' % listdir))
        numdone = len(
            [f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])])
    if run_name[0:2] in ['p6']:
        numprocs = sum(1 for line in open('%s/allpair.txt' % listdir))
        numdone = len(
            [f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])])
    if run_name[0:2] in ['p8']:
        numprocs = sum(1 for line in open('%s/allpair.txt' % listdir)) * len(
            [f for f in os.listdir('%s/splitgene' % listdir)])
        numdone = len(
            [f for f in os.listdir(statusdir) if f.startswith(run_name[0:2])])

    if numdone == numprocs:
        donefilename = '%s/done.%s' % (statusdir, run_name[0:2])
        os.system('touch %s' % donefilename)

    message = '%s: %d of %d completed' % (run_name[0:2], numdone, numprocs)
    common.printstatus(message, 'S', common.func_name())
Exemple #11
0
 def _getgeneannogrsdict(self):
     if self.geneannogrsdict=={}:
         self._num_gagd_accessed+=1
         geneannogrsdict_file='%s/geneannogrsdict.pck'%self.dir
         if os.path.isfile(geneannogrsdict_file):
             #print geneannogrsdict_file
             geneannogrsdict=cPickle.load(open(geneannogrsdict_file))
         else:
             geneannogrsdict={}
             islanddict=self.getislanddict()
             lnum=0
             for gene in islanddict.keys():
                 lnum+=1
                 if lnum%1000==1:
                     message='Processing Graph Struct construction at gene %d'%lnum
                     common.printstatus(message,'S',common.func_name())
                 geneannogrsdict[gene]=self._genetranscriptdict2graphstruct(gene)
             cPickle.dump(geneannogrsdict,open(geneannogrsdict_file,'w'))  
             message='Completed Processing Graph Struct construction'
             common.printstatus(message,'S',common.func_name())
         if self._num_gagd_accessed>2:
             self.geneannogrsdict=geneannogrsdict
         return geneannogrsdict 
     else:
         return self.geneannogrsdict
Exemple #12
0
def wgs2sparsegraphdict(wgs):
    '''
    Assumes retained intron edge not = splice edge ever
    '''
    exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist=wgs
    nodelist=[]
    for edge in splicelist:
        if edge[0] not in nodelist:
            nodelist.append(edge[0])
        if edge[1] not in nodelist:
            nodelist.append(edge[1])
    exoniclist=exonlist+intronlist
    exonicwtlist=exonwtlist+intronwtlist
    exonictuplist=zip(exoniclist,exonicwtlist)
    exonictuplist.sort()
    for exonictup in exonictuplist:
        if exonictup[0][0] not in nodelist:
            if exonictup[0][0]-1 not in nodelist:
                nodelist.append(exonictup[0][0])
            else:
                exonictup[0][0]-=1
        if exonictup[0][1] not in nodelist:
            if exonictup[0][1]+1 not in nodelist:
                nodelist.append(exonictup[0][1])
            else:
                exonictup[0][1]+=1     
    nodelist.sort()
    nodedict=dict([(id,[val,0]) for (id,val) in enumerate(nodelist)])
    for nodeid in nodedict:
        if nodedict[nodeid][0] not in novelnodelist:
            nodedict[nodeid][1]=1
        if (nodedict[nodeid][0] in startnodelist) or (nodedict[nodeid][0]-1 in startnodelist) or (nodedict[nodeid][0]+1 in startnodelist):
            nodedict[nodeid][1]=2
        if (nodedict[nodeid][0] in endnodelist) or (nodedict[nodeid][0]-1 in endnodelist) or (nodedict[nodeid][0]+1 in endnodelist):
            nodedict[nodeid][1]=3          
    sparsegraphdict={}
    for exonic,wt in exonictuplist:
        node1=exonic[0]; node2=exonic[1]
        node1idx=nodelist.index(node1)
        node2idx=nodelist.index(node2)
        if  node1idx not in sparsegraphdict:
            sparsegraphdict[node1idx]={}
        sparsegraphdict[node1idx][node2idx]=[wt,1]
    for splice,wt in zip(splicelist,splicewtlist):
        node1=splice[0]; node2=splice[1]
        node1idx=nodelist.index(node1)
        node2idx=nodelist.index(node2)
        if  node1idx not in sparsegraphdict:
            sparsegraphdict[node1idx]={}
            message='new splice node added %d'%(node1)
            common.printstatus(message,'W',common.func_name(),1)
        if node2idx not in sparsegraphdict[node1idx]:
            sparsegraphdict[node1idx][node2idx]=[wt,2]
        else:
            message='Two edges between two nodes %d(%d)-%d(%d)'%(node1,node1idx,node2,node2idx)
            common.printstatus(message,'W',common.func_name(),1)
            sparsegraphdict[node1idx][node2idx][0]+=wt        
    return (sparsegraphdict,nodedict)
Exemple #13
0
 def Toactfilelines(self,island):
     chrnm='%s'%island[0]
     gene=island[3]
     if island[4]=='+':
         tdir=1
     else:
         tdir=0
     actlines=[]
     nodelist=[]
     exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist=self.tuple
     for edge in splicelist:
         if edge[0] not in nodelist:
             nodelist.append(edge[0])
         if edge[1] not in nodelist:
             nodelist.append(edge[1])
     for edge in exonlist+intronlist:
         if edge[0] not in nodelist:
             if edge[0]-1 not in nodelist:
                 nodelist.append(edge[0])
         if edge[1] not in nodelist:
             if edge[1]+1 not in nodelist:
                 nodelist.append(edge[1])
     for node in startnodelist:
         if node not in nodelist:
             nodelist.append(node)
     for node in endnodelist:
         if node not in nodelist:
             nodelist.append(node)
     nodelist.sort()
     for node in nodelist:
         if node in startnodelist:
             actlines.append('%s\tannos\tnode\t%d\t%d\t0.00\t%d\t.\t%s'%(chrnm,node,node,tdir,gene))
         elif node in endnodelist:
             actlines.append('%s\tannoe\tnode\t%d\t%d\t0.00\t%d\t.\t%s'%(chrnm,node,node,tdir,gene))
         elif node in novelnodelist:
             actlines.append('%s\tnovel\tnode\t%d\t%d\t0.00\t%d\t.\t%s'%(chrnm,node,node,tdir,gene))
         else:
             actlines.append('%s\tannot\tnode\t%d\t%d\t0.00\t%d\t.\t%s'%(chrnm,node,node,tdir,gene))
     edgelist=exonlist[0:]+intronlist[0:]+splicelist[0:]
     edgelist.sort()
     for edge in edgelist:
         if edge in exonlist:
             wt=exonwtlist[exonlist.index(edge)]
             actlines.append('%s\tannot\texon\t%d\t%d\t%8.2f\t%d\t.\t%s'%(chrnm,edge[0],edge[1],wt,tdir,gene))
         elif edge in splicelist:
             wt=splicewtlist[splicelist.index(edge)]
             if edge[0] in novelnodelist or edge[1] in novelnodelist:
                 actlines.append('%s\tnovel\tsplice\t%d\t%d\t%8.2f\t%d\t.\t%s'%(chrnm,edge[0],edge[1],wt,tdir,gene))
             else:
                 actlines.append('%s\tannot\tsplice\t%d\t%d\t%8.2f\t%d\t.\t%s'%(chrnm,edge[0],edge[1],wt,tdir,gene))
         elif edge in intronlist:
             wt=intronwtlist[intronlist.index(edge)]
             actlines.append('%s\tnovel\tretint\t%d\t%d\t%8.2f\t%d\t.\t%s'%(chrnm,edge[0],edge[1],wt,tdir,gene))
         else:
             message='Orphan edge: %s'%str(edge)
             common.printstatus(message,'W',common.func_name())   
     return actlines
Exemple #14
0
 def _checknum(self, x, type, function_name):
     if type == 'int':
         try:
             int(x)
             return int(x)
         except:
             message = '%s is not integer' % x
             common.printstatus(message, 'F', function_name)
     if type == 'float':
         try:
             float(x)
             return float(x)
         except:
             message = '%s is not float' % x
             common.printstatus(message, 'F', function_name)
Exemple #15
0
 def _processlist(self,tsliststr):
     if len(tsliststr.split('->'))==1:
         tslist=tsliststr.split(',')
         for ts in tslist:
             if ts not in self.data_dict.keys():
                 message='Command Line has error(s): %s does not exist'%ts
                 common.printstatus(message,'F',common.func_name())  
     elif len(tsliststr.split('->'))==2:
         tsrange=tsliststr.split('->')
         alltslist=self.data_dict.keys()
         tslist=[]
         for ts in tslist:
             if ts>=tsrange[0] and ts<=tsrange[1]:
                 tslist.append(ts)
     return tslist
Exemple #16
0
 def _checknum(self,x,type,function_name):
     if type=='int':
         try:
             int(x)
             return int(x)
         except:
             message='%s is not integer'%x
             common.printstatus(message,'F',function_name) 
     if type=='float':
         try:
             float(x)
             return float(x)
         except:
             message='%s is not float'%x
             common.printstatus(message,'F',function_name) 
Exemple #17
0
 def _processlist(self, tsliststr):
     if len(tsliststr.split('->')) == 1:
         tslist = tsliststr.split(',')
         for ts in tslist:
             if ts not in self.data_dict.keys():
                 message = 'Command Line has error(s): %s does not exist' % ts
                 common.printstatus(message, 'F', common.func_name())
     elif len(tsliststr.split('->')) == 2:
         tsrange = tsliststr.split('->')
         alltslist = self.data_dict.keys()
         tslist = []
         for ts in tslist:
             if ts >= tsrange[0] and ts <= tsrange[1]:
                 tslist.append(ts)
     return tslist
Exemple #18
0
def addwgs(wgs1,wgs2,component1):
    if component1==1.0:
        return wgs1
    exonlist1,intronlist1,splicelist1,startnodelist1,endnodelist1,novelnodelist1,exonwtlist1,intronwtlist1,splicewtlist1=wgs1
    exonlist2,intronlist2,splicelist2,startnodelist2,endnodelist2,novelnodelist2,exonwtlist2,intronwtlist2,splicewtlist2=wgs2
    if (exonlist1==exonlist2) and (intronlist1==intronlist2) and (splicelist1==splicelist2) and (startnodelist1==startnodelist2) and (endnodelist1==endnodelist2) and (novelnodelist1==novelnodelist2):
        exonwtlist=[component1*wt1+(1-component1)*wt2 for wt1,wt2 in zip(exonwtlist1,exonwtlist2)]
        intronwtlist=[component1*wt1+(1-component1)*wt2 for wt1,wt2 in zip(intronwtlist1,intronwtlist2)]
        splicewtlist=[component1*wt1+(1-component1)*wt2 for wt1,wt2 in zip(splicewtlist1,splicewtlist2)]
        return [exonlist1,intronlist1,splicelist1,startnodelist1,endnodelist1,novelnodelist1,exonwtlist,intronwtlist,splicewtlist]
    else:
        message='Two wgs are different \n%s \n%s\n %d,%d,%d,%d,%d,%d'%(str(wgs1),str(wgs2),
                                                                    (exonlist1==exonlist2),(intronlist1==intronlist2),(splicelist1==splicelist2),
                                                                    (startnodelist1==startnodelist2),(endnodelist1==endnodelist2),(novelnodelist1==novelnodelist2))
        common.printstatus(message,'W',common.func_name(),1)
Exemple #19
0
 def _checknum(self,x,type,function_name):
     if type=='int':
         try:
             int(x)
             return int(x)
         except:
             message='Config file %s has errors: '%self.config_file
             message+='%s is not integer'%x
             common.printstatus(message,'F',function_name) 
     if type=='float':
         try:
             float(x)
             return float(x)
         except:
             message='Config file %s has errors: '%self.config_file
             message+='%s is not float'%x
             common.printstatus(message,'F',function_name) 
Exemple #20
0
    def _fixgenetranscriptdict(self):
        genetranscriptdict_file = '%s/genetranscriptdict.pck' % self.dir
        genetranscriptdict = cPickle.load(open(genetranscriptdict_file))
        newgenetranscriptdict = {}
        for gene in genetranscriptdict:
            trascriptdict = genetranscriptdict[gene]
            transcriptkeys = trascriptdict.keys()
            transcriptkeys.sort()
            chrstrand = trascriptdict[transcriptkeys[0]][0:2]
            removelist = []
            for key in transcriptkeys[1:]:
                if chrstrand != trascriptdict[key][0:2]:
                    message = 'Gene %s in multiple chromosomes; processing only one location:%s' % (
                        gene, str(chrstrand))
                    common.printstatus(message, 'W', common.func_name())
                    removelist.append(key)
            for key in removelist:
                transcriptkeys.remove(key)
            txrangelist = [[
                trascriptdict[key][2][0][0], trascriptdict[key][2][-1][1]
            ] for key in transcriptkeys]
            ztxrangekey = zip(txrangelist, transcriptkeys)
            ztxrangekey.sort()
            txrangelist = [x[0] for x in ztxrangekey]
            transcriptkeys = [x[1] for x in ztxrangekey]
            trange = txrangelist[0]
            for i in range(1, len(txrangelist)):
                txrange = txrangelist[i]
                if txrange[0] <= trange[1]:
                    trange = [trange[0], max(trange[1], txrange[1])]
                else:
                    message = 'Gene %s: multiple non-overlapping transcription regions; processing only one region:%s' % (
                        gene, str(trange))
                    common.printstatus(message, 'W', common.func_name())
                    transcriptkeys = transcriptkeys[0:i]
                    break
            newgenetranscriptdict[gene] = {}
            for transcript in transcriptkeys:
                newgenetranscriptdict[gene][transcript] = genetranscriptdict[
                    gene][transcript]

        cmd = 'mv %s %s.bk' % (genetranscriptdict_file,
                               genetranscriptdict_file)
        os.system(cmd)
        cPickle.dump(newgenetranscriptdict, open(genetranscriptdict_file, 'w'))
        return newgenetranscriptdict
Exemple #21
0
 def _checknum(self, x, type, function_name):
     if type == 'int':
         try:
             int(x)
             return int(x)
         except:
             message = 'Config file %s has errors: ' % self.config_file
             message += '%s is not integer' % x
             common.printstatus(message, 'F', function_name)
     if type == 'float':
         try:
             float(x)
             return float(x)
         except:
             message = 'Config file %s has errors: ' % self.config_file
             message += '%s is not float' % x
             common.printstatus(message, 'F', function_name)
Exemple #22
0
    def _gtf2genetranscriptdict(self):
        if self.type == 'E':
            geneidx = 'gene_name'
        elif self.type == 'F':
            geneidx = 'gene_id'
        genetranscriptdict = {}
        lnum = 0
        for linetxt in open(self.name):
            lnum += 1
            if lnum % 200000 == 1:
                message = 'Processing GTF file for transcripts at line %d' % lnum
                common.printstatus(message, 'S', common.func_name())
            line = linetxt.rstrip('\n').split('\t')
            if line[2] != 'exon':
                continue
            attrdict = self._attrtxt2attrdict(line[8])
            if geneidx in attrdict:
                gene = attrdict[geneidx]
            else:
                continue
            #print attrdict
            transcript = attrdict['transcript_id']
            chrnm = line[0]
            if line[6] == '+':
                strand = 1
            else:
                strand = 0
            #print gene, transcript, chrnm, strand
            if gene not in genetranscriptdict.keys():
                genetranscriptdict[gene] = {}
            if transcript not in genetranscriptdict[gene].keys():
                genetranscriptdict[gene][transcript] = (chrnm, strand, [])
            genetranscriptdict[gene][transcript][2].append(
                (int(line[3]), int(line[4])))
        for gene in genetranscriptdict:
            for transcript in genetranscriptdict[gene]:
                genetranscriptdict[gene][transcript][2].sort()
        message = 'Completed processing GTF file for transcripts'
        common.printstatus(message, 'S', common.func_name())

        genetranscriptdict_file = '%s/genetranscriptdict.pck' % self.dir
        cPickle.dump(genetranscriptdict, open(genetranscriptdict_file, 'w'))
        genetranscriptdict = self._fixgenetranscriptdict()
        return genetranscriptdict
Exemple #23
0
 def _gtf2genetranscriptdict(self):
     if self.type=='E':
         geneidx='gene_name'
     elif self.type=='F':
         geneidx='gene_id'
     genetranscriptdict={}
     lnum=0
     for linetxt in open(self.name):
         lnum+=1
         if lnum%200000==1:
             message='Processing GTF file for transcripts at line %d'%lnum
             common.printstatus(message,'S',common.func_name())
         line=linetxt.rstrip('\n').split('\t')
         if line[2]!='exon':
             continue
         attrdict=self._attrtxt2attrdict(line[8])
         if geneidx in attrdict:
             gene=attrdict[geneidx]
         else:
             continue
         #print attrdict
         transcript=attrdict['transcript_id']
         chrnm=line[0]
         if line[6]=='+':
             strand=1
         else:
             strand=0
         #print gene, transcript, chrnm, strand
         if gene not in genetranscriptdict.keys():
             genetranscriptdict[gene]={}
         if transcript not in  genetranscriptdict[gene].keys():
             genetranscriptdict[gene][transcript]=(chrnm,strand,[])
         genetranscriptdict[gene][transcript][2].append((int(line[3]),int(line[4])))
     for gene in genetranscriptdict:
         for transcript in genetranscriptdict[gene]:
             genetranscriptdict[gene][transcript][2].sort()
     message='Completed processing GTF file for transcripts'
     common.printstatus(message,'S',common.func_name())
     
     genetranscriptdict_file='%s/genetranscriptdict.pck'%self.dir
     cPickle.dump(genetranscriptdict,open(genetranscriptdict_file,'w'))
     genetranscriptdict=self._fixgenetranscriptdict()
     return genetranscriptdict
Exemple #24
0
 def setfdmfullpair(self,fdm_full_pair):
     allts=fdm_full_pair.replace('::',',').replace('+',',').split(',')
     if allts[0]!='BLANK':
         for ts in allts:
             if ts not in self.data_dict.keys():
                 message='Command Line has error(s): %s does not exist'%ts
                 common.printstatus(message,'F',common.func_name())  
         run_pair=fdm_full_pair.split('::')
         self.ffull_run_list=[[run_pair[0].split('+'),run_pair[1].split('+')]]
     self.run_all_flag=0
     self.run_pre_act_flag=0        
     self.run_splice_collate_flag=0
     self.run_proj_act_flag=0
     self.run_extract_flows=0
     self.run_cluster_flag=0        
     self.run_fdm_fast_flag=0
     self.run_fdm_full_flag=1
     self.run_cluster_flag=0
     self.run_report_flag=0
Exemple #25
0
 def _gtf2islandlist(self):
     '''
     For repeated genes, if regions do not match ignore
     '''
     islanddict = {}
     rej_genelist = []
     for linetxt in open(self.name):
         line = linetxt.rstrip('\n').split('\t')
         if line[2] != 'gene':
             continue
         attrdict = self._attrtxt2attrdict(line[8])
         gene = attrdict['gene_id']
         if line[6] == '+':
             strand = 1
         else:
             strand = 0
         gstart = int(line[3])
         gend = int(line[4])
         chrid = line[0]
         if gene not in islanddict.keys() and gene not in rej_genelist:
             islanddict[gene] = [chrid, gstart, gend, gene, strand]
         else:
             if gene in rej_genelist:
                 continue
             intv1 = [chrid, gstart, gend, gene, strand]
             intv2 = islanddict[gene]
             intv = self._intervalintersect(intv1, intv2)
             if intv == 0:
                 rej_genelist.append(gene)
             else:
                 islanddict[gene] = intv
     message = 'Number of repeated genes rejected: %d' % len(rej_genelist)
     common.printstatus(message, 'W', common.func_name())
     message = 'Number of genes loaded: %d' % (len(islanddict.keys()))
     common.printstatus(message, 'S', common.func_name())
     for gene in rej_genelist:
         del islanddict[gene]
     islandlist = []
     for gene in islanddict.keys():
         islandlist.append(islanddict[gene])
     islandlist.sort()
     return islandlist
Exemple #26
0
 def _gtf2islandlist(self):
     '''
     For repeated genes, if regions do not match ignore
     '''
     islanddict={}
     rej_genelist=[]
     for linetxt in open(self.name):
         line=linetxt.rstrip('\n').split('\t')
         if line[2]!='gene':
             continue
         attrdict=self._attrtxt2attrdict(line[8])
         gene=attrdict['gene_id']
         if line[6]=='+':
             strand=1
         else:
             strand=0
         gstart=int(line[3]); gend=int(line[4])
         chrid=line[0]
         if gene not in islanddict.keys() and gene not in rej_genelist:
             islanddict[gene]=[chrid,gstart,gend,gene,strand]
         else:
             if gene in rej_genelist:
                 continue
             intv1=[chrid,gstart,gend,gene,strand]
             intv2=islanddict[gene]
             intv=self._intervalintersect(intv1,intv2)
             if intv==0:
                 rej_genelist.append(gene)
             else:
                 islanddict[gene]=intv
     message='Number of repeated genes rejected: %d'%len(rej_genelist)
     common.printstatus(message,'W',common.func_name())
     message='Number of genes loaded: %d'%(len(islanddict.keys()))
     common.printstatus(message,'S',common.func_name())
     for gene in rej_genelist:
         del islanddict[gene]
     islandlist=[]
     for gene in islanddict.keys():
         islandlist.append(islanddict[gene])
     islandlist.sort()
     return islandlist
Exemple #27
0
 def setfdmfullpair(self, fdm_full_pair):
     allts = fdm_full_pair.replace('::', ',').replace('+', ',').split(',')
     if allts[0] != 'BLANK':
         for ts in allts:
             if ts not in self.data_dict.keys():
                 message = 'Command Line has error(s): %s does not exist' % ts
                 common.printstatus(message, 'F', common.func_name())
         run_pair = fdm_full_pair.split('::')
         self.ffull_run_list = [[
             run_pair[0].split('+'), run_pair[1].split('+')
         ]]
     self.run_all_flag = 0
     self.run_pre_act_flag = 0
     self.run_splice_collate_flag = 0
     self.run_proj_act_flag = 0
     self.run_extract_flows = 0
     self.run_cluster_flag = 0
     self.run_fdm_fast_flag = 0
     self.run_fdm_full_flag = 1
     self.run_cluster_flag = 0
     self.run_report_flag = 0
Exemple #28
0
 def _fixgenetranscriptdict(self):
     genetranscriptdict_file='%s/genetranscriptdict.pck'%self.dir
     genetranscriptdict=cPickle.load(open(genetranscriptdict_file))
     newgenetranscriptdict={}
     for gene in genetranscriptdict:
         trascriptdict=genetranscriptdict[gene]
         transcriptkeys=trascriptdict.keys()
         transcriptkeys.sort()
         chrstrand=trascriptdict[transcriptkeys[0]][0:2]
         removelist=[]
         for key in transcriptkeys[1:]:
             if chrstrand!=trascriptdict[key][0:2]:
                 message='Gene %s in multiple chromosomes; processing only one location:%s'%(gene,str(chrstrand))
                 common.printstatus(message,'W',common.func_name())
                 removelist.append(key)
         for key in removelist:
             transcriptkeys.remove(key)
         txrangelist=[[trascriptdict[key][2][0][0],trascriptdict[key][2][-1][1]] for key in transcriptkeys]
         ztxrangekey=zip(txrangelist,transcriptkeys)
         ztxrangekey.sort()
         txrangelist=[x[0] for x in ztxrangekey]
         transcriptkeys=[x[1] for x in ztxrangekey]
         trange=txrangelist[0]
         for i in range(1,len(txrangelist)):
             txrange=txrangelist[i]
             if txrange[0]<=trange[1]:
                 trange=[trange[0],max(trange[1],txrange[1])]
             else:
                 message='Gene %s: multiple non-overlapping transcription regions; processing only one region:%s'%(gene,str(trange))
                 common.printstatus(message,'W',common.func_name())
                 transcriptkeys=transcriptkeys[0:i]
                 break
         newgenetranscriptdict[gene]={}
         for transcript in transcriptkeys:
             newgenetranscriptdict[gene][transcript]=genetranscriptdict[gene][transcript]
         
     cmd='mv %s %s.bk'%(genetranscriptdict_file,genetranscriptdict_file)
     os.system(cmd)
     cPickle.dump(newgenetranscriptdict,open(genetranscriptdict_file,'w'))
     return newgenetranscriptdict
Exemple #29
0
def createfoldersetup(outputdir, runid, numtypes, numdatasets, configfilename,
                      biasfilename):
    if os.path.exists('%s/%s' % (outputdir, runid)):
        message = '%s/%s already exists' % (outputdir, runid)
        common.printstatus(message, 'F', common.func_name())

    cmd = 'mkdir -p %s/%s' % (outputdir, runid)
    os.system(cmd)
    cmd = 'mkdir -p %s/%s/config' % (outputdir, runid)
    os.system(cmd)
    cmd = 'mkdir -p %s/%s/data' % (outputdir, runid)
    os.system(cmd)
    cmd = 'mkdir -p %s/%s/metadata' % (outputdir, runid)
    os.system(cmd)
    cmd = 'cp %s %s/%s/config' % (configfilename, outputdir, runid)
    os.system(cmd)
    cmd = 'cp %s %s/%s/config' % (biasfilename, outputdir, runid)
    os.system(cmd)
    outdict = {}
    outdict['metadata'] = '%s/%s/metadata' % (outputdir, runid)
    if numtypes == 0:
        for ds in range(1, numdatasets + 1):
            foldername = '%s/%s/data/T%02dS01' % (outputdir, runid, ds)
            cmd = 'mkdir -p %s' % foldername
            os.system(cmd)
    else:
        for type in range(1, numtypes + 1):
            for ds in range(1, numdatasets + 1):
                foldername = '%s/%s/data/T%02dS%02d' % (outputdir, runid, type,
                                                        ds)
                cmd = 'mkdir -p %s' % foldername
                os.system(cmd)
    outdict['data'] = [
        numtypes, numdatasets,
        '%s/%s/data' % (outputdir, runid)
    ]
    return outdict
Exemple #30
0
def arrowplot(title,xlabellist,ylabel,y1y2data,sigflglist,wtflglist,imagefilename):
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    ax.set_xticks(range(1,len(xlabellist)+1))
    ax.set_xticklabels(xlabellist)
    for i in range(len(y1y2data)):
        if sigflglist[i]==0: 
            plt.plot(i+1,y1y2data[i][0],'bo',markerfacecolor='none')
            plt.plot(i+1,y1y2data[i][1],'rs',markerfacecolor='none')
        else:
            plt.plot(i+1,y1y2data[i][0],'bo')
            plt.plot(i+1,y1y2data[i][1],'rs')  
        if y1y2data[i][0]!=y1y2data[i][1]:
            if wtflglist[i]==1:    
                plt.annotate("",
                            xy=(i+1,y1y2data[i][0]), xycoords='data',
                            xytext=(i+1,y1y2data[i][1]), textcoords='data',
                            arrowprops=dict(arrowstyle="<|-",linewidth=3,
                            connectionstyle="arc3"),
                            )   
            else:
                plt.annotate("",
                            xy=(i+1,y1y2data[i][0]), xycoords='data',
                            xytext=(i+1,y1y2data[i][1]), textcoords='data',
                            arrowprops=dict(arrowstyle="<|-",linewidth=1,
                            connectionstyle="arc3"),
                            )   
    plt.title(title)
    plt.xlabel('')
    plt.ylabel('Splicing Fraction')
    plt.axis([0,len(xlabellist)+1,-0.05,1.05])
    try:
        plt.savefig(imagefilename)
    except:
        message='Problem in writing %s'%imagefilename
        common.printstatus(message,'W',common.func_name())
Exemple #31
0
 def divdictToflow(self,divdict):
     '''
     divdict=position:[[incoming/outgoing=0,1,exonstart=0=no,1=yes,2=start transcript,3=end transcript][exon, flowlist]]
     exonstart=0=no,1=yes,2=start transcript and exonstart,3=end transcript and exonstart,4=start transcript and no exonstart(insplice),
     5=end transcript and no exonstart outsplice 
     flowlist=[exon/splicelist]
     wtgraphstruct=(exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist)
     flowlist len=1 for start or end transcript
     getedgevalue edgetype = 10/11,20/21,3 = exon,intron,splice
     flowdict[position]=[[outgoingflg,exonstartflg],[[wt1,wt2],nflowvec]]
     '''
     flowdict={}
     for position in divdict.keys():
         flowvec=[]
         outgoingflg,exonstartflg=divdict[position][0]
         exon,flowlist=divdict[position][1]
         #message='Flowlist : %s'%str(flowlist)
         #common.printstatus(message,'S',common.func_name())      
         edgetype=10+outgoingflg
         wt1=self.getedgevalue(exon,edgetype)[0]
         if exonstartflg==2:
             if len(flowlist)!=1:
                 message='Transcript Start Exon has incoming splice; Flowlist : %s, %s'%(str(exon),common.fl2str(flowlist))
                 common.printstatus(message,'W',common.func_name())     
             else:
                 #incoming flow
                 edgetype=10+outgoingflg
                 #prev exon
                 wtoth=self.getedgevalue(flowlist[0],edgetype)[0]
                 flowvec=[wtoth,max(wt1-wtoth,0)]
                 if wt1-wtoth<0:
                     message='Flow decreases at transcript start exon: %s; Prev Exon: %s; Weight Start=%10.4f, Before=%10.4f'%(str(exon),common.fl2str(flowlist[0]),wt1,wtoth)
                     common.printstatus(message,'W',common.func_name())     
                 #wt2=wt1
                 wt2=sum(flowvec)
         elif exonstartflg==3:
             if len(flowlist)!=1:
                 message='Transcript End Exon has outgoing splice; Flowlist : %s, %s'%(str(exon),common.fl2str(flowlist))
                 common.printstatus(message,'W',common.func_name())     
             else:
                 #outgoing flow
                 edgetype=10+outgoingflg
                 #next exon
                 wtoth=self.getedgevalue(flowlist[0],edgetype)[0]
                 flowvec=[wtoth,max(wt1-wtoth,0)]
                 if wt1-wtoth<0:
                     message='Flow increases at transcript end exon %s: Prev Exon: %s; Weight End=%10.4f, After=%10.4f'%(str(exon),common.fl2str(flowlist[0]),wt1,wtoth)
                     common.printstatus(message,'W',common.func_name())   
                 #wt2=wt1
                 wt2=sum(flowvec)
         elif exonstartflg==1:
             flowvec=[]
             edgetype=10+outgoingflg
             flowvec.append(self.getedgevalue(flowlist[0],edgetype)[0])
             for flowedge in flowlist[1:]:
                 flowvec.append(self.getedgevalue(flowedge,3)[0])
             wt2=sum(flowvec)
         elif exonstartflg==0:
             flowvec=[]
             for flowedge in flowlist:
                 flowvec.append(self.getedgevalue(flowedge,3)[0])
             wt2=sum(flowvec)  
         if len(flowvec)>0:     
             nflowvec=common.normalize_vector(flowvec)
             flowdict[position]=[[outgoingflg,exonstartflg],[[wt1,wt2],nflowvec]]
     return flowdict
Exemple #32
0
    def getedgevalue(self,edge,edgetype):
        '''
        edgetype = 10/11,20/21,3 = exon,retained intron,splice
        foundflg=0 if not found
                =1 if exact match
                =2 if one sided match and +-1 match
                =3 if +-1 match on both
        ALERT: exon wt finding should be improved: All overlapping exon weight should be included
        '''
        goodoverlapsize=40
        exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist=self.tuple
        edge=list(edge)
#        message='Edge: %s; Edgetype: %d'%(str(edge), edgetype)
#        common.printstatus(message,'S',common.func_name())   
#        print zip(exonlist,exonwtlist)
#        print zip(intronlist,intronwtlist)
#        print zip(splicelist,splicewtlist)
#        print startnodelist
#        print endnodelist
#        print novelnodelist   
        allexonlist=exonlist+intronlist
        allexonwtlist=exonwtlist+intronwtlist 
        foundflg=0; wt=0.0
        if edgetype in [10,11,20,21]:
            for i in range(len(allexonlist)):
                exon=allexonlist[i]
                if edge[0]-exon[0] in [-1,0,1] and edge[1]-exon[1] in [-1,0,1] :
                    wt=allexonwtlist[i]
                    if edge[0]-exon[0] in [0] and edge[1]-exon[1] in [0]:
                        foundflg=1
                    elif (edge[0]-exon[0] in [0] and edge[1]-exon[1] in [-1,1]) or (edge[0]-exon[0] in [-1,1] and edge[1]-exon[1] in [0]):
                        foundflg=2
                    else:
                        foundflg=3
                    break
            if foundflg==0:
                if edgetype in [10,20]:
                    for i in range(len(allexonlist)):
                        exon=allexonlist[i]
                        if edge[0]-exon[0] in [-1,0,1]:
                            # and (exon[1]>edge[1] or exon[1]-exon[0]>goodoverlapsize):
                            wt=allexonwtlist[i]
                            foundflg=4
                            break
                if edgetype in [11,21]:
                    for i in range(len(allexonlist)):
                        exon=allexonlist[i]
                        if edge[1]-exon[1] in [-1,0,1]:
                        # and (exon[0]<edge[0] or exon[1]-exon[0]>goodoverlapsize):
                            wt=allexonwtlist[i]
                            foundflg=4
                            break                 
        if edgetype==3:  
            for i in range(len(splicelist)):
                splice=splicelist[i]
                if edge[0]-splice[0] in [0] and edge[1]-splice[1] in [0] :
                    wt=splicewtlist[i]
                    foundflg=1
                    break     
        if foundflg==0:
            message='Edge: %s; Type = %d; Edge weight: %10.4f; Found flag: %d'%(str(edge),edgetype,wt,foundflg) 
            common.printstatus(message,'S',common.func_name()) 
            #message='splicelist is %s'%str(splicelist)
            #common.printstatus(message,'S',common.func_name()) 
        return (wt,foundflg)
Exemple #33
0
def getfragments(txstr, numfragments, strand, cutscoredict, fragmentsizerange,
                 txstartbias, numcutmu, numcutsig):
    """
    input:
        txstr: transcript string
        numfragments: number of fragments output
        strand: strand of transcript
        cutscoredict: dict(xmer:score) where score is cuttability at the center of the xmer, default 1
        fragmentsizerange: size of fragments output
        txstartbias: bias of fragments extracted from the start of the transcript
    output:
        fragments: start,end
    """
    outfragments = []
    fragmentscount = 0
    if numfragments == 0:
        return outfragments
    sttime = time.time()
    #compute cutpoints and probabilities
    scorelist = [1 for i in range(len(txstr))]
    minxmer = min([len(xmer) for xmer in cutscoredict])
    for i in range(len(txstr) - minxmer + 1):
        for xmer in cutscoredict:
            if txstr[i:i + len(xmer)] == xmer:
                scorelist[i] = cutscoredict[xmer]
                break
    scorecumlist = [x * 1.0 / sum(scorelist) for x in np.cumsum(scorelist)]
    scorecumlist.insert(0, 0)

    numcuts = max(numcutmu, int(random.gauss(numcutmu, numcutsig)))

    tries = 0
    while fragmentscount < numfragments:
        tries += 1
        txdegradepoint = np.random.exponential(0.005 * txstartbias)
        if strand == '+':
            cutrange = [0, int(round(len(txstr) * (1 - txdegradepoint)))]
        else:
            cutrange = [
                int(round(len(txstr) * txdegradepoint)),
                len(txstr) - 1
            ]
        cutpoints = list(np.random.uniform(0, 1, numcuts))
        cutpositions = []
        #print scorecumlist
        #print cutpoints
        for cutpoint in cutpoints:
            for i in range(cutrange[0], cutrange[1]):
                if cutpoint > scorecumlist[i] and cutpoint < scorecumlist[i +
                                                                          1]:
                    cutpositions.append(i)
        cutpositions.sort()
        cutpositions.insert(0, cutrange[0])
        cutpositions.append(cutrange[1] + 1)
        #print 'here',cutpositions
        goodfragments = [
            [x[0], x[1] - 1] for x in zip(cutpositions[:-1], cutpositions[1:])
            if fragmentsizerange[0] <= x[1] - x[0] + 1 <= fragmentsizerange[1]
        ]
        #print 'here',len(goodfragments)
        outfragments += goodfragments
        fragmentscount += len(goodfragments)
    if debug_flg == 1:
        eltime = time.time() - sttime
        message = 'Start Time=%s' % sttime
        common.printstatus(message, 'S', common.func_name())
        message = 'Average fragment: %6.2f, Tries per read: %6.2f, Reads per second: %10.4f, Num reads: %d' % (
            len(txstr) / (numcuts + 0.1), tries /
            (numfragments + 0.1), numfragments / eltime, numfragments)
        common.printstatus(message, 'S', common.func_name())
    outfragments = random.sample(outfragments, numfragments)
    outfragments.sort()
    return outfragments
Exemple #34
0
        plot.plotscatterwithhistogram(jsdlist,mincovlist,'jsd','mincoverage','mincov-jsd\n%s'%numofgenestr,imagefilename,markertup=('.',5),logscaleyflg=1,logscalexflg=0)

if __name__ == "__main__":
    debug_flg=2
    function_map={'RNAmetasource2source':RNAmetasource2source,'fragment2read':fragment2read}
    usage = 'usage: %prog [options] arg'
    parser = OptionParser(usage)
    parser.add_option('-c', '--config', dest='cfgfile',default='config/sregen.cfg')
    (options, args) = parser.parse_args()
    config_file=options.cfgfile         
    
    cfg=runparam(config_file)
    cfg.parse()
        
    message='Program Started'
    common.printstatus(message,'S',common.func_name())
    
    
    metasourcefilelist=[cfg.metasourcedict[x] for x in cfg.generate_method_files]
    parameters=cfg.generate_method_parameters
    parameterlist=[metasourcefilelist,parameters]
    genetxcigardict=runfunc(cfg.generate_method,parameterlist) #chr,strand,start,cigar,size
    
    folderdict=createfoldersetup(cfg.output_dir,cfg.run_name,cfg.numtypes,cfg.numdatasets,config_file,cfg.cutpreferencefile)
    
    genetxcountdict=dict([(gene,len(genetxcigardict[gene])) for gene in genetxcigardict])
    
    if cfg.numtypes==0:
        for sample_id in range(1,cfg.numdatasets+1):
            genetxreaddict=getsimulatedoneexpression(genetxcountdict,cfg.readcount)
            replicate_id=1
Exemple #35
0
    def _genetranscriptdict2graphstruct(self, gene):
        '''
        get a graph structure for a gene
        Alert: Exon Ends may need correction
        '''
        transcriptexonlist = []
        genetranscriptdict = self._getgenetranscriptdict()
        trascriptdict = genetranscriptdict[gene]
        transcriptkeys = trascriptdict.keys()
        transcriptkeys.sort()
        chrstrand = trascriptdict[transcriptkeys[0]][0:2]
        removelist = []
        for key in transcriptkeys[1:]:
            if chrstrand != trascriptdict[key][0:2]:
                message = 'Gene %s in multiple chromosomes; processing only one location:%s' % (
                    gene, str(chrstrand))
                common.printstatus(message, 'W', common.func_name())
                removelist.append(key)
        for key in removelist:
            transcriptkeys.remove(key)
        txrangelist = [[
            trascriptdict[key][2][0][0], trascriptdict[key][2][-1][1]
        ] for key in transcriptkeys]
        ztxrangekey = zip(txrangelist, transcriptkeys)
        ztxrangekey.sort()
        txrangelist = [x[0] for x in ztxrangekey]
        transcriptkeys = [x[1] for x in ztxrangekey]
        trange = txrangelist[0]
        for i in range(1, len(txrangelist)):
            txrange = txrangelist[i]
            if txrange[0] <= trange[1]:
                trange = [trange[0], max(trange[1], txrange[1])]
            else:
                message = 'Gene %s: multiple non-overlapping transcription regions; processing only one region:%s' % (
                    gene, str(trange))
                common.printstatus(message, 'W', common.func_name())
                transcriptkeys = transcriptkeys[0:i]
                break

        splicelist = []
        startnodelist = []
        endnodelist = []
        for key in transcriptkeys:
            exonlist = trascriptdict[key][2]
            #print exonlist
            if exonlist[0][0] not in startnodelist:
                startnodelist.append(exonlist[0][0])
            if exonlist[-1][1] not in endnodelist:
                endnodelist.append(exonlist[-1][1])
            for exon in exonlist:
                if exon not in transcriptexonlist:
                    transcriptexonlist.append(exon)
            for i in range(1, len(exonlist)):
                splice = [exonlist[i - 1][1], exonlist[i][0]]
                if splice not in splicelist:
                    splicelist.append(splice)
        exonlist = []
        transcriptexonlist.sort()
        transcriptexonlist = [list(exon) for exon in transcriptexonlist]
        splicelist.sort()
        startnodelist.sort()
        endnodelist.sort()
        #print transcriptexonlist
        exonqueue = transcriptexonlist[0:1]
        for i in range(len(transcriptexonlist) - 1):
            exon = transcriptexonlist[i + 1]
            #print exonqueue, exon, exonlist
            tqueue = exonqueue[0:]
            for exonregion in tqueue:
                if exonregion[1] < exon[0]:
                    exonlist.append(exonregion)
                    exonqueue.remove(exonregion)
            found = 0
            tqueue = exonqueue[0:]
            for exonregion in tqueue:
                if exon[0] > exonregion[0] and exon[0] < exonregion[1]:
                    if exon[1] > exonregion[0] and exon[1] < exonregion[1]:
                        exonqueue.append([exonregion[0], exon[0] - 1])
                        exonqueue.append([exon[0], exon[1]])
                        exonqueue.append([exon[1] + 1, exonregion[1]])
                        exonqueue.remove(exonregion)
                    else:
                        exonqueue.append([exonregion[0], exon[0] - 1])
                        exonqueue.append([exon[0], exonregion[1]])
                        exonqueue.remove(exonregion)
                elif exon[1] > exonregion[0] and exon[1] < exonregion[1]:
                    exonqueue.append([exonregion[0], exon[1]])
                    exonqueue.append([exon[1] + 1, exonregion[1]])
                    exonqueue.remove(exonregion)
            exonqueue.sort()
            if len(exonqueue) > 0:
                if exon[1] > exonqueue[-1][1]:
                    exonqueue.append([exonqueue[-1][1] + 1, exon[1]])
            else:
                exonqueue.append(exon)
        for exonregion in exonqueue:
            if exonregion[0] < exonregion:
                exonlist.append(exonregion)
        exonlist.sort()
        intronlist = []
        novelnodelist = []
        for i in range(len(exonlist) - 1):
            if exonlist[i][1] + 1 < exonlist[i + 1][0] - 1:
                if [exonlist[i][1], exonlist[i + 1][0]] not in splicelist:
                    intronlist.append(
                        [exonlist[i][1] + 1, exonlist[i + 1][0] - 1])
                else:
                    dummyleftnode = (exonlist[i][1] + 1 + exonlist[i + 1][0] -
                                     1) / 2
                    dummyrightnode = dummyleftnode + 1
                    intronlist.append([exonlist[i][1] + 1, dummyleftnode])
                    intronlist.append([dummyrightnode, exonlist[i + 1][0] - 1])
                    novelnodelist.append(dummyleftnode)
#        print exonlist
#        print intronlist
#        print splicelist
#        print startnodelist
#        print endnodelist
        return (exonlist, intronlist, splicelist, startnodelist, endnodelist,
                novelnodelist)
Exemple #36
0
    def parse(self):
        config = ConfigParser.SafeConfigParser()
        config.read(self.config_file)

        self.pathsamtools = self._checkdir('samtools path',
                                           config.get('tools', 'pathsamtools'),
                                           common.func_name())
        self.pathucsctools = self._checkdir(
            'ucsc path', config.get('tools', 'pathucsctools'),
            common.func_name())
        self.pathbedtools = self._checkdir('bedtools path',
                                           config.get('tools', 'pathbedtools'),
                                           common.func_name())

        self.chromszfile = self._checkfile(
            'chromosome size file', config.get('reference', 'chromszfile'),
            common.func_name())
        self.chrfaifile = self._checkfile(
            'chromosome index file', config.get('reference', 'chrfaifile'),
            common.func_name())

        self.annotation_file = self._checkfile(
            'gene annotation file', config.get('reference', 'annotation_file'),
            common.func_name())
        self.annotation_file_type = config.get('reference',
                                               'annotation_file_type')

        self.root_dir = self._checkdir('root_dir path',
                                       config.get('project', 'root_dir'),
                                       common.func_name())

        #Data
        data_files = config.items('Data')
        data_dict1 = dict(data_files)
        data_dict = {}
        for key in data_dict1.keys():
            data_dict[key.upper()] = data_dict1[key]
        for sample in data_dict.keys():
            if not (os.path.isfile(data_dict[sample])):
                message = 'Config file has error(s): %s does not exist' % data_dict[
                    sample]
                common.printstatus(message, 'F', common.func_name())
        self.data_dict = data_dict

        datakeys = data_dict.keys()
        self.datadir_dict = dict(
            zip(datakeys,
                ['/'.join(data_dict[key].split('/')[:-1])
                 for key in datakeys]))

        #Project
        self.project_name = config.get('project', 'project_name')
        project_groups = config.get('project', 'project_groups')
        self.project_groups = [
            group.split(',') for group in project_groups.split('::')
        ]
        proj_tslist = []
        for groups in self.project_groups:
            groups.sort()
            proj_tslist += groups
        for ts in proj_tslist:
            if ts not in datakeys:
                message = 'Config file has error(s) in [Project] Groups: %s does not exist in Data' % ts
                common.printstatus(message, 'F', common.func_name())
        project_type = config.get('project', 'project_type')
        if project_type not in ['1', '2']:
            message = 'Config file has error(s) in Project Type should be 1 or 2'
            common.printstatus(message, 'F', common.func_name())
        self.project_type = int(project_type)
        self.run_type = self._checknum(config.get('project', 'run_type'),
                                       'int', common.func_name())

        self.ffast_min_cov = self._checknum(
            config.get('compute_params', 'ffast_min_cov'), 'float',
            common.func_name())
        self.ffast_min_fdm = self._checknum(
            config.get('compute_params', 'ffast_min_fdm'), 'float',
            common.func_name())

        self.ffull_partition = self._checknum(
            config.get('compute_params', 'ffull_partition'), 'int',
            common.func_name())
        self.ffull_permutation = self._checknum(
            config.get('compute_params', 'ffull_permutation'), 'int',
            common.func_name())
        self.ffull_pvalue = self._checknum(
            config.get('compute_params', 'ffull_pvalue'), 'float',
            common.func_name())

        self.cluster_max_dbi = self._checknum(
            config.get('compute_params', 'cluster_max_dbi'), 'float',
            common.func_name())
        #self.cluster_min_med_cov=self._checknum(config.get('compute_params','cluster_min_med_cov'),'float',common.func_name())
        #self.cluster_min_med_fdm=self._checknum(config.get('compute_params','cluster_min_med_fdm'),'float',common.func_name())

        self.ffull_genesplit_size = self._checknum(
            config.get('compute_params', 'ffull_genesplit_size'), 'int',
            common.func_name())
        self.report_top_x = self._checknum(
            config.get('compute_params', 'report_top_x'), 'int',
            common.func_name())
        self.graph_top_x = self._checknum(
            config.get('compute_params', 'graph_top_x'), 'int',
            common.func_name())

        #Run Flag
        self.run_all_flag = int(config.get('Runflags', 'run_all_flag'))
        self.run_pre_act_flag = max(
            self.run_all_flag, int(config.get('Runflags', 'run_pre_act_flag')))
        self.run_splice_collate_flag = max(
            self.run_all_flag,
            int(config.get('Runflags', 'run_splice_collate_flag')))
        self.run_proj_act_flag = max(
            self.run_all_flag, int(config.get('Runflags',
                                              'run_proj_act_flag')))
        self.run_extract_flows = max(
            self.run_all_flag, int(config.get('Runflags',
                                              'run_extract_flows')))
        self.run_fdm_fast_flag = max(
            self.run_all_flag, int(config.get('Runflags',
                                              'run_fdm_fast_flag')))
        #todo filter
        self.run_fdm_full_flag = max(
            self.run_all_flag, int(config.get('Runflags',
                                              'run_fdm_full_flag')))
        # not necessary
        self.run_cluster_flag = max(
            self.run_all_flag, int(config.get('Runflags', 'run_cluster_flag')))
        self.run_report_flag = max(
            self.run_all_flag, int(config.get('Runflags', 'run_report_flag')))

        #Runpreact
        if self.run_pre_act_flag == 1:
            pre_act_run_flags_tdict = dict(config.items('Runpreact'))
            pre_act_run_flags_dict = {}
            for key in pre_act_run_flags_tdict.keys():
                pre_act_run_flags_dict[key.upper()] = int(
                    pre_act_run_flags_tdict[key])
            for ts in pre_act_run_flags_dict.keys():
                if ts not in datakeys:
                    message = 'Config file has error(s) in [Runpreact]: %s does not exist in Data' % ts
                    common.printstatus(message, 'F', common.func_name())
            self.pre_act_run_list = [
                key for key in pre_act_run_flags_dict.keys()
                if pre_act_run_flags_dict[key] == 1
            ]
            self.pre_act_run_list.sort()
        else:
            self.pre_act_run_list = []

        #Runact
        if self.run_proj_act_flag == 1:
            act_run_flags_tdict = dict(config.items('Runact'))
            act_run_flags_dict = {}
            for key in act_run_flags_tdict.keys():
                act_run_flags_dict[key.upper()] = int(act_run_flags_tdict[key])
            for ts in act_run_flags_dict.keys():
                if ts not in datakeys:
                    message = 'Config file has error(s) in [Runact]: %s does not exist in Data' % ts
                    common.printstatus(message, 'F', common.func_name())
            self.act_run_list = [
                key for key in act_run_flags_dict.keys()
                if act_run_flags_dict[key] == 1
            ]
            self.act_run_list.sort()
        else:
            self.act_run_list = []

        #Extractflows
        ext_flow_gene_file = config.get('Extractflows', 'ext_flow_gene_file')
        if ext_flow_gene_file.lower(
        ) == 'all' or ext_flow_gene_file[0:3].lower() == 'chr':
            self.ext_flow_genelist = [ext_flow_gene_file.lower()]
            self.flow_prefix = self.project_name
        else:
            self.ext_flow_genelist = [
                ln.rstrip('\n').split('\t')[0]
                for ln in open(ext_flow_gene_file).readlines()
            ]
        self.flow_prefix = config.get('Extractflows', 'flow_prefix')

        #Runfastfdm
        self.ffast_prefix = config.get('Runfastfdm', 'ffast_prefix')
        ffast_gene_file = config.get('Runfastfdm', 'ffast_gene_file')
        if ffast_gene_file.lower() == 'all' or ffast_gene_file[0:3].lower(
        ) == 'chr':
            self.ffast_genelist = [ffast_gene_file.lower()]
            self.ffast_prefix = self.project_name
        else:
            self.ffast_genelist = [
                ln.rstrip('\n').split('\t')[0]
                for ln in open(ffast_gene_file).readlines()
            ]

        tffast_run_dict = dict(config.items('Runfastfdm'))
        ffast_run_dict = {}
        for key in tffast_run_dict:
            if key[0:9] == 'ffast_run':
                ffast_run_dict[key] = tffast_run_dict[key]
        allitems = []
        for key in ffast_run_dict.keys():
            allitems += ffast_run_dict[key].replace('::', ',').replace(
                '+', ',').replace(':', ',').replace('|', ',').split(',')
        for item in allitems:
            if item not in datakeys:
                message = 'Config file has error(s): [Runfastfdm] has incorrect definition;[Data] does not have %s' % (
                    item)
                common.printstatus(message, 'W', common.func_name())

        ffast_run_list = []
        for key in ffast_run_dict.keys():
            run_str = ffast_run_dict[key]
            if len(run_str.split('::')) == 1:
                if len(run_str.split(',')) > 1:
                    # within group a,b,c,d
                    run_items = run_str.split(',')
                    for i in range(len(run_items) - 1):
                        for j in range(i + 1, len(run_items)):
                            ffast_run_list.append([[run_items[i]],
                                                   [run_items[j]]])
                else:
                    # all pairs listed a:b|c:d
                    splitrunlist = [x for x in run_str.split('|')]
                    ffast_run_list = [[[x.split(':')[0]], [x.split(':')[1]]]
                                      for x in splitrunlist]
            else:
                run_pair = run_str.split('::')
                if len(run_pair[0].split(',')) == 1:
                    ffast_run_list.append(
                        [run_pair[0].split('+'), run_pair[1].split('+')])
                else:
                    for item1 in run_pair[0].split(','):
                        for item2 in run_pair[1].split(','):
                            ffast_run_list.append([[item1], [item2]])
        ffast_run_list.sort()
        self.ffast_run_list = ffast_run_list

        #RunCluster
        #        self.cluster_prefix=config.get('RunCluster','cluster_prefix')
        #        cluster_gene_file=config.get('RunCluster','cluster_gene_file')
        #        if cluster_gene_file.lower() =='all' or cluster_gene_file[0:3].lower()=='chr':
        #            cluster_gene_file=cluster_gene_file.lower()
        #            if cluster_gene_file=='chrx':
        #                self.cluster_genelist=['chrX']
        #            elif cluster_gene_file=='chry':
        #                self.cluster_genelist=['chrY']
        #            else:
        #                self.cluster_genelist=[cluster_gene_file]
        #        else:
        #            self.cluster_genelist=[ln.rstrip('\n') for ln in open(cluster_gene_file).readlines()]
        #self.cluster_unknown_flag=int(config.get('RunCluster','cluster_unknown_flag'))

        #Runfullfdm
        self.ffull_prefix = config.get('Runfullfdm', 'ffull_prefix')
        ffull_gene_file = config.get('Runfullfdm', 'ffull_gene_file')
        if ffull_gene_file.lower() == 'all' or ffull_gene_file[0:3].lower(
        ) == 'chr':
            self.ffull_genelist = [ffull_gene_file.lower()]
            self.ffull_prefix = self.project_name
        elif ffull_gene_file.lower() == 'none':
            self.ffull_genelist = []
        else:
            self.ffull_genelist = [
                ln.rstrip('\n').split('\t')[0]
                for ln in open(ffull_gene_file).readlines()
            ]

        tffull_run_dict = dict(config.items('Runfullfdm'))
        ffull_run_dict = {}
        for key in tffull_run_dict:
            if key[0:9] == 'ffull_run':
                ffull_run_dict[key] = tffull_run_dict[key]
        allitems = []
        for key in ffull_run_dict.keys():
            allitems += ffull_run_dict[key].replace('::', ',').replace(
                '+', ',').replace(':', ',').replace('|', ',').split(',')
        for item in allitems:
            if item not in datakeys:
                message = 'Config file has error(s): [Runfullfdm] has incorrect definition;[Data] does not have %s' % (
                    item)
                common.printstatus(message, 'W', common.func_name())

        ffull_run_list = []
        for key in ffull_run_dict.keys():
            run_str = ffull_run_dict[key]
            if len(run_str.split('::')) == 1:
                if len(run_str.split(',')) > 1:
                    # within group
                    run_items = run_str.split(',')
                    for i in range(len(run_items) - 1):
                        for j in range(i + 1, len(run_items)):
                            ffull_run_list.append([[run_items[i]],
                                                   [run_items[j]]])
                else:
                    # all pairs listed a:b|c:d
                    splitrunlist = [x for x in run_str.split('|')]
                    ffull_run_list = [[[x.split(':')[0]], [x.split(':')[1]]]
                                      for x in splitrunlist]
            else:
                run_pair = run_str.split('::')
                if len(run_pair[0].split(',')) == 1:
                    ffull_run_list.append(
                        [run_pair[0].split('+'), run_pair[1].split('+')])
                else:
                    for item1 in run_pair[0].split(','):
                        for item2 in run_pair[1].split(','):
                            ffull_run_list.append([[item1], [item2]])
        ffull_run_list.sort()
        if self.run_fdm_fast_flag == 1:
            self.ffull_run_list = ffull_run_list
        else:
            self.ffull_run_list = []
Exemple #37
0
    def _genetranscriptdict2graphstruct(self,gene):
        '''
        get a graph structure for a gene
        Alert: Exon Ends may need correction
        ''' 
        transcriptexonlist=[]
        genetranscriptdict=self._getgenetranscriptdict()
        trascriptdict=genetranscriptdict[gene]
        transcriptkeys=trascriptdict.keys()
        transcriptkeys.sort()
        chrstrand=trascriptdict[transcriptkeys[0]][0:2]
        removelist=[]
        for key in transcriptkeys[1:]:
            if chrstrand!=trascriptdict[key][0:2]:
                message='Gene %s in multiple chromosomes; processing only one location:%s'%(gene,str(chrstrand))
                common.printstatus(message,'W',common.func_name())
                removelist.append(key)
        for key in removelist:
            transcriptkeys.remove(key)
        txrangelist=[[trascriptdict[key][2][0][0],trascriptdict[key][2][-1][1]] for key in transcriptkeys]
        ztxrangekey=zip(txrangelist,transcriptkeys)
        ztxrangekey.sort()
        txrangelist=[x[0] for x in ztxrangekey]
        transcriptkeys=[x[1] for x in ztxrangekey]
        trange=txrangelist[0]
        for i in range(1,len(txrangelist)):
            txrange=txrangelist[i]
            if txrange[0]<=trange[1]:
                trange=[trange[0],max(trange[1],txrange[1])]
            else:
                message='Gene %s: multiple non-overlapping transcription regions; processing only one region:%s'%(gene,str(trange))
                common.printstatus(message,'W',common.func_name())
                transcriptkeys=transcriptkeys[0:i]
                break
  
        splicelist=[]
        startnodelist=[]
        endnodelist=[]
        for key in transcriptkeys:
            exonlist=trascriptdict[key][2]
            #print exonlist
            if exonlist[0][0] not in startnodelist:
                startnodelist.append(exonlist[0][0])
            if exonlist[-1][1] not in endnodelist:
                endnodelist.append(exonlist[-1][1])
            for exon in exonlist:
                if exon not in transcriptexonlist:
                    transcriptexonlist.append(exon)
            for i in range(1,len(exonlist)):
                splice = [exonlist[i-1][1],exonlist[i][0]]
                if splice not in splicelist:
                    splicelist.append(splice)
        exonlist=[]
        transcriptexonlist.sort()
        transcriptexonlist=[list(exon) for exon in transcriptexonlist]
        splicelist.sort()
        startnodelist.sort()
        endnodelist.sort()
        #print transcriptexonlist
        exonqueue=transcriptexonlist[0:1]
        for i in range(len(transcriptexonlist)-1):
            exon=transcriptexonlist[i+1]
            #print exonqueue, exon, exonlist
            tqueue=exonqueue[0:]
            for exonregion in tqueue:
                if exonregion[1]<exon[0]:
                    exonlist.append(exonregion)
                    exonqueue.remove(exonregion)
            found=0
            tqueue=exonqueue[0:]
            for exonregion in tqueue:
                if exon[0] > exonregion[0] and exon[0] < exonregion[1]:
                    if exon[1] > exonregion[0] and exon[1] < exonregion[1]:
                        exonqueue.append([exonregion[0],exon[0]-1])
                        exonqueue.append([exon[0],exon[1]])
                        exonqueue.append([exon[1]+1,exonregion[1]])
                        exonqueue.remove(exonregion)
                    else:
                        exonqueue.append([exonregion[0],exon[0]-1])
                        exonqueue.append([exon[0],exonregion[1]])
                        exonqueue.remove(exonregion)
                elif exon[1] > exonregion[0] and exon[1] < exonregion[1]:
                    exonqueue.append([exonregion[0],exon[1]])
                    exonqueue.append([exon[1]+1,exonregion[1]])
                    exonqueue.remove(exonregion)
            exonqueue.sort()     
            if len(exonqueue)>0:   
                if exon[1]>exonqueue[-1][1]:
                    exonqueue.append([exonqueue[-1][1]+1,exon[1]])
            else:
                exonqueue.append(exon)
        for exonregion in exonqueue:
            if exonregion[0]<exonregion:
                exonlist.append(exonregion)
        exonlist.sort()
        intronlist=[]
        novelnodelist=[]
        for i in range(len(exonlist)-1):
            if exonlist[i][1]+1<exonlist[i+1][0]-1:
                if [exonlist[i][1],exonlist[i+1][0]] not in splicelist:
                    intronlist.append([exonlist[i][1]+1,exonlist[i+1][0]-1])
                else:
                    dummyleftnode=(exonlist[i][1]+1+exonlist[i+1][0]-1)/2
                    dummyrightnode=dummyleftnode+1
                    intronlist.append([exonlist[i][1]+1,dummyleftnode])
                    intronlist.append([dummyrightnode,exonlist[i+1][0]-1])
                    novelnodelist.append(dummyleftnode)
#        print exonlist
#        print intronlist
#        print splicelist
#        print startnodelist
#        print endnodelist
        return (exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist)
Exemple #38
0
 def _checkfile(self,filename,filepath,function_name):
     if not(os.path.isfile(filepath)):
         message='%s does not exist'%filename
         common.printstatus(message,'F',function_name) 
     else:
         return filepath       
Exemple #39
0
    def parse(self):
        config = ConfigParser.SafeConfigParser()
        config.read(self.config_file)
        
        self.pathsamtools=self._checkdir('samtools path',config.get('tools','pathsamtools'),common.func_name())
        self.pathucsctools=self._checkdir('ucsc path',config.get('tools','pathucsctools'),common.func_name())
        self.pathbedtools=self._checkdir('bedtools path',config.get('tools','pathbedtools'),common.func_name())
        
        self.chromszfile =self._checkfile('chromosome size file',config.get('reference','chromszfile'),common.func_name())
        self.chrfaifile =self._checkfile('chromosome index file',config.get('reference','chrfaifile'),common.func_name())
        
        self.annotation_file =self._checkfile('gene annotation file',config.get('reference','annotation_file'),common.func_name())
        self.annotation_file_type = config.get('reference','annotation_file_type')
        
        
        self.root_dir=self._checkdir('root_dir path',config.get('project','root_dir'),common.func_name())  
            
        #Data
        data_files=config.items('Data')
        data_dict1=dict(data_files)
        data_dict={}
        for key in data_dict1.keys():
            data_dict[key.upper()]=data_dict1[key]
        for sample in data_dict.keys():
            if not(os.path.isfile(data_dict[sample])):
                message='Config file has error(s): %s does not exist'%data_dict[sample]
                common.printstatus(message,'F',common.func_name())  
        self.data_dict=data_dict
        
        datakeys=data_dict.keys()
        self.datadir_dict=dict(zip(datakeys,['/'.join(data_dict[key].split('/')[:-1]) for key in datakeys]))
        
        #Project
        self.project_name=config.get('project','project_name')
        project_groups=config.get('project','project_groups')
        self.project_groups=[group.split(',') for group in project_groups.split('::')]
        proj_tslist=[]
        for groups in self.project_groups:
            groups.sort()
            proj_tslist+=groups
        for ts in proj_tslist:
            if ts not in datakeys:
                message='Config file has error(s) in [Project] Groups: %s does not exist in Data'%ts
                common.printstatus(message,'F',common.func_name()) 
        project_type=config.get('project','project_type')
        if project_type not in ['1','2']:
            message='Config file has error(s) in Project Type should be 1 or 2'
            common.printstatus(message,'F',common.func_name()) 
        self.project_type=int(project_type)
        self.run_type=self._checknum(config.get('project','run_type'),'int',common.func_name())        

        self.ffast_min_cov=self._checknum(config.get('compute_params','ffast_min_cov'),'float',common.func_name())
        self.ffast_min_fdm=self._checknum(config.get('compute_params','ffast_min_fdm'),'float',common.func_name())
        
        self.ffull_partition=self._checknum(config.get('compute_params','ffull_partition'),'int',common.func_name())
        self.ffull_permutation=self._checknum(config.get('compute_params','ffull_permutation'),'int',common.func_name())
        self.ffull_pvalue=self._checknum(config.get('compute_params','ffull_pvalue'),'float',common.func_name())
        
        self.cluster_max_dbi=self._checknum(config.get('compute_params','cluster_max_dbi'),'float',common.func_name())
        #self.cluster_min_med_cov=self._checknum(config.get('compute_params','cluster_min_med_cov'),'float',common.func_name())
        #self.cluster_min_med_fdm=self._checknum(config.get('compute_params','cluster_min_med_fdm'),'float',common.func_name())
        
        self.ffull_genesplit_size=self._checknum(config.get('compute_params','ffull_genesplit_size'),'int',common.func_name())
        self.report_top_x=self._checknum(config.get('compute_params','report_top_x'),'int',common.func_name())
        self.graph_top_x=self._checknum(config.get('compute_params','graph_top_x'),'int',common.func_name())
        
        
        #Run Flag
        self.run_all_flag=int(config.get('Runflags','run_all_flag'))
        self.run_pre_act_flag=max(self.run_all_flag,int(config.get('Runflags','run_pre_act_flag')))
        self.run_splice_collate_flag=max(self.run_all_flag,int(config.get('Runflags','run_splice_collate_flag')))
        self.run_proj_act_flag=max(self.run_all_flag,int(config.get('Runflags','run_proj_act_flag')))
        self.run_extract_flows=max(self.run_all_flag,int(config.get('Runflags','run_extract_flows')))
        self.run_fdm_fast_flag=max(self.run_all_flag,int(config.get('Runflags','run_fdm_fast_flag')))
        #todo filter
        self.run_fdm_full_flag=max(self.run_all_flag,int(config.get('Runflags','run_fdm_full_flag')))
        # not necessary
        self.run_cluster_flag=max(self.run_all_flag,int(config.get('Runflags','run_cluster_flag')))
        self.run_report_flag=max(self.run_all_flag,int(config.get('Runflags','run_report_flag')))
        
        #Runpreact
        if self.run_pre_act_flag==1:
            pre_act_run_flags_tdict=dict(config.items('Runpreact'))
            pre_act_run_flags_dict={}
            for key in pre_act_run_flags_tdict.keys():
                pre_act_run_flags_dict[key.upper()]=int(pre_act_run_flags_tdict[key])
            for ts in pre_act_run_flags_dict.keys():
                if ts not in datakeys:
                    message='Config file has error(s) in [Runpreact]: %s does not exist in Data'%ts
                    common.printstatus(message,'F',common.func_name())   
            self.pre_act_run_list=[key for key in pre_act_run_flags_dict.keys() if pre_act_run_flags_dict[key]==1]   
            self.pre_act_run_list.sort()
        else:
            self.pre_act_run_list=[]


        #Runact
        if self.run_proj_act_flag==1:
            act_run_flags_tdict=dict(config.items('Runact'))
            act_run_flags_dict={}
            for key in act_run_flags_tdict.keys():
                act_run_flags_dict[key.upper()]=int(act_run_flags_tdict[key])
            for ts in act_run_flags_dict.keys():
                if ts not in datakeys:
                    message='Config file has error(s) in [Runact]: %s does not exist in Data'%ts
                    common.printstatus(message,'F',common.func_name())   
            self.act_run_list=[key for key in act_run_flags_dict.keys() if act_run_flags_dict[key]==1]   
            self.act_run_list.sort()
        else:
            self.act_run_list=[]
        
        #Extractflows
        ext_flow_gene_file=config.get('Extractflows','ext_flow_gene_file')  
        if ext_flow_gene_file.lower() =='all' or ext_flow_gene_file[0:3].lower()=='chr':
            self.ext_flow_genelist=[ext_flow_gene_file.lower()]
            self.flow_prefix=self.project_name
        else:
            self.ext_flow_genelist=[ln.rstrip('\n').split('\t')[0] for ln in open(ext_flow_gene_file).readlines()]
        self.flow_prefix=config.get('Extractflows','flow_prefix') 
        
        #Runfastfdm   
        self.ffast_prefix=config.get('Runfastfdm','ffast_prefix')   
        ffast_gene_file=config.get('Runfastfdm','ffast_gene_file')
        if ffast_gene_file.lower() =='all' or ffast_gene_file[0:3].lower()=='chr':
            self.ffast_genelist=[ffast_gene_file.lower()]
            self.ffast_prefix=self.project_name
        else:
            self.ffast_genelist=[ln.rstrip('\n').split('\t')[0] for ln in open(ffast_gene_file).readlines()]
        
        tffast_run_dict=dict(config.items('Runfastfdm'))
        ffast_run_dict={}
        for key in tffast_run_dict:
            if key[0:9]=='ffast_run':
                ffast_run_dict[key]=tffast_run_dict[key]
        allitems=[]
        for key in ffast_run_dict.keys():
            allitems+=ffast_run_dict[key].replace('::',',').replace('+',',').replace(':',',').replace('|',',').split(',')
        for item in allitems:
            if item not in datakeys:
                message='Config file has error(s): [Runfastfdm] has incorrect definition;[Data] does not have %s'%(item)
                common.printstatus(message,'W',common.func_name()) 
                
        ffast_run_list=[]
        for key in ffast_run_dict.keys():
            run_str=ffast_run_dict[key]
            if len(run_str.split('::'))==1:
                if len(run_str.split(','))>1:
                    # within group a,b,c,d
                    run_items=run_str.split(',')
                    for i in range(len(run_items)-1):
                        for j in range(i+1,len(run_items)):
                            ffast_run_list.append([[run_items[i]],[run_items[j]]])
                else:
                    # all pairs listed a:b|c:d
                    splitrunlist=[x for x in run_str.split('|')]
                    ffast_run_list=[[[x.split(':')[0]],[x.split(':')[1]]] for x in splitrunlist]
            else:
                run_pair=run_str.split('::')
                if len(run_pair[0].split(','))==1:
                    ffast_run_list.append([run_pair[0].split('+'),run_pair[1].split('+')])
                else:
                    for item1 in run_pair[0].split(','):
                        for item2 in run_pair[1].split(','):
                            ffast_run_list.append([[item1],[item2]])
        ffast_run_list.sort()
        self.ffast_run_list=ffast_run_list

        
        #RunCluster     
#        self.cluster_prefix=config.get('RunCluster','cluster_prefix')           
#        cluster_gene_file=config.get('RunCluster','cluster_gene_file')
#        if cluster_gene_file.lower() =='all' or cluster_gene_file[0:3].lower()=='chr':
#            cluster_gene_file=cluster_gene_file.lower()
#            if cluster_gene_file=='chrx':
#                self.cluster_genelist=['chrX']
#            elif cluster_gene_file=='chry':
#                self.cluster_genelist=['chrY']
#            else:
#                self.cluster_genelist=[cluster_gene_file]            
#        else:
#            self.cluster_genelist=[ln.rstrip('\n') for ln in open(cluster_gene_file).readlines()]
        #self.cluster_unknown_flag=int(config.get('RunCluster','cluster_unknown_flag'))

           
        #Runfullfdm
        self.ffull_prefix=config.get('Runfullfdm','ffull_prefix')     
        ffull_gene_file=config.get('Runfullfdm','ffull_gene_file')
        if ffull_gene_file.lower() =='all' or ffull_gene_file[0:3].lower()=='chr':
            self.ffull_genelist=[ffull_gene_file.lower()]
            self.ffull_prefix=self.project_name
        elif ffull_gene_file.lower() =='none':
            self.ffull_genelist=[]
        else:
            self.ffull_genelist=[ln.rstrip('\n').split('\t')[0] for ln in open(ffull_gene_file).readlines()]
        
     
        tffull_run_dict=dict(config.items('Runfullfdm'))
        ffull_run_dict={}
        for key in tffull_run_dict:
            if key[0:9]=='ffull_run':
                ffull_run_dict[key]=tffull_run_dict[key]
        allitems=[]
        for key in ffull_run_dict.keys():
            allitems+=ffull_run_dict[key].replace('::',',').replace('+',',').replace(':',',').replace('|',',').split(',')
        for item in allitems:
            if item not in datakeys:
                message='Config file has error(s): [Runfullfdm] has incorrect definition;[Data] does not have %s'%(item)
                common.printstatus(message,'W',common.func_name()) 
                
        ffull_run_list=[]
        for key in ffull_run_dict.keys():
            run_str=ffull_run_dict[key]
            if len(run_str.split('::'))==1:
                if len(run_str.split(','))>1:
                    # within group
                    run_items=run_str.split(',')
                    for i in range(len(run_items)-1):
                        for j in range(i+1,len(run_items)):
                            ffull_run_list.append([[run_items[i]],[run_items[j]]])
                else:
                    # all pairs listed a:b|c:d
                    splitrunlist=[x for x in run_str.split('|')]
                    ffull_run_list=[[[x.split(':')[0]],[x.split(':')[1]]] for x in splitrunlist]
            else:
                run_pair=run_str.split('::')
                if len(run_pair[0].split(','))==1:
                    ffull_run_list.append([run_pair[0].split('+'),run_pair[1].split('+')])
                else:
                    for item1 in run_pair[0].split(','):
                        for item2 in run_pair[1].split(','):
                            ffull_run_list.append([[item1],[item2]])
        ffull_run_list.sort()
        if self.run_fdm_fast_flag==1:
            self.ffull_run_list=ffull_run_list
        else:
            self.ffull_run_list=[]
Exemple #40
0
 def __init__(self, config_file):
     if os.path.isfile(config_file):
         self.config_file = config_file
     else:
         message = 'Config file %s does not exist' % config_file
         common.printstatus(message, 'F', common.func_name())
Exemple #41
0
def gensimulatedreadsdata(sample_id,replicate_id,folderdict,genetxreaddict,genetxcigardict,config_object):
    if debug_flg==2:
        funcstarttime=time.time()
    fragmentsizerange=[int(config_object.read_extract_parameters[2]),int(config_object.read_extract_parameters[3])]
    txstartbias=config_object.txstartbias
    cutscoredict=dict([(ln.split('\t')[0],float(ln.rstrip('\n').split('\t')[1])) for ln in open(config_object.cutpreferencefile)])
    
    alignsam='%s/T%02dS%02d/alignments.sam'%(folderdict['data'][2],sample_id,replicate_id)
    alignbam='%s/T%02dS%02d/alignments.bam'%(folderdict['data'][2],sample_id,replicate_id)
    alignfout=open(alignsam,'w')
    alignerrsam='%s/T%02dS%02d/alignments_with_errors.sam'%(folderdict['data'][2],sample_id,replicate_id)
    alignerrbam='%s/T%02dS%02d/alignments_with_errors.bam'%(folderdict['data'][2],sample_id,replicate_id)
    alignerrfout=open(alignerrsam,'w')
    txgenereaddict=genetxreads2txgenereads(genetxreaddict,genetxcigardict)
    metadatafile='%s/expression_T%02dS%02d.txt'%(folderdict['metadata'],sample_id,replicate_id)
    fout=open(metadatafile,'w')
    for gene in txgenereaddict:
        for tx in txgenereaddict[gene]:
            fout.write('%s:%s:%s\t%s\tTX\t%d\t%6.4f\t%d\t%d\t%s\n'%tuple(txgenereaddict[gene][tx]))
    fout.close()
    chrgenedict=genetxdict2chrgenedict(genetxcigardict)
    chrlist=chrgenedict.keys()
    chrlist.sort()
    if debug_flg==2:
        tottxstrgettime=0
        totfraglistgettime=0
        totreadlistgettime=0
        toterrorreadgettime=0       
    for chrname in chrlist:
        if not(os.path.isfile('%s/%s.fa'%(config_object.reference_genome_dir,chrname))):
            message='%s/%s.fa: not found'%(config_object.reference_genome_dir,chrname)
            common.printstatus(message,'S',common.func_name())
            continue
        chrfafileptr=open('%s/%s.fa'%(config_object.reference_genome_dir,chrname))
        for gene in chrgenedict[chrname]:
            for tx in genetxcigardict[gene]:
                if debug_flg==2:
                    loopstarttime=time.time()
                numreads=txgenereaddict[gene][tx][6]
                startbase,cigarstr=genetxcigardict[gene][tx][2],genetxcigardict[gene][tx][3]
                txstr=getRNAtranscriptstring(chrfafileptr,startbase,cigarstr)
                if debug_flg==2:
                    txstrgettime=time.time()
                    tottxstrgettime+=txstrgettime-loopstarttime
                    message='Gene:%s, Transcript:%s; Transcript Size:%d, Num reads=%d'%(gene,tx,len(txstr),numreads)
                    common.printstatus(message,'S',common.func_name())
                if len(txstr)<fragmentsizerange[0]:
                    continue
                #print 'here',tx,startbase,cigarstr,len(txstr)
                #empirical number of cuts
                numcutmu=max(1,int(len(txstr)*4.0/(fragmentsizerange[0]+fragmentsizerange[1])))
                numcutsig=numcutmu/2.0
                #print gene,tx,len(txstr),fragmentsizerange[0],fragmentsizerange[1],numcutmu,numreads
                fragmentlist=getfragments(txstr,numreads,txgenereaddict[gene][tx][2],cutscoredict,fragmentsizerange,txstartbias,numcutmu,numcutsig)
                if debug_flg==2:
                    fraglistgettime=time.time()
                    totfraglistgettime+=fraglistgettime-txstrgettime
                #print 'here',len(txstr),numcutmu,numreads,len(fragmentlist)
                allparameterlist=[txstr,cigarstr,fragmentlist,config_object.read_extract_parameters]
                readlist=runfunc(config_object.read_extract_method,allparameterlist)
                if debug_flg==2:
                    readlistgettime=time.time()
                    totreadlistgettime+=readlistgettime-fraglistgettime
                inum=0
                for read in readlist:
                    rstartbase,readstr,readcigar=read
                    readstartlocation=sum([int(x) for x in common.cigarsubstr(cigarstr,0,rstartbase+1).replace('M','N').split('N')[:-1]])-1
                    inum+=1
                    alignfout.write('%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n'%
                                    (tx,inum,rstartbase,chrname,startbase+readstartlocation,readcigar,readstr))
                    if max(config_object.readqualitydeteriorationrate,config_object.indelrate)>0.0:
                        errreadstr,errcigarstr=adderrortoread(readstr,readcigar,config_object.readqualitydeteriorationrate,config_object.indelrate)
                        alignerrfout.write('%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n'%
                                    (tx,inum,rstartbase,chrname,startbase+readstartlocation,errcigarstr,errreadstr))
                if debug_flg==2:
                    errorreadgettime=time.time()
                    toterrorreadgettime+=errorreadgettime-readlistgettime
    alignfout.close()
    alignerrfout.close() 
    if debug_flg==2:
        functime=time.time()-funcstarttime
        othertime=functime-(tottxstrgettime+totfraglistgettime+totreadlistgettime+toterrorreadgettime)
        message='\n\nFunc Time=%s,Txstr=%4.2f,Fragtime=%4.2f,Readtime=%4.2f,Errtime=%4.2f,Oth=%4.2f\n\n'% \
                 (functime,tottxstrgettime*100/functime,totfraglistgettime*100/functime,
                  totreadlistgettime*100/functime,toterrorreadgettime*100/functime,othertime*100/functime)     
        common.printstatus(message,'S',common.func_name())    
    cmd='%s/samtools view -bt %s %s >%s'%(config_object.pathsamtools,config_object.chrfaifile,alignsam,alignbam)
    message='Running %s'%cmd
    common.printstatus(message,'S',common.func_name())
    os.system(cmd)
    if max(config_object.readqualitydeteriorationrate,config_object.indelrate)>0.0:
        cmd='%s/samtools view -bt %s %s >%s'%(config_object.pathsamtools,config_object.chrfaifile,alignerrsam,alignerrbam)
        message='Running %s'%cmd
        common.printstatus(message,'S',common.func_name())
        os.system(cmd)
    return 1
Exemple #42
0
def getfragments(txstr,numfragments,strand,cutscoredict,fragmentsizerange,txstartbias,numcutmu,numcutsig):
    """
    input:
        txstr: transcript string
        numfragments: number of fragments output
        strand: strand of transcript
        cutscoredict: dict(xmer:score) where score is cuttability at the center of the xmer, default 1
        fragmentsizerange: size of fragments output
        txstartbias: bias of fragments extracted from the start of the transcript
    output:
        fragments: start,end
    """
    outfragments=[]; fragmentscount=0
    if numfragments==0:
        return outfragments
    sttime=time.time()
    #compute cutpoints and probabilities
    scorelist=[1 for i in range(len(txstr))]
    minxmer=min([len(xmer) for xmer in cutscoredict])
    for i in range(len(txstr)-minxmer+1):
        for xmer in cutscoredict:
            if txstr[i:i+len(xmer)]==xmer:
                scorelist[i]=cutscoredict[xmer]
                break
    scorecumlist=[x*1.0/sum(scorelist) for x in np.cumsum(scorelist)]
    scorecumlist.insert(0,0)
    
    numcuts=max(numcutmu,int(random.gauss(numcutmu,numcutsig)))
    
    tries=0
    while fragmentscount<numfragments:
        tries+=1
        txdegradepoint=np.random.exponential(0.005*txstartbias)
        if strand=='+':
            cutrange=[0,int(round(len(txstr)*(1-txdegradepoint)))]
        else:
            cutrange=[int(round(len(txstr)*txdegradepoint)),len(txstr)-1]
        cutpoints=list(np.random.uniform(0,1,numcuts))
        cutpositions=[] 
        #print scorecumlist
        #print cutpoints
        for cutpoint in cutpoints:
            for i in range(cutrange[0],cutrange[1]):
                if cutpoint>scorecumlist[i] and cutpoint<scorecumlist[i+1]:
                    cutpositions.append(i)
        cutpositions.sort()
        cutpositions.insert(0,cutrange[0])
        cutpositions.append(cutrange[1]+1)
        #print 'here',cutpositions
        goodfragments=[[x[0],x[1]-1] for x in zip(cutpositions[:-1],cutpositions[1:]) if fragmentsizerange[0]<=x[1]-x[0]+1<= fragmentsizerange[1]]
        #print 'here',len(goodfragments)
        outfragments+=goodfragments
        fragmentscount+=len(goodfragments)
    if debug_flg==1:
        eltime=time.time()-sttime
        message='Start Time=%s'%sttime
        common.printstatus(message,'S',common.func_name())
        message='Average fragment: %6.2f, Tries per read: %6.2f, Reads per second: %10.4f, Num reads: %d'%(len(txstr)/(numcuts+0.1),tries/(numfragments+0.1),numfragments/eltime,numfragments)
        common.printstatus(message,'S',common.func_name())
    outfragments=random.sample(outfragments,numfragments)
    outfragments.sort()
    return outfragments
Exemple #43
0
    def Toimage(self,gene, imagedir,genomerange=[0,10000000000],highlightnodelist=[],ext='pdf',inwgs=[]):
        
        ts=self.name.split('/')[-1][:-4]
        #image rendering constants
        hconst=30;gwidth=100.0;exonplotdistbasis=10.0;msize=10;hoffset=10;woffset=50
        
        if inwgs==[]:
            wtgraphstruct=self.getwtgraphstruct(gene)    
        else:
            wtgraphstruct=inwgs
        exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist=wtgraphstruct
        
        #print zip(exonlist,exonwtlist)
        #print zip(intronlist,intronwtlist)
        #print zip(splicelist,splicewtlist)
        #print startnodelist,endnodelist,novelnodelist
 
        
        if genomerange!=[0,10000000000]:
            glabel='%s:%s:%d-%d'%(ts,gene,genomerange[0],genomerange[1])
        else:
            glabel='%s:%s'%(ts,gene)

        imgsz=len(exonlist)+len(intronlist)
        if imgsz>100:
            message='Image too large for %s. Printing first few exons'%glabel
            common.printstatus(message,'W',common.func_name(),1)   
            genomerange=[exonlist[0:30][0][0]-1,exonlist[0:30][-1][1]+1]
            glabel='%s:%s:%d-%d'%(ts,gene,genomerange[0],genomerange[1])
            
        allpoints=[]
        for splice in splicelist:
            if splice[0]<genomerange[0] or splice[1]>genomerange[1]:
                continue                  
            if splice[0] not in allpoints:
                allpoints.append(splice[0])
            if splice[1] not in allpoints:
                allpoints.append(splice[1])        
        for node in startnodelist:
            if node<genomerange[0] or node>genomerange[1]:
                continue                  
            if node not in allpoints:
                allpoints.append(node)  
        for node in endnodelist:
            if node<genomerange[0] or node>genomerange[1]:
                continue               
            if node not in allpoints:
                allpoints.append(node)     
        for exon in exonlist:
            if exon[0]<genomerange[0] or exon[1]>genomerange[1]:
                continue            
            if (exon[0] not in allpoints) and (exon[0]-1 not in allpoints) and (exon[0]+1 not in allpoints):
                allpoints.append(exon[0])
            if (exon[1] not in allpoints) and (exon[1]-1 not in allpoints) and (exon[1]+1 not in allpoints):
                allpoints.append(exon[1])             
        for exon in intronlist:
            if exon[0]<genomerange[0] or exon[1]>genomerange[1]:
                continue            
            if (exon[0] not in allpoints) and (exon[0]-1 not in allpoints) and (exon[0]+1 not in allpoints):
                allpoints.append(exon[0])
            if (exon[1] not in allpoints) and (exon[1]-1 not in allpoints) and (exon[1]+1 not in allpoints):
                allpoints.append(exon[1])            

        
        allpoints.sort()
        #print exonlist
        #print allpoints        
        pointdistlist=[(allpoints[i]-allpoints[i-1]) for i in range(1,len(allpoints))]
        mindist=max(exonplotdistbasis,min(pointdistlist))
        plotdistlist=[math.log(max(x,mindist),mindist) for x in pointdistlist]
        #equidistant
        #plotdistlist=[1 for x in pointdistlist]
        #print sum(plotdistlist)
        runtotal=0   
        allpointsy=[runtotal]
        for point in plotdistlist:
            runtotal+=point
            allpointsy.append(runtotal*hconst)
        
        #print allpointsy
        pointplotdict=dict(zip(allpoints,allpointsy))
        
        gheight=sum(plotdistlist)*hconst
        #gwidth=max(gheight/10,100.0)
        ax = plt.figure(figsize=(4,4*gheight/gwidth)).add_subplot(111)
        ax.axis('off')
        
        
        #plot exons
        for i in range(len(exonlist)):
            exon=exonlist[i]
            if exon[0]<genomerange[0] or exon[1]>genomerange[1]:
                continue
            if exon[0] in pointplotdict:
                y1=pointplotdict[exon[0]]
            elif exon[0]-1 in pointplotdict:
                y1=pointplotdict[exon[0]-1]
            elif exon[0]+1 in pointplotdict:
                y1=pointplotdict[exon[0]+1]
            if exon[1] in pointplotdict:
                y2=pointplotdict[exon[1]]
            elif exon[1]-1 in pointplotdict:
                y2=pointplotdict[exon[1]-1]
            elif exon[1]+1 in pointplotdict:
                y2=pointplotdict[exon[1]+1]                
            ax.plot([woffset,woffset],[y1+hoffset,y2+hoffset],'b-',markersize=msize)
            ax.text(woffset-1,(y1+y2)/2+hoffset,'%5.1f'%exonwtlist[i],horizontalalignment='right',verticalalignment='center',rotation=270)
        
        for i in range(len(intronlist)):
            intron=intronlist[i]
            if intron[0]<genomerange[0] or intron[1]>genomerange[1]:
                continue            
            if intron[0] in pointplotdict:
                y1=pointplotdict[intron[0]]
            elif intron[0]-1 in pointplotdict:
                y1=pointplotdict[intron[0]-1]
            elif intron[0]+1 in pointplotdict:
                y1=pointplotdict[intron[0]+1]
            if intron[1] in pointplotdict:
                y2=pointplotdict[intron[1]]
            elif intron[1]-1 in pointplotdict:
                y2=pointplotdict[intron[1]-1]
            elif intron[1]+1 in pointplotdict:
                y2=pointplotdict[intron[1]+1]                       
            ax.plot([woffset,woffset],[y1+hoffset,y2+hoffset],'c-',markersize=msize)  
            ax.text(woffset-1,(y1+y2)/2+hoffset,'%5.1f'%intronwtlist[i],horizontalalignment='right',verticalalignment='center',rotation=270)
        
        maxawidth=0
        for i in range(len(splicelist)):
            splice=splicelist[i]
            if splice[0]<genomerange[0] or splice[1]>genomerange[1]:
                continue            
            if splicewtlist[i]>0:
                y1=pointplotdict[splice[0]]
                y2=pointplotdict[splice[1]]
                center=[woffset,(y1+y2)/2+hoffset]
                awidth=gwidth/gheight*(y2-y1)
                if awidth>maxawidth:
                    maxawidth=awidth
                
        for i in range(len(splicelist)):
            splice=splicelist[i]
            if splice[0]<genomerange[0] or splice[1]>genomerange[1]:
                continue                  
            if splicewtlist[i]>0:
                y1=pointplotdict[splice[0]]
                y2=pointplotdict[splice[1]]
                center=[woffset,(y1+y2)/2+hoffset]
                awidth=gwidth*2/gheight*(y2-y1)*(gwidth/maxawidth)                
                arcs=[Arc(xy=center, width=awidth, height=y2-y1, angle=0, theta1=270, theta2=90,lw=2,color='green',ls='dashed')] # Arc
                #arcs=[Arc(xy=center, width=awidth, height=y2-y1, angle=0, theta1=270, theta2=90,lw=2,color='green')] # Arc   
                ax.add_artist(arcs[0])
                ax.text(woffset+awidth/2+1,(y1+y2)/2+hoffset,splicewtlist[i],horizontalalignment='left',verticalalignment='center', rotation=270)
            
            

        #plot nodes
        #Todo text for points
        for i in range(len(allpoints)):
            node=allpoints[i]
            if node in startnodelist:
                ax.plot(woffset,allpointsy[i]+hoffset,marker='s',markerfacecolor='k',fillstyle='bottom',markersize=msize)
            elif node in endnodelist:
                ax.plot(woffset,allpointsy[i]+hoffset,marker='s',markerfacecolor='k',fillstyle='top',markersize=msize)     
            elif node in novelnodelist:
                ax.plot(woffset,allpointsy[i]+hoffset,marker='o',markerfacecolor='white',markersize=msize)    
            else:
                ax.plot(woffset,allpointsy[i]+hoffset,marker='o',markerfacecolor='k',fillstyle='full',markersize=msize)  
            if node in highlightnodelist: 
                ax.text(woffset-msize,allpointsy[i]+hoffset,node,horizontalalignment='right',verticalalignment='center',color='red')
            else:
                ax.text(woffset-msize,allpointsy[i]+hoffset,node,horizontalalignment='right',verticalalignment='center')
 
                
        ax.text(0,max(allpointsy)+hoffset+4*msize,glabel,horizontalalignment='left',verticalalignment='center')
        ax.set_ylim(0, max(allpointsy)+hoffset+100)
        #ax.set_ylim(100, 200)
        ax.set_xlim(0, gwidth+woffset)
        
        #plt.show()
        ax.set_aspect('equal')
        try:
            plt.savefig('%s/%s.%s'%(imagedir,glabel.replace(':','__').replace('/','_'),ext),bbox_inches='tight', pad_inches=0.1)  
        except:
            message='Image too large for %s'%glabel
            common.printstatus(message,'W',common.func_name(),1)   
Exemple #44
0
 def _checkdir(self, dirname, dirpath, function_name):
     if not (os.path.exists(dirpath)):
         message = '%s does not exist' % dirname
         common.printstatus(message, 'F', function_name)
     else:
         return dirpath
Exemple #45
0
    def wgs2problem(self,wgs,readsz=100):
        """
        X (nx1) = Variables are exon wts, splice wts and tx start and end wts
        B (mx1)=  0 for each node comparing inflow and outflow
               = given wt for each exon,splice*correction weight
               = weq/10 * inflow outflow difference for tx start or end
        A (mxn) = set of equations
        """
        weq=100.0
        X=[]; B=[]; A=[]
        
        exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist=wgs
        sparsegraphdict,nodedict=wgs2sparsegraphdict(wgs)
        
        #print sparsegraphdict
        #print nodedict
        
        txstartlist=[nodedict[x][0] for x in nodedict.keys() if nodedict[x][1]==2]
        txendlist=[nodedict[x][0] for x in nodedict.keys() if nodedict[x][1]==3]
        txstartlist.sort()
        txendlist.sort()
        #print txstartlist
        #print txendlist
        
        exonwttuplelist=zip(exonlist+intronlist,exonwtlist+intronwtlist)
        splicewttuplist=zip(splicelist,splicewtlist)
        exonwttuplelist.sort()
        splicewttuplist.sort()
        
        #print exonwttuplelist
        #print splicewttuplist
    
        exondict={}; splicedict={}
        exonidx=0;spliceidx=0
        for node1 in sorted(sparsegraphdict.keys()):
            for node2 in sorted(sparsegraphdict[node1].keys()):
                if sparsegraphdict[node1][node2][1]==1:
                    exondict[(node1,node2)]=exonidx
                    exonidx+=1
                if sparsegraphdict[node1][node2][1]==2:
                    splicedict[(node1,node2)]=spliceidx
                    spliceidx+=1
        
        incomingedgedict={}
        for node1 in sorted(sparsegraphdict.keys()):
            for node2 in sorted(sparsegraphdict[node1].keys()):
                if node2 not in incomingedgedict:
                    incomingedgedict[node2]={}
                incomingedgedict[node2][node1]=sparsegraphdict[node1][node2][0:]
        
        #print exondict
        #print splicedict
        #print incomingedgedict
        
        numvar=len(exonwttuplelist)+len(splicewttuplist)+len(txstartlist)+len(txendlist)
        for exon,wt in exonwttuplelist:
            X.append([2,exon[0],exon[1],wt])
            txstend=0
            if exon[0] in txstartlist:
                txstend+=1
            if exon[1] in txendlist:
                txstend+=1     
            if exon in exonlist:
                edgetype=1
            else:
                edgetype=2  
            w=self.getcorrectionwt(exon[1]-exon[0], txstend, wt, readsz,edgetype)
            row=[0]*numvar
            row[len(X)-1]=w
            A.append(row)
            B.append(w*wt)
            #B.append(w*math.log10(1.0+exon[2]))
        for splice,wt in splicewttuplist:     
            X.append([3,splice[0],splice[1],wt])  
            edgetype=0
            w=self.getcorrectionwt(0, 0, wt, readsz,edgetype)  
            row=[0]*numvar
            row[len(X)-1]=w
            A.append(row)
            B.append(w*wt)   
        for node in txstartlist:
            X.append([1,node,node,0.0])
        for node in txendlist:
            X.append([-1,node,node,0.0])    
    
        nodelist=sorted(nodedict.keys())
        for node in nodelist:
            if (nodelist.index(node)==0) and (nodedict[node][1] not in [2,3]):
                continue
            if (nodelist.index(node)==len(nodelist)-1) and (nodedict[node][1] not in [2,3]):
                continue            
            row=[0]*numvar
            txstflg=0; txendflg=0; inwt=0.0; outwt=0.0
            if nodedict[node][1]==2:
                idx=len(exondict)+len(splicedict)+txstartlist.index(nodedict[node][0])
                row[idx]=-1*weq
                txstflg=1
                txidx=idx
            if nodedict[node][1]==3:
                idx=len(exondict)+len(splicedict)+len(txstartlist)+txendlist.index(nodedict[node][0])
                row[idx]=weq    
                txendflg=1
                txidx=idx
            
            if node in sparsegraphdict:
                for node2 in sparsegraphdict[node]:
                    edge=(node,node2)
                    if edge in exondict:       
                        idx=exondict[edge]
                        row[idx]=weq
                        outwt+=sparsegraphdict[node][node2][0]
                    elif edge in splicedict:
                        idx=len(exondict)+splicedict[edge]
                        row[idx]=weq
                        outwt+=sparsegraphdict[node][node2][0]  
                    else:
                        message='edge not found %d-%d'%(node,node2)
                        common.printstatus(message,'W',common.func_name(),1)         
                
            if node in  incomingedgedict:
                for node2 in incomingedgedict[node]:
                    edge=(node2,node)
                    if edge in exondict:       
                        idx=exondict[edge]
                        row[idx]=-1*weq
                        inwt+=incomingedgedict[node][node2][0]
                    elif edge in splicedict:
                        idx=len(exondict)+splicedict[edge]
                        row[idx]=-1*weq
                        inwt+=incomingedgedict[node][node2][0]
                    else:
                        message='edge not found %d-%d'%(node,node2)
                        common.printstatus(message,'W',common.func_name(),1)                    
            A.append(row)
            B.append(0)
    
    #       Only internal tx start and end need to closer to difference   
            if txstflg==1:
                txwt=max(0.0,outwt-inwt)
            if txendflg==1:
                txwt=max(0.0,inwt-outwt)
            if txstflg==1 or txendflg==1:
            # and inwt>0.01
                row=[0]*numvar
                row[txidx]=5
                A.append(row)
                B.append(5*txwt)
#                if outwt>0.01:
#                    row=[0]*numvar
#                    row[txidx]=weq/20
#                    A.append(row)
#                    B.append(weq*txwt/20)
#                if outwt<=0.01:
#                    row=[0]*numvar
#                    row[txidx]=weq/5
#                    A.append(row)
#                    B.append(weq*txwt/5)   
            #print node, nodedict[node]
            #print row 
        return((A,B,X))             
Exemple #46
0
 def _checkfile(self, filename, filepath, function_name):
     if not (os.path.isfile(filepath)):
         message = '%s does not exist' % filename
         common.printstatus(message, 'F', function_name)
     else:
         return filepath
Exemple #47
0
    def ACT2Corrected(self,gene,num_iterations=5):
        """
            Next steps: Some way to preserve flows at divergence nodes
            One way could be reallocate flows at all divergence nodes in the original ratio and fix it 
            Iterate 10 times
        """
        inwgs=self.wgsdict[gene]
        outwgs=inwgs
        component1=1.0
        for iteri in range(num_iterations):
            component1=1.0-iteri*1.0/num_iterations
            wgs=addwgs(inwgs,outwgs,component1)
            A,B,X=self.wgs2problem(wgs)
            Xvar = cvx.variable(len(X),1)    
            A=cvx.matrix(A)
            B=cvx.matrix(B)
            B=B.T
            p = cvx.program(cvx.minimize(cvx.norm2(A*Xvar-B)),[cvx.geq(Xvar,0.0)])
            try:
                p.solve(quiet=1)
            except:
                message='Could not solve for %s'%(gene)
                common.printstatus(message,'W',common.func_name(),1)   
                return (outwgs,100.0)   
            if iteri==0:                          # Get optimal value
                err=cvx.norm2(A*Xvar-B)
            #print err.value/len(X)
            Xval=Xvar.T.value.tolist()[0]
            X_corr= [a[:] for a in X]
            for i in range(len(Xval)):
                X_corr[i][3]=int(Xval[i]*100)/100.0
            
            #print X_corr
            exonlist=[[a[1],a[2]] for a in X_corr if a[0]==2]
            exonwtlist=[a[3] for a in X_corr if a[0]==2]
            #print 'E',exonlist
            intronlist=[]
            intronwtlist=[]
            splicelist=[[a[1],a[2]] for a in X_corr if a[0]==3]
            splicewtlist=[a[3] for a in X_corr if a[0]==3]
            removelist=[]
            for i in range(len(exonlist)):
                exon=exonlist[i]
                if exon in splicelist:
                    exonwt=exonwtlist[i]
                    intronlist.append([exon[0]+1,exon[1]-1])
                    intronwtlist.append(exonwt)
                    removelist.append(i)
            removelist.reverse()
            for i in removelist:
                exonlist.pop(i)
                exonwtlist.pop(i)
                    
            #print 'E',exonlist
            startnodelist=[a[1]for a in X_corr if a[0]==1]
            endnodelist=[a[1]for a in X_corr if a[0]==-1]
            novelnodelist=wgs[5]
            #print exonlist
            #print wgs[0]
            #print intronlist
            #print wgs[1]

            exonwtlist1=[exonwtlist[i] for i in range(len(exonwtlist)) if exonlist[i] in wgs[0]]
            intronwtlist1=[exonwtlist[i] for i in range(len(exonwtlist)) if exonlist[i] in wgs[1]]
            #wgrstuple=(exonlist,intronlist,splicelist,startnodelist,endnodelist,novelnodelist,exonwtlist,intronwtlist,splicewtlist)
            outwgs=(wgs[0],wgs[1],splicelist,wgs[3],wgs[4],novelnodelist,exonwtlist1,intronwtlist1,splicewtlist)
        
        return (outwgs,err.value/len(X))
Exemple #48
0
 def __init__(self, config_file):
     if os.path.isfile(config_file):
         self.config_file=config_file
     else:
         message='Config file %s does not exist'%config_file
         common.printstatus(message,'F',common.func_name())     
Exemple #49
0
def gensimulatedreadsdata(sample_id, replicate_id, folderdict, genetxreaddict,
                          genetxcigardict, config_object):
    if debug_flg == 2:
        funcstarttime = time.time()
    fragmentsizerange = [
        int(config_object.read_extract_parameters[2]),
        int(config_object.read_extract_parameters[3])
    ]
    txstartbias = config_object.txstartbias
    cutscoredict = dict([(ln.split('\t')[0],
                          float(ln.rstrip('\n').split('\t')[1]))
                         for ln in open(config_object.cutpreferencefile)])

    alignsam = '%s/T%02dS%02d/alignments.sam' % (folderdict['data'][2],
                                                 sample_id, replicate_id)
    alignbam = '%s/T%02dS%02d/alignments.bam' % (folderdict['data'][2],
                                                 sample_id, replicate_id)
    alignfout = open(alignsam, 'w')
    alignerrsam = '%s/T%02dS%02d/alignments_with_errors.sam' % (
        folderdict['data'][2], sample_id, replicate_id)
    alignerrbam = '%s/T%02dS%02d/alignments_with_errors.bam' % (
        folderdict['data'][2], sample_id, replicate_id)
    alignerrfout = open(alignerrsam, 'w')
    txgenereaddict = genetxreads2txgenereads(genetxreaddict, genetxcigardict)
    metadatafile = '%s/expression_T%02dS%02d.txt' % (folderdict['metadata'],
                                                     sample_id, replicate_id)
    fout = open(metadatafile, 'w')
    for gene in txgenereaddict:
        for tx in txgenereaddict[gene]:
            fout.write('%s:%s:%s\t%s\tTX\t%d\t%6.4f\t%d\t%d\t%s\n' %
                       tuple(txgenereaddict[gene][tx]))
    fout.close()
    chrgenedict = genetxdict2chrgenedict(genetxcigardict)
    chrlist = chrgenedict.keys()
    chrlist.sort()
    if debug_flg == 2:
        tottxstrgettime = 0
        totfraglistgettime = 0
        totreadlistgettime = 0
        toterrorreadgettime = 0
    for chrname in chrlist:
        if not (os.path.isfile('%s/%s.fa' %
                               (config_object.reference_genome_dir, chrname))):
            message = '%s/%s.fa: not found' % (
                config_object.reference_genome_dir, chrname)
            common.printstatus(message, 'S', common.func_name())
            continue
        chrfafileptr = open('%s/%s.fa' %
                            (config_object.reference_genome_dir, chrname))
        for gene in chrgenedict[chrname]:
            for tx in genetxcigardict[gene]:
                if debug_flg == 2:
                    loopstarttime = time.time()
                numreads = txgenereaddict[gene][tx][6]
                startbase, cigarstr = genetxcigardict[gene][tx][
                    2], genetxcigardict[gene][tx][3]
                txstr = getRNAtranscriptstring(chrfafileptr, startbase,
                                               cigarstr)
                if debug_flg == 2:
                    txstrgettime = time.time()
                    tottxstrgettime += txstrgettime - loopstarttime
                    message = 'Gene:%s, Transcript:%s; Transcript Size:%d, Num reads=%d' % (
                        gene, tx, len(txstr), numreads)
                    common.printstatus(message, 'S', common.func_name())
                if len(txstr) < fragmentsizerange[0]:
                    continue
                #print 'here',tx,startbase,cigarstr,len(txstr)
                #empirical number of cuts
                numcutmu = max(
                    1,
                    int(
                        len(txstr) * 4.0 /
                        (fragmentsizerange[0] + fragmentsizerange[1])))
                numcutsig = numcutmu / 2.0
                #print gene,tx,len(txstr),fragmentsizerange[0],fragmentsizerange[1],numcutmu,numreads
                fragmentlist = getfragments(txstr, numreads,
                                            txgenereaddict[gene][tx][2],
                                            cutscoredict, fragmentsizerange,
                                            txstartbias, numcutmu, numcutsig)
                if debug_flg == 2:
                    fraglistgettime = time.time()
                    totfraglistgettime += fraglistgettime - txstrgettime
                #print 'here',len(txstr),numcutmu,numreads,len(fragmentlist)
                allparameterlist = [
                    txstr, cigarstr, fragmentlist,
                    config_object.read_extract_parameters
                ]
                readlist = runfunc(config_object.read_extract_method,
                                   allparameterlist)
                if debug_flg == 2:
                    readlistgettime = time.time()
                    totreadlistgettime += readlistgettime - fraglistgettime
                inum = 0
                for read in readlist:
                    rstartbase, readstr, readcigar = read
                    readstartlocation = sum([
                        int(x) for x in common.cigarsubstr(
                            cigarstr, 0, rstartbase +
                            1).replace('M', 'N').split('N')[:-1]
                    ]) - 1
                    inum += 1
                    alignfout.write(
                        '%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n' %
                        (tx, inum, rstartbase, chrname,
                         startbase + readstartlocation, readcigar, readstr))
                    if max(config_object.readqualitydeteriorationrate,
                           config_object.indelrate) > 0.0:
                        errreadstr, errcigarstr = adderrortoread(
                            readstr, readcigar,
                            config_object.readqualitydeteriorationrate,
                            config_object.indelrate)
                        alignerrfout.write(
                            '%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n' %
                            (tx, inum, rstartbase, chrname, startbase +
                             readstartlocation, errcigarstr, errreadstr))
                if debug_flg == 2:
                    errorreadgettime = time.time()
                    toterrorreadgettime += errorreadgettime - readlistgettime
    alignfout.close()
    alignerrfout.close()
    if debug_flg == 2:
        functime = time.time() - funcstarttime
        othertime = functime - (tottxstrgettime + totfraglistgettime +
                                totreadlistgettime + toterrorreadgettime)
        message='\n\nFunc Time=%s,Txstr=%4.2f,Fragtime=%4.2f,Readtime=%4.2f,Errtime=%4.2f,Oth=%4.2f\n\n'% \
                 (functime,tottxstrgettime*100/functime,totfraglistgettime*100/functime,
                  totreadlistgettime*100/functime,toterrorreadgettime*100/functime,othertime*100/functime)
        common.printstatus(message, 'S', common.func_name())
    cmd = '%s/samtools view -bt %s %s >%s' % (config_object.pathsamtools,
                                              config_object.chrfaifile,
                                              alignsam, alignbam)
    message = 'Running %s' % cmd
    common.printstatus(message, 'S', common.func_name())
    os.system(cmd)
    if max(config_object.readqualitydeteriorationrate,
           config_object.indelrate) > 0.0:
        cmd = '%s/samtools view -bt %s %s >%s' % (config_object.pathsamtools,
                                                  config_object.chrfaifile,
                                                  alignerrsam, alignerrbam)
        message = 'Running %s' % cmd
        common.printstatus(message, 'S', common.func_name())
        os.system(cmd)
    return 1
Exemple #50
0
 def _checkdir(self,dirname,dirpath,function_name):
     if not(os.path.exists(dirpath)):
         message='%s does not exist'%dirname
         common.printstatus(message,'F',function_name) 
     else:
         return dirpath
Exemple #51
0
    if options.proj_range != '':
        cfg.setprojrange(options.proj_range)

    if options.fdm_fast_pair != '':
        cfg.setfdmfastpair(options.fdm_fast_pair)
        if options.analysis_genefile != '':
            cfg.setfdmgenefile(options.analysis_genefile)

    if options.fdm_full_pair != '':
        cfg.setfdmfullpair(options.fdm_full_pair)
        if options.analysis_genefile != '':
            cfg.setfdmgenefile(options.analysis_genefile)

    message = 'Program Started'
    common.printstatus(message, 'S', common.func_name())

    rootdir = cfg.root_dir
    gtffilename = cfg.annotation_file

    gtffile = gtffile.gtfFile(gtffilename, 0, cfg.annotation_file_type)
    gtffile.getgenetranscriptdict()

    geneannodivdict = gtffile.getgeneannodivdict()

    annochrjuncdict = gtffile.getchrjuncdict()

    islanddict = gtffile.getislanddict()
    genelist = islanddict.keys()
    chrlist = gtffile.getchrlist()
    islandlist = gtffile.getislandlist()