Beispiel #1
0
def filterSeq(ex, fastqFiles, seqToFilter, gid, grp_name, via='lsf'):
    indexSeqToFilter = {}
    indexFiles = {}
    global bcDelimiter
    for k, f in seqToFilter.iteritems():
        if os.path.getsize(f) == 0: continue
        ex.add(f,
               description=set_file_descr(grp_name + "_" +
                                          k.replace(bcDelimiter, "_") +
                                          "_seqToFilter.fa",
                                          groupId=gid,
                                          step="filtering",
                                          type="fa",
                                          view="admin"))
        if k in fastqFiles:
            indexFiles[k] = bowtie_build.nonblocking(ex, f, via=via)

    unalignedFiles = {}
    futures = []
    bwtarg = ["-a", "-q", "-n", "2", "-l", "20", "--un"]
    for k, f in indexFiles.iteritems():
        unalignedFiles[k] = unique_filename_in()
        touch(ex, unalignedFiles[k])
        futures.append(
            bowtie.nonblocking(ex,
                               f.wait(),
                               fastqFiles[k],
                               bwtarg + [unalignedFiles[k]],
                               via='lsf'))

    for f in futures:
        f.wait()
    return unalignedFiles
Beispiel #2
0
 def test_browse_executions(self):
     ex_desc = "browse_ex_test"
     with execution(M, description=ex_desc) as ex:
         touch(ex,"boris")
     ex_found = M.browse_executions(with_description=ex_desc)
     #self.assertIs(ex.id,ex_found)
     M.delete_execution(ex.id)
Beispiel #3
0
Datei: test.py Projekt: bbcf/bein
 def test_browse_executions(self):
     ex_desc = "browse_ex_test"
     with execution(M, description=ex_desc) as ex:
         touch(ex, "boris")
     ex_found = M.browse_executions(with_description=ex_desc)
     # self.assertIs(ex.id,ex_found)
     M.delete_execution(ex.id)
Beispiel #4
0
 def test_unique_filename_exact_match(self):
     with execution(None) as ex:
         st = random.getstate()
         f = touch(ex)
         random.setstate(st)
         g = touch(ex)
         self.assertNotEqual(f, g)
Beispiel #5
0
 def test_unique_filename_exact_match(self):
     with execution(None) as ex:
         st = random.getstate()
         f = touch(ex)
         random.setstate(st)
         g = touch(ex)
         self.assertNotEqual(f, g)
Beispiel #6
0
 def test_unique_filename_beginnings_match(self):
     with execution(None) as ex:
         st = random.getstate()
         f = unique_filename_in()
         touch(ex, f + 'abcdefg')
         random.setstate(st)
         g = touch(ex)
         self.assertNotEqual(f, g)
Beispiel #7
0
 def test_unique_filename_beginnings_match(self):
     with execution(None) as ex:
         st = random.getstate()
         f = unique_filename_in()
         touch(ex, f + 'abcdefg')
         random.setstate(st)
         g = touch(ex)
         self.assertNotEqual(f, g)
Beispiel #8
0
 def test_resolve_alias_with_alias(self):
     with execution(None) as ex:
         f = touch(ex)
         M = MiniLIMS("boris")
         a = M.import_file(f)
         M.add_alias(a, 'hilda')
         self.assertEqual(M.resolve_alias('hilda'), a)
Beispiel #9
0
 def test_resolve_alias_with_alias(self):
     with execution(None) as ex:
         f = touch(ex)
         M = MiniLIMS("boris")
         a = M.import_file(f)
         M.add_alias(a, 'hilda')
         self.assertEqual(M.resolve_alias('hilda'), a)
Beispiel #10
0
 def test_associate_with_id(self):
     try:
         fid = M.import_file('test.py')
         with execution(M) as ex:
             touch(ex, "hilda")
             ex.add("hilda", associate_to_id=fid, template="%s.meep")
         hilda_id = M.search_files(source=('execution',ex.id))[0]
         hilda_name = M.fetch_file(hilda_id)['repository_name']
         fid_name = M.fetch_file(fid)['repository_name']
         self.assertEqual("%s.meep" % fid_name, hilda_name)
     finally:
         try:
             M.delete_execution(ex.id)
             M.delete_file(fid)
         except:
             pass
Beispiel #11
0
 def test_associate_with_id(self):
     try:
         fid = M.import_file('test.py')
         with execution(M) as ex:
             touch(ex, "hilda")
             ex.add("hilda", associate_to_id=fid, template="%s.meep")
         hilda_id = M.search_files(source=('execution',ex.id))[0]
         hilda_name = M.fetch_file(hilda_id)['repository_name']
         fid_name = M.fetch_file(fid)['repository_name']
         self.assertEqual("%s.meep" % fid_name, hilda_name)
     finally:
         try:
             M.delete_execution(ex.id)
             M.delete_file(fid)
         except:
             pass
Beispiel #12
0
def save_wellington( ex, wellout, chrmeta ):
    bedlist = {}
    for name, wlist in wellout.iteritems():
        wellall = unique_filename_in()
#### Dummy file
        touch( ex, wellall )
        ex.add(wellall,
               description=set_file_descr(name[1]+'_wellington_files', type='none', view='admin',
                                          step='footprints', groupId=name[0]))
#### BED at FDR 1%
        bedlist[name[0]] = wellall+"FDR01.bed.gz"
        bedzip = gzip.open(bedlist[name[0]],'wb')
        bedzip.write("track name='"+name[1]+"_WellingtonFootprints_FDR_0.01'\n")
        for x in wlist:
            with open(os.path.join(*x)+".WellingtonFootprints.FDR.0.01.bed") as _bed:
                [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall+"FDR01.bed.gz",
               description=set_file_descr(name[1]+'_WellingtonFootprintsFDR01.bed.gz',
                                          type='bed', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz')
#### BED at p-values [...]
        bedzip = gzip.open(wellall+"PvalCutoffs.bed.gz",'wb')
        for bfile in os.listdir(os.path.join(wlist[0][0],"p_value_cutoffs")):
            cut = os.path.splitext(bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4])
            bedzip.write("track name='"+name[1]+"_WellingtonFootprints_Pval_%s'\n" %cut)
            for wdir,wpref in wlist:
                _bedpath = os.path.join(wdir,"p_value_cutoffs",wpref+".WellingtonFootprints."+cut+".bed")
                with open(_bedpath) as _bed:
                    [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall+"PvalCutoffs.bed.gz",
               description=set_file_descr(name[1]+'_WellingtonFootprintsPvalCutoffs.bed.gz',
                                          type='bed', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz')
#### WIG
        cat([os.path.join(*x)+".WellingtonFootprints.wig" for x in wlist], wellall+".wig")
        #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta)
        #ex.add(wellall+".bw",
        #       description=set_file_descr(name[1]+'_WellingtonFootprints.bw',
        #                                  type='bigWig', ucsc='1', step='footprints', groupId=name[0]),
        #       associate_to_filename=wellall, template='%s_WellingtonFootprints.bw')
        ex.add(wellall+".wig",
               description=set_file_descr(name[1]+'_WellingtonFootprints.wig',
                                          type='wig', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprints.wig')
    return bedlist
Beispiel #13
0
 def test_associate_with_names(self):
     try:
         with execution(M) as ex:
             touch(ex, "boris")
             touch(ex, "hilda")
             ex.add("boris")
             ex.add("hilda", associate_to_filename="boris", template="%s.meep")
         boris_id = M.search_files(source=('execution',ex.id), with_text="boris")[0]
         hilda_id = M.search_files(source=('execution',ex.id), with_text="hilda")[0]
         boris_name = M.fetch_file(boris_id)['repository_name']
         hilda_name = M.fetch_file(hilda_id)['repository_name']
         self.assertEqual("%s.meep" % boris_name, hilda_name)
     finally:
         try:
             M.delete_execution(ex.id)
         except:
             pass
Beispiel #14
0
 def test_associate_with_names(self):
     try:
         with execution(M) as ex:
             touch(ex, "boris")
             touch(ex, "hilda")
             ex.add("boris")
             ex.add("hilda", associate_to_filename="boris", template="%s.meep")
         boris_id = M.search_files(source=('execution',ex.id), with_text="boris")[0]
         hilda_id = M.search_files(source=('execution',ex.id), with_text="hilda")[0]
         boris_name = M.fetch_file(boris_id)['repository_name']
         hilda_name = M.fetch_file(hilda_id)['repository_name']
         self.assertEqual("%s.meep" % boris_name, hilda_name)
     finally:
         try:
             M.delete_execution(ex.id)
         except:
             pass
Beispiel #15
0
 def test_path_to_file_on_execution(self):
     with execution(None) as ignoreme:
         f = touch(ignoreme)
         M = MiniLIMS("boris")
         fid = M.import_file(f)
         mpath = M.path_to_file(fid)
         with execution(M) as ex:
             fpath = ex.path_to_file(fid)
     self.assertEqual(mpath, fpath)
Beispiel #16
0
Datei: test.py Projekt: bbcf/bein
 def test_path_to_file_on_execution(self):
     with execution(None) as ignoreme:
         f = touch(ignoreme)
         M = MiniLIMS("boris")
         fid = M.import_file(f)
         mpath = M.path_to_file(fid)
         with execution(M) as ex:
             fpath = ex.path_to_file(fid)
     self.assertEqual(mpath, fpath)
Beispiel #17
0
def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'):
    """
    Main call to create the library
    """
    if len(params['primary'])<2:
        print('Some parameters are missing, cannot create the library')
        print('primary='+params['primary']+" ; "+'secondary='+params['secondary'])
        return [None,None,None,None]

    if not isinstance(assembly_or_fasta,genrep.Assembly):
        assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta )
    chrnames = assembly_or_fasta.chrnames
    chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems())
    allfiles = assembly_or_fasta.fasta_by_chrom  #assembly_or_fasta.untar_genome_fasta()

    libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f,
                                                            params['primary'], params['secondary'],
                                                            params['length'],  params['type'],
                                                            via=via ))
                    for c, f in allfiles.iteritems())
    resfile = unique_filename_in()
    os.mkdir(resfile)
    bedfiles = {}
    for chrom, future in libfiles.iteritems():
        libfiles[chrom] = future.wait()
        if not os.path.getsize(libfiles[chrom][1])>0:
            time.sleep(60)
            touch(ex,libfiles[chrom][1])
        bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map)
    rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via)
    bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames]
    cat(bedchrom,out=resfile+".bed")
    gzipfile(ex,[resfile+".bed"]+bedchrom)
#    resfile_sql = resfile+".sql"
#    track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species'])
    enz_list = []
    infos_lib = { 'assembly_name':  params['species'],
                  'enzyme1_id':     getEnzymeSeqId(params['primary'], True, enz_list, url),
                  'enzyme2_id':     getEnzymeSeqId(params['secondary'], True, enz_list, url),
                  'segment_length': params['length'],
                  'type':           params['type'],
                  'filename':       resfile }
    return [ libfiles, bedfiles, resfile, infos_lib ]
Beispiel #18
0
 def test_path_to_file_on_execution(self):
     with execution(None) as ignoreme:
         f = touch(ignoreme)
         M = MiniLIMS("boris")
         fid = M.import_file(f)
         mpath = M.path_to_file(fid)
         with execution(M) as ex:
             fpath = ex.path_to_file(fid)
         used_files = M.fetch_execution(ex.id)['used_files']
     self.assertEqual(used_files, [fid])
     self.assertEqual(mpath, fpath)
Beispiel #19
0
 def test_path_to_file_on_execution(self):
     with execution(None) as ignoreme:
         f = touch(ignoreme)
         M = MiniLIMS("boris")
         fid = M.import_file(f)
         mpath = M.path_to_file(fid)
         with execution(M) as ex:
             fpath = ex.path_to_file(fid)
         used_files = M.fetch_execution(ex.id)['used_files']
     self.assertEqual(used_files, [fid])
     self.assertEqual(mpath, fpath)
Beispiel #20
0
    def test_immutability_dropped(self):
        executions = []
        with execution(M) as ex:
            touch(ex, "boris")
            ex.add("boris")

        exid1 = ex.id
        borisid = M.search_files(source=('execution',ex.id))[0]
        self.assertFalse(M.fetch_file(borisid)['immutable'])

        with execution(M) as ex:
            ex.use(borisid)

        exid2 = ex.id
        self.assertTrue(M.fetch_file(borisid)['immutable'])

        M.delete_execution(exid2)
        self.assertFalse(M.fetch_file(borisid)['immutable'])

        M.delete_execution(exid1)
        self.assertEqual(M.search_files(source=('execution',exid1)), [])
Beispiel #21
0
    def test_immutability_dropped(self):
        executions = []
        with execution(M) as ex:
            touch(ex, "boris")
            ex.add("boris")

        exid1 = ex.id
        borisid = M.search_files(source=('execution',ex.id))[0]
        self.assertFalse(M.fetch_file(borisid)['immutable'])
    
        with execution(M) as ex:
            ex.use(borisid)

        exid2 = ex.id
        self.assertTrue(M.fetch_file(borisid)['immutable'])

        M.delete_execution(exid2)
        self.assertFalse(M.fetch_file(borisid)['immutable'])

        M.delete_execution(exid1)
        self.assertEqual(M.search_files(source=('execution',exid1)), [])
Beispiel #22
0
 def test_hierarchical_association(self):
     try:
         with execution(M) as ex:
             touch(ex, "a")
             touch(ex, "b")
             touch(ex, "c")
             ex.add("a")
             ex.add("b", associate_to_filename="a", template="%s.step")
             ex.add("c", associate_to_filename="b", template="%s.step")
         a_id = M.search_files(source=('execution', ex.id),
                               with_text='a')[0]
         b_id = M.search_files(source=('execution', ex.id),
                               with_text='b')[0]
         c_id = M.search_files(source=('execution', ex.id),
                               with_text='c')[0]
         a_name = M.fetch_file(a_id)['repository_name']
         b_name = M.fetch_file(b_id)['repository_name']
         c_name = M.fetch_file(c_id)['repository_name']
         self.assertEqual("%s.step" % a_name, b_name)
         self.assertEqual("%s.step.step" % a_name, c_name)
     finally:
         try:
             M.delete_execution(ex.id)
         except:
             pass
Beispiel #23
0
def parse_meme_xml( ex, meme_file, chrmeta ):
    """ Parse meme xml file and convert to track """
    from xml.etree import ElementTree as ET
    touch(ex,meme_file)
    tree = ET.parse(meme_file)
    ncol = {}
    allmatrices = {}
    for motif in tree.find('motifs').findall('motif'):
        mid = motif.attrib['id']
        ncol[mid] = 0
        allmatrices[mid] = unique_filename_in()
        with open(allmatrices[mid],'w') as mat_out:
            for parray in motif.find('probabilities')[0].findall('alphabet_array'):
                ncol[mid] += 1
                m = {'letter_A':0,'letter_C':0,'letter_G':0,'letter_T':0}
                for col in parray:
                    m[col.attrib['letter_id']] = float(col.text)
                mat_out.write("1\t%f\t%f\t%f\t%f\n" %(m['letter_A'],m['letter_C'],m['letter_G'],m['letter_T']))
    def _xmltree(_t):#(_c,_t):
        seq_name = {}
        seq_chr = None
        for it in _t.getiterator():
            if it.tag == 'sequence':
                seq_name[it.attrib['id']] = it.attrib['name']
            if it.tag == 'scanned_sites':
                name = seq_name[it.attrib['sequence_id']]
                name,seq_chr,start,end = re.search(r'(.*)\|(.+):(\d+)-(\d+)',name).groups()
            if it.tag == 'scanned_site':# and _c == seq_chr:
                start = int(start)+int(it.attrib['position'])-1
                end = start+ncol[it.attrib['motif_id']]
                strnd = it.attrib['strand'] == 'plus' and 1 or -1
                score = it.attrib['pvalue']
                yield (seq_chr,str(start),str(end),it.attrib['motif_id'],score,strnd)
    outsql = unique_filename_in()+".sql"
    outtrack = track(outsql, chrmeta=chrmeta, info={'datatype':'qualitative'},
                     fields=['start','end','name','score','strand'])
    outtrack.write(FeatureStream(_xmltree(tree),fields=['chr']+outtrack.fields))
    outtrack.close()
    return {'sql':outsql,'matrices':allmatrices}
Beispiel #24
0
def filterSeq(ex,fastqFiles,seqToFilter,gid,grp_name,via='lsf'):
    indexSeqToFilter = {}
    indexFiles = {}
    global bcDelimiter
    for k,f in seqToFilter.iteritems():
        if os.path.getsize(f) == 0: continue
        ex.add(f,description=set_file_descr(grp_name+"_"+k.replace(bcDelimiter,"_")+"_seqToFilter.fa",
                                            groupId=gid,step="filtering",
                                            type="fa",view="admin"))
        if k in fastqFiles:
            indexFiles[k] = bowtie_build.nonblocking(ex,f,via=via)

    unalignedFiles = {}
    futures = []
    bwtarg = ["-a","-q","-n","2","-l","20","--un"]
    for k,f in indexFiles.iteritems():
        unalignedFiles[k] = unique_filename_in()
        touch(ex,unalignedFiles[k])
        futures.append(bowtie.nonblocking( ex, f.wait(), fastqFiles[k],
                                           bwtarg+[unalignedFiles[k]], via='lsf'))

    for f in futures: f.wait()
    return unalignedFiles
Beispiel #25
0
def filterSeq(ex,fastqFiles,seqToFilter,gid,grp_name,via='lsf'):
#seqToFilter=`awk -v primer=${curPrimer} '{if($0 ~ /^>/){n=split($0,a,"|");curPrimer=a[1];gsub(">","",curPrimer); if(curPrimer == primer){seqToFilter="";for(i=5;i<n;i++){if(a[i] !~ /Exclude=/){seqToFilter=seqToFilter""a[i]","}} if(a[n] !~ /Exclude=/){seqToFilter=seqToFilter""a[n]}else{gsub(/,$/,"",seqToFilter)};print seqToFilter}}}' ${primersFile}`
    indexSeqToFilter = {}
    indexFiles = {}
    for k,f in seqToFilter.iteritems():
        if os.path.getsize(f) == 0: continue
        ex.add(f,description=set_file_descr(grp_name+"_"+k+"_seqToFilter.fa",
                                            groupId=gid,step="filtering",
                                            type="fa",view="admin"))
        if k in fastqFiles:
            indexFiles[k] = bowtie_build.nonblocking(ex,f,via=via)

    unalignedFiles = {}
    futures = []
    bwtarg = ["-a","-q","-n","2","-l","20","--un"]
    for k,f in indexFiles.iteritems():
        unalignedFiles[k] = unique_filename_in()
        touch(ex,unalignedFiles[k])
        futures.append(bowtie.nonblocking( ex, f.wait(), fastqFiles[k],
                                           bwtarg+[unalignedFiles[k]], via='lsf'))

    for f in futures: f.wait()
    return unalignedFiles
Beispiel #26
0
 def test_hierarchical_association(self):
     try:
         with execution(M) as ex:
             touch(ex, "a")
             touch(ex, "b")
             touch(ex, "c")
             ex.add("a")
             ex.add("b", associate_to_filename="a", template="%s.step")
             ex.add("c", associate_to_filename="b", template="%s.step")
         a_id = M.search_files(source=('execution',ex.id), with_text='a')[0]
         b_id = M.search_files(source=('execution',ex.id), with_text='b')[0]
         c_id = M.search_files(source=('execution',ex.id), with_text='c')[0]
         a_name = M.fetch_file(a_id)['repository_name']
         b_name = M.fetch_file(b_id)['repository_name']
         c_name = M.fetch_file(c_id)['repository_name']
         self.assertEqual("%s.step" % a_name, b_name)
         self.assertEqual("%s.step.step" % a_name, c_name)
     finally:
         try:
             M.delete_execution(ex.id)
         except:
             pass
Beispiel #27
0
def add_file(ex):
    touch(ex, "boris")
    ex.add("boris", description="test")
Beispiel #28
0
def add_file(ex):
    touch(ex, "boris")
    ex.add("boris", description="test")
Beispiel #29
0
 def test_resolve_alias_returns_int_if_exists(self):
     with execution(None) as ex:
         f = touch(ex)
         M = MiniLIMS("boris")
         a = M.import_file(f)
         self.assertEqual(M.resolve_alias(a), a)
Beispiel #30
0
def density_to_countsPerFrag( ex, file_dict, groups, assembly, regToExclude, script_path, via='lsf' ):
    '''
    Main function to compute normalised counts per fragments from a density file.
    '''
    futures = {}
    results = {}
    for gid, group in groups.iteritems():
        reffile = file_dict['lib'][gid]
        futures[gid] = {}
        results[gid] = {}
        for rid,run in group['runs'].iteritems():
            density_file = file_dict['4cseq']['density_files'][gid][rid]
            gm_futures = []
            for ch in assembly.chrnames:
                chref = os.path.join(reffile,ch+".bed.gz")
                if not(os.path.exists(chref)): chref = reffile
    #            features = track(chref,'bed')
    #            outbed.write(gMiner.stream.mean_score_by_feature(
    #                    scores.read(selection=ch),
    #                    features.read(selection=ch)), mode='append')
                bedfile = unique_filename_in()+".bed"
                gfminer_job = {"operation": "score_by_feature",
                               "output": bedfile,
                               "datatype": "qualitative",
                               "args": "'"+json.dumps({"trackScores":density_file,
                                                       "trackFeatures":chref,
                                                       "chromosome":ch})+"'"}
                gm_futures.append((gfminer_run.nonblocking(ex,gfminer_job,via=via),
                                   bedfile))
            outsql = unique_filename_in()+".sql"
            sqlouttr = track( outsql, chrmeta=assembly.chrmeta,
                              info={'datatype':'quantitative'},
                              fields=['start', 'end', 'score'] )
            outbed_all = []
            for n,f in enumerate(gm_futures):
                f[0].wait()
                fout = f[1]
                if not(os.path.exists(fout)):
                    time.sleep(60)
                    touch(ex,fout)
                outbed_all.append(fout)
                outbed = track(fout, chrmeta=assembly.chrmeta)
                sqlouttr.write( outbed.read(fields=['start', 'end', 'score'],
                                            selection={'score':(0.01,sys.maxint)}),
                                chrom=assembly.chrnames[n] )
            sqlouttr.close()
            countsPerFragFile = unique_filename_in()+".bed"
            countsPerFragFile = cat(outbed_all,out=countsPerFragFile)
            results[gid][rid] = [ countsPerFragFile, outsql ]
            FragFile = unique_filename_in()
            touch(ex,FragFile)
            futures[gid][rid] = (FragFile,
                            segToFrag.nonblocking( ex, countsPerFragFile, regToExclude[gid],
                                                   script_path, via=via, stdout=FragFile ,
                                                   memory=4 ))
    def _parse_select_frag(stream):
        for s in stream:
            sr = s.strip().split('\t')
            if 'IsValid' in sr[2] and not any([w in sr[8] for w in ['_and_','BothRepeats','notValid']]):
                patt = re.search(r'([^:]+):(\d+)-(\d+)',sr[1])
                if patt:
                    coord = patt.groups()
#                    if float(sr[11])>0.0:
                    yield (coord[0], int(coord[1])-1, int(coord[2]), float(sr[11]))

    for gid, dict_gid in futures.iteritems():
        for rid, res in dict_gid.iteritems():
            res[1].wait()
            touch(ex,res[0])
            segOut = open(res[0],"r")
            resBedGraph = unique_filename_in()+".sql"
            sqlTr = track( resBedGraph, fields=['start','end','score'],
                           info={'datatype':'quantitative'}, chrmeta=assembly.chrmeta )
            sqlTr.write(_parse_select_frag(segOut),fields=['chr','start','end','score'])
            sqlTr.close()
            segOut.close()
            results[gid][rid].extend([res[0],resBedGraph])
    return results #[countsPerFrag_allBed, countsPerFrag_selectSql, segToFrag_out, segToFrag_sql]
Beispiel #31
0
 def test_syntaxerror_outside_execution(self):
     with execution(M) as ex:
         pass
     M.delete_execution(ex.id)
     with self.assertRaises(SyntaxError):
         touch(ex)
Beispiel #32
0
def save_wellington(ex, wellout, chrmeta):
    bedlist = {}
    for name, wlist in wellout.iteritems():
        wellall = unique_filename_in()
        #### Dummy file
        touch(ex, wellall)
        ex.add(wellall,
               description=set_file_descr(name[1] + '_wellington_files',
                                          type='none',
                                          view='admin',
                                          step='footprints',
                                          groupId=name[0]))
        #### BED at FDR 1%
        bedlist[name[0]] = wellall + "FDR01.bed.gz"
        bedzip = gzip.open(bedlist[name[0]], 'wb')
        bedzip.write("track name='" + name[1] +
                     "_WellingtonFootprints_FDR_0.01'\n")
        for x in wlist:
            with open(os.path.join(*x) +
                      ".WellingtonFootprints.FDR.0.01.bed") as _bed:
                [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall + "FDR01.bed.gz",
               description=set_file_descr(name[1] +
                                          '_WellingtonFootprintsFDR01.bed.gz',
                                          type='bed',
                                          ucsc='1',
                                          step='footprints',
                                          groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprintsFDR01.bed.gz')
        #### BED at p-values [...]
        bedzip = gzip.open(wellall + "PvalCutoffs.bed.gz", 'wb')
        for bfile in os.listdir(os.path.join(wlist[0][0], "p_value_cutoffs")):
            cut = os.path.splitext(
                bfile[:-4])[1][1:]  #between . ([1:]) and .bed ([:-4])
            bedzip.write("track name='" + name[1] +
                         "_WellingtonFootprints_Pval_%s'\n" % cut)
            for wdir, wpref in wlist:
                _bedpath = os.path.join(
                    wdir, "p_value_cutoffs",
                    wpref + ".WellingtonFootprints." + cut + ".bed")
                with open(_bedpath) as _bed:
                    [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall + "PvalCutoffs.bed.gz",
               description=set_file_descr(
                   name[1] + '_WellingtonFootprintsPvalCutoffs.bed.gz',
                   type='bed',
                   ucsc='1',
                   step='footprints',
                   groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprintsPvalCutoffs.bed.gz')
        #### WIG
        cat([os.path.join(*x) + ".WellingtonFootprints.wig" for x in wlist],
            wellall + ".wig")
        #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta)
        #ex.add(wellall+".bw",
        #       description=set_file_descr(name[1]+'_WellingtonFootprints.bw',
        #                                  type='bigWig', ucsc='1', step='footprints', groupId=name[0]),
        #       associate_to_filename=wellall, template='%s_WellingtonFootprints.bw')
        ex.add(wellall + ".wig",
               description=set_file_descr(name[1] +
                                          '_WellingtonFootprints.wig',
                                          type='wig',
                                          ucsc='1',
                                          step='footprints',
                                          groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprints.wig')
    return bedlist
Beispiel #33
0
def dnaseseq_workflow(ex, job, assembly, logfile=sys.stdout, via='lsf'):
    """
    This workflow performs the following steps:

      * BAM files from replicates within the same group are merged
      * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group
      * Wellington is called to identify footprints within these enriched regions
      * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file
      * Average DNAse profiles around motifs are plotted

    """
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid, mapped in job.files.iteritems():
        group_name = job.groups[gid]['name']
        if not isinstance(mapped, dict):
            raise TypeError(
                "Files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped: mapped = {'_': mapped}
        if len(mapped) > 1:
            bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()])
            index = index_bam(ex, bamfile)
        else:
            bamfile = mapped.values()[0]['bam']
        if job.groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            if os.path.exists(job.groups[gid].get('bedfile', 'null')):
                bedfile = job.groups[gid]['bedfile']
            elif os.path.exists(
                    os.path.join(supdir,
                                 job.groups[gid].get('bedfile', 'null'))):
                bedfile = os.path.join(supdir, job.groups[gid]['bedfile'])
            else:
                bedfile = None
            tests.append((bedfile, bamfile))
            names['tests'].append((gid, group_name))
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    tests = macs_bedfiles(ex, assembly.chrmeta, tests, controls, names,
                          job.options.get('macs_args', ["--keep-dup", "10"]),
                          via, logfile)
    bedlist = run_wellington(ex, tests, names, assembly, via, logfile)
    ######################### Motif scanning / plotting
    if any([
            gr.get('motif') != 'null' and gr.get('motif')
            for gr in job.groups.values()
    ]):
        motifbeds = motif_scan(ex, bedlist, assembly, job.groups, via, logfile)
        siglist = dict((gid[0], []) for gid in names['tests'])
        for gid, mapped in job.files.iteritems():
            wig = []
            suffixes = ["fwd", "rev"]
            merge_strands = int(job.options.get('merge_strands', -1))
            read_extension = int(job.options.get('read_extension') or -1)
            make_wigs = merge_strands >= 0 or read_extension != 1
            for m in mapped.values():
                if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                    output = mapseq.parallel_density_sql(
                        ex,
                        m["bam"],
                        assembly.chrmeta,
                        nreads=m["stats"]["total"],
                        merge=-1,
                        read_extension=1,
                        convert=False,
                        b2w_args=[],
                        via=via)
                    wig.append(dict(
                        (s, output + s + '.sql') for s in suffixes))
                else:
                    wig.append(m['wig'])
            if len(wig) > 1:
                wig[0] = dict((s, merge_sql(ex, [x[s] for x in wig], via=via))
                              for s in suffixes)
            _trn = job.groups[gid]['name'] + "_%s"
            if job.groups[gid]['control']:
                for s, w in wig[0].iteritems():
                    for _g in siglist.keys():
                        siglist[_g].append(track(w, info={'name': _trn % s}))
            else:
                siglist[gid].extend([
                    track(w, info={'name': _trn % s})
                    for s, w in wig[0].iteritems()
                ])
        plot_files = plot_footprint_profile(ex, motifbeds, siglist,
                                            assembly.chrnames, job.groups,
                                            logfile)
        for gid, flist in plot_files.iteritems():
            gname = job.groups[gid]['name']
            plotall = unique_filename_in()
            touch(ex, plotall)
            ex.add(plotall,
                   description=set_file_descr(gname + '_footprints_plots',
                                              type='none',
                                              view='admin',
                                              step='motifs',
                                              groupId=gid))
            ex.add(flist['pdf'],
                   description=set_file_descr(gname + '_footprints_plots.pdf',
                                              type='pdf',
                                              step='motifs',
                                              groupId=gid),
                   associate_to_filename=plotall,
                   template='%s.pdf')
            tarname = unique_filename_in()
            tarfh = tarfile.open(tarname, "w:gz")
            for mname, matf in flist['mat']:
                tarfh.add(matf, arcname="%s_%s.txt" % (gname, mname))
            tarfh.close()
            ex.add(tarname,
                   description=set_file_descr(gname +
                                              '_footprints_plots.tar.gz',
                                              type='tar',
                                              step='motifs',
                                              groupId=gid),
                   associate_to_filename=plotall,
                   template='%s.tar.gz')
    logfile.write("\nDone.\n ")
    logfile.flush()
    return 0
Beispiel #34
0
 def test_syntaxerror_outside_execution(self):
     with execution(M) as ex:
         pass
     M.delete_execution(ex.id)
     with self.assertRaises(SyntaxError):
         touch(ex)
Beispiel #35
0
def add_macs_results( ex, read_length, genome_size, bamfile,
                      ctrlbam=None, name=None, poisson_threshold=None,
                      alias=None, macs_args=None, via='lsf' ):
    """Calls the ``macs`` function on each possible pair
    of test and control bam files and adds
    the respective outputs to the execution repository.

    ``macs`` options can be controlled with `macs_args`.
    If a dictionary of Poisson thresholds for each sample is given, then the enrichment bounds ('-m' option)
    are computed from them otherwise the default is '-m 10,100'.

    Returns the set of file prefixes.
    """
    if not(isinstance(bamfile,list)):
        bamfile = [bamfile]
    if not(isinstance(ctrlbam,list)):
        ctrlbam = [ctrlbam]
    if poisson_threshold is None:
        poisson_threshold = {}
    if macs_args is None:
        macs_args = []
    futures = {}
    rl = read_length
    for i,bam in enumerate(bamfile):
        n = name['tests'][i]
        if poisson_threshold.get(n)>0:
            low = (poisson_threshold.get(n)+1)*5
            enrich_bounds = str(min(30,low))+","+str(10*low)
        else:
            enrich_bounds = "10,100"
        if not("-m" in macs_args): macs_args += ["-m",enrich_bounds]
        if isinstance(read_length,list): rl = read_length[i]
        for j,cam in enumerate(ctrlbam):
            m = name['controls'][j]
            nm = (n,m)
            futures[nm] = macs.nonblocking( ex, rl, genome_size, bam, cam,
                                            args=macs_args, via=via, memory=12 )
    prefixes = {}
    for n,f in futures.iteritems():
        p = f.wait()
        prefixes[n] = p
        macs_descr0 = {'step':'macs','type':'none','view':'admin','groupId':n[0][0]}
        macs_descr1 = {'step':'macs','type':'xls','groupId':n[0][0]}
        macs_descr2 = {'step':'macs','type':'bed','groupId':n[0][0],'ucsc':'1'}
        filename = "_vs_".join([x[1] for x in n if x[0]])
        touch( ex, p )
        ex.add( p, description=set_file_descr(filename,**macs_descr0),
                alias=alias )
        ex.add( p+"_peaks.xls",
                description=set_file_descr(filename+"_peaks.xls",**macs_descr1),
                associate_to_filename=p, template='%s_peaks.xls' )
        bedzip = gzip.open(p+"_peaks.bed.gz",'wb')
        bedzip.write("track name='"+filename+"_macs_peaks'\n")
        with open(p+"_peaks.bed") as bedinf:
            [bedzip.write(l) for l in bedinf]
        bedzip.close()
        ex.add( p+"_peaks.bed.gz",
                description=set_file_descr(filename+"_peaks.bed.gz",**macs_descr2),
                associate_to_filename=p, template='%s_peaks.bed.gz' )
        bedzip = gzip.open(p+"_summits.bed.gz",'wb')
        bedzip.write("track name='"+filename+"_macs_summits'\n")
        with open(p+"_summits.bed") as bedinf:
            [bedzip.write(l) for l in bedinf]
        bedzip.close()
        ex.add( p+"_summits.bed.gz",
                description=set_file_descr(filename+"_summits.bed.gz",**macs_descr2),
                associate_to_filename=p, template='%s_summits.bed.gz' )
        if n[1][0]:
            ex.add( p+"_negative_peaks.xls",
                    description=set_file_descr(filename+"_negative_peaks.xls",**macs_descr0),
                    associate_to_filename=p, template='%s_negative_peaks.xls' )
    return prefixes
Beispiel #36
0
def c4seq_workflow( ex, job, primers_dict, assembly,
                    c4_url=None, script_path='', logfile=sys.stdout, via='lsf' ):
    '''
    Main
    * open the 4C-seq minilims and create execution
    * 0. get/create the library
    * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql)
    * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate)
    '''

    mapseq_files = job.files
### outputs
    processed = {'lib': {}, 'density': {}, '4cseq': {}}
    processed['4cseq'] = {'density_files' : {},
                          'countsPerFrag' : {},
                          'countsPerFrag_grp' : {},
                          'norm' : {},
                          'norm_grp' : {},
                          'profileCorrection': {},
                          'profileCorrection_grp' : {},
                          'smooth_grp' : {},
                          'domainogram_grp' : {},
                          'bricks2frags' : {}}
                            # was 'smoothFrag': {}, 'domainogram': {}}
    regToExclude = {}
    new_libs=[]
### options
    run_domainogram = {}
    before_profile_correction = {}
    if not job.options.get('viewpoints_chrs',False):
        out_chromosomes = ','.join([ch for ch in assembly.chrnames])
    else:
        out_chromosomes = ','.join([primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] for gid,group in job.groups.iteritems()])
    print "out_chromosomes=" + out_chromosomes + "\n"

    sizeExt = job.options.get('norm_reg',1000000)
    print "region considered for normalisation: mid viewpoint +/-" + str(sizeExt) + 'bps'

### do it
    for gid, group in job.groups.iteritems():
        run_domainogram[gid] = group.get('run_domainogram',False)
        if isinstance(run_domainogram[gid],basestring):
            run_domainogram[gid] = (run_domainogram[gid].lower() in ['1','true','on','t'])
        before_profile_correction[gid] = group.get('before_profile_correction',False)
        if isinstance(before_profile_correction[gid],basestring):
            before_profile_correction[gid] = (before_profile_correction[gid].lower() in ['1','true','on','t'])
        processed['lib'][gid] = get_libForGrp(ex, group, assembly,
                                              new_libs, gid, c4_url, via=via)
#reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed'
        processed['4cseq']['density_files'][gid] = {}
        regToExclude[gid] = primers_dict.get(group['name'],{}).get('regToExclude',"").replace('\r','')

        # if no regToExclude defined, set it as mid_baitCoord +/-5kb
        if len(regToExclude[gid])==0 :
            baitcoord_mid = int(0.5 * (int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[0]) + int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[1]) ))
            regToExclude[gid] = primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] + ':' + str(baitcoord_mid-5000) + '-' + str(baitcoord_mid+5000)

        #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()]))
        print(primers_dict.get(group['name'],{}))
        print "regToExclude["+str(gid)+"]="+regToExclude[gid]
        for rid,run in group['runs'].iteritems():
            libname = mapseq_files[gid][rid]['libname']
            if job.options.get('merge_strands') != 0 or not('wig' in mapseq_files[gid][rid]):
                density_file=parallel_density_sql( ex, mapseq_files[gid][rid]['bam'],
                                                   assembly.chrmeta,
                                                   nreads=mapseq_files[gid][rid]['stats']["total"],
                                                   merge=0,
                                                   read_extension=mapseq_files[gid][rid]['stats']['read_length'],
                                                   convert=False,
                                                   via=via )
                density_file += "merged.sql"
                ex.add( density_file,
                        description=set_file_descr("density_file_"+libname+".sql",
                                                   groupId=gid,step="density",type="sql",view='admin',gdv="1") )
            else:
                density_file = mapseq_files[gid][rid]['wig']['merged']
            #density_files.append(density_file)
            processed['4cseq']['density_files'][gid][rid]=density_file

        # back to grp level!
        # not anymore:
        # processed['density'][gid] = merge_sql(ex, density_files, via=via)

    processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via )
    ## access per gid+rid

    futures_norm = {}
    countsPerFrags_bedGraph = {}
    futures_merged_raw = {}
    for gid, group in job.groups.iteritems():
        futures_norm[gid] = {}
        countsPerFrags_bedGraph[gid] = {}
        processed['4cseq']['norm'][gid] = {}
        for rid,run in group['runs'].iteritems():
            normfile = unique_filename_in()
            touch(ex, normfile)
            resfile = unique_filename_in()+".bedGraph"
            resfiles = processed['4cseq']['countsPerFrag'][gid][rid] # _all.sql
            convert(resfiles[3],resfile)
            countsPerFrags_bedGraph[gid][rid] = resfile

            print "call normFrags: infiles="+resfile+", normfile="+normfile+"baitCoord="+primers_dict[group['name']]['baitcoord']+", sizeExt=sizeExt, name="+ group['name']+"rep_"+str(rid) + "regToExclude="+regToExclude[gid]+"\n"
            futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name']+"rep_"+str(rid) ,regToExclude=regToExclude[gid], script_path=script_path, via=via )
            processed['4cseq']['norm'][gid][rid] = normfile

        if len(group) > 1:
            ## merge replicates before normalisation.
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_raw_mergedRep"
            print "gid="+group['name']
            print "call mergeRep for replicates before normalisation: infiles="+",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['countsPerFrag_grp'][gid] = mergefile
        else:
            futures_merged_raw[gid] = None
            processed['4cseq']['countsPerFrag_grp'][gid] = countsPerFrags_bedGraph[gid][0] #if no replicates, then the file we want is the 1st one

    print "***** profile correction / sample + merge normalised data"
    futures_merged = {} # per gid
    futures_profcor = {} # per gid, per rid
    for gid, group in job.groups.iteritems():
        ## run profile correction per run then merge them
        futures_profcor[gid] = {}
        processed['4cseq']['profileCorrection'][gid] = {}
        for rid, run in group['runs'].iteritems():
            # wait for normalisation of all replicates to be finished
            futures_norm[gid][rid].wait() ## normalised files, per grp, per rep
            normfile = processed['4cseq']['norm'][gid][rid]
            file1 = unique_filename_in() #track file
            touch(ex,file1)
            file2 = unique_filename_in() #report file
            touch(ex,file2)
            file3 = unique_filename_in() #table file
            touch(ex, file3)
            print "call profileCorrection: normfile="+normfile+", baitCoord="+primers_dict[group['name']]['baitcoord']+", name="+group['name']+", file1="+file1+", file2="+file2+", file3= "+file3+"\n"
            futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile,
                                        primers_dict[group['name']]['baitcoord'],
                                        group['name'], file1, file2, file3, script_path,
                                        via=via )
            processed['4cseq']['profileCorrection'][gid][rid] = [file1, file2, file3]

        ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop.
        if len(group)>1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_norm_mergedRep"
            print "gid="+group['name']
            print "call mergeRep: infiles="+",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['norm_grp'][gid] = mergefile
        else:
            futures_merged[gid] = None
            processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][gid][0] ##if no replicates, then the file we want is the 1st one

    print "***** merge profile corrected data"
    futures_profcor_merged = {} # per gid
    for gid, group in job.groups.iteritems():
        processed['4cseq']['profileCorrection_grp'][gid] = {}
        for rid, run in group['runs'].iteritems():
            futures_profcor[gid][rid].wait()   ## wait for ProfileCorrection to be finished

        ## merge replicates after profile correction
        if len(group)>1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_ProfCor_mergedRep"
            pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid,res_rid in processed['4cseq']['profileCorrection'][gid].iteritems()]
            print "call mergeRep (for PC tables): infiles="+",".join(pcfiles)+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['profileCorrection_grp'][gid] = mergefile
        else:
            futures_profcor_merged[gid] = None
            processed['4cseq']['profileCorrection_grp'][gid] = processed['4cseq']['profileCorrection'][gid][0] ##if no replicates, then the file we want is the 1st one


    print "***** smooth data"
    futures_smoothed = {}
    for gid, group in job.groups.iteritems():
        file1 = unique_filename_in()
        touch(ex,file1)
        file2 = unique_filename_in()
        touch(ex, file2)
        file3 = unique_filename_in()
        touch(ex, file3)
        nFragsPerWin = group['window_size']
        futures_merged_raw[gid].wait() ## wait for merging of raw_grp to be completed
        futures_smoothed[gid] = ( smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'],
                                                    file1, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        futures_merged[gid].wait() ## wait for merging of norm_grp to be completed
        futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name']+"_norm",
                                                    file2, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        futures_profcor_merged[gid].wait() # wait for the merging of profile corrected data to be done
        futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name']+"_fromProfileCorrected",
                                                    file3, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        processed['4cseq']['smooth_grp'][gid] = [file1,file2,file3] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC]

    print "***** Domainograms"
    futures_domainograms = {}
    for gid, group in job.groups.iteritems():
        grName = job.groups[gid]['name']
        if run_domainogram[gid]:
            regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord']
            if before_profile_correction[gid]:
               futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid],
                                                                            grName, regCoord=regCoord, skip=1,
                                                                            script_path=script_path, via=via, memory=15 )
            else:
                futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid],
                                                                            grName, regCoord=regCoord.split(':')[0], skip=1,
                                                                            script_path=script_path, via=via, memory=15 )

    ## prepare tar files for domainogram results (if any)
    ## and create "BRICKS to frags" files
    print "***** BRICKS to Frags"
    futures_BRICKS2Frags = {}
    for gid, f in futures_domainograms.iteritems():
        if run_domainogram[gid]: # if domainogram has been run
            resFiles = []
            logFile = f.wait()
            start = False
            tarname = job.groups[gid]['name']+"_domainogram.tar.gz"
            res_tar = tarfile.open(tarname, "w:gz")
            futures_BRICKS2Frags[gid] = []
            processed['4cseq']['bricks2frags'][gid] = []
            if logFile is None: continue
            with open(logFile) as f:
                for s in f:
                    s = s.strip()
                    if '####resfiles####' in s:
                        start = True
                    elif start and "RData" not in s:
                        resFiles.append(s)
                        res_tar.add(s)
                    if start and "foundBRICKS" in s:
                        bricks2fragsfile = unique_filename_in()+".bedGraph"
                        touch(ex, bricks2fragsfile)
                        futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking(ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4 ) ]
                        processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ]
            res_tar.close()
            processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname]




############### prepare tables for global results
    print "***** combine results into tables "
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        for rid,run in group['runs'].iteritems():
            allNames += [ group['name']+"_rep"+str(rid)+"_norm", group['name']+"_rep"+str(rid)+"_fit" ]
            allFiles += [ processed['4cseq']['profileCorrection'][gid][rid][2] ]
            allRegToExclude += [ regToExclude[gid] ]
    tablePC=unique_filename_in()+".txt"
    print("***will call makeTable with:")
    print(",".join(allFiles))
    print("resfile="+tablePC)
    print(",".join(allNames))
    touch(ex,tablePC)

    #regToExclude[gid]

    futures_tables = (makeTable.nonblocking(ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    # wait for all smoothing to be done
    for gid, fg in futures_smoothed.iteritems():
        for f in fg: f.wait()

    ## make Table raw/smoothed_raw
    print("** make Table raw/smoothed_raw")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        futures_merged_raw[gid].wait()
        allNames += [ group['name']+"_raw", group['name']+"_rawSmoothed" ]
        allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ]
        allRegToExclude += [ 'NA', regToExclude[gid] ]

    tableSmoothedRaw_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothedRaw_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## make Table norm/smoothed_norm before PC
    print("** make Table norm/smoothed_norm befor PC")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        allNames += [ group['name']+"_norm", group['name']+"_smoothed" ]
        allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ]
        allRegToExclude += [ regToExclude[gid], regToExclude[gid] ]

    tableSmoothed_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothed_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## make Table norm/smoothed_norm after PC
    print("** make Table norm/smoothed_norm after PC")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        allNames += [ group['name']+"_normPC", group['name']+"_smoothedPC" ]
        allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ]
        allRegToExclude += [ regToExclude[gid], regToExclude[gid] ]

    tableSmoothedPC_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothedPC_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## combine BRICKS2Frags files
    allNames=[]
    allFiles=[]
    for gid, fg in futures_BRICKS2Frags.iteritems():
        for f in fg: f.wait()
        allNames += [ job.groups[gid]['name']+"_BRICKSpval" ]
        cat_bricks2frags = unique_filename_in()+".txt"
        print ','.join(processed['4cseq']['bricks2frags'][gid])
        cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],out=cat_bricks2frags)
        allFiles += [ cat_bricks2frags ]

    for gid, fg in futures_smoothed.iteritems():
        for f in fg: f.wait()

    tableBRICKS2Frags = unique_filename_in()+".txt"
    touch(ex,tableBRICKS2Frags)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8 ), )


    for f in futures_tables: f.wait()


################ Add everything to minilims below!
    step = "density"
    for gid in processed['4cseq']['density_files'].keys():
        for rid, sql in processed['4cseq']['density_files'][gid].iteritems():
            fname = "density_file_"+job.groups[gid]['name']+"_merged_rep"+str(rid)
            ex.add( sql, description=set_file_descr( fname+".sql",
                                                 groupId=gid,step=step,type="sql",gdv="1" ) )
            wig = unique_filename_in()+".bw"
            convert( sql, wig )
            ex.add( wig, description=set_file_descr( fname+".bw",
                                                 groupId=gid,step=step,type="bigWig",ucsc="1") )
    step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate
    for gid in processed['4cseq']['countsPerFrag'].keys():
        for rid, resfiles in processed['4cseq']['countsPerFrag'][gid].iteritems():
            fname = "meanScorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid)
            ex.add( resfiles[1], description=set_file_descr( fname+".sql",
                                                             groupId=gid,step=step,type="sql",view="admin",gdv='1'))
            #gzipfile(ex,resfiles[0])
            #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz",
            #                                                       groupId=gid,step=step,type="bed",view="admin" ))
            fname = "segToFrag_"+job.groups[gid]['name']+"_rep"+str(rid)
            ex.add( resfiles[3], description=set_file_descr( fname+"_all.sql",
                                                             groupId=gid,step=step,type="sql",
                                                             comment="all informative frags - null included" ))
            trsql = track(resfiles[3])
            bwig = unique_filename_in()+".bw"
            trwig = track(bwig,chrmeta=trsql.chrmeta)
            trwig.write(trsql.read(fields=['chr','start','end','score'],
                                   selection={'score':(0.01,sys.maxint)}))
            trwig.close()
            ex.add( bwig, set_file_descr(fname+".bw",groupId=gid,step=step,type="bigWig",ucsc='1'))
        ## add segToFrags before normalisation
        futures_merged_raw[gid].wait()
        trbedgraph = track(removeNA(processed['4cseq']['countsPerFrag_grp'][gid]),format='bedgraph')
        bwig = unique_filename_in()+".bw"
        trwig = track(bwig,chrmeta=assembly.chrmeta)
        trwig.write(trbedgraph.read(fields=['chr','start','end','score'],
                               selection={'score':(0.01,sys.maxint)}))
        trwig.close()
        fname = "segToFrag_"+job.groups[gid]['name']
        ex.add( bwig, description=set_file_descr( fname+".bw",
                                                             groupId=gid,step=step,type="bigWig",
                                                             comment="segToFrag file before normalisation" ))

    step = "norm_counts_per_frags"  # after new normalisation process, combined replicates
    for gid, resfile in processed['4cseq']['norm_grp'].iteritems():
        fname = "normalised_scorePerFeature_"+job.groups[gid]['name']
        gzipfile(ex,resfile)
        ex.add( resfile+".gz", description=set_file_descr( fname+".bedGraph.gz", groupId=gid,step=step, type="bedGraph",ucsc='1'))
    # norm files, per replicates (might be removed)
    for gid, dict_gid in processed['4cseq']['norm'].iteritems():
        for rid, resfile in dict_gid.iteritems():
            fname = "normalised_scorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid)
            gzipfile(ex,resfile)
            ex.add(resfile+".gz",
                    description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
    step = "profile_correction" # Profile corrected data, combined replicates
    for gid, profileCorrectedFile in processed['4cseq']['profileCorrection_grp'].iteritems():
        fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected"
        gzipfile(ex,profileCorrectedFile)
        ex.add( profileCorrectedFile+".gz",
                description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
    # Profile corrected, per replicate (might be removed)
    for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems():
        for rid, resfiles in dict_gid.iteritems():
    #        profileCorrectedFile = resfiles[0]
            reportProfileCorrection = resfiles[1]
            fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_rep"+str(rid)
    #        gzipfile(ex,profileCorrectedFile)
     #       ex.add( profileCorrectedFile+".gz",
      #              description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
            ex.add( reportProfileCorrection, description=set_file_descr(fname+".pdf",
                                                                    groupId=gid,step=step,type="pdf"))
    step = "smoothing"
    for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems():
        rawSmoothFile = resfiles[0]
        smoothFile = resfiles[1]
        afterProfileCorrection = resfiles[2]
        nFrags = str(job.groups[gid]['window_size'])
        ## smoothed file before normalisation
        fname = "segToFrag_"+job.groups[gid]['name']+"_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,rawSmoothFile)
        ex.add(rawSmoothFile+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
        ## smoothed file after normalisation, before Profile correction
        fname = "segToFrag_"+job.groups[gid]['name']+"_norm_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,smoothFile)
        ex.add(smoothFile+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
        ## smoothed file after normalisation, after Profile correction
        fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,afterProfileCorrection)
        ex.add(afterProfileCorrection+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))

    step = "domainograms"
    for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems():
        tarFile = resfiles.pop()
        fname = job.groups[gid]['name']+"_domainogram.tar.gz"
        ex.add(tarFile, description=set_file_descr(fname,
                                                   groupId=gid,step=step,type="tgz"))
        for s in resfiles:
            if s[-8:] == "bedGraph":
                gzipfile(ex,s)
                s += ".gz"
                ex.add( s, description=set_file_descr( s, groupId=gid,step=step,type="bedGraph",ucsc="1",gdv="1"))

    step = "combined_results"
    gzipfile(ex,tableSmoothedRaw_grp)
    ex.add(tableSmoothedRaw_grp+".gz", description=set_file_descr("table_segToFrags_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableSmoothed_grp)
    ex.add(tableSmoothed_grp+".gz", description=set_file_descr("table_normalised_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableSmoothedPC_grp)
    ex.add(tableSmoothedPC_grp+".gz", description=set_file_descr("table_profileCorrected_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tablePC)
    ex.add(tablePC+".gz", description=set_file_descr("table_normalised_fit_per_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableBRICKS2Frags)
    ex.add(tableBRICKS2Frags+".gz", description=set_file_descr("table_frags_in_BRICKS_combined_replicates.txt.gz",step=step,type="txt"))

    return processed
Beispiel #37
0
 def test_resolve_alias_returns_int_if_exists(self):
     with execution(None) as ex:
         f = touch(ex)
         M = MiniLIMS("boris")
         a = M.import_file(f)
         self.assertEqual(M.resolve_alias(a), a)
Beispiel #38
0
def c4seq_workflow(ex,
                   job,
                   primers_dict,
                   assembly,
                   c4_url=None,
                   script_path='',
                   logfile=sys.stdout,
                   via='lsf'):
    '''
    Main
    * open the 4C-seq minilims and create execution
    * 0. get/create the library
    * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql)
    * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate)
    '''

    mapseq_files = job.files
    ### outputs
    processed = {'lib': {}, 'density': {}, '4cseq': {}}
    processed['4cseq'] = {
        'density_files': {},
        'countsPerFrag': {},
        'countsPerFrag_grp': {},
        'norm': {},
        'norm_grp': {},
        'profileCorrection': {},
        'profileCorrection_grp': {},
        'smooth_grp': {},
        'domainogram_grp': {},
        'bricks2frags': {}
    }
    # was 'smoothFrag': {}, 'domainogram': {}}
    regToExclude = {}
    new_libs = []
    ### options
    run_domainogram = {}
    before_profile_correction = {}
    if not job.options.get('viewpoints_chrs', False):
        out_chromosomes = ','.join([ch for ch in assembly.chrnames])
    else:
        out_chromosomes = ','.join([
            primers_dict.get(group['name'], {}).get('baitcoord').split(':')[0]
            for gid, group in job.groups.iteritems()
        ])
    print "out_chromosomes=" + out_chromosomes + "\n"

    sizeExt = job.options.get('norm_reg', 1000000)
    print "region considered for normalisation: mid viewpoint +/-" + str(
        sizeExt) + 'bps'

    ### do it
    for gid, group in job.groups.iteritems():
        run_domainogram[gid] = group.get('run_domainogram', False)
        if isinstance(run_domainogram[gid], basestring):
            run_domainogram[gid] = (run_domainogram[gid].lower()
                                    in ['1', 'true', 'on', 't'])
        before_profile_correction[gid] = group.get('before_profile_correction',
                                                   False)
        if isinstance(before_profile_correction[gid], basestring):
            before_profile_correction[gid] = (
                before_profile_correction[gid].lower()
                in ['1', 'true', 'on', 't'])
        processed['lib'][gid] = get_libForGrp(ex,
                                              group,
                                              assembly,
                                              new_libs,
                                              gid,
                                              c4_url,
                                              via=via)
        #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed'
        processed['4cseq']['density_files'][gid] = {}
        regToExclude[gid] = primers_dict.get(group['name'],
                                             {}).get('regToExclude',
                                                     "").replace('\r', '')

        # if no regToExclude defined, set it as mid_baitCoord +/-5kb
        if len(regToExclude[gid]) == 0:
            baitcoord_mid = int(0.5 * (int(
                primers_dict.get(group['name'], {}).get('baitcoord').split(':')
                [1].split('-')[0]) + int(
                    primers_dict.get(group['name'], {}).get('baitcoord').split(
                        ':')[1].split('-')[1])))
            regToExclude[gid] = primers_dict.get(
                group['name'], {}).get('baitcoord').split(':')[0] + ':' + str(
                    baitcoord_mid - 5000) + '-' + str(baitcoord_mid + 5000)

        #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()]))
        print(primers_dict.get(group['name'], {}))
        print "regToExclude[" + str(gid) + "]=" + regToExclude[gid]
        for rid, run in group['runs'].iteritems():
            libname = mapseq_files[gid][rid]['libname']
            if job.options.get('merge_strands') != 0 or not (
                    'wig' in mapseq_files[gid][rid]):
                density_file = parallel_density_sql(
                    ex,
                    mapseq_files[gid][rid]['bam'],
                    assembly.chrmeta,
                    nreads=mapseq_files[gid][rid]['stats']["total"],
                    merge=0,
                    read_extension=mapseq_files[gid][rid]['stats']
                    ['read_length'],
                    convert=False,
                    via=via)
                density_file += "merged.sql"
                ex.add(density_file,
                       description=set_file_descr("density_file_" + libname +
                                                  ".sql",
                                                  groupId=gid,
                                                  step="density",
                                                  type="sql",
                                                  view='admin',
                                                  gdv="1"))
            else:
                density_file = mapseq_files[gid][rid]['wig']['merged']
            #density_files.append(density_file)
            processed['4cseq']['density_files'][gid][rid] = density_file

        # back to grp level!
        # not anymore:
        # processed['density'][gid] = merge_sql(ex, density_files, via=via)

    processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag(
        ex, processed, job.groups, assembly, regToExclude, script_path, via)
    ## access per gid+rid

    futures_norm = {}
    countsPerFrags_bedGraph = {}
    futures_merged_raw = {}
    for gid, group in job.groups.iteritems():
        futures_norm[gid] = {}
        countsPerFrags_bedGraph[gid] = {}
        processed['4cseq']['norm'][gid] = {}
        for rid, run in group['runs'].iteritems():
            normfile = unique_filename_in()
            touch(ex, normfile)
            resfile = unique_filename_in() + ".bedGraph"
            resfiles = processed['4cseq']['countsPerFrag'][gid][
                rid]  # _all.sql
            convert(resfiles[3], resfile)
            countsPerFrags_bedGraph[gid][rid] = resfile

            print "call normFrags: infiles=" + resfile + ", normfile=" + normfile + "baitCoord=" + primers_dict[
                group['name']][
                    'baitcoord'] + ", sizeExt=sizeExt, name=" + group[
                        'name'] + "rep_" + str(
                            rid) + "regToExclude=" + regToExclude[gid] + "\n"
            futures_norm[gid][rid] = normFrags.nonblocking(
                ex,
                resfile,
                normfile,
                baitCoord=primers_dict[group['name']]['baitcoord'],
                sizeExt=sizeExt,
                name=group['name'] + "rep_" + str(rid),
                regToExclude=regToExclude[gid],
                script_path=script_path,
                via=via)
            processed['4cseq']['norm'][gid][rid] = normfile

        if len(group) > 1:
            ## merge replicates before normalisation.
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_raw_mergedRep"
            print "gid=" + group['name']
            print "call mergeRep for replicates before normalisation: infiles=" + ",".join(
                [
                    res_rid for rid, res_rid in
                    countsPerFrags_bedGraph[gid].iteritems()
                ]
            ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_merged_raw[gid] = mergeRep.nonblocking(
                ex,
                ",".join([
                    res_rid for rid, res_rid in
                    countsPerFrags_bedGraph[gid].iteritems()
                ]),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['countsPerFrag_grp'][gid] = mergefile
        else:
            futures_merged_raw[gid] = None
            processed['4cseq']['countsPerFrag_grp'][
                gid] = countsPerFrags_bedGraph[gid][
                    0]  #if no replicates, then the file we want is the 1st one

    print "***** profile correction / sample + merge normalised data"
    futures_merged = {}  # per gid
    futures_profcor = {}  # per gid, per rid
    for gid, group in job.groups.iteritems():
        ## run profile correction per run then merge them
        futures_profcor[gid] = {}
        processed['4cseq']['profileCorrection'][gid] = {}
        for rid, run in group['runs'].iteritems():
            # wait for normalisation of all replicates to be finished
            futures_norm[gid][rid].wait(
            )  ## normalised files, per grp, per rep
            normfile = processed['4cseq']['norm'][gid][rid]
            file1 = unique_filename_in()  #track file
            touch(ex, file1)
            file2 = unique_filename_in()  #report file
            touch(ex, file2)
            file3 = unique_filename_in()  #table file
            touch(ex, file3)
            print "call profileCorrection: normfile=" + normfile + ", baitCoord=" + primers_dict[
                group['name']]['baitcoord'] + ", name=" + group[
                    'name'] + ", file1=" + file1 + ", file2=" + file2 + ", file3= " + file3 + "\n"
            futures_profcor[gid][rid] = profileCorrection.nonblocking(
                ex,
                normfile,
                primers_dict[group['name']]['baitcoord'],
                group['name'],
                file1,
                file2,
                file3,
                script_path,
                via=via)
            processed['4cseq']['profileCorrection'][gid][rid] = [
                file1, file2, file3
            ]

        ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop.
        if len(group) > 1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_norm_mergedRep"
            print "gid=" + group['name']
            print "call mergeRep: infiles=" + ",".join([
                res_rid for rid, res_rid in processed['4cseq']['norm']
                [gid].iteritems()
            ]) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_merged[gid] = mergeRep.nonblocking(
                ex,
                ",".join([
                    res_rid for rid, res_rid in processed['4cseq']['norm']
                    [gid].iteritems()
                ]),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['norm_grp'][gid] = mergefile
        else:
            futures_merged[gid] = None
            processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][
                gid][
                    0]  ##if no replicates, then the file we want is the 1st one

    print "***** merge profile corrected data"
    futures_profcor_merged = {}  # per gid
    for gid, group in job.groups.iteritems():
        processed['4cseq']['profileCorrection_grp'][gid] = {}
        for rid, run in group['runs'].iteritems():
            futures_profcor[gid][rid].wait(
            )  ## wait for ProfileCorrection to be finished

        ## merge replicates after profile correction
        if len(group) > 1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_ProfCor_mergedRep"
            pcfiles = [
                processed['4cseq']['profileCorrection'][gid][rid][0]
                for rid, res_rid in processed['4cseq']['profileCorrection']
                [gid].iteritems()
            ]
            print "call mergeRep (for PC tables): infiles=" + ",".join(
                pcfiles
            ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_profcor_merged[gid] = mergeRep.nonblocking(
                ex,
                ",".join(pcfiles),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['profileCorrection_grp'][gid] = mergefile
        else:
            futures_profcor_merged[gid] = None
            processed['4cseq']['profileCorrection_grp'][gid] = processed[
                '4cseq']['profileCorrection'][gid][
                    0]  ##if no replicates, then the file we want is the 1st one

    print "***** smooth data"
    futures_smoothed = {}
    for gid, group in job.groups.iteritems():
        file1 = unique_filename_in()
        touch(ex, file1)
        file2 = unique_filename_in()
        touch(ex, file2)
        file3 = unique_filename_in()
        touch(ex, file3)
        nFragsPerWin = group['window_size']
        futures_merged_raw[gid].wait(
        )  ## wait for merging of raw_grp to be completed
        futures_smoothed[gid] = (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['countsPerFrag_grp'][gid],
            nFragsPerWin,
            group['name'],
            file1,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        futures_merged[gid].wait(
        )  ## wait for merging of norm_grp to be completed
        futures_smoothed[gid] += (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['norm_grp'][gid],
            nFragsPerWin,
            group['name'] + "_norm",
            file2,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        futures_profcor_merged[gid].wait(
        )  # wait for the merging of profile corrected data to be done
        futures_smoothed[gid] += (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['profileCorrection_grp'][gid],
            nFragsPerWin,
            group['name'] + "_fromProfileCorrected",
            file3,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        processed['4cseq']['smooth_grp'][gid] = [
            file1, file2, file3
        ]  #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC]

    print "***** Domainograms"
    futures_domainograms = {}
    for gid, group in job.groups.iteritems():
        grName = job.groups[gid]['name']
        if run_domainogram[gid]:
            regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord']
            if before_profile_correction[gid]:
                futures_domainograms[gid] = runDomainogram.nonblocking(
                    ex,
                    processed['4cseq']['norm_grp'][gid],
                    grName,
                    regCoord=regCoord,
                    skip=1,
                    script_path=script_path,
                    via=via,
                    memory=15)
            else:
                futures_domainograms[gid] = runDomainogram.nonblocking(
                    ex,
                    processed['4cseq']['profileCorrection_grp'][gid],
                    grName,
                    regCoord=regCoord.split(':')[0],
                    skip=1,
                    script_path=script_path,
                    via=via,
                    memory=15)

    ## prepare tar files for domainogram results (if any)
    ## and create "BRICKS to frags" files
    print "***** BRICKS to Frags"
    futures_BRICKS2Frags = {}
    for gid, f in futures_domainograms.iteritems():
        if run_domainogram[gid]:  # if domainogram has been run
            resFiles = []
            logFile = f.wait()
            start = False
            tarname = job.groups[gid]['name'] + "_domainogram.tar.gz"
            res_tar = tarfile.open(tarname, "w:gz")
            futures_BRICKS2Frags[gid] = []
            processed['4cseq']['bricks2frags'][gid] = []
            if logFile is None: continue
            with open(logFile) as f:
                for s in f:
                    s = s.strip()
                    if '####resfiles####' in s:
                        start = True
                    elif start and "RData" not in s:
                        resFiles.append(s)
                        res_tar.add(s)
                    if start and "foundBRICKS" in s:
                        bricks2fragsfile = unique_filename_in() + ".bedGraph"
                        touch(ex, bricks2fragsfile)
                        futures_BRICKS2Frags[gid] += [
                            BRICKSToFrag.nonblocking(
                                ex,
                                s,
                                processed['4cseq']['norm_grp'][gid],
                                bricks2fragsfile,
                                script_path=script_path,
                                via=via,
                                memory=4)
                        ]
                        processed['4cseq']['bricks2frags'][gid] += [
                            bricks2fragsfile
                        ]
            res_tar.close()
            processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname]

############### prepare tables for global results
    print "***** combine results into tables "
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        for rid, run in group['runs'].iteritems():
            allNames += [
                group['name'] + "_rep" + str(rid) + "_norm",
                group['name'] + "_rep" + str(rid) + "_fit"
            ]
            allFiles += [processed['4cseq']['profileCorrection'][gid][rid][2]]
            allRegToExclude += [regToExclude[gid]]
    tablePC = unique_filename_in() + ".txt"
    print("***will call makeTable with:")
    print(",".join(allFiles))
    print("resfile=" + tablePC)
    print(",".join(allNames))
    touch(ex, tablePC)

    #regToExclude[gid]

    futures_tables = (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tablePC,
        ",".join(allNames),
        idCols="4,5",
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    # wait for all smoothing to be done
    for gid, fg in futures_smoothed.iteritems():
        for f in fg:
            f.wait()

    ## make Table raw/smoothed_raw
    print("** make Table raw/smoothed_raw")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        futures_merged_raw[gid].wait()
        allNames += [group['name'] + "_raw", group['name'] + "_rawSmoothed"]
        allFiles += [
            processed['4cseq']['countsPerFrag_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][0]
        ]
        allRegToExclude += ['NA', regToExclude[gid]]

    tableSmoothedRaw_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothedRaw_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothedRaw_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## make Table norm/smoothed_norm before PC
    print("** make Table norm/smoothed_norm befor PC")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        allNames += [group['name'] + "_norm", group['name'] + "_smoothed"]
        allFiles += [
            processed['4cseq']['norm_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][1]
        ]
        allRegToExclude += [regToExclude[gid], regToExclude[gid]]

    tableSmoothed_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothed_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothed_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## make Table norm/smoothed_norm after PC
    print("** make Table norm/smoothed_norm after PC")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        allNames += [group['name'] + "_normPC", group['name'] + "_smoothedPC"]
        allFiles += [
            processed['4cseq']['profileCorrection_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][2]
        ]
        allRegToExclude += [regToExclude[gid], regToExclude[gid]]

    tableSmoothedPC_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothedPC_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothedPC_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## combine BRICKS2Frags files
    allNames = []
    allFiles = []
    for gid, fg in futures_BRICKS2Frags.iteritems():
        for f in fg:
            f.wait()
        allNames += [job.groups[gid]['name'] + "_BRICKSpval"]
        cat_bricks2frags = unique_filename_in() + ".txt"
        print ','.join(processed['4cseq']['bricks2frags'][gid])
        cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],
                               out=cat_bricks2frags)
        allFiles += [cat_bricks2frags]

    for gid, fg in futures_smoothed.iteritems():
        for f in fg:
            f.wait()

    tableBRICKS2Frags = unique_filename_in() + ".txt"
    touch(ex, tableBRICKS2Frags)
    futures_tables += (makeTable.nonblocking(ex,
                                             ",".join(allFiles),
                                             tableBRICKS2Frags,
                                             ",".join(allNames),
                                             idCols="4",
                                             out_chromosomes=out_chromosomes,
                                             defVal="NA",
                                             script_path=script_path,
                                             via=via,
                                             memory=8), )

    for f in futures_tables:
        f.wait()

    ################ Add everything to minilims below!
    step = "density"
    for gid in processed['4cseq']['density_files'].keys():
        for rid, sql in processed['4cseq']['density_files'][gid].iteritems():
            fname = "density_file_" + job.groups[gid][
                'name'] + "_merged_rep" + str(rid)
            ex.add(sql,
                   description=set_file_descr(fname + ".sql",
                                              groupId=gid,
                                              step=step,
                                              type="sql",
                                              gdv="1"))
            wig = unique_filename_in() + ".bw"
            convert(sql, wig)
            ex.add(wig,
                   description=set_file_descr(fname + ".bw",
                                              groupId=gid,
                                              step=step,
                                              type="bigWig",
                                              ucsc="1"))
    step = "counts_per_frag"  #was _norm_counts_per_frags # before normalisation process, per replicate
    for gid in processed['4cseq']['countsPerFrag'].keys():
        for rid, resfiles in processed['4cseq']['countsPerFrag'][
                gid].iteritems():
            fname = "meanScorePerFeature_" + job.groups[gid][
                'name'] + "_rep" + str(rid)
            ex.add(resfiles[1],
                   description=set_file_descr(fname + ".sql",
                                              groupId=gid,
                                              step=step,
                                              type="sql",
                                              view="admin",
                                              gdv='1'))
            #gzipfile(ex,resfiles[0])
            #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz",
            #                                                       groupId=gid,step=step,type="bed",view="admin" ))
            fname = "segToFrag_" + job.groups[gid]['name'] + "_rep" + str(rid)
            ex.add(resfiles[3],
                   description=set_file_descr(
                       fname + "_all.sql",
                       groupId=gid,
                       step=step,
                       type="sql",
                       comment="all informative frags - null included"))
            trsql = track(resfiles[3])
            bwig = unique_filename_in() + ".bw"
            trwig = track(bwig, chrmeta=trsql.chrmeta)
            trwig.write(
                trsql.read(fields=['chr', 'start', 'end', 'score'],
                           selection={'score': (0.01, sys.maxint)}))
            trwig.close()
            ex.add(
                bwig,
                set_file_descr(fname + ".bw",
                               groupId=gid,
                               step=step,
                               type="bigWig",
                               ucsc='1'))
        ## add segToFrags before normalisation
        futures_merged_raw[gid].wait()
        trbedgraph = track(removeNA(
            processed['4cseq']['countsPerFrag_grp'][gid]),
                           format='bedgraph')
        bwig = unique_filename_in() + ".bw"
        trwig = track(bwig, chrmeta=assembly.chrmeta)
        trwig.write(
            trbedgraph.read(fields=['chr', 'start', 'end', 'score'],
                            selection={'score': (0.01, sys.maxint)}))
        trwig.close()
        fname = "segToFrag_" + job.groups[gid]['name']
        ex.add(bwig,
               description=set_file_descr(
                   fname + ".bw",
                   groupId=gid,
                   step=step,
                   type="bigWig",
                   comment="segToFrag file before normalisation"))

    step = "norm_counts_per_frags"  # after new normalisation process, combined replicates
    for gid, resfile in processed['4cseq']['norm_grp'].iteritems():
        fname = "normalised_scorePerFeature_" + job.groups[gid]['name']
        gzipfile(ex, resfile)
        ex.add(resfile + ".gz",
               description=set_file_descr(fname + ".bedGraph.gz",
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1'))
    # norm files, per replicates (might be removed)
    for gid, dict_gid in processed['4cseq']['norm'].iteritems():
        for rid, resfile in dict_gid.iteritems():
            fname = "normalised_scorePerFeature_" + job.groups[gid][
                'name'] + "_rep" + str(rid)
            gzipfile(ex, resfile)
            ex.add(resfile + ".gz",
                   description=set_file_descr(fname + ".bedGraph.gz",
                                              groupId=gid,
                                              step=step,
                                              type="bedGraph",
                                              ucsc='1',
                                              gdv='1'))
    step = "profile_correction"  # Profile corrected data, combined replicates
    for gid, profileCorrectedFile in processed['4cseq'][
            'profileCorrection_grp'].iteritems():
        fname = "segToFrag_" + job.groups[gid]['name'] + "_profileCorrected"
        gzipfile(ex, profileCorrectedFile)
        ex.add(profileCorrectedFile + ".gz",
               description=set_file_descr(fname + ".bedGraph.gz",
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
    # Profile corrected, per replicate (might be removed)
    for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems():
        for rid, resfiles in dict_gid.iteritems():
            #        profileCorrectedFile = resfiles[0]
            reportProfileCorrection = resfiles[1]
            fname = "segToFrag_" + job.groups[gid][
                'name'] + "_profileCorrected_rep" + str(rid)
            #        gzipfile(ex,profileCorrectedFile)
            #       ex.add( profileCorrectedFile+".gz",
            #              description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
            ex.add(reportProfileCorrection,
                   description=set_file_descr(fname + ".pdf",
                                              groupId=gid,
                                              step=step,
                                              type="pdf"))
    step = "smoothing"
    for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems():
        rawSmoothFile = resfiles[0]
        smoothFile = resfiles[1]
        afterProfileCorrection = resfiles[2]
        nFrags = str(job.groups[gid]['window_size'])
        ## smoothed file before normalisation
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, rawSmoothFile)
        ex.add(rawSmoothFile + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
        ## smoothed file after normalisation, before Profile correction
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_norm_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, smoothFile)
        ex.add(smoothFile + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
        ## smoothed file after normalisation, after Profile correction
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_profileCorrected_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, afterProfileCorrection)
        ex.add(afterProfileCorrection + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))

    step = "domainograms"
    for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems():
        tarFile = resfiles.pop()
        fname = job.groups[gid]['name'] + "_domainogram.tar.gz"
        ex.add(tarFile,
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="tgz"))
        for s in resfiles:
            if s[-8:] == "bedGraph":
                gzipfile(ex, s)
                s += ".gz"
                ex.add(s,
                       description=set_file_descr(s,
                                                  groupId=gid,
                                                  step=step,
                                                  type="bedGraph",
                                                  ucsc="1",
                                                  gdv="1"))

    step = "combined_results"
    gzipfile(ex, tableSmoothedRaw_grp)
    ex.add(tableSmoothedRaw_grp + ".gz",
           description=set_file_descr(
               "table_segToFrags_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableSmoothed_grp)
    ex.add(tableSmoothed_grp + ".gz",
           description=set_file_descr(
               "table_normalised_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableSmoothedPC_grp)
    ex.add(tableSmoothedPC_grp + ".gz",
           description=set_file_descr(
               "table_profileCorrected_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tablePC)
    ex.add(tablePC + ".gz",
           description=set_file_descr(
               "table_normalised_fit_per_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableBRICKS2Frags)
    ex.add(tableBRICKS2Frags + ".gz",
           description=set_file_descr(
               "table_frags_in_BRICKS_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    return processed
Beispiel #39
0
def density_to_countsPerFrag(ex,
                             file_dict,
                             groups,
                             assembly,
                             regToExclude,
                             script_path,
                             via='lsf'):
    '''
    Main function to compute normalised counts per fragments from a density file.
    '''
    futures = {}
    results = {}
    for gid, group in groups.iteritems():
        reffile = file_dict['lib'][gid]
        futures[gid] = {}
        results[gid] = {}
        for rid, run in group['runs'].iteritems():
            density_file = file_dict['4cseq']['density_files'][gid][rid]
            gm_futures = []
            for ch in assembly.chrnames:
                chref = os.path.join(reffile, ch + ".bed.gz")
                if not (os.path.exists(chref)): chref = reffile
                #            features = track(chref,'bed')
                #            outbed.write(gMiner.stream.mean_score_by_feature(
                #                    scores.read(selection=ch),
                #                    features.read(selection=ch)), mode='append')
                bedfile = unique_filename_in() + ".bed"
                gfminer_job = {
                    "operation":
                    "score_by_feature",
                    "output":
                    bedfile,
                    "datatype":
                    "qualitative",
                    "args":
                    "'" + json.dumps({
                        "trackScores": density_file,
                        "trackFeatures": chref,
                        "chromosome": ch
                    }) + "'"
                }
                gm_futures.append((gfminer_run.nonblocking(ex,
                                                           gfminer_job,
                                                           via=via), bedfile))
            outsql = unique_filename_in() + ".sql"
            sqlouttr = track(outsql,
                             chrmeta=assembly.chrmeta,
                             info={'datatype': 'quantitative'},
                             fields=['start', 'end', 'score'])
            outbed_all = []
            for n, f in enumerate(gm_futures):
                f[0].wait()
                fout = f[1]
                if not (os.path.exists(fout)):
                    time.sleep(60)
                    touch(ex, fout)
                outbed_all.append(fout)
                outbed = track(fout, chrmeta=assembly.chrmeta)
                sqlouttr.write(outbed.read(
                    fields=['start', 'end', 'score'],
                    selection={'score': (0.01, sys.maxint)}),
                               chrom=assembly.chrnames[n])
            sqlouttr.close()
            countsPerFragFile = unique_filename_in() + ".bed"
            countsPerFragFile = cat(outbed_all, out=countsPerFragFile)
            results[gid][rid] = [countsPerFragFile, outsql]
            FragFile = unique_filename_in()
            touch(ex, FragFile)
            futures[gid][rid] = (FragFile,
                                 segToFrag.nonblocking(ex,
                                                       countsPerFragFile,
                                                       regToExclude[gid],
                                                       script_path,
                                                       via=via,
                                                       stdout=FragFile,
                                                       memory=4))

    def _parse_select_frag(stream):
        for s in stream:
            sr = s.strip().split('\t')
            if 'IsValid' in sr[2] and not any(
                [w in sr[8] for w in ['_and_', 'BothRepeats', 'notValid']]):
                patt = re.search(r'([^:]+):(\d+)-(\d+)', sr[1])
                if patt:
                    coord = patt.groups()
                    #                    if float(sr[11])>0.0:
                    yield (coord[0], int(coord[1]) - 1, int(coord[2]),
                           float(sr[11]))

    for gid, dict_gid in futures.iteritems():
        for rid, res in dict_gid.iteritems():
            res[1].wait()
            touch(ex, res[0])
            segOut = open(res[0], "r")
            resBedGraph = unique_filename_in() + ".sql"
            sqlTr = track(resBedGraph,
                          fields=['start', 'end', 'score'],
                          info={'datatype': 'quantitative'},
                          chrmeta=assembly.chrmeta)
            sqlTr.write(_parse_select_frag(segOut),
                        fields=['chr', 'start', 'end', 'score'])
            sqlTr.close()
            segOut.close()
            results[gid][rid].extend([res[0], resBedGraph])
    return results  #[countsPerFrag_allBed, countsPerFrag_selectSql, segToFrag_out, segToFrag_sql]
Beispiel #40
0
def dnaseseq_workflow( ex, job, assembly, logfile=sys.stdout, via='lsf' ):
    """
    This workflow performs the following steps:

      * BAM files from replicates within the same group are merged
      * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group
      * Wellington is called to identify footprints within these enriched regions
      * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file
      * Average DNAse profiles around motifs are plotted

    """
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid,mapped in job.files.iteritems():
        group_name = job.groups[gid]['name']
        if not isinstance(mapped,dict):
            raise TypeError("Files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped: mapped = {'_': mapped}
        if len(mapped)>1:
            bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()])
            index = index_bam(ex, bamfile)
        else:
            bamfile = mapped.values()[0]['bam']
        if job.groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            if os.path.exists(job.groups[gid].get('bedfile','null')):
                bedfile = job.groups[gid]['bedfile']
            elif os.path.exists(os.path.join(supdir,job.groups[gid].get('bedfile','null'))):
                bedfile = os.path.join(supdir,job.groups[gid]['bedfile'])
            else:
                bedfile = None
            tests.append((bedfile,bamfile))
            names['tests'].append((gid,group_name))
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    tests = macs_bedfiles( ex, assembly.chrmeta, tests, controls, names, 
                           job.options.get('macs_args',["--keep-dup","10"]), via, logfile )
    bedlist = run_wellington(ex, tests, names, assembly, via, logfile)
######################### Motif scanning / plotting
    if any([gr.get('motif') != 'null' and gr.get('motif') 
            for gr in job.groups.values()]):
        motifbeds = motif_scan( ex, bedlist, assembly, job.groups, via, logfile )
        siglist = dict((gid[0],[]) for gid in names['tests'])
        for gid,mapped in job.files.iteritems():
            wig = []
            suffixes = ["fwd","rev"]
            merge_strands = int(job.options.get('merge_strands',-1))
            read_extension = int(job.options.get('read_extension') or -1)
            make_wigs = merge_strands >= 0 or read_extension != 1
            for m in mapped.values():
                if make_wigs or not('wig' in m) or len(m['wig'])<2:
                    output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                          nreads=m["stats"]["total"],
                                                          merge=-1, read_extension=1,
                                                          convert=False,
                                                          b2w_args=[], via=via )
                    wig.append(dict((s,output+s+'.sql') for s in suffixes))
                else:
                    wig.append(m['wig'])
            if len(wig) > 1:
                wig[0] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) 
                              for s in suffixes)
            _trn = job.groups[gid]['name']+"_%s"
            if job.groups[gid]['control']:
                for s,w in wig[0].iteritems():
                    for _g in siglist.keys():
                        siglist[_g].append(track(w,info={'name': _trn%s}))
            else:
                siglist[gid].extend([track(w,info={'name': _trn%s})
                                     for s,w in wig[0].iteritems()])
        plot_files = plot_footprint_profile( ex, motifbeds, siglist, 
                                             assembly.chrnames, 
                                             job.groups, logfile )
        for gid, flist in plot_files.iteritems():
            gname = job.groups[gid]['name']
            plotall = unique_filename_in()
            touch( ex, plotall )
            ex.add(plotall, description=set_file_descr(gname+'_footprints_plots', 
                                                       type='none', view='admin',
                                                       step='motifs', groupId=gid))
            ex.add(flist['pdf'], description=set_file_descr(gname+'_footprints_plots.pdf', 
                                                            type='pdf', step='motifs', 
                                                            groupId=gid),
                   associate_to_filename=plotall, template='%s.pdf')
            tarname = unique_filename_in()
            tarfh = tarfile.open(tarname, "w:gz")
            for mname,matf in flist['mat']:
                tarfh.add(matf, arcname="%s_%s.txt" % (gname,mname))
            tarfh.close()
            ex.add( tarname, description=set_file_descr(gname+'_footprints_plots.tar.gz',
                                                        type='tar', step='motifs', groupId=gid),
                    associate_to_filename=plotall, template='%s.tar.gz')
    logfile.write("\nDone.\n ");logfile.flush()
    return 0
Beispiel #41
0
def add_macs_results(ex,
                     read_length,
                     genome_size,
                     bamfile,
                     ctrlbam=None,
                     name=None,
                     poisson_threshold=None,
                     alias=None,
                     macs_args=None,
                     via='lsf'):
    """Calls the ``macs`` function on each possible pair
    of test and control bam files and adds
    the respective outputs to the execution repository.

    ``macs`` options can be controlled with `macs_args`.
    If a dictionary of Poisson thresholds for each sample is given, then the enrichment bounds ('-m' option)
    are computed from them otherwise the default is '-m 10,100'.

    Returns the set of file prefixes.
    """
    if not (isinstance(bamfile, list)):
        bamfile = [bamfile]
    if not (isinstance(ctrlbam, list)):
        ctrlbam = [ctrlbam]
    if poisson_threshold is None:
        poisson_threshold = {}
    if macs_args is None:
        macs_args = []
    futures = {}
    rl = read_length
    for i, bam in enumerate(bamfile):
        n = name['tests'][i]
        if poisson_threshold.get(n) > 0:
            low = (poisson_threshold.get(n) + 1) * 5
            enrich_bounds = str(min(30, low)) + "," + str(10 * low)
        else:
            enrich_bounds = "10,100"
        if not ("-m" in macs_args): macs_args += ["-m", enrich_bounds]
        if isinstance(read_length, list): rl = read_length[i]
        for j, cam in enumerate(ctrlbam):
            m = name['controls'][j]
            nm = (n, m)
            futures[nm] = macs.nonblocking(ex,
                                           rl,
                                           genome_size,
                                           bam,
                                           cam,
                                           args=macs_args,
                                           via=via,
                                           memory=12)
    prefixes = {}
    for n, f in futures.iteritems():
        p = f.wait()
        prefixes[n] = p
        macs_descr0 = {
            'step': 'macs',
            'type': 'none',
            'view': 'admin',
            'groupId': n[0][0]
        }
        macs_descr1 = {'step': 'macs', 'type': 'xls', 'groupId': n[0][0]}
        macs_descr2 = {
            'step': 'macs',
            'type': 'bed',
            'groupId': n[0][0],
            'ucsc': '1'
        }
        filename = "_vs_".join([x[1] for x in n if x[0]])
        touch(ex, p)
        ex.add(p,
               description=set_file_descr(filename, **macs_descr0),
               alias=alias)
        ex.add(p + "_peaks.xls",
               description=set_file_descr(filename + "_peaks.xls",
                                          **macs_descr1),
               associate_to_filename=p,
               template='%s_peaks.xls')
        bedzip = gzip.open(p + "_peaks.bed.gz", 'wb')
        bedzip.write("track name='" + filename + "_macs_peaks'\n")
        with open(p + "_peaks.bed") as bedinf:
            [bedzip.write(l) for l in bedinf]
        bedzip.close()
        ex.add(p + "_peaks.bed.gz",
               description=set_file_descr(filename + "_peaks.bed.gz",
                                          **macs_descr2),
               associate_to_filename=p,
               template='%s_peaks.bed.gz')
        bedzip = gzip.open(p + "_summits.bed.gz", 'wb')
        bedzip.write("track name='" + filename + "_macs_summits'\n")
        with open(p + "_summits.bed") as bedinf:
            [bedzip.write(l) for l in bedinf]
        bedzip.close()
        ex.add(p + "_summits.bed.gz",
               description=set_file_descr(filename + "_summits.bed.gz",
                                          **macs_descr2),
               associate_to_filename=p,
               template='%s_summits.bed.gz')
        if n[1][0]:
            ex.add(p + "_negative_peaks.xls",
                   description=set_file_descr(filename + "_negative_peaks.xls",
                                              **macs_descr0),
                   associate_to_filename=p,
                   template='%s_negative_peaks.xls')
    return prefixes