Python gzipfile Examples

Programming Language: Python

Namespace/Package Name: bbcflib.common

Method/Function: gzipfile

Examples at hotexamples.com: 8

Python gzipfile - 8 examples found. These are the top rated real world Python examples of bbcflib.common.gzipfile extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: createlib.py Project: MolbioUnige/bbcflib

def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'):
    """
    Main call to create the library
    """
    if len(params['primary'])<2:
        print('Some parameters are missing, cannot create the library')
        print('primary='+params['primary']+" ; "+'secondary='+params['secondary'])
        return [None,None,None,None]

    if not isinstance(assembly_or_fasta,genrep.Assembly):
        assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta )
    chrnames = assembly_or_fasta.chrnames
    chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems())
    allfiles = assembly_or_fasta.fasta_by_chrom  #assembly_or_fasta.untar_genome_fasta()

    libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f,
                                                            params['primary'], params['secondary'],
                                                            params['length'],  params['type'],
                                                            via=via ))
                    for c, f in allfiles.iteritems())
    resfile = unique_filename_in()
    os.mkdir(resfile)
    bedfiles = {}
    for chrom, future in libfiles.iteritems():
        libfiles[chrom] = future.wait()
        if not os.path.getsize(libfiles[chrom][1])>0:
            time.sleep(60)
            touch(ex,libfiles[chrom][1])
        bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map)
    rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via)
    bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames]
    cat(bedchrom,out=resfile+".bed")
    gzipfile(ex,[resfile+".bed"]+bedchrom)
#    resfile_sql = resfile+".sql"
#    track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species'])
    enz_list = []
    infos_lib = { 'assembly_name':  params['species'],
                  'enzyme1_id':     getEnzymeSeqId(params['primary'], True, enz_list, url),
                  'enzyme2_id':     getEnzymeSeqId(params['secondary'], True, enz_list, url),
                  'segment_length': params['length'],
                  'type':           params['type'],
                  'filename':       resfile }
    return [ libfiles, bedfiles, resfile, infos_lib ]

Example #2

Show file

File: rnaseq.py Project: bbcf/bbcflib

    def count_reads(self, bamfiles, gtf):
        self.write_log("* Counting reads")

        # Count reads on genes, transcripts with "rnacounter"
        ncond = len(self.conditions)
        tablenames = [None]*ncond
        futures = [None]*ncond
        max_rlen = 0
        counter_options = ["--nh"]
        for bam in bamfiles:
            sam = pysam.Samfile(bam,'rb')
            max_rlen = max(max_rlen, sam.next().rlen)
        counter_options += ["--exon_cutoff", str(max_rlen)]
        bwt_args = self.job.options.get('map_args',{}).get('bwt_args',[])
#        if not "--local" in bwt_args:
#            counter_options += ["--nh"]
        if hasattr(self.assembly,"fasta_origin") or self.assembly.intype==2:
            counter_options += ["--type","transcripts", "--method","raw"]
        else:
            counter_options += ["--type","genes,transcripts", "--method","raw,nnls"]
        if self.stranded:
            counter_options += ["--stranded"]
        for i,c in enumerate(self.conditions):
            tablenames[i] = unique_filename_in()
            futures[i] = rnacounter.nonblocking(self.ex, bamfiles[i], gtf, stdout=tablenames[i], via=self.via,
                               options=counter_options)

        # Put samples together
        for i,c in enumerate(self.conditions):
            try:
                futures[i].wait()
            except Exception as err:
                self.write_debug("Counting failed: %s." % str(err))
                raise err
            if futures[i] is None:
                self.write_debug("Counting failed.")
                raise ValueError("Counting failed.")
        if len(tablenames) > 1:
            joined = unique_filename_in()
            rnacounter_join.nonblocking(self.ex, tablenames, stdout=joined, via=self.via).wait()
        else:
            joined = tablenames[0]

        # Split genes and transcripts into separate files
        genes_filename = unique_filename_in()
        trans_filename = unique_filename_in()
        genes_file = open(genes_filename,"wb")
        trans_file = open(trans_filename,"wb")
        if self.stranded:
            genes_anti_filename = unique_filename_in()
            trans_anti_filename = unique_filename_in()
            genes_anti_file = open(genes_anti_filename,"wb")
            trans_anti_file = open(trans_anti_filename,"wb")
        with open(joined) as jfile:
            header = jfile.readline()
            hconds = ["counts."+c for c in self.conditions] + ["rpkm."+c for c in self.conditions]
            hinfo = header.strip().split('\t')[2*ncond+1:]
            header = '\t'.join(["ID"] + hconds + hinfo)+'\n'
            genes_file.write(header)
            trans_file.write(header)
            type_idx = header.split('\t').index("Type")
            if self.stranded:
                genes_anti_file.write(header)
                trans_anti_file.write(header)
                sense_idx = header.split('\t').index("Sense")
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    sense = L[sense_idx].lower()
                    if ftype == 'gene':
                        if sense == 'antisense':
                            genes_anti_file.write(line)
                        else:
                            genes_file.write(line)
                    elif ftype == 'transcript':
                        if sense == 'antisense':
                            trans_anti_file.write(line)
                        else:
                            trans_file.write(line)
            else:
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    if ftype == 'gene':
                        genes_file.write(line)
                    elif ftype == 'transcript':
                        trans_file.write(line)
        genes_file.close()
        trans_file.close()

        # Keep intermediate tables
        for i,c in enumerate(self.conditions):
            #shutil.copy(tablenames[i], "../counts%d.txt"%i)
            descr = set_file_descr(self.conditions[i]+'_'+tablenames[i]+'.gz', type='txt', step='pileup', view='admin')
            gzipfile(self.ex, tablenames[i])
            self.ex.add(tablenames[i]+'.gz', description=descr)

        if self.stranded:
            count_files = {'genes':genes_filename, 'transcripts':trans_filename,
                           'genes_anti':genes_anti_filename, 'transcripts_anti':trans_anti_filename}
        else:
            count_files = {'genes':genes_filename, 'transcripts':trans_filename}
        return count_files

Example #3

Show file

File: rnaseq.py Project: MolbioUnige/bbcflib

    def count_reads(self, bamfiles, gtf):
        self.write_log("* Counting reads")

        # Count reads on genes, transcripts with "rnacounter"
        ncond = len(self.conditions)
        tablenames = [None] * ncond
        futures = [None] * ncond
        max_rlen = 0
        counter_options = ["--nh"]
        for bam in bamfiles:
            sam = pysam.Samfile(bam, 'rb')
            max_rlen = max(max_rlen, sam.next().rlen)
        counter_options += ["--exon_cutoff", str(max_rlen)]
        bwt_args = self.job.options.get('map_args', {}).get('bwt_args', [])
        #        if not "--local" in bwt_args:
        #            counter_options += ["--nh"]
        if hasattr(self.assembly, "fasta_origin") or self.assembly.intype == 2:
            counter_options += ["--type", "transcripts", "--method", "raw"]
        else:
            counter_options += [
                "--type", "genes,transcripts", "--method", "raw,nnls"
            ]
        if self.stranded:
            counter_options += ["--stranded"]
        for i, c in enumerate(self.conditions):
            tablenames[i] = unique_filename_in()
            futures[i] = rnacounter.nonblocking(self.ex,
                                                bamfiles[i],
                                                gtf,
                                                stdout=tablenames[i],
                                                via=self.via,
                                                options=counter_options)

        # Put samples together
        for i, c in enumerate(self.conditions):
            try:
                futures[i].wait()
            except Exception as err:
                self.write_debug("Counting failed: %s." % str(err))
                raise err
            if futures[i] is None:
                self.write_debug("Counting failed.")
                raise ValueError("Counting failed.")
        if len(tablenames) > 1:
            joined = unique_filename_in()
            rnacounter_join.nonblocking(self.ex,
                                        tablenames,
                                        stdout=joined,
                                        via=self.via).wait()
        else:
            joined = tablenames[0]

        # Split genes and transcripts into separate files
        genes_filename = unique_filename_in()
        trans_filename = unique_filename_in()
        genes_file = open(genes_filename, "wb")
        trans_file = open(trans_filename, "wb")
        if self.stranded:
            genes_anti_filename = unique_filename_in()
            trans_anti_filename = unique_filename_in()
            genes_anti_file = open(genes_anti_filename, "wb")
            trans_anti_file = open(trans_anti_filename, "wb")
        with open(joined) as jfile:
            header = jfile.readline()
            hconds = ["counts." + c for c in self.conditions
                      ] + ["rpkm." + c for c in self.conditions]
            hinfo = header.strip().split('\t')[2 * ncond + 1:]
            header = '\t'.join(["ID"] + hconds + hinfo) + '\n'
            genes_file.write(header)
            trans_file.write(header)
            type_idx = header.split('\t').index("Type")
            if self.stranded:
                genes_anti_file.write(header)
                trans_anti_file.write(header)
                sense_idx = header.split('\t').index("Sense")
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    sense = L[sense_idx].lower()
                    if ftype == 'gene':
                        if sense == 'antisense':
                            genes_anti_file.write(line)
                        else:
                            genes_file.write(line)
                    elif ftype == 'transcript':
                        if sense == 'antisense':
                            trans_anti_file.write(line)
                        else:
                            trans_file.write(line)
            else:
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    if ftype == 'gene':
                        genes_file.write(line)
                    elif ftype == 'transcript':
                        trans_file.write(line)
        genes_file.close()
        trans_file.close()

        # Keep intermediate tables
        for i, c in enumerate(self.conditions):
            #shutil.copy(tablenames[i], "../counts%d.txt"%i)
            descr = set_file_descr(self.conditions[i] + '_' + tablenames[i] +
                                   '.gz',
                                   type='txt',
                                   step='pileup',
                                   view='admin')
            gzipfile(self.ex, tablenames[i])
            self.ex.add(tablenames[i] + '.gz', description=descr)

        if self.stranded:
            count_files = {
                'genes': genes_filename,
                'transcripts': trans_filename,
                'genes_anti': genes_anti_filename,
                'transcripts_anti': trans_anti_filename
            }
        else:
            count_files = {
                'genes': genes_filename,
                'transcripts': trans_filename
            }
        return count_files

Example #4

Show file

File: chipseq.py Project: bbcf/bbcflib

def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict,frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files',{})
    else:
        raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.")
    merge_strands = int(options.get('merge_strands',-1))
    suffixes = ["fwd","rev"]
    peak_deconvolution = options.get('peak_deconvolution',False)
    if isinstance(peak_deconvolution,basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1','true','t']
    run_meme = options.get('run_meme',False)
    if isinstance(run_meme,basestring):
        run_meme = run_meme.lower() in ['1','true','t']
    macs_args = options.get('macs_args',["--bw","200"])
    b2w_args = options.get('b2w_args',[])
    if not(isinstance(mapseq_files,dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid,mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not(isinstance(mapped,dict)):
            raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name+"_"+str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via )
            if mapped[k].get('poisson_threshold',-1)>0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns)>0:
            p_thresh['group_name'] = sum(ptruns)/len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped)>1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid,group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    logfile.write("Starting MACS.\n");logfile.flush()
    processed = {'macs': add_macs_results( ex, read_length, genome_size,
                                           tests, ctrlbam=controls, name=names,
                                           poisson_threshold=p_thresh,
                                           macs_args=macs_args, via=via ) }
    logfile.write("Done MACS.\n");logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score':(6,sys.maxint)}
    _fields = ['chr','start','end','name','score']
    for i,name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name,names['controls'][0])
            macsbed = track(processed['macs'][ctrl]+"_summits.bed",
                            chrmeta=chrlist, fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed",
                                         chrmeta=chrlist, fields=_fields).read(selection=_select),
                                         'name', lambda __n,_n=xn: "%s:%i" %(__n,_n))
                                   for xn,x in enumerate(names['controls'])])
        ##############################
        macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 )
        peak_list[name] = unique_filename_in()+".sql"
        macs_final = track( peak_list[name], chrmeta=chrlist,
                            info={'datatype':'qualitative'},
                            fields=['start','end','name','score'] )
        macs_final.write(fusion(macs_neighb),clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1: options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension']>100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid,mapped in mapseq_files.iteritems():
#            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not('wig' in m) or len(m['wig'])<2:
                output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                      nreads=m["stats"]["total"],
                                                      merge=-1, read_extension=options['read_extension'],
                                                      convert=False,
                                                      b2w_args=b2w_args, via=via )
                wig.append(dict((s,output+s+'.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via))
                                          for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv( stream, pval ):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] 
                                   for x in stream 
                                   if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), 
                                  fields=stream.fields )
        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1]+" deconvolution.\n");logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name,names['controls'][0])
                macsbed = processed['macs'][ctrl]+"_peaks.bed"
            else:
                macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed"
                                                   for x in names['controls']], via=via )
            deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta,
                                 options['read_extension'], script_path, via=via )
            peak_list[name] = unique_filename_in()+".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed,0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1]+'_peaks.sql', type='sql',
                                              step='deconvolution', groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1]+'_deconv.sql', type='sql',
                                              step='deconvolution',  groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'],(bigwig,"bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1]+'_deconv.bw', type='bigWig',
                                                  ucsc='1', step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e));logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1]+'_deconv.pdf', type='pdf',
                                              step='deconvolution', groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs( stream, xlsl, _f ):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:])
                        yield _p+xlsl[0][nb-1][1:]
                    else:
                        nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:]
        return FeatureStream( _macs_row(stream), fields=_f )
    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']])
        try:
###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat),
                                         xlsl, _fields), mode='append')
        except ValueError:
            _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append')
        peakout.close()
        gzipfile(ex,peakfile)
        peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields))
        ex.add(peakfile+".gz",
               description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text',
                                          step='annotation',groupId=name[0]))
    stracks = [track(wig,info={'name':name+"_"+st}) 
               for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()]
    tablefile = unique_filename_in()
    with open(tablefile,"w") as _tf:
        _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1]
                   for _s in names['tests'] for _c in names['controls']]
        _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']),
                        'name', lambda __n,_n=npt: "%s:%i" %(__n,_n))
                  for npt,pt in enumerate(peakfile_list)]
        features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], 
                                      remove_duplicates=True, group_by=['chr','start','end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile,"a") as _tf:
            for row in quantifs:
                pcols = ['']*_ns*_nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while ( _k < len(_rnsplit)-1-int(_nc>1) ):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k-1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])*_nc+int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n")
    gzipfile(ex,tablefile)
    ex.add(tablefile+".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n");logfile.flush()
        processed['meme'] = parallel_meme( ex, assembly,
                                           peak_list.values(), name=peak_list.keys(),
                                           chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'],
                                           via=via )
    return processed

Example #5

Show file

File: c4seq.py Project: bbcf/bbcflib

def c4seq_workflow( ex, job, primers_dict, assembly,
                    c4_url=None, script_path='', logfile=sys.stdout, via='lsf' ):
    '''
    Main
    * open the 4C-seq minilims and create execution
    * 0. get/create the library
    * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql)
    * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate)
    '''

    mapseq_files = job.files
### outputs
    processed = {'lib': {}, 'density': {}, '4cseq': {}}
    processed['4cseq'] = {'density_files' : {},
                          'countsPerFrag' : {},
                          'countsPerFrag_grp' : {},
                          'norm' : {},
                          'norm_grp' : {},
                          'profileCorrection': {},
                          'profileCorrection_grp' : {},
                          'smooth_grp' : {},
                          'domainogram_grp' : {},
                          'bricks2frags' : {}}
                            # was 'smoothFrag': {}, 'domainogram': {}}
    regToExclude = {}
    new_libs=[]
### options
    run_domainogram = {}
    before_profile_correction = {}
    if not job.options.get('viewpoints_chrs',False):
        out_chromosomes = ','.join([ch for ch in assembly.chrnames])
    else:
        out_chromosomes = ','.join([primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] for gid,group in job.groups.iteritems()])
    print "out_chromosomes=" + out_chromosomes + "\n"

    sizeExt = job.options.get('norm_reg',1000000)
    print "region considered for normalisation: mid viewpoint +/-" + str(sizeExt) + 'bps'

### do it
    for gid, group in job.groups.iteritems():
        run_domainogram[gid] = group.get('run_domainogram',False)
        if isinstance(run_domainogram[gid],basestring):
            run_domainogram[gid] = (run_domainogram[gid].lower() in ['1','true','on','t'])
        before_profile_correction[gid] = group.get('before_profile_correction',False)
        if isinstance(before_profile_correction[gid],basestring):
            before_profile_correction[gid] = (before_profile_correction[gid].lower() in ['1','true','on','t'])
        processed['lib'][gid] = get_libForGrp(ex, group, assembly,
                                              new_libs, gid, c4_url, via=via)
#reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed'
        processed['4cseq']['density_files'][gid] = {}
        regToExclude[gid] = primers_dict.get(group['name'],{}).get('regToExclude',"").replace('\r','')

        # if no regToExclude defined, set it as mid_baitCoord +/-5kb
        if len(regToExclude[gid])==0 :
            baitcoord_mid = int(0.5 * (int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[0]) + int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[1]) ))
            regToExclude[gid] = primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] + ':' + str(baitcoord_mid-5000) + '-' + str(baitcoord_mid+5000)

        #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()]))
        print(primers_dict.get(group['name'],{}))
        print "regToExclude["+str(gid)+"]="+regToExclude[gid]
        for rid,run in group['runs'].iteritems():
            libname = mapseq_files[gid][rid]['libname']
            if job.options.get('merge_strands') != 0 or not('wig' in mapseq_files[gid][rid]):
                density_file=parallel_density_sql( ex, mapseq_files[gid][rid]['bam'],
                                                   assembly.chrmeta,
                                                   nreads=mapseq_files[gid][rid]['stats']["total"],
                                                   merge=0,
                                                   read_extension=mapseq_files[gid][rid]['stats']['read_length'],
                                                   convert=False,
                                                   via=via )
                density_file += "merged.sql"
                ex.add( density_file,
                        description=set_file_descr("density_file_"+libname+".sql",
                                                   groupId=gid,step="density",type="sql",view='admin',gdv="1") )
            else:
                density_file = mapseq_files[gid][rid]['wig']['merged']
            #density_files.append(density_file)
            processed['4cseq']['density_files'][gid][rid]=density_file

        # back to grp level!
        # not anymore:
        # processed['density'][gid] = merge_sql(ex, density_files, via=via)

    processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via )
    ## access per gid+rid

    futures_norm = {}
    countsPerFrags_bedGraph = {}
    futures_merged_raw = {}
    for gid, group in job.groups.iteritems():
        futures_norm[gid] = {}
        countsPerFrags_bedGraph[gid] = {}
        processed['4cseq']['norm'][gid] = {}
        for rid,run in group['runs'].iteritems():
            normfile = unique_filename_in()
            touch(ex, normfile)
            resfile = unique_filename_in()+".bedGraph"
            resfiles = processed['4cseq']['countsPerFrag'][gid][rid] # _all.sql
            convert(resfiles[3],resfile)
            countsPerFrags_bedGraph[gid][rid] = resfile

            print "call normFrags: infiles="+resfile+", normfile="+normfile+"baitCoord="+primers_dict[group['name']]['baitcoord']+", sizeExt=sizeExt, name="+ group['name']+"rep_"+str(rid) + "regToExclude="+regToExclude[gid]+"\n"
            futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name']+"rep_"+str(rid) ,regToExclude=regToExclude[gid], script_path=script_path, via=via )
            processed['4cseq']['norm'][gid][rid] = normfile

        if len(group) > 1:
            ## merge replicates before normalisation.
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_raw_mergedRep"
            print "gid="+group['name']
            print "call mergeRep for replicates before normalisation: infiles="+",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['countsPerFrag_grp'][gid] = mergefile
        else:
            futures_merged_raw[gid] = None
            processed['4cseq']['countsPerFrag_grp'][gid] = countsPerFrags_bedGraph[gid][0] #if no replicates, then the file we want is the 1st one

    print "***** profile correction / sample + merge normalised data"
    futures_merged = {} # per gid
    futures_profcor = {} # per gid, per rid
    for gid, group in job.groups.iteritems():
        ## run profile correction per run then merge them
        futures_profcor[gid] = {}
        processed['4cseq']['profileCorrection'][gid] = {}
        for rid, run in group['runs'].iteritems():
            # wait for normalisation of all replicates to be finished
            futures_norm[gid][rid].wait() ## normalised files, per grp, per rep
            normfile = processed['4cseq']['norm'][gid][rid]
            file1 = unique_filename_in() #track file
            touch(ex,file1)
            file2 = unique_filename_in() #report file
            touch(ex,file2)
            file3 = unique_filename_in() #table file
            touch(ex, file3)
            print "call profileCorrection: normfile="+normfile+", baitCoord="+primers_dict[group['name']]['baitcoord']+", name="+group['name']+", file1="+file1+", file2="+file2+", file3= "+file3+"\n"
            futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile,
                                        primers_dict[group['name']]['baitcoord'],
                                        group['name'], file1, file2, file3, script_path,
                                        via=via )
            processed['4cseq']['profileCorrection'][gid][rid] = [file1, file2, file3]

        ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop.
        if len(group)>1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_norm_mergedRep"
            print "gid="+group['name']
            print "call mergeRep: infiles="+",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['norm_grp'][gid] = mergefile
        else:
            futures_merged[gid] = None
            processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][gid][0] ##if no replicates, then the file we want is the 1st one

    print "***** merge profile corrected data"
    futures_profcor_merged = {} # per gid
    for gid, group in job.groups.iteritems():
        processed['4cseq']['profileCorrection_grp'][gid] = {}
        for rid, run in group['runs'].iteritems():
            futures_profcor[gid][rid].wait()   ## wait for ProfileCorrection to be finished

        ## merge replicates after profile correction
        if len(group)>1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_ProfCor_mergedRep"
            pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid,res_rid in processed['4cseq']['profileCorrection'][gid].iteritems()]
            print "call mergeRep (for PC tables): infiles="+",".join(pcfiles)+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['profileCorrection_grp'][gid] = mergefile
        else:
            futures_profcor_merged[gid] = None
            processed['4cseq']['profileCorrection_grp'][gid] = processed['4cseq']['profileCorrection'][gid][0] ##if no replicates, then the file we want is the 1st one


    print "***** smooth data"
    futures_smoothed = {}
    for gid, group in job.groups.iteritems():
        file1 = unique_filename_in()
        touch(ex,file1)
        file2 = unique_filename_in()
        touch(ex, file2)
        file3 = unique_filename_in()
        touch(ex, file3)
        nFragsPerWin = group['window_size']
        futures_merged_raw[gid].wait() ## wait for merging of raw_grp to be completed
        futures_smoothed[gid] = ( smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'],
                                                    file1, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        futures_merged[gid].wait() ## wait for merging of norm_grp to be completed
        futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name']+"_norm",
                                                    file2, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        futures_profcor_merged[gid].wait() # wait for the merging of profile corrected data to be done
        futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name']+"_fromProfileCorrected",
                                                    file3, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        processed['4cseq']['smooth_grp'][gid] = [file1,file2,file3] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC]

    print "***** Domainograms"
    futures_domainograms = {}
    for gid, group in job.groups.iteritems():
        grName = job.groups[gid]['name']
        if run_domainogram[gid]:
            regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord']
            if before_profile_correction[gid]:
               futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid],
                                                                            grName, regCoord=regCoord, skip=1,
                                                                            script_path=script_path, via=via, memory=15 )
            else:
                futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid],
                                                                            grName, regCoord=regCoord.split(':')[0], skip=1,
                                                                            script_path=script_path, via=via, memory=15 )

    ## prepare tar files for domainogram results (if any)
    ## and create "BRICKS to frags" files
    print "***** BRICKS to Frags"
    futures_BRICKS2Frags = {}
    for gid, f in futures_domainograms.iteritems():
        if run_domainogram[gid]: # if domainogram has been run
            resFiles = []
            logFile = f.wait()
            start = False
            tarname = job.groups[gid]['name']+"_domainogram.tar.gz"
            res_tar = tarfile.open(tarname, "w:gz")
            futures_BRICKS2Frags[gid] = []
            processed['4cseq']['bricks2frags'][gid] = []
            if logFile is None: continue
            with open(logFile) as f:
                for s in f:
                    s = s.strip()
                    if '####resfiles####' in s:
                        start = True
                    elif start and "RData" not in s:
                        resFiles.append(s)
                        res_tar.add(s)
                    if start and "foundBRICKS" in s:
                        bricks2fragsfile = unique_filename_in()+".bedGraph"
                        touch(ex, bricks2fragsfile)
                        futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking(ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4 ) ]
                        processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ]
            res_tar.close()
            processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname]




############### prepare tables for global results
    print "***** combine results into tables "
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        for rid,run in group['runs'].iteritems():
            allNames += [ group['name']+"_rep"+str(rid)+"_norm", group['name']+"_rep"+str(rid)+"_fit" ]
            allFiles += [ processed['4cseq']['profileCorrection'][gid][rid][2] ]
            allRegToExclude += [ regToExclude[gid] ]
    tablePC=unique_filename_in()+".txt"
    print("***will call makeTable with:")
    print(",".join(allFiles))
    print("resfile="+tablePC)
    print(",".join(allNames))
    touch(ex,tablePC)

    #regToExclude[gid]

    futures_tables = (makeTable.nonblocking(ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    # wait for all smoothing to be done
    for gid, fg in futures_smoothed.iteritems():
        for f in fg: f.wait()

    ## make Table raw/smoothed_raw
    print("** make Table raw/smoothed_raw")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        futures_merged_raw[gid].wait()
        allNames += [ group['name']+"_raw", group['name']+"_rawSmoothed" ]
        allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ]
        allRegToExclude += [ 'NA', regToExclude[gid] ]

    tableSmoothedRaw_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothedRaw_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## make Table norm/smoothed_norm before PC
    print("** make Table norm/smoothed_norm befor PC")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        allNames += [ group['name']+"_norm", group['name']+"_smoothed" ]
        allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ]
        allRegToExclude += [ regToExclude[gid], regToExclude[gid] ]

    tableSmoothed_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothed_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## make Table norm/smoothed_norm after PC
    print("** make Table norm/smoothed_norm after PC")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        allNames += [ group['name']+"_normPC", group['name']+"_smoothedPC" ]
        allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ]
        allRegToExclude += [ regToExclude[gid], regToExclude[gid] ]

    tableSmoothedPC_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothedPC_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## combine BRICKS2Frags files
    allNames=[]
    allFiles=[]
    for gid, fg in futures_BRICKS2Frags.iteritems():
        for f in fg: f.wait()
        allNames += [ job.groups[gid]['name']+"_BRICKSpval" ]
        cat_bricks2frags = unique_filename_in()+".txt"
        print ','.join(processed['4cseq']['bricks2frags'][gid])
        cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],out=cat_bricks2frags)
        allFiles += [ cat_bricks2frags ]

    for gid, fg in futures_smoothed.iteritems():
        for f in fg: f.wait()

    tableBRICKS2Frags = unique_filename_in()+".txt"
    touch(ex,tableBRICKS2Frags)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8 ), )


    for f in futures_tables: f.wait()


################ Add everything to minilims below!
    step = "density"
    for gid in processed['4cseq']['density_files'].keys():
        for rid, sql in processed['4cseq']['density_files'][gid].iteritems():
            fname = "density_file_"+job.groups[gid]['name']+"_merged_rep"+str(rid)
            ex.add( sql, description=set_file_descr( fname+".sql",
                                                 groupId=gid,step=step,type="sql",gdv="1" ) )
            wig = unique_filename_in()+".bw"
            convert( sql, wig )
            ex.add( wig, description=set_file_descr( fname+".bw",
                                                 groupId=gid,step=step,type="bigWig",ucsc="1") )
    step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate
    for gid in processed['4cseq']['countsPerFrag'].keys():
        for rid, resfiles in processed['4cseq']['countsPerFrag'][gid].iteritems():
            fname = "meanScorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid)
            ex.add( resfiles[1], description=set_file_descr( fname+".sql",
                                                             groupId=gid,step=step,type="sql",view="admin",gdv='1'))
            #gzipfile(ex,resfiles[0])
            #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz",
            #                                                       groupId=gid,step=step,type="bed",view="admin" ))
            fname = "segToFrag_"+job.groups[gid]['name']+"_rep"+str(rid)
            ex.add( resfiles[3], description=set_file_descr( fname+"_all.sql",
                                                             groupId=gid,step=step,type="sql",
                                                             comment="all informative frags - null included" ))
            trsql = track(resfiles[3])
            bwig = unique_filename_in()+".bw"
            trwig = track(bwig,chrmeta=trsql.chrmeta)
            trwig.write(trsql.read(fields=['chr','start','end','score'],
                                   selection={'score':(0.01,sys.maxint)}))
            trwig.close()
            ex.add( bwig, set_file_descr(fname+".bw",groupId=gid,step=step,type="bigWig",ucsc='1'))
        ## add segToFrags before normalisation
        futures_merged_raw[gid].wait()
        trbedgraph = track(removeNA(processed['4cseq']['countsPerFrag_grp'][gid]),format='bedgraph')
        bwig = unique_filename_in()+".bw"
        trwig = track(bwig,chrmeta=assembly.chrmeta)
        trwig.write(trbedgraph.read(fields=['chr','start','end','score'],
                               selection={'score':(0.01,sys.maxint)}))
        trwig.close()
        fname = "segToFrag_"+job.groups[gid]['name']
        ex.add( bwig, description=set_file_descr( fname+".bw",
                                                             groupId=gid,step=step,type="bigWig",
                                                             comment="segToFrag file before normalisation" ))

    step = "norm_counts_per_frags"  # after new normalisation process, combined replicates
    for gid, resfile in processed['4cseq']['norm_grp'].iteritems():
        fname = "normalised_scorePerFeature_"+job.groups[gid]['name']
        gzipfile(ex,resfile)
        ex.add( resfile+".gz", description=set_file_descr( fname+".bedGraph.gz", groupId=gid,step=step, type="bedGraph",ucsc='1'))
    # norm files, per replicates (might be removed)
    for gid, dict_gid in processed['4cseq']['norm'].iteritems():
        for rid, resfile in dict_gid.iteritems():
            fname = "normalised_scorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid)
            gzipfile(ex,resfile)
            ex.add(resfile+".gz",
                    description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
    step = "profile_correction" # Profile corrected data, combined replicates
    for gid, profileCorrectedFile in processed['4cseq']['profileCorrection_grp'].iteritems():
        fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected"
        gzipfile(ex,profileCorrectedFile)
        ex.add( profileCorrectedFile+".gz",
                description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
    # Profile corrected, per replicate (might be removed)
    for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems():
        for rid, resfiles in dict_gid.iteritems():
    #        profileCorrectedFile = resfiles[0]
            reportProfileCorrection = resfiles[1]
            fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_rep"+str(rid)
    #        gzipfile(ex,profileCorrectedFile)
     #       ex.add( profileCorrectedFile+".gz",
      #              description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
            ex.add( reportProfileCorrection, description=set_file_descr(fname+".pdf",
                                                                    groupId=gid,step=step,type="pdf"))
    step = "smoothing"
    for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems():
        rawSmoothFile = resfiles[0]
        smoothFile = resfiles[1]
        afterProfileCorrection = resfiles[2]
        nFrags = str(job.groups[gid]['window_size'])
        ## smoothed file before normalisation
        fname = "segToFrag_"+job.groups[gid]['name']+"_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,rawSmoothFile)
        ex.add(rawSmoothFile+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
        ## smoothed file after normalisation, before Profile correction
        fname = "segToFrag_"+job.groups[gid]['name']+"_norm_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,smoothFile)
        ex.add(smoothFile+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
        ## smoothed file after normalisation, after Profile correction
        fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,afterProfileCorrection)
        ex.add(afterProfileCorrection+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))

    step = "domainograms"
    for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems():
        tarFile = resfiles.pop()
        fname = job.groups[gid]['name']+"_domainogram.tar.gz"
        ex.add(tarFile, description=set_file_descr(fname,
                                                   groupId=gid,step=step,type="tgz"))
        for s in resfiles:
            if s[-8:] == "bedGraph":
                gzipfile(ex,s)
                s += ".gz"
                ex.add( s, description=set_file_descr( s, groupId=gid,step=step,type="bedGraph",ucsc="1",gdv="1"))

    step = "combined_results"
    gzipfile(ex,tableSmoothedRaw_grp)
    ex.add(tableSmoothedRaw_grp+".gz", description=set_file_descr("table_segToFrags_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableSmoothed_grp)
    ex.add(tableSmoothed_grp+".gz", description=set_file_descr("table_normalised_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableSmoothedPC_grp)
    ex.add(tableSmoothedPC_grp+".gz", description=set_file_descr("table_profileCorrected_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tablePC)
    ex.add(tablePC+".gz", description=set_file_descr("table_normalised_fit_per_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableBRICKS2Frags)
    ex.add(tableBRICKS2Frags+".gz", description=set_file_descr("table_frags_in_BRICKS_combined_replicates.txt.gz",step=step,type="txt"))

    return processed

Example #6

Show file

File: chipseq.py Project: MolbioUnige/bbcflib

def chipseq_workflow(ex,
                     job_or_dict,
                     assembly,
                     script_path='',
                     logfile=sys.stdout,
                     via='lsf'):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict, frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not ('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files', {})
    else:
        raise TypeError(
            "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'."
        )
    merge_strands = int(options.get('merge_strands', -1))
    suffixes = ["fwd", "rev"]
    peak_deconvolution = options.get('peak_deconvolution', False)
    if isinstance(peak_deconvolution, basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't']
    run_meme = options.get('run_meme', False)
    if isinstance(run_meme, basestring):
        run_meme = run_meme.lower() in ['1', 'true', 't']
    macs_args = options.get('macs_args', ["--bw", "200"])
    b2w_args = options.get('b2w_args', [])
    if not (isinstance(mapseq_files, dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid, mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not (isinstance(mapped, dict)):
            raise TypeError(
                "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name + "_" + str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking(ex,
                                                         mapped[k]["bam"],
                                                         via=via)
            if mapped[k].get('poisson_threshold', -1) > 0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns) > 0:
            p_thresh['group_name'] = sum(ptruns) / len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped) > 1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid, group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    logfile.write("Starting MACS.\n")
    logfile.flush()
    processed = {
        'macs':
        add_macs_results(ex,
                         read_length,
                         genome_size,
                         tests,
                         ctrlbam=controls,
                         name=names,
                         poisson_threshold=p_thresh,
                         macs_args=macs_args,
                         via=via)
    }
    logfile.write("Done MACS.\n")
    logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
    ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score': (6, sys.maxint)}
    _fields = ['chr', 'start', 'end', 'name', 'score']
    for i, name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name, names['controls'][0])
            macsbed = track(processed['macs'][ctrl] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([
                apply(track(processed['macs'][(name, x)] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select),
                      'name',
                      lambda __n, _n=xn: "%s:%i" % (__n, _n))
                for xn, x in enumerate(names['controls'])
            ])
        ##############################
        macs_neighb = neighborhood(macsbed, before_start=150, after_end=150)
        peak_list[name] = unique_filename_in() + ".sql"
        macs_final = track(peak_list[name],
                           chrmeta=chrlist,
                           info={'datatype': 'qualitative'},
                           fields=['start', 'end', 'name', 'score'])
        macs_final.write(fusion(macs_neighb), clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(
        options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1:
        options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension'] > 100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid, mapped in mapseq_files.iteritems():
        #            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                output = mapseq.parallel_density_sql(
                    ex,
                    m["bam"],
                    assembly.chrmeta,
                    nreads=m["stats"]["total"],
                    merge=-1,
                    read_extension=options['read_extension'],
                    convert=False,
                    b2w_args=b2w_args,
                    via=via)
                wig.append(dict((s, output + s + '.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict(
                (s, merge_sql(ex, [x[s] for x in wig], via=via))
                for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv(stream, pval):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream(
                ((x[0], ) + ((x[2] + x[1]) / 2 - 150,
                             (x[2] + x[1]) / 2 + 150) + x[3:]
                 for x in stream if "FERR=" in x[3]
                 and float(ferr.search(x[3]).groups()[0]) <= pval),
                fields=stream.fields)

        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1] + " deconvolution.\n")
            logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name, names['controls'][0])
                macsbed = processed['macs'][ctrl] + "_peaks.bed"
            else:
                macsbed = intersect_many_bed(ex, [
                    processed['macs'][(name, x)] + "_peaks.bed"
                    for x in names['controls']
                ],
                                             via=via)
            deconv = run_deconv(ex,
                                merged_wig[name[1]],
                                macsbed,
                                assembly.chrmeta,
                                options['read_extension'],
                                script_path,
                                via=via)
            peak_list[name] = unique_filename_in() + ".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist,
                       fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed, 0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1] + '_peaks.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1] + '_deconv.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'], (bigwig, "bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1] + '_deconv.bw',
                                                  type='bigWig',
                                                  ucsc='1',
                                                  step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e))
                logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1] + '_deconv.pdf',
                                              type='pdf',
                                              step='deconvolution',
                                              groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs(stream, xlsl, _f):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(
                            _n.split(";")[0][13:]) if _n[:3] == "ID=" else int(
                                _n[10:])
                        yield _p + xlsl[0][nb - 1][1:]
                    else:
                        nb = _n.split(
                            ";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:]

        return FeatureStream(_macs_row(stream), fields=_f)

    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,
                       chrmeta=chrlist,
                       fields=["chr", "start", "end", "name", "score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([
            processed['macs'][(name, _c)] + "_peaks.xls"
            for _c in names['controls']
        ])
        try:
            ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join([
                'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)',
                'location_type', 'distance'
            ] + _fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(
                    getNearestFeature(ptrack.read(selection=chrom), _feat),
                    xlsl, _fields),
                              mode='append')
        except ValueError:
            _fields = ['chr', 'start', 'end', 'name', 'score'
                       ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join(
                ['chromosome', 'start', 'end', 'info', 'peak_height'] +
                _fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl,
                                         _fields),
                              mode='append')
        peakout.close()
        gzipfile(ex, peakfile)
        peakfile_list.append(
            track(peakfile + ".gz", format='txt', fields=_fields))
        ex.add(peakfile + ".gz",
               description=set_file_descr(name[1] + '_annotated_peaks.txt.gz',
                                          type='text',
                                          step='annotation',
                                          groupId=name[0]))
    stracks = [
        track(wig, info={'name': name + "_" + st})
        for name, wigdict in merged_wig.iteritems()
        for st, wig in wigdict.iteritems()
    ]
    tablefile = unique_filename_in()
    with open(tablefile, "w") as _tf:
        _pnames = [
            "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1]
            for _s in names['tests'] for _c in names['controls']
        ]
        _tf.write("\t".join([
            '#chromosome',
            'start',
            'end',
        ] + _pnames + [s.name for s in stracks]) + "\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [
            apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']),
                  'name',
                  lambda __n, _n=npt: "%s:%i" % (__n, _n))
            for npt, pt in enumerate(peakfile_list)
        ]
        features = fusion(
            concatenate(pk_lst,
                        fields=['chr', 'start', 'end', 'name'],
                        remove_duplicates=True,
                        group_by=['chr', 'start', 'end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile, "a") as _tf:
            for row in quantifs:
                pcols = [''] * _ns * _nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while (_k < len(_rnsplit) - 1 - int(_nc > 1)):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k - 1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0]) * _nc + int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(
                    str(tt)
                    for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) +
                          "\n")
    gzipfile(ex, tablefile)
    ex.add(tablefile + ".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',
                                      type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n")
        logfile.flush()
        processed['meme'] = parallel_meme(
            ex,
            assembly,
            peak_list.values(),
            name=peak_list.keys(),
            chip=True,
            meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'],
            via=via)
    return processed

Example #7

Show file

File: c4seq.py Project: MolbioUnige/bbcflib

def c4seq_workflow(ex,
                   job,
                   primers_dict,
                   assembly,
                   c4_url=None,
                   script_path='',
                   logfile=sys.stdout,
                   via='lsf'):
    '''
    Main
    * open the 4C-seq minilims and create execution
    * 0. get/create the library
    * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql)
    * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate)
    '''

    mapseq_files = job.files
    ### outputs
    processed = {'lib': {}, 'density': {}, '4cseq': {}}
    processed['4cseq'] = {
        'density_files': {},
        'countsPerFrag': {},
        'countsPerFrag_grp': {},
        'norm': {},
        'norm_grp': {},
        'profileCorrection': {},
        'profileCorrection_grp': {},
        'smooth_grp': {},
        'domainogram_grp': {},
        'bricks2frags': {}
    }
    # was 'smoothFrag': {}, 'domainogram': {}}
    regToExclude = {}
    new_libs = []
    ### options
    run_domainogram = {}
    before_profile_correction = {}
    if not job.options.get('viewpoints_chrs', False):
        out_chromosomes = ','.join([ch for ch in assembly.chrnames])
    else:
        out_chromosomes = ','.join([
            primers_dict.get(group['name'], {}).get('baitcoord').split(':')[0]
            for gid, group in job.groups.iteritems()
        ])
    print "out_chromosomes=" + out_chromosomes + "\n"

    sizeExt = job.options.get('norm_reg', 1000000)
    print "region considered for normalisation: mid viewpoint +/-" + str(
        sizeExt) + 'bps'

    ### do it
    for gid, group in job.groups.iteritems():
        run_domainogram[gid] = group.get('run_domainogram', False)
        if isinstance(run_domainogram[gid], basestring):
            run_domainogram[gid] = (run_domainogram[gid].lower()
                                    in ['1', 'true', 'on', 't'])
        before_profile_correction[gid] = group.get('before_profile_correction',
                                                   False)
        if isinstance(before_profile_correction[gid], basestring):
            before_profile_correction[gid] = (
                before_profile_correction[gid].lower()
                in ['1', 'true', 'on', 't'])
        processed['lib'][gid] = get_libForGrp(ex,
                                              group,
                                              assembly,
                                              new_libs,
                                              gid,
                                              c4_url,
                                              via=via)
        #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed'
        processed['4cseq']['density_files'][gid] = {}
        regToExclude[gid] = primers_dict.get(group['name'],
                                             {}).get('regToExclude',
                                                     "").replace('\r', '')

        # if no regToExclude defined, set it as mid_baitCoord +/-5kb
        if len(regToExclude[gid]) == 0:
            baitcoord_mid = int(0.5 * (int(
                primers_dict.get(group['name'], {}).get('baitcoord').split(':')
                [1].split('-')[0]) + int(
                    primers_dict.get(group['name'], {}).get('baitcoord').split(
                        ':')[1].split('-')[1])))
            regToExclude[gid] = primers_dict.get(
                group['name'], {}).get('baitcoord').split(':')[0] + ':' + str(
                    baitcoord_mid - 5000) + '-' + str(baitcoord_mid + 5000)

        #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()]))
        print(primers_dict.get(group['name'], {}))
        print "regToExclude[" + str(gid) + "]=" + regToExclude[gid]
        for rid, run in group['runs'].iteritems():
            libname = mapseq_files[gid][rid]['libname']
            if job.options.get('merge_strands') != 0 or not (
                    'wig' in mapseq_files[gid][rid]):
                density_file = parallel_density_sql(
                    ex,
                    mapseq_files[gid][rid]['bam'],
                    assembly.chrmeta,
                    nreads=mapseq_files[gid][rid]['stats']["total"],
                    merge=0,
                    read_extension=mapseq_files[gid][rid]['stats']
                    ['read_length'],
                    convert=False,
                    via=via)
                density_file += "merged.sql"
                ex.add(density_file,
                       description=set_file_descr("density_file_" + libname +
                                                  ".sql",
                                                  groupId=gid,
                                                  step="density",
                                                  type="sql",
                                                  view='admin',
                                                  gdv="1"))
            else:
                density_file = mapseq_files[gid][rid]['wig']['merged']
            #density_files.append(density_file)
            processed['4cseq']['density_files'][gid][rid] = density_file

        # back to grp level!
        # not anymore:
        # processed['density'][gid] = merge_sql(ex, density_files, via=via)

    processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag(
        ex, processed, job.groups, assembly, regToExclude, script_path, via)
    ## access per gid+rid

    futures_norm = {}
    countsPerFrags_bedGraph = {}
    futures_merged_raw = {}
    for gid, group in job.groups.iteritems():
        futures_norm[gid] = {}
        countsPerFrags_bedGraph[gid] = {}
        processed['4cseq']['norm'][gid] = {}
        for rid, run in group['runs'].iteritems():
            normfile = unique_filename_in()
            touch(ex, normfile)
            resfile = unique_filename_in() + ".bedGraph"
            resfiles = processed['4cseq']['countsPerFrag'][gid][
                rid]  # _all.sql
            convert(resfiles[3], resfile)
            countsPerFrags_bedGraph[gid][rid] = resfile

            print "call normFrags: infiles=" + resfile + ", normfile=" + normfile + "baitCoord=" + primers_dict[
                group['name']][
                    'baitcoord'] + ", sizeExt=sizeExt, name=" + group[
                        'name'] + "rep_" + str(
                            rid) + "regToExclude=" + regToExclude[gid] + "\n"
            futures_norm[gid][rid] = normFrags.nonblocking(
                ex,
                resfile,
                normfile,
                baitCoord=primers_dict[group['name']]['baitcoord'],
                sizeExt=sizeExt,
                name=group['name'] + "rep_" + str(rid),
                regToExclude=regToExclude[gid],
                script_path=script_path,
                via=via)
            processed['4cseq']['norm'][gid][rid] = normfile

        if len(group) > 1:
            ## merge replicates before normalisation.
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_raw_mergedRep"
            print "gid=" + group['name']
            print "call mergeRep for replicates before normalisation: infiles=" + ",".join(
                [
                    res_rid for rid, res_rid in
                    countsPerFrags_bedGraph[gid].iteritems()
                ]
            ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_merged_raw[gid] = mergeRep.nonblocking(
                ex,
                ",".join([
                    res_rid for rid, res_rid in
                    countsPerFrags_bedGraph[gid].iteritems()
                ]),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['countsPerFrag_grp'][gid] = mergefile
        else:
            futures_merged_raw[gid] = None
            processed['4cseq']['countsPerFrag_grp'][
                gid] = countsPerFrags_bedGraph[gid][
                    0]  #if no replicates, then the file we want is the 1st one

    print "***** profile correction / sample + merge normalised data"
    futures_merged = {}  # per gid
    futures_profcor = {}  # per gid, per rid
    for gid, group in job.groups.iteritems():
        ## run profile correction per run then merge them
        futures_profcor[gid] = {}
        processed['4cseq']['profileCorrection'][gid] = {}
        for rid, run in group['runs'].iteritems():
            # wait for normalisation of all replicates to be finished
            futures_norm[gid][rid].wait(
            )  ## normalised files, per grp, per rep
            normfile = processed['4cseq']['norm'][gid][rid]
            file1 = unique_filename_in()  #track file
            touch(ex, file1)
            file2 = unique_filename_in()  #report file
            touch(ex, file2)
            file3 = unique_filename_in()  #table file
            touch(ex, file3)
            print "call profileCorrection: normfile=" + normfile + ", baitCoord=" + primers_dict[
                group['name']]['baitcoord'] + ", name=" + group[
                    'name'] + ", file1=" + file1 + ", file2=" + file2 + ", file3= " + file3 + "\n"
            futures_profcor[gid][rid] = profileCorrection.nonblocking(
                ex,
                normfile,
                primers_dict[group['name']]['baitcoord'],
                group['name'],
                file1,
                file2,
                file3,
                script_path,
                via=via)
            processed['4cseq']['profileCorrection'][gid][rid] = [
                file1, file2, file3
            ]

        ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop.
        if len(group) > 1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_norm_mergedRep"
            print "gid=" + group['name']
            print "call mergeRep: infiles=" + ",".join([
                res_rid for rid, res_rid in processed['4cseq']['norm']
                [gid].iteritems()
            ]) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_merged[gid] = mergeRep.nonblocking(
                ex,
                ",".join([
                    res_rid for rid, res_rid in processed['4cseq']['norm']
                    [gid].iteritems()
                ]),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['norm_grp'][gid] = mergefile
        else:
            futures_merged[gid] = None
            processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][
                gid][
                    0]  ##if no replicates, then the file we want is the 1st one

    print "***** merge profile corrected data"
    futures_profcor_merged = {}  # per gid
    for gid, group in job.groups.iteritems():
        processed['4cseq']['profileCorrection_grp'][gid] = {}
        for rid, run in group['runs'].iteritems():
            futures_profcor[gid][rid].wait(
            )  ## wait for ProfileCorrection to be finished

        ## merge replicates after profile correction
        if len(group) > 1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_ProfCor_mergedRep"
            pcfiles = [
                processed['4cseq']['profileCorrection'][gid][rid][0]
                for rid, res_rid in processed['4cseq']['profileCorrection']
                [gid].iteritems()
            ]
            print "call mergeRep (for PC tables): infiles=" + ",".join(
                pcfiles
            ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_profcor_merged[gid] = mergeRep.nonblocking(
                ex,
                ",".join(pcfiles),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['profileCorrection_grp'][gid] = mergefile
        else:
            futures_profcor_merged[gid] = None
            processed['4cseq']['profileCorrection_grp'][gid] = processed[
                '4cseq']['profileCorrection'][gid][
                    0]  ##if no replicates, then the file we want is the 1st one

    print "***** smooth data"
    futures_smoothed = {}
    for gid, group in job.groups.iteritems():
        file1 = unique_filename_in()
        touch(ex, file1)
        file2 = unique_filename_in()
        touch(ex, file2)
        file3 = unique_filename_in()
        touch(ex, file3)
        nFragsPerWin = group['window_size']
        futures_merged_raw[gid].wait(
        )  ## wait for merging of raw_grp to be completed
        futures_smoothed[gid] = (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['countsPerFrag_grp'][gid],
            nFragsPerWin,
            group['name'],
            file1,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        futures_merged[gid].wait(
        )  ## wait for merging of norm_grp to be completed
        futures_smoothed[gid] += (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['norm_grp'][gid],
            nFragsPerWin,
            group['name'] + "_norm",
            file2,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        futures_profcor_merged[gid].wait(
        )  # wait for the merging of profile corrected data to be done
        futures_smoothed[gid] += (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['profileCorrection_grp'][gid],
            nFragsPerWin,
            group['name'] + "_fromProfileCorrected",
            file3,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        processed['4cseq']['smooth_grp'][gid] = [
            file1, file2, file3
        ]  #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC]

    print "***** Domainograms"
    futures_domainograms = {}
    for gid, group in job.groups.iteritems():
        grName = job.groups[gid]['name']
        if run_domainogram[gid]:
            regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord']
            if before_profile_correction[gid]:
                futures_domainograms[gid] = runDomainogram.nonblocking(
                    ex,
                    processed['4cseq']['norm_grp'][gid],
                    grName,
                    regCoord=regCoord,
                    skip=1,
                    script_path=script_path,
                    via=via,
                    memory=15)
            else:
                futures_domainograms[gid] = runDomainogram.nonblocking(
                    ex,
                    processed['4cseq']['profileCorrection_grp'][gid],
                    grName,
                    regCoord=regCoord.split(':')[0],
                    skip=1,
                    script_path=script_path,
                    via=via,
                    memory=15)

    ## prepare tar files for domainogram results (if any)
    ## and create "BRICKS to frags" files
    print "***** BRICKS to Frags"
    futures_BRICKS2Frags = {}
    for gid, f in futures_domainograms.iteritems():
        if run_domainogram[gid]:  # if domainogram has been run
            resFiles = []
            logFile = f.wait()
            start = False
            tarname = job.groups[gid]['name'] + "_domainogram.tar.gz"
            res_tar = tarfile.open(tarname, "w:gz")
            futures_BRICKS2Frags[gid] = []
            processed['4cseq']['bricks2frags'][gid] = []
            if logFile is None: continue
            with open(logFile) as f:
                for s in f:
                    s = s.strip()
                    if '####resfiles####' in s:
                        start = True
                    elif start and "RData" not in s:
                        resFiles.append(s)
                        res_tar.add(s)
                    if start and "foundBRICKS" in s:
                        bricks2fragsfile = unique_filename_in() + ".bedGraph"
                        touch(ex, bricks2fragsfile)
                        futures_BRICKS2Frags[gid] += [
                            BRICKSToFrag.nonblocking(
                                ex,
                                s,
                                processed['4cseq']['norm_grp'][gid],
                                bricks2fragsfile,
                                script_path=script_path,
                                via=via,
                                memory=4)
                        ]
                        processed['4cseq']['bricks2frags'][gid] += [
                            bricks2fragsfile
                        ]
            res_tar.close()
            processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname]

############### prepare tables for global results
    print "***** combine results into tables "
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        for rid, run in group['runs'].iteritems():
            allNames += [
                group['name'] + "_rep" + str(rid) + "_norm",
                group['name'] + "_rep" + str(rid) + "_fit"
            ]
            allFiles += [processed['4cseq']['profileCorrection'][gid][rid][2]]
            allRegToExclude += [regToExclude[gid]]
    tablePC = unique_filename_in() + ".txt"
    print("***will call makeTable with:")
    print(",".join(allFiles))
    print("resfile=" + tablePC)
    print(",".join(allNames))
    touch(ex, tablePC)

    #regToExclude[gid]

    futures_tables = (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tablePC,
        ",".join(allNames),
        idCols="4,5",
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    # wait for all smoothing to be done
    for gid, fg in futures_smoothed.iteritems():
        for f in fg:
            f.wait()

    ## make Table raw/smoothed_raw
    print("** make Table raw/smoothed_raw")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        futures_merged_raw[gid].wait()
        allNames += [group['name'] + "_raw", group['name'] + "_rawSmoothed"]
        allFiles += [
            processed['4cseq']['countsPerFrag_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][0]
        ]
        allRegToExclude += ['NA', regToExclude[gid]]

    tableSmoothedRaw_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothedRaw_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothedRaw_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## make Table norm/smoothed_norm before PC
    print("** make Table norm/smoothed_norm befor PC")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        allNames += [group['name'] + "_norm", group['name'] + "_smoothed"]
        allFiles += [
            processed['4cseq']['norm_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][1]
        ]
        allRegToExclude += [regToExclude[gid], regToExclude[gid]]

    tableSmoothed_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothed_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothed_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## make Table norm/smoothed_norm after PC
    print("** make Table norm/smoothed_norm after PC")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        allNames += [group['name'] + "_normPC", group['name'] + "_smoothedPC"]
        allFiles += [
            processed['4cseq']['profileCorrection_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][2]
        ]
        allRegToExclude += [regToExclude[gid], regToExclude[gid]]

    tableSmoothedPC_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothedPC_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothedPC_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## combine BRICKS2Frags files
    allNames = []
    allFiles = []
    for gid, fg in futures_BRICKS2Frags.iteritems():
        for f in fg:
            f.wait()
        allNames += [job.groups[gid]['name'] + "_BRICKSpval"]
        cat_bricks2frags = unique_filename_in() + ".txt"
        print ','.join(processed['4cseq']['bricks2frags'][gid])
        cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],
                               out=cat_bricks2frags)
        allFiles += [cat_bricks2frags]

    for gid, fg in futures_smoothed.iteritems():
        for f in fg:
            f.wait()

    tableBRICKS2Frags = unique_filename_in() + ".txt"
    touch(ex, tableBRICKS2Frags)
    futures_tables += (makeTable.nonblocking(ex,
                                             ",".join(allFiles),
                                             tableBRICKS2Frags,
                                             ",".join(allNames),
                                             idCols="4",
                                             out_chromosomes=out_chromosomes,
                                             defVal="NA",
                                             script_path=script_path,
                                             via=via,
                                             memory=8), )

    for f in futures_tables:
        f.wait()

    ################ Add everything to minilims below!
    step = "density"
    for gid in processed['4cseq']['density_files'].keys():
        for rid, sql in processed['4cseq']['density_files'][gid].iteritems():
            fname = "density_file_" + job.groups[gid][
                'name'] + "_merged_rep" + str(rid)
            ex.add(sql,
                   description=set_file_descr(fname + ".sql",
                                              groupId=gid,
                                              step=step,
                                              type="sql",
                                              gdv="1"))
            wig = unique_filename_in() + ".bw"
            convert(sql, wig)
            ex.add(wig,
                   description=set_file_descr(fname + ".bw",
                                              groupId=gid,
                                              step=step,
                                              type="bigWig",
                                              ucsc="1"))
    step = "counts_per_frag"  #was _norm_counts_per_frags # before normalisation process, per replicate
    for gid in processed['4cseq']['countsPerFrag'].keys():
        for rid, resfiles in processed['4cseq']['countsPerFrag'][
                gid].iteritems():
            fname = "meanScorePerFeature_" + job.groups[gid][
                'name'] + "_rep" + str(rid)
            ex.add(resfiles[1],
                   description=set_file_descr(fname + ".sql",
                                              groupId=gid,
                                              step=step,
                                              type="sql",
                                              view="admin",
                                              gdv='1'))
            #gzipfile(ex,resfiles[0])
            #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz",
            #                                                       groupId=gid,step=step,type="bed",view="admin" ))
            fname = "segToFrag_" + job.groups[gid]['name'] + "_rep" + str(rid)
            ex.add(resfiles[3],
                   description=set_file_descr(
                       fname + "_all.sql",
                       groupId=gid,
                       step=step,
                       type="sql",
                       comment="all informative frags - null included"))
            trsql = track(resfiles[3])
            bwig = unique_filename_in() + ".bw"
            trwig = track(bwig, chrmeta=trsql.chrmeta)
            trwig.write(
                trsql.read(fields=['chr', 'start', 'end', 'score'],
                           selection={'score': (0.01, sys.maxint)}))
            trwig.close()
            ex.add(
                bwig,
                set_file_descr(fname + ".bw",
                               groupId=gid,
                               step=step,
                               type="bigWig",
                               ucsc='1'))
        ## add segToFrags before normalisation
        futures_merged_raw[gid].wait()
        trbedgraph = track(removeNA(
            processed['4cseq']['countsPerFrag_grp'][gid]),
                           format='bedgraph')
        bwig = unique_filename_in() + ".bw"
        trwig = track(bwig, chrmeta=assembly.chrmeta)
        trwig.write(
            trbedgraph.read(fields=['chr', 'start', 'end', 'score'],
                            selection={'score': (0.01, sys.maxint)}))
        trwig.close()
        fname = "segToFrag_" + job.groups[gid]['name']
        ex.add(bwig,
               description=set_file_descr(
                   fname + ".bw",
                   groupId=gid,
                   step=step,
                   type="bigWig",
                   comment="segToFrag file before normalisation"))

    step = "norm_counts_per_frags"  # after new normalisation process, combined replicates
    for gid, resfile in processed['4cseq']['norm_grp'].iteritems():
        fname = "normalised_scorePerFeature_" + job.groups[gid]['name']
        gzipfile(ex, resfile)
        ex.add(resfile + ".gz",
               description=set_file_descr(fname + ".bedGraph.gz",
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1'))
    # norm files, per replicates (might be removed)
    for gid, dict_gid in processed['4cseq']['norm'].iteritems():
        for rid, resfile in dict_gid.iteritems():
            fname = "normalised_scorePerFeature_" + job.groups[gid][
                'name'] + "_rep" + str(rid)
            gzipfile(ex, resfile)
            ex.add(resfile + ".gz",
                   description=set_file_descr(fname + ".bedGraph.gz",
                                              groupId=gid,
                                              step=step,
                                              type="bedGraph",
                                              ucsc='1',
                                              gdv='1'))
    step = "profile_correction"  # Profile corrected data, combined replicates
    for gid, profileCorrectedFile in processed['4cseq'][
            'profileCorrection_grp'].iteritems():
        fname = "segToFrag_" + job.groups[gid]['name'] + "_profileCorrected"
        gzipfile(ex, profileCorrectedFile)
        ex.add(profileCorrectedFile + ".gz",
               description=set_file_descr(fname + ".bedGraph.gz",
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
    # Profile corrected, per replicate (might be removed)
    for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems():
        for rid, resfiles in dict_gid.iteritems():
            #        profileCorrectedFile = resfiles[0]
            reportProfileCorrection = resfiles[1]
            fname = "segToFrag_" + job.groups[gid][
                'name'] + "_profileCorrected_rep" + str(rid)
            #        gzipfile(ex,profileCorrectedFile)
            #       ex.add( profileCorrectedFile+".gz",
            #              description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
            ex.add(reportProfileCorrection,
                   description=set_file_descr(fname + ".pdf",
                                              groupId=gid,
                                              step=step,
                                              type="pdf"))
    step = "smoothing"
    for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems():
        rawSmoothFile = resfiles[0]
        smoothFile = resfiles[1]
        afterProfileCorrection = resfiles[2]
        nFrags = str(job.groups[gid]['window_size'])
        ## smoothed file before normalisation
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, rawSmoothFile)
        ex.add(rawSmoothFile + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
        ## smoothed file after normalisation, before Profile correction
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_norm_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, smoothFile)
        ex.add(smoothFile + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
        ## smoothed file after normalisation, after Profile correction
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_profileCorrected_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, afterProfileCorrection)
        ex.add(afterProfileCorrection + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))

    step = "domainograms"
    for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems():
        tarFile = resfiles.pop()
        fname = job.groups[gid]['name'] + "_domainogram.tar.gz"
        ex.add(tarFile,
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="tgz"))
        for s in resfiles:
            if s[-8:] == "bedGraph":
                gzipfile(ex, s)
                s += ".gz"
                ex.add(s,
                       description=set_file_descr(s,
                                                  groupId=gid,
                                                  step=step,
                                                  type="bedGraph",
                                                  ucsc="1",
                                                  gdv="1"))

    step = "combined_results"
    gzipfile(ex, tableSmoothedRaw_grp)
    ex.add(tableSmoothedRaw_grp + ".gz",
           description=set_file_descr(
               "table_segToFrags_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableSmoothed_grp)
    ex.add(tableSmoothed_grp + ".gz",
           description=set_file_descr(
               "table_normalised_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableSmoothedPC_grp)
    ex.add(tableSmoothedPC_grp + ".gz",
           description=set_file_descr(
               "table_profileCorrected_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tablePC)
    ex.add(tablePC + ".gz",
           description=set_file_descr(
               "table_normalised_fit_per_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableBRICKS2Frags)
    ex.add(tableBRICKS2Frags + ".gz",
           description=set_file_descr(
               "table_frags_in_BRICKS_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    return processed

Example #8

Show file

File: motif.py Project: MolbioUnige/bbcflib

def parallel_meme( ex, assembly, regions, name=None, chip=False, meme_args=None, via='lsf' ):
    """Fetches sequences, then calls ``meme`` on them and finally saves the results in the repository.
    
    """
    if meme_args is None: meme_args = []
    if not(isinstance(regions,list)): regions = [regions]
    if not(isinstance(name,list)): name = [name or '_']
    futures = {}
    fasta_files = {}
    background = assembly.statistics(unique_filename_in(),frequency=True)
#    genomeRef = assembly.untar_genome_fasta()
    for i,n in enumerate(name):
        (fasta, size) = assembly.fasta_from_regions( regions[i], ex=ex )
        tmpfile = unique_filename_in()
        outdir = unique_filename_in()
        if chip:
            futures[n] = (outdir, memechip.nonblocking( ex, fasta, outdir, background,
                                                        args=meme_args, via=via, 
                                                        stderr=tmpfile, memory=6 ))
        else:
            futures[n] = (outdir, meme.nonblocking( ex, fasta, outdir, background,
                                                    maxsize=(size*3)/2, args=meme_args,
                                                    via=via, stderr=tmpfile, memory=6 ))
        fasta_files[n] = fasta
    all_res = {}
    for n,f in futures.iteritems():
        f[1].wait()
        meme_out = f[0]
        archive = unique_filename_in()
        tgz = tarfile.open(archive, "w:gz")
        tgz.add( meme_out, arcname=n[1]+"_meme",
                 exclude=lambda x: os.path.basename(x) in [fasta_files[n],background] )
        tgz.close()
        ex.add( archive, description=set_file_descr(n[1]+"_meme.tgz",
                                                    step='meme', type='tar',
                                                    groupId=n[0]) )
        gzipfile(ex,fasta_files[n],args=["-f"])
        ex.add( fasta_files[n]+".gz",
                description=set_file_descr(n[1]+"_sites.fa.gz",
                                           step='meme', type='fasta',
                                           groupId=n[0]) )
        if not(chip) and os.path.exists(os.path.join(meme_out, "meme.xml")):
            meme_res = parse_meme_xml( ex, os.path.join(meme_out, "meme.xml"),
                                       assembly.chrmeta )
            if os.path.exists(os.path.join(meme_out, "meme.html")):
                ex.add( os.path.join(meme_out, "meme.html"),
                        description=set_file_descr(n[1]+"_meme.html",
                                                   step='meme', type='html', 
                                                   groupId=n[0]) )
            ex.add( meme_res['sql'], description=set_file_descr(n[1]+"_meme_sites.sql",
                                                                step='meme', type='sql',
                                                                groupId=n[0]) )
            for i,motif in enumerate(meme_res['matrices'].keys()):
                ex.add( meme_res['matrices'][motif],
                        description=set_file_descr(n[1]+"_meme_"+motif+".txt",
                                                   step='meme', type='txt', 
                                                   groupId=n[0]) )
                ex.add( os.path.join(meme_out, "logo"+str(i+1)+".png"),
                        description=set_file_descr(n[1]+"_meme_"+motif+".png",
                                                   step='meme', type='png', 
                                                   groupId=n[0]) )
            all_res[n] = meme_res
    return all_res