예제 #1
0
    def pca_rnaseq(self, counts_table_file):
        @program
        def pca(counts_table_file):
            outprefix = unique_filename_in()
            args = ['pca.R', counts_table_file, outprefix, "rpkm"]
            return {"arguments": args, "return_value": outprefix}

        if not program_exists('pca.R'):
            self.write_debug("Skipped PCA: pca.R not found.")
            return
        try:
            self.write_log("* PCA")
            outprefix = pca.nonblocking(self.ex,
                                        counts_table_file,
                                        via=self.via).wait()
        except Exception as err:
            self.write_debug("PCA failed: %s." % str(err))
            return
        if outprefix is None:
            self.write_debug("PCA failed.")
            return
        pca_descr_pdf = set_file_descr('pca.pdf',
                                       type='pdf',
                                       step='pca',
                                       ucsc=0)
        self.ex.add(outprefix + '.pdf', description=pca_descr_pdf)
예제 #2
0
파일: snp.py 프로젝트: JoseEspinosa/bbcflib
def create_tracks(ex, outall, sample_names, assembly):
    """Write BED tracks showing SNPs found in each sample."""
    infields = ['chromosome','position','reference']+sample_names+['gene','location_type','distance']
    intrack = track(outall, format='text', fields=infields, chrmeta=assembly.chrmeta,
                    intypes={'position':int})
    instream = intrack.read(fields=infields[:-3])
    outtracks = {}
    for sample_name in sample_names:
        out = unique_filename_in()+'.bed.gz'
        t = track(out,fields=['name'])
        t.make_header(name=sample_name+"_SNPs")
        outtracks[sample_name] = (t,out)

    def _row_to_annot(x,ref,n):
        if x[3+n][0] == ref: return None
        else: return "%s>%s"%(ref,x[3+n][0])

    for x in instream:
        coord = (x[0],x[1]-1,x[1])
        ref = x[2]
        snp = dict((name, _row_to_annot(x,ref,n)) for n,name in enumerate(sample_names))
        for name, tr in outtracks.iteritems():
            if snp[name]: tr[0].write([coord+(snp[name],)],mode='append')
    for name, tr in outtracks.iteritems():
        tr[0].close()
        description = set_file_descr(name+"_SNPs.bed.gz",type='bed',step='tracks',gdv='1',ucsc='1')
        ex.add(tr[1], description=description)
예제 #3
0
def motif_scan( ex, bedlist, assembly, groups, via, logfile ):
    logfile.write("Scanning motifs\n");logfile.flush()
    motifbeds = {}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid,bedfile in bedlist.iteritems():
        logfile.write("\n%i: "%gid);logfile.flush()
        group = groups[gid]
        motifs = {}
        for mot in group.get('motif',[]):
            if os.path.exists(mot):
                mname = os.path.basename(os.path.splitext(mot)[0])
                motifs[mname] = mot
            elif os.path.exists(os.path.join(supdir,mot)):
                mname = os.path.basename(os.path.splitext(mot)[0])
                motifs[mname] = os.path.join(supdir,mot)
            else:
                _gnid, mname = mot.split(' ')
                motifs[mname] = _gnrp.get_motif_PWM(int(_gnid), mname, output=unique_filename_in())
            logfile.write(mname+", ");logfile.flush()
        _descr = set_file_descr(group['name']+'_motifs.bed',
                                type='bed', ucsc='1', step='motifs', groupId=gid)
        _out = unique_filename_in()
        _hd = "track name='%s_motifs'" %group['name']
        motifbeds[gid] = save_motif_profile( ex, motifs, assembly, bedfile,
                                             keep_max_only=True, output=_out,
                                             header=_hd, description=_descr, via=via )
    return motifbeds
예제 #4
0
파일: dnaseseq.py 프로젝트: bbcf/bbcflib
def save_wellington( ex, wellout, chrmeta ):
    bedlist = {}
    for name, wlist in wellout.iteritems():
        wellall = unique_filename_in()
#### Dummy file
        touch( ex, wellall )
        ex.add(wellall,
               description=set_file_descr(name[1]+'_wellington_files', type='none', view='admin',
                                          step='footprints', groupId=name[0]))
#### BED at FDR 1%
        bedlist[name[0]] = wellall+"FDR01.bed.gz"
        bedzip = gzip.open(bedlist[name[0]],'wb')
        bedzip.write("track name='"+name[1]+"_WellingtonFootprints_FDR_0.01'\n")
        for x in wlist:
            with open(os.path.join(*x)+".WellingtonFootprints.FDR.0.01.bed") as _bed:
                [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall+"FDR01.bed.gz",
               description=set_file_descr(name[1]+'_WellingtonFootprintsFDR01.bed.gz',
                                          type='bed', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz')
#### BED at p-values [...]
        bedzip = gzip.open(wellall+"PvalCutoffs.bed.gz",'wb')
        for bfile in os.listdir(os.path.join(wlist[0][0],"p_value_cutoffs")):
            cut = os.path.splitext(bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4])
            bedzip.write("track name='"+name[1]+"_WellingtonFootprints_Pval_%s'\n" %cut)
            for wdir,wpref in wlist:
                _bedpath = os.path.join(wdir,"p_value_cutoffs",wpref+".WellingtonFootprints."+cut+".bed")
                with open(_bedpath) as _bed:
                    [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall+"PvalCutoffs.bed.gz",
               description=set_file_descr(name[1]+'_WellingtonFootprintsPvalCutoffs.bed.gz',
                                          type='bed', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz')
#### WIG
        cat([os.path.join(*x)+".WellingtonFootprints.wig" for x in wlist], wellall+".wig")
        #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta)
        #ex.add(wellall+".bw",
        #       description=set_file_descr(name[1]+'_WellingtonFootprints.bw',
        #                                  type='bigWig', ucsc='1', step='footprints', groupId=name[0]),
        #       associate_to_filename=wellall, template='%s_WellingtonFootprints.bw')
        ex.add(wellall+".wig",
               description=set_file_descr(name[1]+'_WellingtonFootprints.wig',
                                          type='wig', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprints.wig')
    return bedlist
예제 #5
0
def get_libForGrp(ex, group, fasta_or_assembly, new_libraries, grpId, url=None, lib_dir=None, via='lsf'):
#wd_archive="/archive/epfl/bbcf/mleleu/pipeline_vMarion/pipeline_3Cseq/vWebServer_Bein/"
    def _libfile(id_lib):
        libs_list = json.load(urllib2.urlopen( url+"/libraries.json" ))
        for lib in libs_list:
            if lib['library']['id']==int(id_lib):
                return lib['library']['filename']
        return None

    def _paramsFile(paramsfile):
        """Returns a dictionary with the parameters required for the creation of a new library"""
        paramslib={'name': 'myLibrary', 'length': '30', 'type': 'typeI'}
        with open(paramsfile) as f:
            for s in f:
                s=s.strip().split('=')
                key = None
                if   re.search('Library name',s[0],re.I) and len(s[1])>1:   key='name'
                elif re.search('Genome name',s[0],re.I):                    key='species'
                elif re.search('Primary',s[0],re.I):                        key='primary'
                elif re.search('Secondary',s[0],re.I):                      key='secondary'
                elif re.search('Segment length',s[0],re.I) and len(s[1])>0: key='length'
                elif re.search('Type',s[0],re.I) and len(s[1])>1:           key='type'
                if key: paramslib[key]=s[1].strip()
        return paramslib

    if url is None: url = GlobalHtsUrl
    if lib_dir is None: lib_dir = os.path.split(ex.remote_working_directory)[0]
    if not(group.get('library_param_file','null') in ["null",'', None]):
        library_filename = os.path.join(lib_dir,'group_'+group['name']+"_paramsFileLibrary.txt")
        paramslib = _paramsFile(library_filename)
        lib_id, ex_libfile = lib_exists( paramslib, new_libraries, url )
        if lib_id == 0 and ex_libfile == None:
            libfiles = createLibrary(ex, fasta_or_assembly, paramslib, url, via=via)
            reffile = libfiles[2]
            ex.add( libfiles[2]+".bed.gz",
                    description=set_file_descr( group['name']+"_new_library.bed.gz", groupId=grpId,
                                                step="library", type="bed" ))
#            ex.add(reffile,description=set_file_descr("new_library.sql",groupId=grpId,step="library",type="sql",view='admin'))
            new_libraries.append( {'library': libfiles[3]} )
        elif lib_id > 0:
            reffile = _libfile(lib_id)
        else:
            reffile = ex_libfile
    elif 'library_id' in group and group['library_id']> 0 and not str(group['library_id'])=="":
        reffile = _libfile(group['library_id'])
        if reffile is None:
            raise TypeError("No valid parameter passed for the library.")
        if not(os.path.exists(reffile) or os.path.exists(reffile+'.bed.gz')):
            raise TypeError("library file ("+reffile+") is not valid")
        if not os.path.exists(reffile):
            reffile += '.bed.gz'
    elif 'library_file_url' in group and group['library_file_url'] != "":
        reffile=group['library_file_url']
    else:
        reffile=None
    return reffile
예제 #6
0
파일: rnaseq.py 프로젝트: bbcf/bbcflib
 def differential_analysis(counts_file, feature_type):
     #shutil.copy(counts_file, "../")
     diff_files = DE.differential_analysis(counts_file)
     if diff_files is not None:
         for diff in diff_files:
             # Remove first line
             diff_nohead = unique_filename_in()
             with open(diff) as f:
                 head = f.readline().strip()
                 with open(diff_nohead, "wb") as g:
                     for line in f: g.write(line)
             oname = feature_type + "_differential_"+ head + ".txt"
             desc = set_file_descr(oname, step='stats', type='txt', ucsc=0)
             ex.add(diff_nohead, description=desc)
예제 #7
0
 def differential_analysis(counts_file, feature_type):
     #shutil.copy(counts_file, "../")
     diff_files = DE.differential_analysis(counts_file)
     if diff_files is not None:
         for diff in diff_files:
             # Remove first line
             diff_nohead = unique_filename_in()
             with open(diff) as f:
                 head = f.readline().strip()
                 with open(diff_nohead, "wb") as g:
                     for line in f:
                         g.write(line)
             oname = feature_type + "_differential_" + head + ".txt"
             desc = set_file_descr(oname, step='stats', type='txt', ucsc=0)
             ex.add(diff_nohead, description=desc)
예제 #8
0
 def gdv_create(self,ex):
     from bbcflib import gdv
     project = gdv.get_project(mail=self.globals['gdv']['email'],
                               key=self.globals['gdv']['key'],
                               project_key=self.job.options['gdv_key'])
     if 'error' in project:
         self.log_write("Creating GDV project.")
         project = gdv.new_project( self.globals['gdv']['email'],
                                    self.globals['gdv']['key'],
                                    self.job.description,
                                    self.job.assembly.id,
                                    self.globals['gdv']['url'] )
         self.debug_write("\nGDV project: "+json.dumps(project))
         add_pickle( ex, project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') )
     self.job.options['gdv_project'] = project
     return True
예제 #9
0
파일: rnaseq.py 프로젝트: bbcf/bbcflib
    def pca_rnaseq(self,counts_table_file):
        @program
        def pca(counts_table_file):
            outprefix = unique_filename_in()
            args = ['pca.R', counts_table_file, outprefix, "rpkm"]
            return {"arguments": args, "return_value": outprefix}

        if not program_exists('pca.R'):
            self.write_debug("Skipped PCA: pca.R not found.")
            return
        try:
            self.write_log("* PCA")
            outprefix = pca.nonblocking(self.ex, counts_table_file, via=self.via).wait()
        except Exception as err:
            self.write_debug("PCA failed: %s." % str(err))
            return
        if outprefix is None:
            self.write_debug("PCA failed.")
            return
        pca_descr_pdf = set_file_descr('pca.pdf', type='pdf', step='pca', ucsc=0)
        self.ex.add(outprefix+'.pdf', description=pca_descr_pdf)
예제 #10
0
def motif_scan(ex, bedlist, assembly, groups, via, logfile):
    logfile.write("Scanning motifs\n")
    logfile.flush()
    motifbeds = {}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid, bedfile in bedlist.iteritems():
        logfile.write("\n%i: " % gid)
        logfile.flush()
        group = groups[gid]
        motifs = {}
        for mot in group.get('motif', []):
            if os.path.exists(mot):
                mname = os.path.basename(os.path.splitext(mot)[0])
                motifs[mname] = mot
            elif os.path.exists(os.path.join(supdir, mot)):
                mname = os.path.basename(os.path.splitext(mot)[0])
                motifs[mname] = os.path.join(supdir, mot)
            else:
                _gnid, mname = mot.split(' ')
                motifs[mname] = _gnrp.get_motif_PWM(
                    int(_gnid), mname, output=unique_filename_in())
            logfile.write(mname + ", ")
            logfile.flush()
        _descr = set_file_descr(group['name'] + '_motifs.bed',
                                type='bed',
                                ucsc='1',
                                step='motifs',
                                groupId=gid)
        _out = unique_filename_in()
        _hd = "track name='%s_motifs'" % group['name']
        motifbeds[gid] = save_motif_profile(ex,
                                            motifs,
                                            assembly,
                                            bedfile,
                                            keep_max_only=True,
                                            output=_out,
                                            header=_hd,
                                            description=_descr,
                                            via=via)
    return motifbeds
예제 #11
0
파일: snp.py 프로젝트: MolbioUnige/bbcflib
def create_tracks(ex, outall, sample_names, assembly):
    """Write BED tracks showing SNPs found in each sample."""
    infields = ['chromosome', 'position', 'reference'
                ] + sample_names + ['gene', 'location_type', 'distance']
    intrack = track(outall,
                    format='text',
                    fields=infields,
                    chrmeta=assembly.chrmeta,
                    intypes={'position': int})
    instream = intrack.read(fields=infields[:-3])
    outtracks = {}
    for sample_name in sample_names:
        out = unique_filename_in() + '.bed.gz'
        t = track(out, fields=['name'])
        t.make_header(name=sample_name + "_SNPs")
        outtracks[sample_name] = (t, out)

    def _row_to_annot(x, ref, n):
        if x[3 + n][0] == ref: return None
        else: return "%s>%s" % (ref, x[3 + n][0])

    for x in instream:
        coord = (x[0], x[1] - 1, x[1])
        ref = x[2]
        snp = dict((name, _row_to_annot(x, ref, n))
                   for n, name in enumerate(sample_names))
        for name, tr in outtracks.iteritems():
            if snp[name]: tr[0].write([coord + (snp[name], )], mode='append')
    for name, tr in outtracks.iteritems():
        tr[0].close()
        description = set_file_descr(name + "_SNPs.bed.gz",
                                     type='bed',
                                     step='tracks',
                                     gdv='1',
                                     ucsc='1')
        ex.add(tr[1], description=description)
예제 #12
0
def c4seq_workflow(ex,
                   job,
                   primers_dict,
                   assembly,
                   c4_url=None,
                   script_path='',
                   logfile=sys.stdout,
                   via='lsf'):
    '''
    Main
    * open the 4C-seq minilims and create execution
    * 0. get/create the library
    * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql)
    * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate)
    '''

    mapseq_files = job.files
    ### outputs
    processed = {'lib': {}, 'density': {}, '4cseq': {}}
    processed['4cseq'] = {
        'density_files': {},
        'countsPerFrag': {},
        'countsPerFrag_grp': {},
        'norm': {},
        'norm_grp': {},
        'profileCorrection': {},
        'profileCorrection_grp': {},
        'smooth_grp': {},
        'domainogram_grp': {},
        'bricks2frags': {}
    }
    # was 'smoothFrag': {}, 'domainogram': {}}
    regToExclude = {}
    new_libs = []
    ### options
    run_domainogram = {}
    before_profile_correction = {}
    if not job.options.get('viewpoints_chrs', False):
        out_chromosomes = ','.join([ch for ch in assembly.chrnames])
    else:
        out_chromosomes = ','.join([
            primers_dict.get(group['name'], {}).get('baitcoord').split(':')[0]
            for gid, group in job.groups.iteritems()
        ])
    print "out_chromosomes=" + out_chromosomes + "\n"

    sizeExt = job.options.get('norm_reg', 1000000)
    print "region considered for normalisation: mid viewpoint +/-" + str(
        sizeExt) + 'bps'

    ### do it
    for gid, group in job.groups.iteritems():
        run_domainogram[gid] = group.get('run_domainogram', False)
        if isinstance(run_domainogram[gid], basestring):
            run_domainogram[gid] = (run_domainogram[gid].lower()
                                    in ['1', 'true', 'on', 't'])
        before_profile_correction[gid] = group.get('before_profile_correction',
                                                   False)
        if isinstance(before_profile_correction[gid], basestring):
            before_profile_correction[gid] = (
                before_profile_correction[gid].lower()
                in ['1', 'true', 'on', 't'])
        processed['lib'][gid] = get_libForGrp(ex,
                                              group,
                                              assembly,
                                              new_libs,
                                              gid,
                                              c4_url,
                                              via=via)
        #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed'
        processed['4cseq']['density_files'][gid] = {}
        regToExclude[gid] = primers_dict.get(group['name'],
                                             {}).get('regToExclude',
                                                     "").replace('\r', '')

        # if no regToExclude defined, set it as mid_baitCoord +/-5kb
        if len(regToExclude[gid]) == 0:
            baitcoord_mid = int(0.5 * (int(
                primers_dict.get(group['name'], {}).get('baitcoord').split(':')
                [1].split('-')[0]) + int(
                    primers_dict.get(group['name'], {}).get('baitcoord').split(
                        ':')[1].split('-')[1])))
            regToExclude[gid] = primers_dict.get(
                group['name'], {}).get('baitcoord').split(':')[0] + ':' + str(
                    baitcoord_mid - 5000) + '-' + str(baitcoord_mid + 5000)

        #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()]))
        print(primers_dict.get(group['name'], {}))
        print "regToExclude[" + str(gid) + "]=" + regToExclude[gid]
        for rid, run in group['runs'].iteritems():
            libname = mapseq_files[gid][rid]['libname']
            if job.options.get('merge_strands') != 0 or not (
                    'wig' in mapseq_files[gid][rid]):
                density_file = parallel_density_sql(
                    ex,
                    mapseq_files[gid][rid]['bam'],
                    assembly.chrmeta,
                    nreads=mapseq_files[gid][rid]['stats']["total"],
                    merge=0,
                    read_extension=mapseq_files[gid][rid]['stats']
                    ['read_length'],
                    convert=False,
                    via=via)
                density_file += "merged.sql"
                ex.add(density_file,
                       description=set_file_descr("density_file_" + libname +
                                                  ".sql",
                                                  groupId=gid,
                                                  step="density",
                                                  type="sql",
                                                  view='admin',
                                                  gdv="1"))
            else:
                density_file = mapseq_files[gid][rid]['wig']['merged']
            #density_files.append(density_file)
            processed['4cseq']['density_files'][gid][rid] = density_file

        # back to grp level!
        # not anymore:
        # processed['density'][gid] = merge_sql(ex, density_files, via=via)

    processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag(
        ex, processed, job.groups, assembly, regToExclude, script_path, via)
    ## access per gid+rid

    futures_norm = {}
    countsPerFrags_bedGraph = {}
    futures_merged_raw = {}
    for gid, group in job.groups.iteritems():
        futures_norm[gid] = {}
        countsPerFrags_bedGraph[gid] = {}
        processed['4cseq']['norm'][gid] = {}
        for rid, run in group['runs'].iteritems():
            normfile = unique_filename_in()
            touch(ex, normfile)
            resfile = unique_filename_in() + ".bedGraph"
            resfiles = processed['4cseq']['countsPerFrag'][gid][
                rid]  # _all.sql
            convert(resfiles[3], resfile)
            countsPerFrags_bedGraph[gid][rid] = resfile

            print "call normFrags: infiles=" + resfile + ", normfile=" + normfile + "baitCoord=" + primers_dict[
                group['name']][
                    'baitcoord'] + ", sizeExt=sizeExt, name=" + group[
                        'name'] + "rep_" + str(
                            rid) + "regToExclude=" + regToExclude[gid] + "\n"
            futures_norm[gid][rid] = normFrags.nonblocking(
                ex,
                resfile,
                normfile,
                baitCoord=primers_dict[group['name']]['baitcoord'],
                sizeExt=sizeExt,
                name=group['name'] + "rep_" + str(rid),
                regToExclude=regToExclude[gid],
                script_path=script_path,
                via=via)
            processed['4cseq']['norm'][gid][rid] = normfile

        if len(group) > 1:
            ## merge replicates before normalisation.
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_raw_mergedRep"
            print "gid=" + group['name']
            print "call mergeRep for replicates before normalisation: infiles=" + ",".join(
                [
                    res_rid for rid, res_rid in
                    countsPerFrags_bedGraph[gid].iteritems()
                ]
            ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_merged_raw[gid] = mergeRep.nonblocking(
                ex,
                ",".join([
                    res_rid for rid, res_rid in
                    countsPerFrags_bedGraph[gid].iteritems()
                ]),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['countsPerFrag_grp'][gid] = mergefile
        else:
            futures_merged_raw[gid] = None
            processed['4cseq']['countsPerFrag_grp'][
                gid] = countsPerFrags_bedGraph[gid][
                    0]  #if no replicates, then the file we want is the 1st one

    print "***** profile correction / sample + merge normalised data"
    futures_merged = {}  # per gid
    futures_profcor = {}  # per gid, per rid
    for gid, group in job.groups.iteritems():
        ## run profile correction per run then merge them
        futures_profcor[gid] = {}
        processed['4cseq']['profileCorrection'][gid] = {}
        for rid, run in group['runs'].iteritems():
            # wait for normalisation of all replicates to be finished
            futures_norm[gid][rid].wait(
            )  ## normalised files, per grp, per rep
            normfile = processed['4cseq']['norm'][gid][rid]
            file1 = unique_filename_in()  #track file
            touch(ex, file1)
            file2 = unique_filename_in()  #report file
            touch(ex, file2)
            file3 = unique_filename_in()  #table file
            touch(ex, file3)
            print "call profileCorrection: normfile=" + normfile + ", baitCoord=" + primers_dict[
                group['name']]['baitcoord'] + ", name=" + group[
                    'name'] + ", file1=" + file1 + ", file2=" + file2 + ", file3= " + file3 + "\n"
            futures_profcor[gid][rid] = profileCorrection.nonblocking(
                ex,
                normfile,
                primers_dict[group['name']]['baitcoord'],
                group['name'],
                file1,
                file2,
                file3,
                script_path,
                via=via)
            processed['4cseq']['profileCorrection'][gid][rid] = [
                file1, file2, file3
            ]

        ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop.
        if len(group) > 1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_norm_mergedRep"
            print "gid=" + group['name']
            print "call mergeRep: infiles=" + ",".join([
                res_rid for rid, res_rid in processed['4cseq']['norm']
                [gid].iteritems()
            ]) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_merged[gid] = mergeRep.nonblocking(
                ex,
                ",".join([
                    res_rid for rid, res_rid in processed['4cseq']['norm']
                    [gid].iteritems()
                ]),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['norm_grp'][gid] = mergefile
        else:
            futures_merged[gid] = None
            processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][
                gid][
                    0]  ##if no replicates, then the file we want is the 1st one

    print "***** merge profile corrected data"
    futures_profcor_merged = {}  # per gid
    for gid, group in job.groups.iteritems():
        processed['4cseq']['profileCorrection_grp'][gid] = {}
        for rid, run in group['runs'].iteritems():
            futures_profcor[gid][rid].wait(
            )  ## wait for ProfileCorrection to be finished

        ## merge replicates after profile correction
        if len(group) > 1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_ProfCor_mergedRep"
            pcfiles = [
                processed['4cseq']['profileCorrection'][gid][rid][0]
                for rid, res_rid in processed['4cseq']['profileCorrection']
                [gid].iteritems()
            ]
            print "call mergeRep (for PC tables): infiles=" + ",".join(
                pcfiles
            ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_profcor_merged[gid] = mergeRep.nonblocking(
                ex,
                ",".join(pcfiles),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['profileCorrection_grp'][gid] = mergefile
        else:
            futures_profcor_merged[gid] = None
            processed['4cseq']['profileCorrection_grp'][gid] = processed[
                '4cseq']['profileCorrection'][gid][
                    0]  ##if no replicates, then the file we want is the 1st one

    print "***** smooth data"
    futures_smoothed = {}
    for gid, group in job.groups.iteritems():
        file1 = unique_filename_in()
        touch(ex, file1)
        file2 = unique_filename_in()
        touch(ex, file2)
        file3 = unique_filename_in()
        touch(ex, file3)
        nFragsPerWin = group['window_size']
        futures_merged_raw[gid].wait(
        )  ## wait for merging of raw_grp to be completed
        futures_smoothed[gid] = (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['countsPerFrag_grp'][gid],
            nFragsPerWin,
            group['name'],
            file1,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        futures_merged[gid].wait(
        )  ## wait for merging of norm_grp to be completed
        futures_smoothed[gid] += (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['norm_grp'][gid],
            nFragsPerWin,
            group['name'] + "_norm",
            file2,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        futures_profcor_merged[gid].wait(
        )  # wait for the merging of profile corrected data to be done
        futures_smoothed[gid] += (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['profileCorrection_grp'][gid],
            nFragsPerWin,
            group['name'] + "_fromProfileCorrected",
            file3,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        processed['4cseq']['smooth_grp'][gid] = [
            file1, file2, file3
        ]  #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC]

    print "***** Domainograms"
    futures_domainograms = {}
    for gid, group in job.groups.iteritems():
        grName = job.groups[gid]['name']
        if run_domainogram[gid]:
            regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord']
            if before_profile_correction[gid]:
                futures_domainograms[gid] = runDomainogram.nonblocking(
                    ex,
                    processed['4cseq']['norm_grp'][gid],
                    grName,
                    regCoord=regCoord,
                    skip=1,
                    script_path=script_path,
                    via=via,
                    memory=15)
            else:
                futures_domainograms[gid] = runDomainogram.nonblocking(
                    ex,
                    processed['4cseq']['profileCorrection_grp'][gid],
                    grName,
                    regCoord=regCoord.split(':')[0],
                    skip=1,
                    script_path=script_path,
                    via=via,
                    memory=15)

    ## prepare tar files for domainogram results (if any)
    ## and create "BRICKS to frags" files
    print "***** BRICKS to Frags"
    futures_BRICKS2Frags = {}
    for gid, f in futures_domainograms.iteritems():
        if run_domainogram[gid]:  # if domainogram has been run
            resFiles = []
            logFile = f.wait()
            start = False
            tarname = job.groups[gid]['name'] + "_domainogram.tar.gz"
            res_tar = tarfile.open(tarname, "w:gz")
            futures_BRICKS2Frags[gid] = []
            processed['4cseq']['bricks2frags'][gid] = []
            if logFile is None: continue
            with open(logFile) as f:
                for s in f:
                    s = s.strip()
                    if '####resfiles####' in s:
                        start = True
                    elif start and "RData" not in s:
                        resFiles.append(s)
                        res_tar.add(s)
                    if start and "foundBRICKS" in s:
                        bricks2fragsfile = unique_filename_in() + ".bedGraph"
                        touch(ex, bricks2fragsfile)
                        futures_BRICKS2Frags[gid] += [
                            BRICKSToFrag.nonblocking(
                                ex,
                                s,
                                processed['4cseq']['norm_grp'][gid],
                                bricks2fragsfile,
                                script_path=script_path,
                                via=via,
                                memory=4)
                        ]
                        processed['4cseq']['bricks2frags'][gid] += [
                            bricks2fragsfile
                        ]
            res_tar.close()
            processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname]

############### prepare tables for global results
    print "***** combine results into tables "
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        for rid, run in group['runs'].iteritems():
            allNames += [
                group['name'] + "_rep" + str(rid) + "_norm",
                group['name'] + "_rep" + str(rid) + "_fit"
            ]
            allFiles += [processed['4cseq']['profileCorrection'][gid][rid][2]]
            allRegToExclude += [regToExclude[gid]]
    tablePC = unique_filename_in() + ".txt"
    print("***will call makeTable with:")
    print(",".join(allFiles))
    print("resfile=" + tablePC)
    print(",".join(allNames))
    touch(ex, tablePC)

    #regToExclude[gid]

    futures_tables = (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tablePC,
        ",".join(allNames),
        idCols="4,5",
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    # wait for all smoothing to be done
    for gid, fg in futures_smoothed.iteritems():
        for f in fg:
            f.wait()

    ## make Table raw/smoothed_raw
    print("** make Table raw/smoothed_raw")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        futures_merged_raw[gid].wait()
        allNames += [group['name'] + "_raw", group['name'] + "_rawSmoothed"]
        allFiles += [
            processed['4cseq']['countsPerFrag_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][0]
        ]
        allRegToExclude += ['NA', regToExclude[gid]]

    tableSmoothedRaw_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothedRaw_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothedRaw_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## make Table norm/smoothed_norm before PC
    print("** make Table norm/smoothed_norm befor PC")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        allNames += [group['name'] + "_norm", group['name'] + "_smoothed"]
        allFiles += [
            processed['4cseq']['norm_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][1]
        ]
        allRegToExclude += [regToExclude[gid], regToExclude[gid]]

    tableSmoothed_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothed_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothed_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## make Table norm/smoothed_norm after PC
    print("** make Table norm/smoothed_norm after PC")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        allNames += [group['name'] + "_normPC", group['name'] + "_smoothedPC"]
        allFiles += [
            processed['4cseq']['profileCorrection_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][2]
        ]
        allRegToExclude += [regToExclude[gid], regToExclude[gid]]

    tableSmoothedPC_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothedPC_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothedPC_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## combine BRICKS2Frags files
    allNames = []
    allFiles = []
    for gid, fg in futures_BRICKS2Frags.iteritems():
        for f in fg:
            f.wait()
        allNames += [job.groups[gid]['name'] + "_BRICKSpval"]
        cat_bricks2frags = unique_filename_in() + ".txt"
        print ','.join(processed['4cseq']['bricks2frags'][gid])
        cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],
                               out=cat_bricks2frags)
        allFiles += [cat_bricks2frags]

    for gid, fg in futures_smoothed.iteritems():
        for f in fg:
            f.wait()

    tableBRICKS2Frags = unique_filename_in() + ".txt"
    touch(ex, tableBRICKS2Frags)
    futures_tables += (makeTable.nonblocking(ex,
                                             ",".join(allFiles),
                                             tableBRICKS2Frags,
                                             ",".join(allNames),
                                             idCols="4",
                                             out_chromosomes=out_chromosomes,
                                             defVal="NA",
                                             script_path=script_path,
                                             via=via,
                                             memory=8), )

    for f in futures_tables:
        f.wait()

    ################ Add everything to minilims below!
    step = "density"
    for gid in processed['4cseq']['density_files'].keys():
        for rid, sql in processed['4cseq']['density_files'][gid].iteritems():
            fname = "density_file_" + job.groups[gid][
                'name'] + "_merged_rep" + str(rid)
            ex.add(sql,
                   description=set_file_descr(fname + ".sql",
                                              groupId=gid,
                                              step=step,
                                              type="sql",
                                              gdv="1"))
            wig = unique_filename_in() + ".bw"
            convert(sql, wig)
            ex.add(wig,
                   description=set_file_descr(fname + ".bw",
                                              groupId=gid,
                                              step=step,
                                              type="bigWig",
                                              ucsc="1"))
    step = "counts_per_frag"  #was _norm_counts_per_frags # before normalisation process, per replicate
    for gid in processed['4cseq']['countsPerFrag'].keys():
        for rid, resfiles in processed['4cseq']['countsPerFrag'][
                gid].iteritems():
            fname = "meanScorePerFeature_" + job.groups[gid][
                'name'] + "_rep" + str(rid)
            ex.add(resfiles[1],
                   description=set_file_descr(fname + ".sql",
                                              groupId=gid,
                                              step=step,
                                              type="sql",
                                              view="admin",
                                              gdv='1'))
            #gzipfile(ex,resfiles[0])
            #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz",
            #                                                       groupId=gid,step=step,type="bed",view="admin" ))
            fname = "segToFrag_" + job.groups[gid]['name'] + "_rep" + str(rid)
            ex.add(resfiles[3],
                   description=set_file_descr(
                       fname + "_all.sql",
                       groupId=gid,
                       step=step,
                       type="sql",
                       comment="all informative frags - null included"))
            trsql = track(resfiles[3])
            bwig = unique_filename_in() + ".bw"
            trwig = track(bwig, chrmeta=trsql.chrmeta)
            trwig.write(
                trsql.read(fields=['chr', 'start', 'end', 'score'],
                           selection={'score': (0.01, sys.maxint)}))
            trwig.close()
            ex.add(
                bwig,
                set_file_descr(fname + ".bw",
                               groupId=gid,
                               step=step,
                               type="bigWig",
                               ucsc='1'))
        ## add segToFrags before normalisation
        futures_merged_raw[gid].wait()
        trbedgraph = track(removeNA(
            processed['4cseq']['countsPerFrag_grp'][gid]),
                           format='bedgraph')
        bwig = unique_filename_in() + ".bw"
        trwig = track(bwig, chrmeta=assembly.chrmeta)
        trwig.write(
            trbedgraph.read(fields=['chr', 'start', 'end', 'score'],
                            selection={'score': (0.01, sys.maxint)}))
        trwig.close()
        fname = "segToFrag_" + job.groups[gid]['name']
        ex.add(bwig,
               description=set_file_descr(
                   fname + ".bw",
                   groupId=gid,
                   step=step,
                   type="bigWig",
                   comment="segToFrag file before normalisation"))

    step = "norm_counts_per_frags"  # after new normalisation process, combined replicates
    for gid, resfile in processed['4cseq']['norm_grp'].iteritems():
        fname = "normalised_scorePerFeature_" + job.groups[gid]['name']
        gzipfile(ex, resfile)
        ex.add(resfile + ".gz",
               description=set_file_descr(fname + ".bedGraph.gz",
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1'))
    # norm files, per replicates (might be removed)
    for gid, dict_gid in processed['4cseq']['norm'].iteritems():
        for rid, resfile in dict_gid.iteritems():
            fname = "normalised_scorePerFeature_" + job.groups[gid][
                'name'] + "_rep" + str(rid)
            gzipfile(ex, resfile)
            ex.add(resfile + ".gz",
                   description=set_file_descr(fname + ".bedGraph.gz",
                                              groupId=gid,
                                              step=step,
                                              type="bedGraph",
                                              ucsc='1',
                                              gdv='1'))
    step = "profile_correction"  # Profile corrected data, combined replicates
    for gid, profileCorrectedFile in processed['4cseq'][
            'profileCorrection_grp'].iteritems():
        fname = "segToFrag_" + job.groups[gid]['name'] + "_profileCorrected"
        gzipfile(ex, profileCorrectedFile)
        ex.add(profileCorrectedFile + ".gz",
               description=set_file_descr(fname + ".bedGraph.gz",
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
    # Profile corrected, per replicate (might be removed)
    for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems():
        for rid, resfiles in dict_gid.iteritems():
            #        profileCorrectedFile = resfiles[0]
            reportProfileCorrection = resfiles[1]
            fname = "segToFrag_" + job.groups[gid][
                'name'] + "_profileCorrected_rep" + str(rid)
            #        gzipfile(ex,profileCorrectedFile)
            #       ex.add( profileCorrectedFile+".gz",
            #              description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
            ex.add(reportProfileCorrection,
                   description=set_file_descr(fname + ".pdf",
                                              groupId=gid,
                                              step=step,
                                              type="pdf"))
    step = "smoothing"
    for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems():
        rawSmoothFile = resfiles[0]
        smoothFile = resfiles[1]
        afterProfileCorrection = resfiles[2]
        nFrags = str(job.groups[gid]['window_size'])
        ## smoothed file before normalisation
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, rawSmoothFile)
        ex.add(rawSmoothFile + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
        ## smoothed file after normalisation, before Profile correction
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_norm_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, smoothFile)
        ex.add(smoothFile + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
        ## smoothed file after normalisation, after Profile correction
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_profileCorrected_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, afterProfileCorrection)
        ex.add(afterProfileCorrection + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))

    step = "domainograms"
    for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems():
        tarFile = resfiles.pop()
        fname = job.groups[gid]['name'] + "_domainogram.tar.gz"
        ex.add(tarFile,
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="tgz"))
        for s in resfiles:
            if s[-8:] == "bedGraph":
                gzipfile(ex, s)
                s += ".gz"
                ex.add(s,
                       description=set_file_descr(s,
                                                  groupId=gid,
                                                  step=step,
                                                  type="bedGraph",
                                                  ucsc="1",
                                                  gdv="1"))

    step = "combined_results"
    gzipfile(ex, tableSmoothedRaw_grp)
    ex.add(tableSmoothedRaw_grp + ".gz",
           description=set_file_descr(
               "table_segToFrags_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableSmoothed_grp)
    ex.add(tableSmoothed_grp + ".gz",
           description=set_file_descr(
               "table_normalised_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableSmoothedPC_grp)
    ex.add(tableSmoothedPC_grp + ".gz",
           description=set_file_descr(
               "table_profileCorrected_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tablePC)
    ex.add(tablePC + ".gz",
           description=set_file_descr(
               "table_normalised_fit_per_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableBRICKS2Frags)
    ex.add(tableBRICKS2Frags + ".gz",
           description=set_file_descr(
               "table_frags_in_BRICKS_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    return processed
예제 #13
0
def dnaseseq_workflow(ex, job, assembly, logfile=sys.stdout, via='lsf'):
    """
    This workflow performs the following steps:

      * BAM files from replicates within the same group are merged
      * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group
      * Wellington is called to identify footprints within these enriched regions
      * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file
      * Average DNAse profiles around motifs are plotted

    """
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid, mapped in job.files.iteritems():
        group_name = job.groups[gid]['name']
        if not isinstance(mapped, dict):
            raise TypeError(
                "Files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped: mapped = {'_': mapped}
        if len(mapped) > 1:
            bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()])
            index = index_bam(ex, bamfile)
        else:
            bamfile = mapped.values()[0]['bam']
        if job.groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            if os.path.exists(job.groups[gid].get('bedfile', 'null')):
                bedfile = job.groups[gid]['bedfile']
            elif os.path.exists(
                    os.path.join(supdir,
                                 job.groups[gid].get('bedfile', 'null'))):
                bedfile = os.path.join(supdir, job.groups[gid]['bedfile'])
            else:
                bedfile = None
            tests.append((bedfile, bamfile))
            names['tests'].append((gid, group_name))
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    tests = macs_bedfiles(ex, assembly.chrmeta, tests, controls, names,
                          job.options.get('macs_args', ["--keep-dup", "10"]),
                          via, logfile)
    bedlist = run_wellington(ex, tests, names, assembly, via, logfile)
    ######################### Motif scanning / plotting
    if any([
            gr.get('motif') != 'null' and gr.get('motif')
            for gr in job.groups.values()
    ]):
        motifbeds = motif_scan(ex, bedlist, assembly, job.groups, via, logfile)
        siglist = dict((gid[0], []) for gid in names['tests'])
        for gid, mapped in job.files.iteritems():
            wig = []
            suffixes = ["fwd", "rev"]
            merge_strands = int(job.options.get('merge_strands', -1))
            read_extension = int(job.options.get('read_extension') or -1)
            make_wigs = merge_strands >= 0 or read_extension != 1
            for m in mapped.values():
                if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                    output = mapseq.parallel_density_sql(
                        ex,
                        m["bam"],
                        assembly.chrmeta,
                        nreads=m["stats"]["total"],
                        merge=-1,
                        read_extension=1,
                        convert=False,
                        b2w_args=[],
                        via=via)
                    wig.append(dict(
                        (s, output + s + '.sql') for s in suffixes))
                else:
                    wig.append(m['wig'])
            if len(wig) > 1:
                wig[0] = dict((s, merge_sql(ex, [x[s] for x in wig], via=via))
                              for s in suffixes)
            _trn = job.groups[gid]['name'] + "_%s"
            if job.groups[gid]['control']:
                for s, w in wig[0].iteritems():
                    for _g in siglist.keys():
                        siglist[_g].append(track(w, info={'name': _trn % s}))
            else:
                siglist[gid].extend([
                    track(w, info={'name': _trn % s})
                    for s, w in wig[0].iteritems()
                ])
        plot_files = plot_footprint_profile(ex, motifbeds, siglist,
                                            assembly.chrnames, job.groups,
                                            logfile)
        for gid, flist in plot_files.iteritems():
            gname = job.groups[gid]['name']
            plotall = unique_filename_in()
            touch(ex, plotall)
            ex.add(plotall,
                   description=set_file_descr(gname + '_footprints_plots',
                                              type='none',
                                              view='admin',
                                              step='motifs',
                                              groupId=gid))
            ex.add(flist['pdf'],
                   description=set_file_descr(gname + '_footprints_plots.pdf',
                                              type='pdf',
                                              step='motifs',
                                              groupId=gid),
                   associate_to_filename=plotall,
                   template='%s.pdf')
            tarname = unique_filename_in()
            tarfh = tarfile.open(tarname, "w:gz")
            for mname, matf in flist['mat']:
                tarfh.add(matf, arcname="%s_%s.txt" % (gname, mname))
            tarfh.close()
            ex.add(tarname,
                   description=set_file_descr(gname +
                                              '_footprints_plots.tar.gz',
                                              type='tar',
                                              step='motifs',
                                              groupId=gid),
                   associate_to_filename=plotall,
                   template='%s.tar.gz')
    logfile.write("\nDone.\n ")
    logfile.flush()
    return 0
예제 #14
0
def save_wellington(ex, wellout, chrmeta):
    bedlist = {}
    for name, wlist in wellout.iteritems():
        wellall = unique_filename_in()
        #### Dummy file
        touch(ex, wellall)
        ex.add(wellall,
               description=set_file_descr(name[1] + '_wellington_files',
                                          type='none',
                                          view='admin',
                                          step='footprints',
                                          groupId=name[0]))
        #### BED at FDR 1%
        bedlist[name[0]] = wellall + "FDR01.bed.gz"
        bedzip = gzip.open(bedlist[name[0]], 'wb')
        bedzip.write("track name='" + name[1] +
                     "_WellingtonFootprints_FDR_0.01'\n")
        for x in wlist:
            with open(os.path.join(*x) +
                      ".WellingtonFootprints.FDR.0.01.bed") as _bed:
                [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall + "FDR01.bed.gz",
               description=set_file_descr(name[1] +
                                          '_WellingtonFootprintsFDR01.bed.gz',
                                          type='bed',
                                          ucsc='1',
                                          step='footprints',
                                          groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprintsFDR01.bed.gz')
        #### BED at p-values [...]
        bedzip = gzip.open(wellall + "PvalCutoffs.bed.gz", 'wb')
        for bfile in os.listdir(os.path.join(wlist[0][0], "p_value_cutoffs")):
            cut = os.path.splitext(
                bfile[:-4])[1][1:]  #between . ([1:]) and .bed ([:-4])
            bedzip.write("track name='" + name[1] +
                         "_WellingtonFootprints_Pval_%s'\n" % cut)
            for wdir, wpref in wlist:
                _bedpath = os.path.join(
                    wdir, "p_value_cutoffs",
                    wpref + ".WellingtonFootprints." + cut + ".bed")
                with open(_bedpath) as _bed:
                    [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall + "PvalCutoffs.bed.gz",
               description=set_file_descr(
                   name[1] + '_WellingtonFootprintsPvalCutoffs.bed.gz',
                   type='bed',
                   ucsc='1',
                   step='footprints',
                   groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprintsPvalCutoffs.bed.gz')
        #### WIG
        cat([os.path.join(*x) + ".WellingtonFootprints.wig" for x in wlist],
            wellall + ".wig")
        #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta)
        #ex.add(wellall+".bw",
        #       description=set_file_descr(name[1]+'_WellingtonFootprints.bw',
        #                                  type='bigWig', ucsc='1', step='footprints', groupId=name[0]),
        #       associate_to_filename=wellall, template='%s_WellingtonFootprints.bw')
        ex.add(wellall + ".wig",
               description=set_file_descr(name[1] +
                                          '_WellingtonFootprints.wig',
                                          type='wig',
                                          ucsc='1',
                                          step='footprints',
                                          groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprints.wig')
    return bedlist
예제 #15
0
def rnaseq_workflow(ex,
                    job,
                    pileup_level=["genes", "transcripts"],
                    via="lsf",
                    junctions=False,
                    stranded=False,
                    logfile=sys.stdout,
                    debugfile=sys.stderr):
    """Main function of the workflow.

    :rtype: None
    :param ex: a bein execution.
    :param job: a Frontend.Job object (or a dictionary of the same form).
    :param assembly: a genrep.Assembly object
    :param junctions: (bool) whether to search for splice junctions using SOAPsplice. [False]
    :param via: (str) send job via 'local' or 'lsf'. ["lsf"]
    """
    group_names = {}
    conditions = []
    groups = job.groups
    assembly = job.assembly
    assert len(groups) > 0, "No groups/runs were given."
    for gid, group in groups.iteritems():
        gname = str(group['name'])
        group_names[gid] = gname
    if isinstance(pileup_level, basestring): pileup_level = [pileup_level]

    # Define conditions as 'group_name.run_id' and store bamfiles in the same order
    bamfiles = []
    for gid, files in job.files.iteritems():
        k = 0
        for rid, f in files.iteritems():
            k += 1
            cond = group_names[gid] + '.' + str(k)
            conditions.append(cond)
            bamfiles.append(f['bam'])
    ncond = len(conditions)

    # Get the assembly's GTF
    # ...from fasta origin
    logfile.write("* Prepare GTF\n")
    logfile.flush()
    if hasattr(assembly, "fasta_origin"):
        logfile.write("  ... from fasta origin\n")
        logfile.flush()
        gtf = gtf_from_bam_header(bamfiles[0])
        descr = set_file_descr(gtf, type='txt', step='pileup', view='admin')
        ex.add(gtf, description=descr)
        pileup_level = ["transcripts"]
        if stranded:
            stranded = False
            logfile.write(
                "  ... Cannot exploit strand information from custom fasta reference.\n"
            )
            logfile.flush()
    # ... or from (wrong) mapping on the transcriptome
    elif assembly.intype == 2:
        logfile.write("  ... from mapping on the transcriptome\n")
        logfile.flush()
        gtf = transcriptome_gtf_from_genrep(assembly)
    # ... or from config file
    else:
        gtf = job.options.get('annot_file')
        if gtf and os.path.exists(os.path.join('..', gtf)):
            gtf = os.path.join('..', gtf)
            logfile.write("  ... from config file: %s\n" % gtf)
            logfile.flush()
        elif gtf and os.path.exists(gtf):
            gtf = os.path.abspath(gtf)
            logfile.write("  ... from config file: %s\n" % gtf)
            logfile.flush()
    # ... or from GenRep
        else:
            logfile.write("  ... from GenRep\n")
            logfile.flush()
            gtf = assembly.create_exome_gtf()
    #shutil.copy(gtf,"../")

    # Build controllers
    rnaseq_args = (ex, via, job, assembly, conditions, debugfile, logfile,
                   pileup_level, junctions, stranded)
    CNT = Counter(*rnaseq_args)
    DE = DE_Analysis(*rnaseq_args)
    PCA = Pca(*rnaseq_args)
    JN = Junctions(*rnaseq_args)

    # Count reads on genes, transcripts with "rnacounter"
    count_files = CNT.count_reads(bamfiles, gtf)

    def differential_analysis(counts_file, feature_type):
        #shutil.copy(counts_file, "../")
        diff_files = DE.differential_analysis(counts_file)
        if diff_files is not None:
            for diff in diff_files:
                # Remove first line
                diff_nohead = unique_filename_in()
                with open(diff) as f:
                    head = f.readline().strip()
                    with open(diff_nohead, "wb") as g:
                        for line in f:
                            g.write(line)
                oname = feature_type + "_differential_" + head + ".txt"
                desc = set_file_descr(oname, step='stats', type='txt', ucsc=0)
                ex.add(diff_nohead, description=desc)

    # DE and PCA
    if "genes" in pileup_level:
        # PCA of groups ~ gene expression
        description = set_file_descr("genes_expression.txt",
                                     step="pileup",
                                     type="txt",
                                     ucsc=0)
        ex.add(count_files['genes'], description=description)
        differential_analysis(count_files['genes'], "genes")
        if stranded:
            description = set_file_descr("genes_antisense_expression.txt",
                                         step="pileup",
                                         type="txt",
                                         ucsc=0)
            ex.add(count_files['genes_anti'], description=description)
            differential_analysis(count_files['genes_anti'], "genes_antisense")
        if ncond > 2:
            PCA.pca_rnaseq(count_files['genes'])

    if "transcripts" in pileup_level:
        description = set_file_descr("transcripts_expression.txt",
                                     step="pileup",
                                     type="txt",
                                     ucsc=0)
        ex.add(count_files['transcripts'], description=description)
        differential_analysis(count_files['transcripts'], "transcripts")
        if stranded:
            description = set_file_descr(
                "transcripts_antisense_expression.txt",
                step="pileup",
                type="txt",
                ucsc=0)
            ex.add(count_files['transcripts_anti'], description=description)
            differential_analysis(count_files['transcripts_anti'],
                                  "transcripts_antisense")

    # Find splice junctions
    if junctions:
        logfile.write("* Search for splice junctions\n")
        logfile.flush()
        JN.find_junctions()

    return 0
예제 #16
0
파일: chipseq.py 프로젝트: bbcf/bbcflib
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict,frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files',{})
    else:
        raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.")
    merge_strands = int(options.get('merge_strands',-1))
    suffixes = ["fwd","rev"]
    peak_deconvolution = options.get('peak_deconvolution',False)
    if isinstance(peak_deconvolution,basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1','true','t']
    run_meme = options.get('run_meme',False)
    if isinstance(run_meme,basestring):
        run_meme = run_meme.lower() in ['1','true','t']
    macs_args = options.get('macs_args',["--bw","200"])
    b2w_args = options.get('b2w_args',[])
    if not(isinstance(mapseq_files,dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid,mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not(isinstance(mapped,dict)):
            raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name+"_"+str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via )
            if mapped[k].get('poisson_threshold',-1)>0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns)>0:
            p_thresh['group_name'] = sum(ptruns)/len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped)>1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid,group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    logfile.write("Starting MACS.\n");logfile.flush()
    processed = {'macs': add_macs_results( ex, read_length, genome_size,
                                           tests, ctrlbam=controls, name=names,
                                           poisson_threshold=p_thresh,
                                           macs_args=macs_args, via=via ) }
    logfile.write("Done MACS.\n");logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score':(6,sys.maxint)}
    _fields = ['chr','start','end','name','score']
    for i,name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name,names['controls'][0])
            macsbed = track(processed['macs'][ctrl]+"_summits.bed",
                            chrmeta=chrlist, fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed",
                                         chrmeta=chrlist, fields=_fields).read(selection=_select),
                                         'name', lambda __n,_n=xn: "%s:%i" %(__n,_n))
                                   for xn,x in enumerate(names['controls'])])
        ##############################
        macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 )
        peak_list[name] = unique_filename_in()+".sql"
        macs_final = track( peak_list[name], chrmeta=chrlist,
                            info={'datatype':'qualitative'},
                            fields=['start','end','name','score'] )
        macs_final.write(fusion(macs_neighb),clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1: options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension']>100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid,mapped in mapseq_files.iteritems():
#            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not('wig' in m) or len(m['wig'])<2:
                output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                      nreads=m["stats"]["total"],
                                                      merge=-1, read_extension=options['read_extension'],
                                                      convert=False,
                                                      b2w_args=b2w_args, via=via )
                wig.append(dict((s,output+s+'.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via))
                                          for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv( stream, pval ):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] 
                                   for x in stream 
                                   if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), 
                                  fields=stream.fields )
        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1]+" deconvolution.\n");logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name,names['controls'][0])
                macsbed = processed['macs'][ctrl]+"_peaks.bed"
            else:
                macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed"
                                                   for x in names['controls']], via=via )
            deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta,
                                 options['read_extension'], script_path, via=via )
            peak_list[name] = unique_filename_in()+".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed,0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1]+'_peaks.sql', type='sql',
                                              step='deconvolution', groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1]+'_deconv.sql', type='sql',
                                              step='deconvolution',  groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'],(bigwig,"bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1]+'_deconv.bw', type='bigWig',
                                                  ucsc='1', step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e));logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1]+'_deconv.pdf', type='pdf',
                                              step='deconvolution', groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs( stream, xlsl, _f ):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:])
                        yield _p+xlsl[0][nb-1][1:]
                    else:
                        nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:]
        return FeatureStream( _macs_row(stream), fields=_f )
    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']])
        try:
###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat),
                                         xlsl, _fields), mode='append')
        except ValueError:
            _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append')
        peakout.close()
        gzipfile(ex,peakfile)
        peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields))
        ex.add(peakfile+".gz",
               description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text',
                                          step='annotation',groupId=name[0]))
    stracks = [track(wig,info={'name':name+"_"+st}) 
               for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()]
    tablefile = unique_filename_in()
    with open(tablefile,"w") as _tf:
        _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1]
                   for _s in names['tests'] for _c in names['controls']]
        _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']),
                        'name', lambda __n,_n=npt: "%s:%i" %(__n,_n))
                  for npt,pt in enumerate(peakfile_list)]
        features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], 
                                      remove_duplicates=True, group_by=['chr','start','end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile,"a") as _tf:
            for row in quantifs:
                pcols = ['']*_ns*_nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while ( _k < len(_rnsplit)-1-int(_nc>1) ):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k-1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])*_nc+int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n")
    gzipfile(ex,tablefile)
    ex.add(tablefile+".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n");logfile.flush()
        processed['meme'] = parallel_meme( ex, assembly,
                                           peak_list.values(), name=peak_list.keys(),
                                           chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'],
                                           via=via )
    return processed
예제 #17
0
파일: rnaseq.py 프로젝트: bbcf/bbcflib
    def count_reads(self, bamfiles, gtf):
        self.write_log("* Counting reads")

        # Count reads on genes, transcripts with "rnacounter"
        ncond = len(self.conditions)
        tablenames = [None]*ncond
        futures = [None]*ncond
        max_rlen = 0
        counter_options = ["--nh"]
        for bam in bamfiles:
            sam = pysam.Samfile(bam,'rb')
            max_rlen = max(max_rlen, sam.next().rlen)
        counter_options += ["--exon_cutoff", str(max_rlen)]
        bwt_args = self.job.options.get('map_args',{}).get('bwt_args',[])
#        if not "--local" in bwt_args:
#            counter_options += ["--nh"]
        if hasattr(self.assembly,"fasta_origin") or self.assembly.intype==2:
            counter_options += ["--type","transcripts", "--method","raw"]
        else:
            counter_options += ["--type","genes,transcripts", "--method","raw,nnls"]
        if self.stranded:
            counter_options += ["--stranded"]
        for i,c in enumerate(self.conditions):
            tablenames[i] = unique_filename_in()
            futures[i] = rnacounter.nonblocking(self.ex, bamfiles[i], gtf, stdout=tablenames[i], via=self.via,
                               options=counter_options)

        # Put samples together
        for i,c in enumerate(self.conditions):
            try:
                futures[i].wait()
            except Exception as err:
                self.write_debug("Counting failed: %s." % str(err))
                raise err
            if futures[i] is None:
                self.write_debug("Counting failed.")
                raise ValueError("Counting failed.")
        if len(tablenames) > 1:
            joined = unique_filename_in()
            rnacounter_join.nonblocking(self.ex, tablenames, stdout=joined, via=self.via).wait()
        else:
            joined = tablenames[0]

        # Split genes and transcripts into separate files
        genes_filename = unique_filename_in()
        trans_filename = unique_filename_in()
        genes_file = open(genes_filename,"wb")
        trans_file = open(trans_filename,"wb")
        if self.stranded:
            genes_anti_filename = unique_filename_in()
            trans_anti_filename = unique_filename_in()
            genes_anti_file = open(genes_anti_filename,"wb")
            trans_anti_file = open(trans_anti_filename,"wb")
        with open(joined) as jfile:
            header = jfile.readline()
            hconds = ["counts."+c for c in self.conditions] + ["rpkm."+c for c in self.conditions]
            hinfo = header.strip().split('\t')[2*ncond+1:]
            header = '\t'.join(["ID"] + hconds + hinfo)+'\n'
            genes_file.write(header)
            trans_file.write(header)
            type_idx = header.split('\t').index("Type")
            if self.stranded:
                genes_anti_file.write(header)
                trans_anti_file.write(header)
                sense_idx = header.split('\t').index("Sense")
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    sense = L[sense_idx].lower()
                    if ftype == 'gene':
                        if sense == 'antisense':
                            genes_anti_file.write(line)
                        else:
                            genes_file.write(line)
                    elif ftype == 'transcript':
                        if sense == 'antisense':
                            trans_anti_file.write(line)
                        else:
                            trans_file.write(line)
            else:
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    if ftype == 'gene':
                        genes_file.write(line)
                    elif ftype == 'transcript':
                        trans_file.write(line)
        genes_file.close()
        trans_file.close()

        # Keep intermediate tables
        for i,c in enumerate(self.conditions):
            #shutil.copy(tablenames[i], "../counts%d.txt"%i)
            descr = set_file_descr(self.conditions[i]+'_'+tablenames[i]+'.gz', type='txt', step='pileup', view='admin')
            gzipfile(self.ex, tablenames[i])
            self.ex.add(tablenames[i]+'.gz', description=descr)

        if self.stranded:
            count_files = {'genes':genes_filename, 'transcripts':trans_filename,
                           'genes_anti':genes_anti_filename, 'transcripts_anti':trans_anti_filename}
        else:
            count_files = {'genes':genes_filename, 'transcripts':trans_filename}
        return count_files
예제 #18
0
def main(argv = None):
    via = "lsf"
    limspath = None
    hts_key = ''
    working_dir = None
    config_file = None
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts,args = getopt.getopt(sys.argv[1:],"hu:k:d:w:c:",
                                      ["help","via=","key=","minilims=",
                                       "working-directory=","config="])
        except getopt.error, msg:
            raise Usage(msg)
        for o, a in opts:
            if o in ("-h", "--help"):
                print __doc__
                print usage
                return 0
            elif o in ("-u", "--via"):
                if a=="local":
                    via = "local"
                elif a=="lsf":
                    via = "lsf"
                else:
                    raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (a,))
            elif o in ("-w", "--working-directory"):
                if os.path.exists(a):
                    os.chdir(a)
                    working_dir = a
                else:
                    raise Usage("Working directory '%s' does not exist." % a)
            elif o in ("-d", "--minilims"):
                limspath = a
            elif o in ("-k", "--key"):
                hts_key = a
            elif o in ("-c", "--config"):
                config_file = a
            else:
                raise Usage("Unhandled option: " + o)
        if not(limspath and os.path.exists(limspath)
               and (hts_key != None or (config_file and os.path.exists(config_file)))):
            raise Usage("Need a minilims and a job key or a configuration file")
        M = MiniLIMS( limspath )
        if len(hts_key)>1:
            gl = use_pickle(M, "global variables")
            htss = frontend.Frontend( url=gl['hts_mapseq']['url'] )
            job = htss.job( hts_key )
            [M.delete_execution(x) for x in M.search_executions(with_description=hts_key,fails=True)]
        elif os.path.exists(config_file):
            (job,gl) = frontend.parseConfig( config_file )
            hts_key = job.description
        else:
            raise ValueError("Need either a job key (-k) or a configuration file (-c).")
        g_rep = genrep.GenRep( url=gl["genrep_url"], root=gl["bwt_root"],
                               intype=job.options.get('input_type_id') or 0 )
        assembly = g_rep.assembly( job.assembly_id )
        if 'lims' in gl:
            dafl = dict((loc,daflims.DAFLIMS( username=gl['lims']['user'], password=pwd ))
                        for loc,pwd in gl['lims']['passwd'].iteritems())
        else:
            dafl = None
        if not('compute_densities' in job.options):
            job.options['compute_densities'] = True
        elif isinstance(job.options['compute_densities'],str):
            job.options['compute_densities'] = job.options['compute_densities'].lower() in ['1','true','t']
        if not('ucsc_bigwig' in job.options):
            job.options['ucsc_bigwig'] = True
        elif isinstance(job.options['ucsc_bigwig'],str):
            job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'].lower() in ['1','true','t']
        job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'] and job.options['compute_densities']
        if not('create_gdv_project' in job.options):
            job.options['create_gdv_project'] = False
        elif isinstance(job.options['create_gdv_project'],str):
            job.options['create_gdv_project'] = job.options['create_gdv_project'].lower() in ['1','true','t']
        if job.options.get('read_extension'):
            job.options['read_extension'] = int(job.options['read_extension'])
        if job.options.get('merge_strands'):
            job.options['merge_strands'] = int(job.options['merge_strands'])
        logfile = open(hts_key+".log",'w')
        with execution( M, description=hts_key, remote_working_directory=working_dir ) as ex:
            logfile.write("Enter execution, fetch fastq files.\n");logfile.flush()
            job = get_fastq_files( job, ex.working_directory, dafl )
            logfile.write("Map reads.\n");logfile.flush()
            mapped_files = map_groups( ex, job, ex.working_directory, assembly, {'via': via} )
            logfile.write("Make stats:\n");logfile.flush()
            for k,v in job.groups.iteritems():
                logfile.write(str(k)+str(v['name'])+"\t");logfile.flush()
                pdf = add_pdf_stats( ex, mapped_files,
                                     {k:v['name']},
                                     gl.get('script_path') or '',
                                     description=set_file_descr(v['name']+"_mapping_report.pdf",groupId=k,step='stats',type='pdf') )
            if job.options['compute_densities']:
                logfile.write("computing densities.\n");logfile.flush()
                if not(job.options.get('read_extension')>0):
                    job.options['read_extension'] = mapped_files.values()[0].values()[0]['stats']['read_length']
                density_files = densities_groups( ex, job, mapped_files, assembly.chromosomes, via=via )
                logfile.write("Finished computing densities.\n");logfile.flush()
                if job.options['create_gdv_project']:
                    logfile.write("Creating GDV project.\n");logfile.flush()
                    gdv_project = gdv.create_gdv_project( gl['gdv']['key'], gl['gdv']['email'],
                                                          job.description,
                                                          assembly.nr_assembly_id,
                                                          gdv_url=gl['gdv']['url'], public=True )
                    logfile.write("GDV project: "+str(gdv_project['project_id']+"\n"));logfile.flush()
                    add_pickle( ex, gdv_project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') )
        allfiles = get_files( ex.id, M )
        if 'ucsc_bigwig' and g_rep.intype == 0:
            ucscfiles = get_files( ex.id, M, select_param={'ucsc':'1'} )
            with open(hts_key+".bed",'w') as ucscbed:
                for ftype,fset in ucscfiles.iteritems():
                    for ffile,descr in fset.iteritems():
                        if re.search(r' \(.*\)',descr): continue
                        ucscbed.write(track_header(descr,ftype,gl['hts_mapseq']['download'],ffile))
        if job.options['create_gdv_project']:
            allfiles['url'] = {gdv_project['public_url']: 'GDV view'}
            download_url = gl['hts_mapseq']['download']
            [gdv.add_gdv_track( gl['gdv']['key'], gl['gdv']['email'],
                                gdv_project['project_id'],
                                url=download_url+str(k),
                                name = re.sub('\.sql','',str(f)),
                                gdv_url=gl['gdv']['url'] )
             for k,f in allfiles['sql'].iteritems()]
        logfile.close()
        print json.dumps(allfiles)
        with open(hts_key+".done",'w') as done:
            json.dump(allfiles,done)
        if 'email' in gl:
            r = email.EmailReport( sender=gl['email']['sender'],
                                   to=str(job.email),
                                   subject="Mapseq job "+str(job.description),
                                   smtp_server=gl['email']['smtp'] )
            r.appendBody('''
Your mapseq job has finished.

The description was:
'''+str(job.description)+'''
and its unique key is '''+hts_key+'''.

You can now retrieve the results at this url:
'''+gl['hts_mapseq']['url']+"jobs/"+hts_key+"/get_results")
            r.send()
        return 0
예제 #19
0
    def __call__(self,opts):
        self.opts = opts
        if os.path.exists(self.opts.wdir):
            os.chdir(self.opts.wdir)
        else:
            raise Usage("Working directory '%s' does not exist." %self.opts.wdir)

##### Connect to Minilims, recover global variables, fetch job info
        self.minilims = os.path.join(self.opts.basepath,self.name+"_minilims")
        M = MiniLIMS(self.minilims)
        if not((self.opts.key != None or (self.opts.config and os.path.exists(self.opts.config)))):
            raise Usage("Need a job key or a configuration file")
        if self.opts.key:
            self.globals = use_pickle(M, "global variables")
            htss = frontend.Frontend( url=self.globals['hts_mapseq']['url'] )
            self.job = htss.job( self.opts.key )
            [M.delete_execution(x) for x in \
                 M.search_executions(with_description=self.opts.key,fails=True)]
            if self.job.options.get("config_file"):
                if os.path.exists(self.job.options["config_file"]):
                    self.opts.config = os.path.abspath(self.job.options["config_file"])
                elif os.path.exists("config.txt"):
                    self.opts.config = os.path.abspath("config.txt")
            if self.opts.config and os.path.exists(self.opts.config):
                (self.job,self.globals) = frontend.parseConfig( self.opts.config, self.job, self.globals )
        elif os.path.exists(self.opts.config):
            (self.job,self.globals) = frontend.parseConfig( self.opts.config )
            self.opts.key = self.job.description
        else:
            raise Usage("Need either a job key (-k) or a configuration file (-c).")
##### Genrep instance
        if 'fasta_file' in self.job.options:
            if os.path.exists(self.job.options['fasta_file']):
                self.job.options['fasta_file'] = os.path.abspath(self.job.options['fasta_path'])
            else:
                for ext in (".fa",".fa.gz",".tar.gz"):
                    if os.path.exists("ref_sequence"+ext):
                        self.job.options['fasta_file'] = os.path.abspath("ref_sequence"+ext)
            if not os.path.exists(self.job.options['fasta_file']):
                raise Usage("Don't know where to find fasta file %s." %self.job.options["fasta_file"])
        g_rep = genrep.GenRep( url=self.globals.get("genrep_url"),
                               root=self.globals.get("bwt_root") )
##### Configure facility LIMS
        if 'lims' in self.globals:
            from bbcflib import daflims
            self.job.dafl = dict((loc,daflims.DAFLIMS( username=self.globals['lims']['user'],
                                                       password=pwd ))
                                 for loc,pwd in self.globals['lims']['passwd'].iteritems())
########################################################################
##########################  EXECUTION  #################################
########################################################################
##### Logging
        logfile_name = os.path.abspath(self.opts.key+".log")
        debugfile_name = os.path.abspath(self.opts.key+".debug")
        self.logfile = open(logfile_name,'w')
        self.debugfile = open(debugfile_name,'w')
        self.debug_write(json.dumps(self.globals)+"\n")
        with execution( M, description=self.opts.key,
                        remote_working_directory=self.opts.wdir ) as ex:
            self.log_write("Enter execution. Current working directory: %s" %ex.working_directory)
            self.job.assembly = genrep.Assembly( assembly=self.job.assembly_id,
                                                 genrep=g_rep,
                                                 fasta=self.job.options.get('fasta_file'),
                                                 annot=self.job.options.get('annot_file'),
                                                 intype=self.job.options.get('input_type_id',0),
                                                 ex=ex, via=self.opts.via,
                                                 bowtie2=self.job.options.get("bowtie2",True) )
##### Check all the options
            if not self.check_options():
                raise Usage("Problem with options %s" %self.opts)
            self.debug_write(json.dumps(self.job.options))
            self.init_files( ex )
##### Run workflow
            self.log_write("Starting workflow.")
            self.main_func(ex,**self.main_args)
##### Add logs to the LIMS in admin mode
            self.logfile.flush()
            self.debugfile.flush()
            log_desc = set_file_descr('logfile.txt', step='log', type='txt', view="admin")
            debug_desc = set_file_descr('debug.txt', step='log', type='txt', view="admin")
            ex.add(os.path.join(logfile_name), description=log_desc)
            ex.add(os.path.join(debugfile_name), description=debug_desc)
##### Create GDV project
            if self.job.options['create_gdv_project']: self.gdv_create(ex)

########################################################################
########################  POSTPROCESSING  ##############################
########################################################################
        allfiles = get_files( ex.id, M )
        if self.job.options['create_gdv_project'] and \
                self.job.options['gdv_project'].get('project',{}).get('id',0)>0:
            allfiles['url'] = self.gdv_upload(allfiles.get('sql',{}))
        self.logfile.close()
        self.debugfile.close()
        print json.dumps(allfiles)
        with open(self.opts.key+".done",'w') as done: json.dump(allfiles,done)
        self.send_email()
        return 0
예제 #20
0
def microbiome_workflow(ex, job, assembly, logfile=sys.stdout, via='lsf'):
    '''
    Main:

      * 0. retrieve bam files from mapseq job
      *   0.a. merge bam files (=> 1 bam file per group)
      * 1. for each group:
      *   1.a get counts per group (=> 1 file per group)
      *   1.b get counts per Level (Kingdom, Phylum, Class, Order, Family, Genus and Species) (=> 1 file per level / per group)
      * 2. combine counts
      *   2.a combine counts for all groups (=> 1 combined file)
      *   2.b combine counts per level for all groups (=> 1 combined file per Level)
      * 3. generate barplots (=> 1 plot per group + per level + per combined files)

    '''
    ### params
    levels = [
        'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'
    ]
    infosCols = {
        'Kingdom': [0, [1, 2]],
        'Phylum': [[0, 1], [2, 3]],
        'Class': [[0, 1, 2], [3, 4]],
        'Order': [[0, 1, 2, 3], [4, 5]],
        'Family': [[0, 1, 2, 3, 4], [5, 6]],
        'Genus': [[0, 1, 2, 3, 4, 5], [6, 7]],
        'Species': [[0, 1, 2, 3, 4, 5, 6], [7, 8]]
    }
    ### outputs
    processed = {'cnts': {}, 'cnts_level': {}, 'plots': {}}

    ### do it
    mapseq_files = job.files

    # 1.a get counts per group (=> 1 file per group)
    futures = {}
    for gid, group in job.groups.iteritems():
        group_name = group['name']
        bamfiles = [m['bam'] for m in mapseq_files[gid].values()]
        futures[gid] = run_microbiome.nonblocking(ex, [
            "bam_to_annot_counts", bamfiles, assembly.annotations_path,
            group_name
        ],
                                                  via=via,
                                                  memory=8)

    # 1.b get counts per Level (Kingdom, Phylum, Class, Order, Family, Genus and Species) (=> 1 file per level / per group)
    step = 'counts'
    for gid, future in futures.iteritems():
        res = future.wait()
        processed['cnts'][gid] = res  # group_name + "_counts_annot.txt"
        fname = job.groups[gid]['name'] + "_counts_annot.txt"
        ex.add(res,
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="txt"))
        processed['cnts_level'][gid] = [
            run_microbiome.nonblocking(ex, ["getCountsPerLevel", res, level],
                                       via=via,
                                       memory=8) for level in levels
        ]

    # 2.a combine counts for all groups (=> 1 combined file)
    files = processed['cnts'].values()
    combined_out = [
        run_microbiome.nonblocking(ex, ["combine_counts", files, 0, [1, 2]],
                                   via=via,
                                   memory=8)
    ]

    # 2.b combine counts per level for all groups (=> 1 combined file per Level)
    for n, level in enumerate(levels):
        files = dict([(gid, f[n].wait())
                      for gid, f in processed['cnts_level'].iteritems()])
        combined_out.append(
            run_microbiome.nonblocking(
                ex, ["combine_counts", files.values()] +
                infosCols.get(level, [0, [1, 2]]),
                via=via,
                memory=8))
        for gid, f in files.iteritems():
            fname = job.groups[gid]['name'] + "_counts_annot_" + level + ".txt"
            ex.add(f,
                   description=set_file_descr(fname,
                                              groupId=gid,
                                              step=step,
                                              type="txt"))

    step = 'combined'
    ex.add(combined_out[0].wait(),
           description=set_file_descr("combined_counts.txt",
                                      step=step,
                                      type="txt"))
    for nl, level in enumerate(levels):
        ex.add(combined_out[nl + 1].wait(),
               description=set_file_descr("combined_counts" + level + ".txt",
                                          step=step,
                                          type="txt"))
    return 0
예제 #21
0
def parallel_meme( ex, assembly, regions, name=None, chip=False, meme_args=None, via='lsf' ):
    """Fetches sequences, then calls ``meme`` on them and finally saves the results in the repository.
    
    """
    if meme_args is None: meme_args = []
    if not(isinstance(regions,list)): regions = [regions]
    if not(isinstance(name,list)): name = [name or '_']
    futures = {}
    fasta_files = {}
    background = assembly.statistics(unique_filename_in(),frequency=True)
#    genomeRef = assembly.untar_genome_fasta()
    for i,n in enumerate(name):
        (fasta, size) = assembly.fasta_from_regions( regions[i], ex=ex )
        tmpfile = unique_filename_in()
        outdir = unique_filename_in()
        if chip:
            futures[n] = (outdir, memechip.nonblocking( ex, fasta, outdir, background,
                                                        args=meme_args, via=via, 
                                                        stderr=tmpfile, memory=6 ))
        else:
            futures[n] = (outdir, meme.nonblocking( ex, fasta, outdir, background,
                                                    maxsize=(size*3)/2, args=meme_args,
                                                    via=via, stderr=tmpfile, memory=6 ))
        fasta_files[n] = fasta
    all_res = {}
    for n,f in futures.iteritems():
        f[1].wait()
        meme_out = f[0]
        archive = unique_filename_in()
        tgz = tarfile.open(archive, "w:gz")
        tgz.add( meme_out, arcname=n[1]+"_meme",
                 exclude=lambda x: os.path.basename(x) in [fasta_files[n],background] )
        tgz.close()
        ex.add( archive, description=set_file_descr(n[1]+"_meme.tgz",
                                                    step='meme', type='tar',
                                                    groupId=n[0]) )
        gzipfile(ex,fasta_files[n],args=["-f"])
        ex.add( fasta_files[n]+".gz",
                description=set_file_descr(n[1]+"_sites.fa.gz",
                                           step='meme', type='fasta',
                                           groupId=n[0]) )
        if not(chip) and os.path.exists(os.path.join(meme_out, "meme.xml")):
            meme_res = parse_meme_xml( ex, os.path.join(meme_out, "meme.xml"),
                                       assembly.chrmeta )
            if os.path.exists(os.path.join(meme_out, "meme.html")):
                ex.add( os.path.join(meme_out, "meme.html"),
                        description=set_file_descr(n[1]+"_meme.html",
                                                   step='meme', type='html', 
                                                   groupId=n[0]) )
            ex.add( meme_res['sql'], description=set_file_descr(n[1]+"_meme_sites.sql",
                                                                step='meme', type='sql',
                                                                groupId=n[0]) )
            for i,motif in enumerate(meme_res['matrices'].keys()):
                ex.add( meme_res['matrices'][motif],
                        description=set_file_descr(n[1]+"_meme_"+motif+".txt",
                                                   step='meme', type='txt', 
                                                   groupId=n[0]) )
                ex.add( os.path.join(meme_out, "logo"+str(i+1)+".png"),
                        description=set_file_descr(n[1]+"_meme_"+motif+".png",
                                                   step='meme', type='png', 
                                                   groupId=n[0]) )
            all_res[n] = meme_res
    return all_res
예제 #22
0
def dnaseseq_workflow( ex, job, assembly, logfile=sys.stdout, via='lsf' ):
    """
    This workflow performs the following steps:

      * BAM files from replicates within the same group are merged
      * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group
      * Wellington is called to identify footprints within these enriched regions
      * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file
      * Average DNAse profiles around motifs are plotted

    """
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid,mapped in job.files.iteritems():
        group_name = job.groups[gid]['name']
        if not isinstance(mapped,dict):
            raise TypeError("Files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped: mapped = {'_': mapped}
        if len(mapped)>1:
            bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()])
            index = index_bam(ex, bamfile)
        else:
            bamfile = mapped.values()[0]['bam']
        if job.groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            if os.path.exists(job.groups[gid].get('bedfile','null')):
                bedfile = job.groups[gid]['bedfile']
            elif os.path.exists(os.path.join(supdir,job.groups[gid].get('bedfile','null'))):
                bedfile = os.path.join(supdir,job.groups[gid]['bedfile'])
            else:
                bedfile = None
            tests.append((bedfile,bamfile))
            names['tests'].append((gid,group_name))
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    tests = macs_bedfiles( ex, assembly.chrmeta, tests, controls, names, 
                           job.options.get('macs_args',["--keep-dup","10"]), via, logfile )
    bedlist = run_wellington(ex, tests, names, assembly, via, logfile)
######################### Motif scanning / plotting
    if any([gr.get('motif') != 'null' and gr.get('motif') 
            for gr in job.groups.values()]):
        motifbeds = motif_scan( ex, bedlist, assembly, job.groups, via, logfile )
        siglist = dict((gid[0],[]) for gid in names['tests'])
        for gid,mapped in job.files.iteritems():
            wig = []
            suffixes = ["fwd","rev"]
            merge_strands = int(job.options.get('merge_strands',-1))
            read_extension = int(job.options.get('read_extension') or -1)
            make_wigs = merge_strands >= 0 or read_extension != 1
            for m in mapped.values():
                if make_wigs or not('wig' in m) or len(m['wig'])<2:
                    output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                          nreads=m["stats"]["total"],
                                                          merge=-1, read_extension=1,
                                                          convert=False,
                                                          b2w_args=[], via=via )
                    wig.append(dict((s,output+s+'.sql') for s in suffixes))
                else:
                    wig.append(m['wig'])
            if len(wig) > 1:
                wig[0] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) 
                              for s in suffixes)
            _trn = job.groups[gid]['name']+"_%s"
            if job.groups[gid]['control']:
                for s,w in wig[0].iteritems():
                    for _g in siglist.keys():
                        siglist[_g].append(track(w,info={'name': _trn%s}))
            else:
                siglist[gid].extend([track(w,info={'name': _trn%s})
                                     for s,w in wig[0].iteritems()])
        plot_files = plot_footprint_profile( ex, motifbeds, siglist, 
                                             assembly.chrnames, 
                                             job.groups, logfile )
        for gid, flist in plot_files.iteritems():
            gname = job.groups[gid]['name']
            plotall = unique_filename_in()
            touch( ex, plotall )
            ex.add(plotall, description=set_file_descr(gname+'_footprints_plots', 
                                                       type='none', view='admin',
                                                       step='motifs', groupId=gid))
            ex.add(flist['pdf'], description=set_file_descr(gname+'_footprints_plots.pdf', 
                                                            type='pdf', step='motifs', 
                                                            groupId=gid),
                   associate_to_filename=plotall, template='%s.pdf')
            tarname = unique_filename_in()
            tarfh = tarfile.open(tarname, "w:gz")
            for mname,matf in flist['mat']:
                tarfh.add(matf, arcname="%s_%s.txt" % (gname,mname))
            tarfh.close()
            ex.add( tarname, description=set_file_descr(gname+'_footprints_plots.tar.gz',
                                                        type='tar', step='motifs', groupId=gid),
                    associate_to_filename=plotall, template='%s.tar.gz')
    logfile.write("\nDone.\n ");logfile.flush()
    return 0
예제 #23
0
파일: rnaseq.py 프로젝트: bbcf/bbcflib
def rnaseq_workflow(ex, job, pileup_level=["genes","transcripts"],
                    via="lsf", junctions=False, stranded=False,
                    logfile=sys.stdout, debugfile=sys.stderr):
    """Main function of the workflow.

    :rtype: None
    :param ex: a bein execution.
    :param job: a Frontend.Job object (or a dictionary of the same form).
    :param assembly: a genrep.Assembly object
    :param junctions: (bool) whether to search for splice junctions using SOAPsplice. [False]
    :param via: (str) send job via 'local' or 'lsf'. ["lsf"]
    """
    group_names={}; conditions=[]
    groups = job.groups
    assembly = job.assembly
    assert len(groups) > 0, "No groups/runs were given."
    for gid,group in groups.iteritems():
        gname = str(group['name'])
        group_names[gid] = gname
    if isinstance(pileup_level,basestring): pileup_level=[pileup_level]

    # Define conditions as 'group_name.run_id' and store bamfiles in the same order
    bamfiles = []
    for gid,files in job.files.iteritems():
        k = 0
        for rid,f in files.iteritems():
            k+=1
            cond = group_names[gid]+'.'+str(k)
            conditions.append(cond)
            bamfiles.append(f['bam'])
    ncond = len(conditions)

    # Get the assembly's GTF
    # ...from fasta origin
    logfile.write("* Prepare GTF\n"); logfile.flush()
    if hasattr(assembly,"fasta_origin"):
        logfile.write("  ... from fasta origin\n"); logfile.flush()
        gtf = gtf_from_bam_header(bamfiles[0])
        descr = set_file_descr(gtf, type='txt', step='pileup', view='admin')
        ex.add(gtf, description=descr)
        pileup_level = ["transcripts"]
        if stranded:
            stranded=False
            logfile.write("  ... Cannot exploit strand information from custom fasta reference.\n"); logfile.flush()
    # ... or from (wrong) mapping on the transcriptome
    elif assembly.intype==2:
        logfile.write("  ... from mapping on the transcriptome\n"); logfile.flush()
        gtf = transcriptome_gtf_from_genrep(assembly)
    # ... or from config file
    else:
        gtf = job.options.get('annot_file')
        if gtf and os.path.exists(os.path.join('..', gtf)):
            gtf = os.path.join('..', gtf)
            logfile.write("  ... from config file: %s\n" % gtf); logfile.flush()
        elif gtf and os.path.exists(gtf):
            gtf = os.path.abspath(gtf)
            logfile.write("  ... from config file: %s\n" % gtf); logfile.flush()
    # ... or from GenRep
        else:
            logfile.write("  ... from GenRep\n"); logfile.flush()
            gtf = assembly.create_exome_gtf()
    #shutil.copy(gtf,"../")

    # Build controllers
    rnaseq_args = (ex,via,job,assembly,conditions,debugfile,logfile,
                   pileup_level,junctions,stranded)
    CNT = Counter(*rnaseq_args)
    DE = DE_Analysis(*rnaseq_args)
    PCA = Pca(*rnaseq_args)
    JN = Junctions(*rnaseq_args)

    # Count reads on genes, transcripts with "rnacounter"
    count_files = CNT.count_reads(bamfiles, gtf)

    def differential_analysis(counts_file, feature_type):
        #shutil.copy(counts_file, "../")
        diff_files = DE.differential_analysis(counts_file)
        if diff_files is not None:
            for diff in diff_files:
                # Remove first line
                diff_nohead = unique_filename_in()
                with open(diff) as f:
                    head = f.readline().strip()
                    with open(diff_nohead, "wb") as g:
                        for line in f: g.write(line)
                oname = feature_type + "_differential_"+ head + ".txt"
                desc = set_file_descr(oname, step='stats', type='txt', ucsc=0)
                ex.add(diff_nohead, description=desc)

    # DE and PCA
    if "genes" in pileup_level:
        # PCA of groups ~ gene expression
        description = set_file_descr("genes_expression.txt", step="pileup", type="txt", ucsc=0)
        ex.add(count_files['genes'], description=description)
        differential_analysis(count_files['genes'], "genes")
        if stranded:
            description = set_file_descr("genes_antisense_expression.txt", step="pileup", type="txt", ucsc=0)
            ex.add(count_files['genes_anti'], description=description)
            differential_analysis(count_files['genes_anti'], "genes_antisense")
        if ncond > 2:
            PCA.pca_rnaseq(count_files['genes'])

    if "transcripts" in pileup_level:
        description = set_file_descr("transcripts_expression.txt", step="pileup", type="txt", ucsc=0)
        ex.add(count_files['transcripts'], description=description)
        differential_analysis(count_files['transcripts'], "transcripts")
        if stranded:
            description = set_file_descr("transcripts_antisense_expression.txt", step="pileup", type="txt", ucsc=0)
            ex.add(count_files['transcripts_anti'], description=description)
            differential_analysis(count_files['transcripts_anti'], "transcripts_antisense")

    # Find splice junctions
    if junctions:
        logfile.write("* Search for splice junctions\n"); logfile.flush()
        JN.find_junctions()

    return 0
예제 #24
0
파일: chipseq.py 프로젝트: bbcf/bbcflib
def add_macs_results( ex, read_length, genome_size, bamfile,
                      ctrlbam=None, name=None, poisson_threshold=None,
                      alias=None, macs_args=None, via='lsf' ):
    """Calls the ``macs`` function on each possible pair
    of test and control bam files and adds
    the respective outputs to the execution repository.

    ``macs`` options can be controlled with `macs_args`.
    If a dictionary of Poisson thresholds for each sample is given, then the enrichment bounds ('-m' option)
    are computed from them otherwise the default is '-m 10,100'.

    Returns the set of file prefixes.
    """
    if not(isinstance(bamfile,list)):
        bamfile = [bamfile]
    if not(isinstance(ctrlbam,list)):
        ctrlbam = [ctrlbam]
    if poisson_threshold is None:
        poisson_threshold = {}
    if macs_args is None:
        macs_args = []
    futures = {}
    rl = read_length
    for i,bam in enumerate(bamfile):
        n = name['tests'][i]
        if poisson_threshold.get(n)>0:
            low = (poisson_threshold.get(n)+1)*5
            enrich_bounds = str(min(30,low))+","+str(10*low)
        else:
            enrich_bounds = "10,100"
        if not("-m" in macs_args): macs_args += ["-m",enrich_bounds]
        if isinstance(read_length,list): rl = read_length[i]
        for j,cam in enumerate(ctrlbam):
            m = name['controls'][j]
            nm = (n,m)
            futures[nm] = macs.nonblocking( ex, rl, genome_size, bam, cam,
                                            args=macs_args, via=via, memory=12 )
    prefixes = {}
    for n,f in futures.iteritems():
        p = f.wait()
        prefixes[n] = p
        macs_descr0 = {'step':'macs','type':'none','view':'admin','groupId':n[0][0]}
        macs_descr1 = {'step':'macs','type':'xls','groupId':n[0][0]}
        macs_descr2 = {'step':'macs','type':'bed','groupId':n[0][0],'ucsc':'1'}
        filename = "_vs_".join([x[1] for x in n if x[0]])
        touch( ex, p )
        ex.add( p, description=set_file_descr(filename,**macs_descr0),
                alias=alias )
        ex.add( p+"_peaks.xls",
                description=set_file_descr(filename+"_peaks.xls",**macs_descr1),
                associate_to_filename=p, template='%s_peaks.xls' )
        bedzip = gzip.open(p+"_peaks.bed.gz",'wb')
        bedzip.write("track name='"+filename+"_macs_peaks'\n")
        with open(p+"_peaks.bed") as bedinf:
            [bedzip.write(l) for l in bedinf]
        bedzip.close()
        ex.add( p+"_peaks.bed.gz",
                description=set_file_descr(filename+"_peaks.bed.gz",**macs_descr2),
                associate_to_filename=p, template='%s_peaks.bed.gz' )
        bedzip = gzip.open(p+"_summits.bed.gz",'wb')
        bedzip.write("track name='"+filename+"_macs_summits'\n")
        with open(p+"_summits.bed") as bedinf:
            [bedzip.write(l) for l in bedinf]
        bedzip.close()
        ex.add( p+"_summits.bed.gz",
                description=set_file_descr(filename+"_summits.bed.gz",**macs_descr2),
                associate_to_filename=p, template='%s_summits.bed.gz' )
        if n[1][0]:
            ex.add( p+"_negative_peaks.xls",
                    description=set_file_descr(filename+"_negative_peaks.xls",**macs_descr0),
                    associate_to_filename=p, template='%s_negative_peaks.xls' )
    return prefixes
예제 #25
0
파일: rnaseq.py 프로젝트: bbcf/bbcflib
    def find_junctions(self, soapsplice_index=None, path_to_soapsplice=None, soapsplice_options={}):
        """
        Retrieve unmapped reads from a precedent mapping and runs SOAPsplice on them.
        Return the names of a .bed track indicating the junctions positions, as well as
        of a bam file of the alignments attesting the junctions.

        :param soapsplice_index: (str) path to the SOAPsplice index.
        :param path_to_soapsplice: (str) specify the path to the program if it is not in your $PATH.
        :param soapsplice_options: (dict) SOAPsplice options, e.g. {'-m':2}.
        :rtype: str, str
        """

        @program
        def soapsplice(unmapped_R1, unmapped_R2, index, output=None, path_to_soapsplice=None, options={}):
            """Bind 'soapsplice'. Return a text file containing the list of junctions.

            :param unmapped_R1: (str) path to the fastq file containing the 'left' reads.
            :param unmapped_R2: (str) path to the fastq file containing the 'right' reads.
            :param index: (str) path to the SOAPsplice index.
            :param output: (str) output file name.
            :param path_to_soapsplice: (str) path to the SOAPsplice executable.
                If not specified, the program must be in your $PATH.
            :param options: (dict) SOAPsplice options, given as {opt: value}.
            :rtype: str

            Main options::

            -p: number of threads, <= 20. [1]
            -S: 1: forward strand, 2: reverse strand, 3: both. [3]
            -m: maximum mismatch for one-segment alignment, <= 5. [3]
            -g: maximum indel for one-segment alignment, <= 2. [2]
            -i: length of tail that can be ignored in one-segment alignment. [7]
            -t: longest gap between two segments in two-segment alignment. [500000]
            -a: shortest length of a segment in two-segment alignment. [8]
            -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0]
            -L: maximum distance between paired-end reads. [500000]
            -l: minimum distance between paired-end reads. [50]
            -I: insert length of paired-end reads.
            """
            if not output: output = unique_filename_in()
            path_to_soapsplice = path_to_soapsplice or 'soapsplice'
            args = [path_to_soapsplice,'-d',index,'-1',unmapped_R1,'-2',unmapped_R2,'-o',output,'-f','2']
            opts = []
            for k,v in options.iteritems(): opts.extend([str(k),str(v)])
            return {"arguments": args+opts, "return_value": output}

        if not program_exists('soapsplice'):
            self.write_debug("Skipped junctions search: soapsplice not found.")
            return
        self.assembly.set_index_path(intype=3)
        soapsplice_index = soapsplice_index or self.assembly.index_path
        soapsplice_options.update(self.job.options.get('soapsplice_options',{}))
        soapsplice_options.setdefault('-p',16) # number of threads
        soapsplice_options.setdefault('-q',1)  # Sanger format
        unmapped_fastq = {}
        for gid, group in self.job.groups.iteritems():
            unmapped_fastq[gid] = []
            for rid, run in group['runs'].iteritems():
                unmapped = self.job.files[gid][rid].get('unmapped_fastq')
                if not unmapped:
                    self.write_log("No unmapped reads found for group %s, run %d. Skip." % (gid,rid))
                    continue
                elif not isinstance(unmapped,tuple):
                    self.write_log("Pair-end reads required. Skip.")
                    continue
                unmapped_fastq[gid].append(unmapped)
            if len(unmapped_fastq[gid]) == 0:
                continue
            R1 = cat(zip(*unmapped_fastq[gid])[0])
            R2 = cat(zip(*unmapped_fastq[gid])[1])
            future = soapsplice.nonblocking(self.ex,R1,R2,soapsplice_index,
                                            path_to_soapsplice=path_to_soapsplice,
                                            options=soapsplice_options,
                                            via=self.via, memory=8, threads=soapsplice_options['-p'])
            try:
                template = future.wait()
            except Exception as err:
                self.write_debug("SOAPsplice failed: %s." % str(err))
                return
            if template is None:
                self.write_debug("SOAPsplice failed.")
                return
            junc_file = template+'.junc'
            bed = self.convert_junc_file(junc_file,self.assembly)
            bed_descr = set_file_descr('junctions_%s.bed' % group['name'],
                                       groupId=gid,type='bed',step='junctions', ucsc=1)
            bam_descr = set_file_descr('junctions_%s.bam' % group['name'],
                                       groupId=gid,type='bam',step='junctions', ucsc=0)
            sam = template+'.sam'
            try:
                bam = sam_to_bam(self.ex,sam,reheader=self.assembly.name)
                add_and_index_bam(self.ex, bam, description=bam_descr)
                self.ex.add(bam, description=bam_descr)
            except Exception as e:
                self.write_debug("%s\n(Qualities may be in the wrong format, try with '-q 0'.)" %str(e))
            self.ex.add(bed, description=bed_descr)
        return bed, bam
예제 #26
0
파일: snp.py 프로젝트: JoseEspinosa/bbcflib
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local',
                 logfile=sys.stdout, debugfile=sys.stderr):
    """Main function of the workflow"""
    ref_genome = assembly.fasta_by_chrom
    sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())]

    logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush()
    vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}}
    bams = {}
    # Launch the jobs
    for gid in sorted(job.files.keys()):
        # Merge all bams belonging to the same group
        runs = [r['bam'] for r in job.files[gid].itervalues()]
        bam = Samfile(runs[0])
        header = bam.header
        headerfile = unique_filename_in()
        for h in header["SQ"]:
            if h["SN"] in assembly.chrmeta:
                h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
        head = Samfile( headerfile, "wh", header=header )
        head.close()
        if len(runs) > 1:
            _b = merge_bam(ex,runs)
            index_bam(ex,_b)
            bams[gid] = _b
        else:
            bams[gid] = runs[0]
        # Samtools mpileup + bcftools + vcfutils.pl
        for chrom,ref in ref_genome.iteritems():
            vcf = unique_filename_in()
            vcfs[chrom][gid] = (vcf,
                                pileup.nonblocking(ex, bams[gid], ref, header=headerfile,
                                                   via=via, stdout=vcf))
        logfile.write("  ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush()
    # Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
    for gid in sorted(job.files.keys()):
        for chrom,ref in ref_genome.iteritems():
            vcfs[chrom][gid][1].wait()
            vcfs[chrom][gid] = vcfs[chrom][gid][0]
        logfile.write("  ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush()
    # Targz the pileup files (vcf)
    tarname = unique_filename_in()
    tarfh = tarfile.open(tarname, "w:gz")
    for chrom,v in vcfs.iteritems():
        for gid,vcf in v.iteritems():
            tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom))
    tarfh.close()
    ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') )

    logfile.write("\n* Merge info from vcf files\n"); logfile.flush()
    outall = unique_filename_in()
    outexons = unique_filename_in()
    with open(outall,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
                                 ['gene','location_type','distance'])+'\n')
    with open(outexons,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
                                  + ['new_aa_'+s for s in sample_names])+'\n')
    msa_table = dict((s,'') for s in [assembly.name]+sample_names)
    for chrom,v in vcfs.iteritems():
        logfile.write("  > Chromosome '%s'\n" % chrom); logfile.flush()
    # Put together info from all vcf files
        logfile.write("  - All SNPs\n"); logfile.flush()
        allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly,
                           sample_names,mincov,float(minsnp),logfile,debugfile)
    # Annotate SNPs and check synonymy
        logfile.write("  - Exonic SNPs\n"); logfile.flush()
        exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile)
        for snprow in allsnps:
            for n,k in enumerate([assembly.name]+sample_names):
                msa_table[k] += snprow[3+n][0]
    description = set_file_descr("allSNP.txt",step="SNPs",type="txt")
    ex.add(outall,description=description)
    description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt")
    ex.add(outexons,description=description)
    msafile = unique_filename_in()
    with open(msafile,"w") as msa:
        msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0])))
        for name,seq in msa_table.iteritems():
            msa.write("%s\t%s\n" %(name,seq))
    msa_table = {}
    description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt")
    ex.add(msafile,description=description)
    # Create UCSC bed tracks
    logfile.write("\n* Create tracks\n"); logfile.flush()
    create_tracks(ex,outall,sample_names,assembly)
    # Create quantitative tracks
    logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush()

    def _process_pileup(pileups, seq, startpos, endpos):
        atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        vectors = ([],[],[])
        for pileupcolumn in pileups:
            position = pileupcolumn.pos
            if position < startpos: continue
            if position >= endpos: break
            coverage = pileupcolumn.n
            ref_symbol = seq[position-startpos]
            ref = atoi.get(ref_symbol, 4)
            symbols = [0,0,0,0,0]
            quality = 0
            for pileupread in pileupcolumn.pileups:
                symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1
                quality += ord(pileupread.alignment.qual[pileupread.qpos])-33
            quality = float(quality)/coverage
            info = heterozygosity(ref, symbols[0:4])
            if coverage > 0: vectors[0].append((position, position+1, coverage))
            if info > 0: vectors[1].append((position, position+1, info))
            if quality > 0: vectors[2].append((position, position+1, quality))
#            yield (position, position+1, coverage, info, quality)
        return vectors

    if job.options.get('make_bigwigs',False):
        _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'}
        for gid,bamfile in bams.iteritems():
            _descr['groupId'] = gid
            bamtr = track(bamfile,format="bam")
            covname = unique_filename_in()+".bw"
            out_cov = track(covname, chrmeta=assembly.chrmeta)
            hetname = unique_filename_in()+".bw"
            out_het = track(hetname, chrmeta=assembly.chrmeta)
            qualname = unique_filename_in()+".bw"
            out_qual = track(qualname, chrmeta=assembly.chrmeta)
            for chrom, cinfo in assembly.chrmeta.iteritems():
                fasta = Fastafile(ref_genome[chrom])
                #process fasta and bam by 10Mb chunks
                for chunk in range(0,cinfo["length"],10**7):
                    fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7)
                    vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7)
                    out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom)
                    out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom)
                    out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom)
            out_cov.close()
            out_het.close()
            out_qual.close()
            description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr)
            ex.add(covname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr)
            ex.add(hetname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr)
            ex.add(qualname,description=description)

    return 0
예제 #27
0
파일: c4seq.py 프로젝트: bbcf/bbcflib
def c4seq_workflow( ex, job, primers_dict, assembly,
                    c4_url=None, script_path='', logfile=sys.stdout, via='lsf' ):
    '''
    Main
    * open the 4C-seq minilims and create execution
    * 0. get/create the library
    * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql)
    * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate)
    '''

    mapseq_files = job.files
### outputs
    processed = {'lib': {}, 'density': {}, '4cseq': {}}
    processed['4cseq'] = {'density_files' : {},
                          'countsPerFrag' : {},
                          'countsPerFrag_grp' : {},
                          'norm' : {},
                          'norm_grp' : {},
                          'profileCorrection': {},
                          'profileCorrection_grp' : {},
                          'smooth_grp' : {},
                          'domainogram_grp' : {},
                          'bricks2frags' : {}}
                            # was 'smoothFrag': {}, 'domainogram': {}}
    regToExclude = {}
    new_libs=[]
### options
    run_domainogram = {}
    before_profile_correction = {}
    if not job.options.get('viewpoints_chrs',False):
        out_chromosomes = ','.join([ch for ch in assembly.chrnames])
    else:
        out_chromosomes = ','.join([primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] for gid,group in job.groups.iteritems()])
    print "out_chromosomes=" + out_chromosomes + "\n"

    sizeExt = job.options.get('norm_reg',1000000)
    print "region considered for normalisation: mid viewpoint +/-" + str(sizeExt) + 'bps'

### do it
    for gid, group in job.groups.iteritems():
        run_domainogram[gid] = group.get('run_domainogram',False)
        if isinstance(run_domainogram[gid],basestring):
            run_domainogram[gid] = (run_domainogram[gid].lower() in ['1','true','on','t'])
        before_profile_correction[gid] = group.get('before_profile_correction',False)
        if isinstance(before_profile_correction[gid],basestring):
            before_profile_correction[gid] = (before_profile_correction[gid].lower() in ['1','true','on','t'])
        processed['lib'][gid] = get_libForGrp(ex, group, assembly,
                                              new_libs, gid, c4_url, via=via)
#reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed'
        processed['4cseq']['density_files'][gid] = {}
        regToExclude[gid] = primers_dict.get(group['name'],{}).get('regToExclude',"").replace('\r','')

        # if no regToExclude defined, set it as mid_baitCoord +/-5kb
        if len(regToExclude[gid])==0 :
            baitcoord_mid = int(0.5 * (int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[0]) + int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[1]) ))
            regToExclude[gid] = primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] + ':' + str(baitcoord_mid-5000) + '-' + str(baitcoord_mid+5000)

        #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()]))
        print(primers_dict.get(group['name'],{}))
        print "regToExclude["+str(gid)+"]="+regToExclude[gid]
        for rid,run in group['runs'].iteritems():
            libname = mapseq_files[gid][rid]['libname']
            if job.options.get('merge_strands') != 0 or not('wig' in mapseq_files[gid][rid]):
                density_file=parallel_density_sql( ex, mapseq_files[gid][rid]['bam'],
                                                   assembly.chrmeta,
                                                   nreads=mapseq_files[gid][rid]['stats']["total"],
                                                   merge=0,
                                                   read_extension=mapseq_files[gid][rid]['stats']['read_length'],
                                                   convert=False,
                                                   via=via )
                density_file += "merged.sql"
                ex.add( density_file,
                        description=set_file_descr("density_file_"+libname+".sql",
                                                   groupId=gid,step="density",type="sql",view='admin',gdv="1") )
            else:
                density_file = mapseq_files[gid][rid]['wig']['merged']
            #density_files.append(density_file)
            processed['4cseq']['density_files'][gid][rid]=density_file

        # back to grp level!
        # not anymore:
        # processed['density'][gid] = merge_sql(ex, density_files, via=via)

    processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via )
    ## access per gid+rid

    futures_norm = {}
    countsPerFrags_bedGraph = {}
    futures_merged_raw = {}
    for gid, group in job.groups.iteritems():
        futures_norm[gid] = {}
        countsPerFrags_bedGraph[gid] = {}
        processed['4cseq']['norm'][gid] = {}
        for rid,run in group['runs'].iteritems():
            normfile = unique_filename_in()
            touch(ex, normfile)
            resfile = unique_filename_in()+".bedGraph"
            resfiles = processed['4cseq']['countsPerFrag'][gid][rid] # _all.sql
            convert(resfiles[3],resfile)
            countsPerFrags_bedGraph[gid][rid] = resfile

            print "call normFrags: infiles="+resfile+", normfile="+normfile+"baitCoord="+primers_dict[group['name']]['baitcoord']+", sizeExt=sizeExt, name="+ group['name']+"rep_"+str(rid) + "regToExclude="+regToExclude[gid]+"\n"
            futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name']+"rep_"+str(rid) ,regToExclude=regToExclude[gid], script_path=script_path, via=via )
            processed['4cseq']['norm'][gid][rid] = normfile

        if len(group) > 1:
            ## merge replicates before normalisation.
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_raw_mergedRep"
            print "gid="+group['name']
            print "call mergeRep for replicates before normalisation: infiles="+",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['countsPerFrag_grp'][gid] = mergefile
        else:
            futures_merged_raw[gid] = None
            processed['4cseq']['countsPerFrag_grp'][gid] = countsPerFrags_bedGraph[gid][0] #if no replicates, then the file we want is the 1st one

    print "***** profile correction / sample + merge normalised data"
    futures_merged = {} # per gid
    futures_profcor = {} # per gid, per rid
    for gid, group in job.groups.iteritems():
        ## run profile correction per run then merge them
        futures_profcor[gid] = {}
        processed['4cseq']['profileCorrection'][gid] = {}
        for rid, run in group['runs'].iteritems():
            # wait for normalisation of all replicates to be finished
            futures_norm[gid][rid].wait() ## normalised files, per grp, per rep
            normfile = processed['4cseq']['norm'][gid][rid]
            file1 = unique_filename_in() #track file
            touch(ex,file1)
            file2 = unique_filename_in() #report file
            touch(ex,file2)
            file3 = unique_filename_in() #table file
            touch(ex, file3)
            print "call profileCorrection: normfile="+normfile+", baitCoord="+primers_dict[group['name']]['baitcoord']+", name="+group['name']+", file1="+file1+", file2="+file2+", file3= "+file3+"\n"
            futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile,
                                        primers_dict[group['name']]['baitcoord'],
                                        group['name'], file1, file2, file3, script_path,
                                        via=via )
            processed['4cseq']['profileCorrection'][gid][rid] = [file1, file2, file3]

        ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop.
        if len(group)>1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_norm_mergedRep"
            print "gid="+group['name']
            print "call mergeRep: infiles="+",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['norm_grp'][gid] = mergefile
        else:
            futures_merged[gid] = None
            processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][gid][0] ##if no replicates, then the file we want is the 1st one

    print "***** merge profile corrected data"
    futures_profcor_merged = {} # per gid
    for gid, group in job.groups.iteritems():
        processed['4cseq']['profileCorrection_grp'][gid] = {}
        for rid, run in group['runs'].iteritems():
            futures_profcor[gid][rid].wait()   ## wait for ProfileCorrection to be finished

        ## merge replicates after profile correction
        if len(group)>1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_ProfCor_mergedRep"
            pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid,res_rid in processed['4cseq']['profileCorrection'][gid].iteritems()]
            print "call mergeRep (for PC tables): infiles="+",".join(pcfiles)+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['profileCorrection_grp'][gid] = mergefile
        else:
            futures_profcor_merged[gid] = None
            processed['4cseq']['profileCorrection_grp'][gid] = processed['4cseq']['profileCorrection'][gid][0] ##if no replicates, then the file we want is the 1st one


    print "***** smooth data"
    futures_smoothed = {}
    for gid, group in job.groups.iteritems():
        file1 = unique_filename_in()
        touch(ex,file1)
        file2 = unique_filename_in()
        touch(ex, file2)
        file3 = unique_filename_in()
        touch(ex, file3)
        nFragsPerWin = group['window_size']
        futures_merged_raw[gid].wait() ## wait for merging of raw_grp to be completed
        futures_smoothed[gid] = ( smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'],
                                                    file1, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        futures_merged[gid].wait() ## wait for merging of norm_grp to be completed
        futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name']+"_norm",
                                                    file2, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        futures_profcor_merged[gid].wait() # wait for the merging of profile corrected data to be done
        futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name']+"_fromProfileCorrected",
                                                    file3, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        processed['4cseq']['smooth_grp'][gid] = [file1,file2,file3] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC]

    print "***** Domainograms"
    futures_domainograms = {}
    for gid, group in job.groups.iteritems():
        grName = job.groups[gid]['name']
        if run_domainogram[gid]:
            regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord']
            if before_profile_correction[gid]:
               futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid],
                                                                            grName, regCoord=regCoord, skip=1,
                                                                            script_path=script_path, via=via, memory=15 )
            else:
                futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid],
                                                                            grName, regCoord=regCoord.split(':')[0], skip=1,
                                                                            script_path=script_path, via=via, memory=15 )

    ## prepare tar files for domainogram results (if any)
    ## and create "BRICKS to frags" files
    print "***** BRICKS to Frags"
    futures_BRICKS2Frags = {}
    for gid, f in futures_domainograms.iteritems():
        if run_domainogram[gid]: # if domainogram has been run
            resFiles = []
            logFile = f.wait()
            start = False
            tarname = job.groups[gid]['name']+"_domainogram.tar.gz"
            res_tar = tarfile.open(tarname, "w:gz")
            futures_BRICKS2Frags[gid] = []
            processed['4cseq']['bricks2frags'][gid] = []
            if logFile is None: continue
            with open(logFile) as f:
                for s in f:
                    s = s.strip()
                    if '####resfiles####' in s:
                        start = True
                    elif start and "RData" not in s:
                        resFiles.append(s)
                        res_tar.add(s)
                    if start and "foundBRICKS" in s:
                        bricks2fragsfile = unique_filename_in()+".bedGraph"
                        touch(ex, bricks2fragsfile)
                        futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking(ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4 ) ]
                        processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ]
            res_tar.close()
            processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname]




############### prepare tables for global results
    print "***** combine results into tables "
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        for rid,run in group['runs'].iteritems():
            allNames += [ group['name']+"_rep"+str(rid)+"_norm", group['name']+"_rep"+str(rid)+"_fit" ]
            allFiles += [ processed['4cseq']['profileCorrection'][gid][rid][2] ]
            allRegToExclude += [ regToExclude[gid] ]
    tablePC=unique_filename_in()+".txt"
    print("***will call makeTable with:")
    print(",".join(allFiles))
    print("resfile="+tablePC)
    print(",".join(allNames))
    touch(ex,tablePC)

    #regToExclude[gid]

    futures_tables = (makeTable.nonblocking(ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    # wait for all smoothing to be done
    for gid, fg in futures_smoothed.iteritems():
        for f in fg: f.wait()

    ## make Table raw/smoothed_raw
    print("** make Table raw/smoothed_raw")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        futures_merged_raw[gid].wait()
        allNames += [ group['name']+"_raw", group['name']+"_rawSmoothed" ]
        allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ]
        allRegToExclude += [ 'NA', regToExclude[gid] ]

    tableSmoothedRaw_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothedRaw_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## make Table norm/smoothed_norm before PC
    print("** make Table norm/smoothed_norm befor PC")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        allNames += [ group['name']+"_norm", group['name']+"_smoothed" ]
        allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ]
        allRegToExclude += [ regToExclude[gid], regToExclude[gid] ]

    tableSmoothed_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothed_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## make Table norm/smoothed_norm after PC
    print("** make Table norm/smoothed_norm after PC")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        allNames += [ group['name']+"_normPC", group['name']+"_smoothedPC" ]
        allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ]
        allRegToExclude += [ regToExclude[gid], regToExclude[gid] ]

    tableSmoothedPC_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothedPC_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## combine BRICKS2Frags files
    allNames=[]
    allFiles=[]
    for gid, fg in futures_BRICKS2Frags.iteritems():
        for f in fg: f.wait()
        allNames += [ job.groups[gid]['name']+"_BRICKSpval" ]
        cat_bricks2frags = unique_filename_in()+".txt"
        print ','.join(processed['4cseq']['bricks2frags'][gid])
        cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],out=cat_bricks2frags)
        allFiles += [ cat_bricks2frags ]

    for gid, fg in futures_smoothed.iteritems():
        for f in fg: f.wait()

    tableBRICKS2Frags = unique_filename_in()+".txt"
    touch(ex,tableBRICKS2Frags)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8 ), )


    for f in futures_tables: f.wait()


################ Add everything to minilims below!
    step = "density"
    for gid in processed['4cseq']['density_files'].keys():
        for rid, sql in processed['4cseq']['density_files'][gid].iteritems():
            fname = "density_file_"+job.groups[gid]['name']+"_merged_rep"+str(rid)
            ex.add( sql, description=set_file_descr( fname+".sql",
                                                 groupId=gid,step=step,type="sql",gdv="1" ) )
            wig = unique_filename_in()+".bw"
            convert( sql, wig )
            ex.add( wig, description=set_file_descr( fname+".bw",
                                                 groupId=gid,step=step,type="bigWig",ucsc="1") )
    step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate
    for gid in processed['4cseq']['countsPerFrag'].keys():
        for rid, resfiles in processed['4cseq']['countsPerFrag'][gid].iteritems():
            fname = "meanScorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid)
            ex.add( resfiles[1], description=set_file_descr( fname+".sql",
                                                             groupId=gid,step=step,type="sql",view="admin",gdv='1'))
            #gzipfile(ex,resfiles[0])
            #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz",
            #                                                       groupId=gid,step=step,type="bed",view="admin" ))
            fname = "segToFrag_"+job.groups[gid]['name']+"_rep"+str(rid)
            ex.add( resfiles[3], description=set_file_descr( fname+"_all.sql",
                                                             groupId=gid,step=step,type="sql",
                                                             comment="all informative frags - null included" ))
            trsql = track(resfiles[3])
            bwig = unique_filename_in()+".bw"
            trwig = track(bwig,chrmeta=trsql.chrmeta)
            trwig.write(trsql.read(fields=['chr','start','end','score'],
                                   selection={'score':(0.01,sys.maxint)}))
            trwig.close()
            ex.add( bwig, set_file_descr(fname+".bw",groupId=gid,step=step,type="bigWig",ucsc='1'))
        ## add segToFrags before normalisation
        futures_merged_raw[gid].wait()
        trbedgraph = track(removeNA(processed['4cseq']['countsPerFrag_grp'][gid]),format='bedgraph')
        bwig = unique_filename_in()+".bw"
        trwig = track(bwig,chrmeta=assembly.chrmeta)
        trwig.write(trbedgraph.read(fields=['chr','start','end','score'],
                               selection={'score':(0.01,sys.maxint)}))
        trwig.close()
        fname = "segToFrag_"+job.groups[gid]['name']
        ex.add( bwig, description=set_file_descr( fname+".bw",
                                                             groupId=gid,step=step,type="bigWig",
                                                             comment="segToFrag file before normalisation" ))

    step = "norm_counts_per_frags"  # after new normalisation process, combined replicates
    for gid, resfile in processed['4cseq']['norm_grp'].iteritems():
        fname = "normalised_scorePerFeature_"+job.groups[gid]['name']
        gzipfile(ex,resfile)
        ex.add( resfile+".gz", description=set_file_descr( fname+".bedGraph.gz", groupId=gid,step=step, type="bedGraph",ucsc='1'))
    # norm files, per replicates (might be removed)
    for gid, dict_gid in processed['4cseq']['norm'].iteritems():
        for rid, resfile in dict_gid.iteritems():
            fname = "normalised_scorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid)
            gzipfile(ex,resfile)
            ex.add(resfile+".gz",
                    description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
    step = "profile_correction" # Profile corrected data, combined replicates
    for gid, profileCorrectedFile in processed['4cseq']['profileCorrection_grp'].iteritems():
        fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected"
        gzipfile(ex,profileCorrectedFile)
        ex.add( profileCorrectedFile+".gz",
                description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
    # Profile corrected, per replicate (might be removed)
    for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems():
        for rid, resfiles in dict_gid.iteritems():
    #        profileCorrectedFile = resfiles[0]
            reportProfileCorrection = resfiles[1]
            fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_rep"+str(rid)
    #        gzipfile(ex,profileCorrectedFile)
     #       ex.add( profileCorrectedFile+".gz",
      #              description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
            ex.add( reportProfileCorrection, description=set_file_descr(fname+".pdf",
                                                                    groupId=gid,step=step,type="pdf"))
    step = "smoothing"
    for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems():
        rawSmoothFile = resfiles[0]
        smoothFile = resfiles[1]
        afterProfileCorrection = resfiles[2]
        nFrags = str(job.groups[gid]['window_size'])
        ## smoothed file before normalisation
        fname = "segToFrag_"+job.groups[gid]['name']+"_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,rawSmoothFile)
        ex.add(rawSmoothFile+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
        ## smoothed file after normalisation, before Profile correction
        fname = "segToFrag_"+job.groups[gid]['name']+"_norm_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,smoothFile)
        ex.add(smoothFile+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
        ## smoothed file after normalisation, after Profile correction
        fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,afterProfileCorrection)
        ex.add(afterProfileCorrection+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))

    step = "domainograms"
    for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems():
        tarFile = resfiles.pop()
        fname = job.groups[gid]['name']+"_domainogram.tar.gz"
        ex.add(tarFile, description=set_file_descr(fname,
                                                   groupId=gid,step=step,type="tgz"))
        for s in resfiles:
            if s[-8:] == "bedGraph":
                gzipfile(ex,s)
                s += ".gz"
                ex.add( s, description=set_file_descr( s, groupId=gid,step=step,type="bedGraph",ucsc="1",gdv="1"))

    step = "combined_results"
    gzipfile(ex,tableSmoothedRaw_grp)
    ex.add(tableSmoothedRaw_grp+".gz", description=set_file_descr("table_segToFrags_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableSmoothed_grp)
    ex.add(tableSmoothed_grp+".gz", description=set_file_descr("table_normalised_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableSmoothedPC_grp)
    ex.add(tableSmoothedPC_grp+".gz", description=set_file_descr("table_profileCorrected_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tablePC)
    ex.add(tablePC+".gz", description=set_file_descr("table_normalised_fit_per_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableBRICKS2Frags)
    ex.add(tableBRICKS2Frags+".gz", description=set_file_descr("table_frags_in_BRICKS_combined_replicates.txt.gz",step=step,type="txt"))

    return processed
예제 #28
0
파일: snp.py 프로젝트: MolbioUnige/bbcflib
def snp_workflow(ex,
                 job,
                 assembly,
                 minsnp=40.,
                 mincov=5,
                 path_to_ref=None,
                 via='local',
                 logfile=sys.stdout,
                 debugfile=sys.stderr):
    """Main function of the workflow"""
    ref_genome = assembly.fasta_by_chrom
    sample_names = [
        job.groups[gid]['name'] for gid in sorted(job.files.keys())
    ]

    logfile.write("\n* Generate vcfs for each chrom/group\n")
    logfile.flush()
    vcfs = dict((chrom, {}) for chrom in ref_genome.keys())  # {chr: {}}
    bams = {}
    # Launch the jobs
    bam = Samfile(job.files.values()[0].values()[0]['bam'])
    header = bam.header
    headerfile = unique_filename_in()
    for h in header["SQ"]:
        if h["SN"] in assembly.chrmeta:
            h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
    head = Samfile(headerfile, "wh", header=header)
    head.close()
    for gid in job.files.keys():
        # Merge all bams belonging to the same group
        runs = [r['bam'] for r in job.files[gid].itervalues()]
        if len(runs) > 1:
            _b = merge_bam(ex, runs)
            index_bam(ex, _b)
            bams[gid] = _b
        else:
            index_bam(ex, runs[0])
            bams[gid] = runs[0]
        # Samtools mpileup + bcftools + vcfutils.pl
        for chrom, ref in ref_genome.iteritems():
            vcf = unique_filename_in()
            vcfs[chrom][gid] = (vcf,
                                pileup.nonblocking(ex,
                                                   bams[gid],
                                                   ref,
                                                   header=headerfile,
                                                   via=via,
                                                   stdout=vcf))
        logfile.write("  ...Group %s running.\n" % job.groups[gid]['name'])
        logfile.flush()
    # Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
    for gid in job.files.keys():
        for chrom, ref in ref_genome.iteritems():
            vcfs[chrom][gid][1].wait()
            vcfs[chrom][gid] = vcfs[chrom][gid][0]
        logfile.write("  ...Group %s done.\n" % job.groups[gid]['name'])
        logfile.flush()
    # Targz the pileup files (vcf)
    tarname = unique_filename_in()
    tarfh = tarfile.open(tarname, "w:gz")
    for chrom, v in vcfs.iteritems():
        for gid, vcf in v.iteritems():
            tarfh.add(vcf,
                      arcname="%s_%s.vcf" % (job.groups[gid]['name'], chrom))
    tarfh.close()
    ex.add(tarname,
           description=set_file_descr("vcf_files.tar.gz",
                                      step="pileup",
                                      type="tar",
                                      view='admin'))

    logfile.write("\n* Merge info from vcf files\n")
    logfile.flush()
    outall = unique_filename_in()
    outexons = unique_filename_in()
    with open(outall, "w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
                                 ['gene','location_type','distance'])+'\n')
    with open(outexons, "w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
                                  + ['new_aa_'+s for s in sample_names])+'\n')
    msa_table = dict((s, '') for s in [assembly.name] + sample_names)
    for chrom, v in vcfs.iteritems():
        logfile.write("  > Chromosome '%s'\n" % chrom)
        logfile.flush()
        # Put together info from all vcf files
        logfile.write("  - All SNPs\n")
        logfile.flush()
        allsnps = all_snps(ex, chrom, vcfs[chrom], bams, outall,
                           assembly, headerfile, sample_names, mincov,
                           float(minsnp), logfile, debugfile, via)
        # Annotate SNPs and check synonymy
        logfile.write("  - Exonic SNPs\n")
        logfile.flush()
        exon_snps(chrom, outexons, allsnps, assembly, sample_names, ref_genome,
                  logfile, debugfile)
        for snprow in allsnps:
            for n, k in enumerate([assembly.name] + sample_names):
                base = snprow[3 + n][0]
                if base == "-": base = snprow[3][0]
                if base not in 'ACGTacgt': base = "N"
                msa_table[k] += base
    description = set_file_descr("allSNP.txt", step="SNPs", type="txt")
    ex.add(outall, description=description)
    description = set_file_descr("exonsSNP.txt", step="SNPs", type="txt")
    ex.add(outexons, description=description)
    msafile = unique_filename_in()
    with open(msafile, "w") as msa:
        msa.write(" %i %i\n" % (len(msa_table), len(msa_table.values()[0])))
        for name, seq in msa_table.iteritems():
            msa.write("%s\t%s\n" % (name, seq))
    msa_table = {}
    description = set_file_descr("SNPalignment.txt", step="SNPs", type="txt")
    ex.add(msafile, description=description)
    # Create UCSC bed tracks
    logfile.write("\n* Create tracks\n")
    logfile.flush()
    create_tracks(ex, outall, sample_names, assembly)
    # Create quantitative tracks
    logfile.write("\n* Create heteroz. and quality tracks\n")
    logfile.flush()

    def _process_pileup(pileups, seq, startpos, endpos):
        atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        vectors = ([], [], [])
        for pileupcolumn in pileups:
            position = pileupcolumn.pos
            if position < startpos: continue
            if position >= endpos: break
            coverage = pileupcolumn.n
            ref_symbol = seq[position - startpos]
            ref = atoi.get(ref_symbol, 4)
            symbols = [0, 0, 0, 0, 0]
            quality = 0
            for pileupread in pileupcolumn.pileups:
                if pileupread.qpos >= len(pileupread.alignment.seq):
                    coverage -= 1
                else:
                    symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos],
                                     4)] += 1
                    quality += ord(
                        pileupread.alignment.qual[pileupread.qpos]) - 33
            quality = float(quality) / coverage
            info = heterozygosity(ref, symbols[0:4])
            if coverage > 0:
                vectors[0].append((position, position + 1, coverage))
            if info > 0: vectors[1].append((position, position + 1, info))
            if quality > 0:
                vectors[2].append((position, position + 1, quality))
#            yield (position, position+1, coverage, info, quality)
        return vectors

    if job.options.get('make_bigwigs', False):
        _descr = {
            'groupId': 0,
            'step': "tracks",
            'type': "bigWig",
            'ucsc': '1'
        }
        for gid, bamfile in bams.iteritems():
            _descr['groupId'] = gid
            bamtr = track(bamfile, format="bam")
            covname = unique_filename_in() + ".bw"
            out_cov = track(covname, chrmeta=assembly.chrmeta)
            hetname = unique_filename_in() + ".bw"
            out_het = track(hetname, chrmeta=assembly.chrmeta)
            qualname = unique_filename_in() + ".bw"
            out_qual = track(qualname, chrmeta=assembly.chrmeta)
            for chrom, cinfo in assembly.chrmeta.iteritems():
                fasta = Fastafile(ref_genome[chrom])
                #process fasta and bam by 10Mb chunks
                for chunk in range(0, cinfo["length"], 10**7):
                    fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk + 10**7)
                    vecs = _process_pileup(
                        bamtr.pileup(chrom, chunk, chunk + 10**7), fastaseq,
                        chunk, chunk + 10**7)
                    out_cov.write(vecs[0],
                                  fields=['start', 'end', 'score'],
                                  chrom=chrom)
                    out_het.write(vecs[1],
                                  fields=['start', 'end', 'score'],
                                  chrom=chrom)
                    out_qual.write(vecs[2],
                                   fields=['start', 'end', 'score'],
                                   chrom=chrom)
            out_cov.close()
            out_het.close()
            out_qual.close()
            description = set_file_descr(
                job.groups[gid]['name'] + "_coverage.bw", **_descr)
            ex.add(covname, description=description)
            description = set_file_descr(
                job.groups[gid]['name'] + "_heterozygosity.bw", **_descr)
            ex.add(hetname, description=description)
            description = set_file_descr(
                job.groups[gid]['name'] + "_quality.bw", **_descr)
            ex.add(qualname, description=description)

    return 0
예제 #29
0
def chipseq_workflow(ex,
                     job_or_dict,
                     assembly,
                     script_path='',
                     logfile=sys.stdout,
                     via='lsf'):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict, frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not ('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files', {})
    else:
        raise TypeError(
            "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'."
        )
    merge_strands = int(options.get('merge_strands', -1))
    suffixes = ["fwd", "rev"]
    peak_deconvolution = options.get('peak_deconvolution', False)
    if isinstance(peak_deconvolution, basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't']
    run_meme = options.get('run_meme', False)
    if isinstance(run_meme, basestring):
        run_meme = run_meme.lower() in ['1', 'true', 't']
    macs_args = options.get('macs_args', ["--bw", "200"])
    b2w_args = options.get('b2w_args', [])
    if not (isinstance(mapseq_files, dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid, mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not (isinstance(mapped, dict)):
            raise TypeError(
                "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name + "_" + str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking(ex,
                                                         mapped[k]["bam"],
                                                         via=via)
            if mapped[k].get('poisson_threshold', -1) > 0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns) > 0:
            p_thresh['group_name'] = sum(ptruns) / len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped) > 1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid, group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    logfile.write("Starting MACS.\n")
    logfile.flush()
    processed = {
        'macs':
        add_macs_results(ex,
                         read_length,
                         genome_size,
                         tests,
                         ctrlbam=controls,
                         name=names,
                         poisson_threshold=p_thresh,
                         macs_args=macs_args,
                         via=via)
    }
    logfile.write("Done MACS.\n")
    logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
    ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score': (6, sys.maxint)}
    _fields = ['chr', 'start', 'end', 'name', 'score']
    for i, name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name, names['controls'][0])
            macsbed = track(processed['macs'][ctrl] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([
                apply(track(processed['macs'][(name, x)] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select),
                      'name',
                      lambda __n, _n=xn: "%s:%i" % (__n, _n))
                for xn, x in enumerate(names['controls'])
            ])
        ##############################
        macs_neighb = neighborhood(macsbed, before_start=150, after_end=150)
        peak_list[name] = unique_filename_in() + ".sql"
        macs_final = track(peak_list[name],
                           chrmeta=chrlist,
                           info={'datatype': 'qualitative'},
                           fields=['start', 'end', 'name', 'score'])
        macs_final.write(fusion(macs_neighb), clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(
        options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1:
        options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension'] > 100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid, mapped in mapseq_files.iteritems():
        #            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                output = mapseq.parallel_density_sql(
                    ex,
                    m["bam"],
                    assembly.chrmeta,
                    nreads=m["stats"]["total"],
                    merge=-1,
                    read_extension=options['read_extension'],
                    convert=False,
                    b2w_args=b2w_args,
                    via=via)
                wig.append(dict((s, output + s + '.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict(
                (s, merge_sql(ex, [x[s] for x in wig], via=via))
                for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv(stream, pval):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream(
                ((x[0], ) + ((x[2] + x[1]) / 2 - 150,
                             (x[2] + x[1]) / 2 + 150) + x[3:]
                 for x in stream if "FERR=" in x[3]
                 and float(ferr.search(x[3]).groups()[0]) <= pval),
                fields=stream.fields)

        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1] + " deconvolution.\n")
            logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name, names['controls'][0])
                macsbed = processed['macs'][ctrl] + "_peaks.bed"
            else:
                macsbed = intersect_many_bed(ex, [
                    processed['macs'][(name, x)] + "_peaks.bed"
                    for x in names['controls']
                ],
                                             via=via)
            deconv = run_deconv(ex,
                                merged_wig[name[1]],
                                macsbed,
                                assembly.chrmeta,
                                options['read_extension'],
                                script_path,
                                via=via)
            peak_list[name] = unique_filename_in() + ".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist,
                       fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed, 0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1] + '_peaks.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1] + '_deconv.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'], (bigwig, "bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1] + '_deconv.bw',
                                                  type='bigWig',
                                                  ucsc='1',
                                                  step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e))
                logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1] + '_deconv.pdf',
                                              type='pdf',
                                              step='deconvolution',
                                              groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs(stream, xlsl, _f):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(
                            _n.split(";")[0][13:]) if _n[:3] == "ID=" else int(
                                _n[10:])
                        yield _p + xlsl[0][nb - 1][1:]
                    else:
                        nb = _n.split(
                            ";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:]

        return FeatureStream(_macs_row(stream), fields=_f)

    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,
                       chrmeta=chrlist,
                       fields=["chr", "start", "end", "name", "score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([
            processed['macs'][(name, _c)] + "_peaks.xls"
            for _c in names['controls']
        ])
        try:
            ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join([
                'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)',
                'location_type', 'distance'
            ] + _fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(
                    getNearestFeature(ptrack.read(selection=chrom), _feat),
                    xlsl, _fields),
                              mode='append')
        except ValueError:
            _fields = ['chr', 'start', 'end', 'name', 'score'
                       ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join(
                ['chromosome', 'start', 'end', 'info', 'peak_height'] +
                _fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl,
                                         _fields),
                              mode='append')
        peakout.close()
        gzipfile(ex, peakfile)
        peakfile_list.append(
            track(peakfile + ".gz", format='txt', fields=_fields))
        ex.add(peakfile + ".gz",
               description=set_file_descr(name[1] + '_annotated_peaks.txt.gz',
                                          type='text',
                                          step='annotation',
                                          groupId=name[0]))
    stracks = [
        track(wig, info={'name': name + "_" + st})
        for name, wigdict in merged_wig.iteritems()
        for st, wig in wigdict.iteritems()
    ]
    tablefile = unique_filename_in()
    with open(tablefile, "w") as _tf:
        _pnames = [
            "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1]
            for _s in names['tests'] for _c in names['controls']
        ]
        _tf.write("\t".join([
            '#chromosome',
            'start',
            'end',
        ] + _pnames + [s.name for s in stracks]) + "\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [
            apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']),
                  'name',
                  lambda __n, _n=npt: "%s:%i" % (__n, _n))
            for npt, pt in enumerate(peakfile_list)
        ]
        features = fusion(
            concatenate(pk_lst,
                        fields=['chr', 'start', 'end', 'name'],
                        remove_duplicates=True,
                        group_by=['chr', 'start', 'end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile, "a") as _tf:
            for row in quantifs:
                pcols = [''] * _ns * _nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while (_k < len(_rnsplit) - 1 - int(_nc > 1)):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k - 1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0]) * _nc + int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(
                    str(tt)
                    for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) +
                          "\n")
    gzipfile(ex, tablefile)
    ex.add(tablefile + ".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',
                                      type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n")
        logfile.flush()
        processed['meme'] = parallel_meme(
            ex,
            assembly,
            peak_list.values(),
            name=peak_list.keys(),
            chip=True,
            meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'],
            via=via)
    return processed
예제 #30
0
파일: microbiome.py 프로젝트: bbcf/bbcflib
def microbiome_workflow(ex, job, assembly, logfile=sys.stdout, via="lsf"):
    """
    Main:

      * 0. retrieve bam files from mapseq job
      *   0.a. merge bam files (=> 1 bam file per group)
      * 1. for each group:
      *   1.a get counts per group (=> 1 file per group)
      *   1.b get counts per Level (Kingdom, Phylum, Class, Order, Family, Genus and Species) (=> 1 file per level / per group)
      * 2. combine counts
      *   2.a combine counts for all groups (=> 1 combined file)
      *   2.b combine counts per level for all groups (=> 1 combined file per Level)
      * 3. generate barplots (=> 1 plot per group + per level + per combined files)

    """
    ### params
    levels = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
    infosCols = {
        "Kingdom": [0, [1, 2]],
        "Phylum": [[0, 1], [2, 3]],
        "Class": [[0, 1, 2], [3, 4]],
        "Order": [[0, 1, 2, 3], [4, 5]],
        "Family": [[0, 1, 2, 3, 4], [5, 6]],
        "Genus": [[0, 1, 2, 3, 4, 5], [6, 7]],
        "Species": [[0, 1, 2, 3, 4, 5, 6], [7, 8]],
    }
    ### outputs
    processed = {"cnts": {}, "cnts_level": {}, "plots": {}}

    ### do it
    mapseq_files = job.files

    # 1.a get counts per group (=> 1 file per group)
    futures = {}
    for gid, group in job.groups.iteritems():
        group_name = group["name"]
        bamfiles = [m["bam"] for m in mapseq_files[gid].values()]
        futures[gid] = run_microbiome.nonblocking(
            ex, ["bam_to_annot_counts", bamfiles, assembly.annotations_path, group_name], via=via, memory=8
        )

    # 1.b get counts per Level (Kingdom, Phylum, Class, Order, Family, Genus and Species) (=> 1 file per level / per group)
    step = "counts"
    for gid, future in futures.iteritems():
        res = future.wait()
        processed["cnts"][gid] = res  # group_name + "_counts_annot.txt"
        fname = job.groups[gid]["name"] + "_counts_annot.txt"
        ex.add(res, description=set_file_descr(fname, groupId=gid, step=step, type="txt"))
        processed["cnts_level"][gid] = [
            run_microbiome.nonblocking(ex, ["getCountsPerLevel", res, level], via=via, memory=8) for level in levels
        ]

    # 2.a combine counts for all groups (=> 1 combined file)
    files = processed["cnts"].values()
    combined_out = [run_microbiome.nonblocking(ex, ["combine_counts", files, 0, [1, 2]], via=via, memory=8)]

    # 2.b combine counts per level for all groups (=> 1 combined file per Level)
    for n, level in enumerate(levels):
        files = dict([(gid, f[n].wait()) for gid, f in processed["cnts_level"].iteritems()])
        combined_out.append(
            run_microbiome.nonblocking(
                ex, ["combine_counts", files.values()] + infosCols.get(level, [0, [1, 2]]), via=via, memory=8
            )
        )
        for gid, f in files.iteritems():
            fname = job.groups[gid]["name"] + "_counts_annot_" + level + ".txt"
            ex.add(f, description=set_file_descr(fname, groupId=gid, step=step, type="txt"))

    step = "combined"
    ex.add(combined_out[0].wait(), description=set_file_descr("combined_counts.txt", step=step, type="txt"))
    for nl, level in enumerate(levels):
        ex.add(
            combined_out[nl + 1].wait(),
            description=set_file_descr("combined_counts" + level + ".txt", step=step, type="txt"),
        )
    return 0
예제 #31
0
    def count_reads(self, bamfiles, gtf):
        self.write_log("* Counting reads")

        # Count reads on genes, transcripts with "rnacounter"
        ncond = len(self.conditions)
        tablenames = [None] * ncond
        futures = [None] * ncond
        max_rlen = 0
        counter_options = ["--nh"]
        for bam in bamfiles:
            sam = pysam.Samfile(bam, 'rb')
            max_rlen = max(max_rlen, sam.next().rlen)
        counter_options += ["--exon_cutoff", str(max_rlen)]
        bwt_args = self.job.options.get('map_args', {}).get('bwt_args', [])
        #        if not "--local" in bwt_args:
        #            counter_options += ["--nh"]
        if hasattr(self.assembly, "fasta_origin") or self.assembly.intype == 2:
            counter_options += ["--type", "transcripts", "--method", "raw"]
        else:
            counter_options += [
                "--type", "genes,transcripts", "--method", "raw,nnls"
            ]
        if self.stranded:
            counter_options += ["--stranded"]
        for i, c in enumerate(self.conditions):
            tablenames[i] = unique_filename_in()
            futures[i] = rnacounter.nonblocking(self.ex,
                                                bamfiles[i],
                                                gtf,
                                                stdout=tablenames[i],
                                                via=self.via,
                                                options=counter_options)

        # Put samples together
        for i, c in enumerate(self.conditions):
            try:
                futures[i].wait()
            except Exception as err:
                self.write_debug("Counting failed: %s." % str(err))
                raise err
            if futures[i] is None:
                self.write_debug("Counting failed.")
                raise ValueError("Counting failed.")
        if len(tablenames) > 1:
            joined = unique_filename_in()
            rnacounter_join.nonblocking(self.ex,
                                        tablenames,
                                        stdout=joined,
                                        via=self.via).wait()
        else:
            joined = tablenames[0]

        # Split genes and transcripts into separate files
        genes_filename = unique_filename_in()
        trans_filename = unique_filename_in()
        genes_file = open(genes_filename, "wb")
        trans_file = open(trans_filename, "wb")
        if self.stranded:
            genes_anti_filename = unique_filename_in()
            trans_anti_filename = unique_filename_in()
            genes_anti_file = open(genes_anti_filename, "wb")
            trans_anti_file = open(trans_anti_filename, "wb")
        with open(joined) as jfile:
            header = jfile.readline()
            hconds = ["counts." + c for c in self.conditions
                      ] + ["rpkm." + c for c in self.conditions]
            hinfo = header.strip().split('\t')[2 * ncond + 1:]
            header = '\t'.join(["ID"] + hconds + hinfo) + '\n'
            genes_file.write(header)
            trans_file.write(header)
            type_idx = header.split('\t').index("Type")
            if self.stranded:
                genes_anti_file.write(header)
                trans_anti_file.write(header)
                sense_idx = header.split('\t').index("Sense")
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    sense = L[sense_idx].lower()
                    if ftype == 'gene':
                        if sense == 'antisense':
                            genes_anti_file.write(line)
                        else:
                            genes_file.write(line)
                    elif ftype == 'transcript':
                        if sense == 'antisense':
                            trans_anti_file.write(line)
                        else:
                            trans_file.write(line)
            else:
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    if ftype == 'gene':
                        genes_file.write(line)
                    elif ftype == 'transcript':
                        trans_file.write(line)
        genes_file.close()
        trans_file.close()

        # Keep intermediate tables
        for i, c in enumerate(self.conditions):
            #shutil.copy(tablenames[i], "../counts%d.txt"%i)
            descr = set_file_descr(self.conditions[i] + '_' + tablenames[i] +
                                   '.gz',
                                   type='txt',
                                   step='pileup',
                                   view='admin')
            gzipfile(self.ex, tablenames[i])
            self.ex.add(tablenames[i] + '.gz', description=descr)

        if self.stranded:
            count_files = {
                'genes': genes_filename,
                'transcripts': trans_filename,
                'genes_anti': genes_anti_filename,
                'transcripts_anti': trans_anti_filename
            }
        else:
            count_files = {
                'genes': genes_filename,
                'transcripts': trans_filename
            }
        return count_files
예제 #32
0
    def find_junctions(self,
                       soapsplice_index=None,
                       path_to_soapsplice=None,
                       soapsplice_options={}):
        """
        Retrieve unmapped reads from a precedent mapping and runs SOAPsplice on them.
        Return the names of a .bed track indicating the junctions positions, as well as
        of a bam file of the alignments attesting the junctions.

        :param soapsplice_index: (str) path to the SOAPsplice index.
        :param path_to_soapsplice: (str) specify the path to the program if it is not in your $PATH.
        :param soapsplice_options: (dict) SOAPsplice options, e.g. {'-m':2}.
        :rtype: str, str
        """
        @program
        def soapsplice(unmapped_R1,
                       unmapped_R2,
                       index,
                       output=None,
                       path_to_soapsplice=None,
                       options={}):
            """Bind 'soapsplice'. Return a text file containing the list of junctions.

            :param unmapped_R1: (str) path to the fastq file containing the 'left' reads.
            :param unmapped_R2: (str) path to the fastq file containing the 'right' reads.
            :param index: (str) path to the SOAPsplice index.
            :param output: (str) output file name.
            :param path_to_soapsplice: (str) path to the SOAPsplice executable.
                If not specified, the program must be in your $PATH.
            :param options: (dict) SOAPsplice options, given as {opt: value}.
            :rtype: str

            Main options::

            -p: number of threads, <= 20. [1]
            -S: 1: forward strand, 2: reverse strand, 3: both. [3]
            -m: maximum mismatch for one-segment alignment, <= 5. [3]
            -g: maximum indel for one-segment alignment, <= 2. [2]
            -i: length of tail that can be ignored in one-segment alignment. [7]
            -t: longest gap between two segments in two-segment alignment. [500000]
            -a: shortest length of a segment in two-segment alignment. [8]
            -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0]
            -L: maximum distance between paired-end reads. [500000]
            -l: minimum distance between paired-end reads. [50]
            -I: insert length of paired-end reads.
            """
            if not output: output = unique_filename_in()
            path_to_soapsplice = path_to_soapsplice or 'soapsplice'
            args = [
                path_to_soapsplice, '-d', index, '-1', unmapped_R1, '-2',
                unmapped_R2, '-o', output, '-f', '2'
            ]
            opts = []
            for k, v in options.iteritems():
                opts.extend([str(k), str(v)])
            return {"arguments": args + opts, "return_value": output}

        if not program_exists('soapsplice'):
            self.write_debug("Skipped junctions search: soapsplice not found.")
            return
        self.assembly.set_index_path(intype=3)
        soapsplice_index = soapsplice_index or self.assembly.index_path
        soapsplice_options.update(
            self.job.options.get('soapsplice_options', {}))
        soapsplice_options.setdefault('-p', 16)  # number of threads
        soapsplice_options.setdefault('-q', 1)  # Sanger format
        unmapped_fastq = {}
        for gid, group in self.job.groups.iteritems():
            unmapped_fastq[gid] = []
            for rid, run in group['runs'].iteritems():
                unmapped = self.job.files[gid][rid].get('unmapped_fastq')
                if not unmapped:
                    self.write_log(
                        "No unmapped reads found for group %s, run %d. Skip." %
                        (gid, rid))
                    continue
                elif not isinstance(unmapped, tuple):
                    self.write_log("Pair-end reads required. Skip.")
                    continue
                unmapped_fastq[gid].append(unmapped)
            if len(unmapped_fastq[gid]) == 0:
                continue
            R1 = cat(zip(*unmapped_fastq[gid])[0])
            R2 = cat(zip(*unmapped_fastq[gid])[1])
            future = soapsplice.nonblocking(
                self.ex,
                R1,
                R2,
                soapsplice_index,
                path_to_soapsplice=path_to_soapsplice,
                options=soapsplice_options,
                via=self.via,
                memory=8,
                threads=soapsplice_options['-p'])
            try:
                template = future.wait()
            except Exception as err:
                self.write_debug("SOAPsplice failed: %s." % str(err))
                return
            if template is None:
                self.write_debug("SOAPsplice failed.")
                return
            junc_file = template + '.junc'
            bed = self.convert_junc_file(junc_file, self.assembly)
            bed_descr = set_file_descr('junctions_%s.bed' % group['name'],
                                       groupId=gid,
                                       type='bed',
                                       step='junctions',
                                       ucsc=1)
            bam_descr = set_file_descr('junctions_%s.bam' % group['name'],
                                       groupId=gid,
                                       type='bam',
                                       step='junctions',
                                       ucsc=0)
            sam = template + '.sam'
            try:
                bam = sam_to_bam(self.ex, sam, reheader=self.assembly.name)
                add_and_index_bam(self.ex, bam, description=bam_descr)
                self.ex.add(bam, description=bam_descr)
            except Exception as e:
                self.write_debug(
                    "%s\n(Qualities may be in the wrong format, try with '-q 0'.)"
                    % str(e))
            self.ex.add(bed, description=bed_descr)
        return bed, bam
예제 #33
0
def add_macs_results(ex,
                     read_length,
                     genome_size,
                     bamfile,
                     ctrlbam=None,
                     name=None,
                     poisson_threshold=None,
                     alias=None,
                     macs_args=None,
                     via='lsf'):
    """Calls the ``macs`` function on each possible pair
    of test and control bam files and adds
    the respective outputs to the execution repository.

    ``macs`` options can be controlled with `macs_args`.
    If a dictionary of Poisson thresholds for each sample is given, then the enrichment bounds ('-m' option)
    are computed from them otherwise the default is '-m 10,100'.

    Returns the set of file prefixes.
    """
    if not (isinstance(bamfile, list)):
        bamfile = [bamfile]
    if not (isinstance(ctrlbam, list)):
        ctrlbam = [ctrlbam]
    if poisson_threshold is None:
        poisson_threshold = {}
    if macs_args is None:
        macs_args = []
    futures = {}
    rl = read_length
    for i, bam in enumerate(bamfile):
        n = name['tests'][i]
        if poisson_threshold.get(n) > 0:
            low = (poisson_threshold.get(n) + 1) * 5
            enrich_bounds = str(min(30, low)) + "," + str(10 * low)
        else:
            enrich_bounds = "10,100"
        if not ("-m" in macs_args): macs_args += ["-m", enrich_bounds]
        if isinstance(read_length, list): rl = read_length[i]
        for j, cam in enumerate(ctrlbam):
            m = name['controls'][j]
            nm = (n, m)
            futures[nm] = macs.nonblocking(ex,
                                           rl,
                                           genome_size,
                                           bam,
                                           cam,
                                           args=macs_args,
                                           via=via,
                                           memory=12)
    prefixes = {}
    for n, f in futures.iteritems():
        p = f.wait()
        prefixes[n] = p
        macs_descr0 = {
            'step': 'macs',
            'type': 'none',
            'view': 'admin',
            'groupId': n[0][0]
        }
        macs_descr1 = {'step': 'macs', 'type': 'xls', 'groupId': n[0][0]}
        macs_descr2 = {
            'step': 'macs',
            'type': 'bed',
            'groupId': n[0][0],
            'ucsc': '1'
        }
        filename = "_vs_".join([x[1] for x in n if x[0]])
        touch(ex, p)
        ex.add(p,
               description=set_file_descr(filename, **macs_descr0),
               alias=alias)
        ex.add(p + "_peaks.xls",
               description=set_file_descr(filename + "_peaks.xls",
                                          **macs_descr1),
               associate_to_filename=p,
               template='%s_peaks.xls')
        bedzip = gzip.open(p + "_peaks.bed.gz", 'wb')
        bedzip.write("track name='" + filename + "_macs_peaks'\n")
        with open(p + "_peaks.bed") as bedinf:
            [bedzip.write(l) for l in bedinf]
        bedzip.close()
        ex.add(p + "_peaks.bed.gz",
               description=set_file_descr(filename + "_peaks.bed.gz",
                                          **macs_descr2),
               associate_to_filename=p,
               template='%s_peaks.bed.gz')
        bedzip = gzip.open(p + "_summits.bed.gz", 'wb')
        bedzip.write("track name='" + filename + "_macs_summits'\n")
        with open(p + "_summits.bed") as bedinf:
            [bedzip.write(l) for l in bedinf]
        bedzip.close()
        ex.add(p + "_summits.bed.gz",
               description=set_file_descr(filename + "_summits.bed.gz",
                                          **macs_descr2),
               associate_to_filename=p,
               template='%s_summits.bed.gz')
        if n[1][0]:
            ex.add(p + "_negative_peaks.xls",
                   description=set_file_descr(filename + "_negative_peaks.xls",
                                              **macs_descr0),
                   associate_to_filename=p,
                   template='%s_negative_peaks.xls')
    return prefixes