Ejemplo n.º 1
0
def save_wellington( ex, wellout, chrmeta ):
    bedlist = {}
    for name, wlist in wellout.iteritems():
        wellall = unique_filename_in()
#### Dummy file
        touch( ex, wellall )
        ex.add(wellall,
               description=set_file_descr(name[1]+'_wellington_files', type='none', view='admin',
                                          step='footprints', groupId=name[0]))
#### BED at FDR 1%
        bedlist[name[0]] = wellall+"FDR01.bed.gz"
        bedzip = gzip.open(bedlist[name[0]],'wb')
        bedzip.write("track name='"+name[1]+"_WellingtonFootprints_FDR_0.01'\n")
        for x in wlist:
            with open(os.path.join(*x)+".WellingtonFootprints.FDR.0.01.bed") as _bed:
                [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall+"FDR01.bed.gz",
               description=set_file_descr(name[1]+'_WellingtonFootprintsFDR01.bed.gz',
                                          type='bed', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz')
#### BED at p-values [...]
        bedzip = gzip.open(wellall+"PvalCutoffs.bed.gz",'wb')
        for bfile in os.listdir(os.path.join(wlist[0][0],"p_value_cutoffs")):
            cut = os.path.splitext(bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4])
            bedzip.write("track name='"+name[1]+"_WellingtonFootprints_Pval_%s'\n" %cut)
            for wdir,wpref in wlist:
                _bedpath = os.path.join(wdir,"p_value_cutoffs",wpref+".WellingtonFootprints."+cut+".bed")
                with open(_bedpath) as _bed:
                    [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall+"PvalCutoffs.bed.gz",
               description=set_file_descr(name[1]+'_WellingtonFootprintsPvalCutoffs.bed.gz',
                                          type='bed', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz')
#### WIG
        cat([os.path.join(*x)+".WellingtonFootprints.wig" for x in wlist], wellall+".wig")
        #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta)
        #ex.add(wellall+".bw",
        #       description=set_file_descr(name[1]+'_WellingtonFootprints.bw',
        #                                  type='bigWig', ucsc='1', step='footprints', groupId=name[0]),
        #       associate_to_filename=wellall, template='%s_WellingtonFootprints.bw')
        ex.add(wellall+".wig",
               description=set_file_descr(name[1]+'_WellingtonFootprints.wig',
                                          type='wig', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprints.wig')
    return bedlist
Ejemplo n.º 2
0
def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'):
    """
    Main call to create the library
    """
    if len(params['primary'])<2:
        print('Some parameters are missing, cannot create the library')
        print('primary='+params['primary']+" ; "+'secondary='+params['secondary'])
        return [None,None,None,None]

    if not isinstance(assembly_or_fasta,genrep.Assembly):
        assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta )
    chrnames = assembly_or_fasta.chrnames
    chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems())
    allfiles = assembly_or_fasta.fasta_by_chrom  #assembly_or_fasta.untar_genome_fasta()

    libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f,
                                                            params['primary'], params['secondary'],
                                                            params['length'],  params['type'],
                                                            via=via ))
                    for c, f in allfiles.iteritems())
    resfile = unique_filename_in()
    os.mkdir(resfile)
    bedfiles = {}
    for chrom, future in libfiles.iteritems():
        libfiles[chrom] = future.wait()
        if not os.path.getsize(libfiles[chrom][1])>0:
            time.sleep(60)
            touch(ex,libfiles[chrom][1])
        bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map)
    rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via)
    bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames]
    cat(bedchrom,out=resfile+".bed")
    gzipfile(ex,[resfile+".bed"]+bedchrom)
#    resfile_sql = resfile+".sql"
#    track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species'])
    enz_list = []
    infos_lib = { 'assembly_name':  params['species'],
                  'enzyme1_id':     getEnzymeSeqId(params['primary'], True, enz_list, url),
                  'enzyme2_id':     getEnzymeSeqId(params['secondary'], True, enz_list, url),
                  'segment_length': params['length'],
                  'type':           params['type'],
                  'filename':       resfile }
    return [ libfiles, bedfiles, resfile, infos_lib ]
Ejemplo n.º 3
0
def coverageInRepeats(ex, infile, genomeName='mm9', repeatsPath=GlobalRepbasePath,
                      outdir=None, via='lsf'):
    """
    Completes the segment info bed file with the coverage in repeats of each segment.
    For now, works only for mm9, hg19 and dm3.
    """
    if not(isinstance(infile,dict)):
        infile = {"":infile}
    if outdir is None:
        resfile = unique_filename_in()+".bed"
        outf = open(resfile,'w')
    repeatsFile = os.path.join(repeatsPath, genomeName, genomeName+'_rmsk.bed')
    if not(os.path.exists(repeatsFile)):
        print("coverage in repeats not calculated as file "+repeatsFile+" does not exist.")
        if outdir is None:
            outf.close()
            cat([inf[0] for inf in infile.values()],out=resfile)
        else:
            for chrom,inf in infile.iteritems():
                shutil.copy(inf[0], os.path.join(outdir,chrom+".bed"))
            resfile = outdir
        return resfile
    futures = {}
    for chrom,inf in infile.iteritems():
        tmpfile = unique_filename_in()
        futures[chrom] = (tmpfile,coverageBed.nonblocking(ex,repeatsFile,inf[0],via=via,stdout=tmpfile))
    for chrom,fut in futures.iteritems():
        if not(outdir is None):
            resfile = os.path.join(outdir,chrom+".bed")
            outf = open(resfile,'w')
        fut[1].wait()
        coverout = track(fut[0],format='text',fields=['chr','start','end','name','c1','c2','c3','c4'])
        for s in sorted_stream(coverout.read(),[chrom]):
            s_split = s[3].split('|')
            infos = '|'.join(s_split[0:(len(s_split)-4)]+list(s[4:8]))
            outf.write('\t'.join([str(x) for x in s[0:3]+(infos,)])+'\n')
        if not(outdir is None):
            outf.close()
    if outdir is None: outf.close()
    else: resfile = outdir
    return resfile
Ejemplo n.º 4
0
def save_wellington(ex, wellout, chrmeta):
    bedlist = {}
    for name, wlist in wellout.iteritems():
        wellall = unique_filename_in()
        #### Dummy file
        touch(ex, wellall)
        ex.add(wellall,
               description=set_file_descr(name[1] + '_wellington_files',
                                          type='none',
                                          view='admin',
                                          step='footprints',
                                          groupId=name[0]))
        #### BED at FDR 1%
        bedlist[name[0]] = wellall + "FDR01.bed.gz"
        bedzip = gzip.open(bedlist[name[0]], 'wb')
        bedzip.write("track name='" + name[1] +
                     "_WellingtonFootprints_FDR_0.01'\n")
        for x in wlist:
            with open(os.path.join(*x) +
                      ".WellingtonFootprints.FDR.0.01.bed") as _bed:
                [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall + "FDR01.bed.gz",
               description=set_file_descr(name[1] +
                                          '_WellingtonFootprintsFDR01.bed.gz',
                                          type='bed',
                                          ucsc='1',
                                          step='footprints',
                                          groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprintsFDR01.bed.gz')
        #### BED at p-values [...]
        bedzip = gzip.open(wellall + "PvalCutoffs.bed.gz", 'wb')
        for bfile in os.listdir(os.path.join(wlist[0][0], "p_value_cutoffs")):
            cut = os.path.splitext(
                bfile[:-4])[1][1:]  #between . ([1:]) and .bed ([:-4])
            bedzip.write("track name='" + name[1] +
                         "_WellingtonFootprints_Pval_%s'\n" % cut)
            for wdir, wpref in wlist:
                _bedpath = os.path.join(
                    wdir, "p_value_cutoffs",
                    wpref + ".WellingtonFootprints." + cut + ".bed")
                with open(_bedpath) as _bed:
                    [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall + "PvalCutoffs.bed.gz",
               description=set_file_descr(
                   name[1] + '_WellingtonFootprintsPvalCutoffs.bed.gz',
                   type='bed',
                   ucsc='1',
                   step='footprints',
                   groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprintsPvalCutoffs.bed.gz')
        #### WIG
        cat([os.path.join(*x) + ".WellingtonFootprints.wig" for x in wlist],
            wellall + ".wig")
        #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta)
        #ex.add(wellall+".bw",
        #       description=set_file_descr(name[1]+'_WellingtonFootprints.bw',
        #                                  type='bigWig', ucsc='1', step='footprints', groupId=name[0]),
        #       associate_to_filename=wellall, template='%s_WellingtonFootprints.bw')
        ex.add(wellall + ".wig",
               description=set_file_descr(name[1] +
                                          '_WellingtonFootprints.wig',
                                          type='wig',
                                          ucsc='1',
                                          step='footprints',
                                          groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprints.wig')
    return bedlist
Ejemplo n.º 5
0
    def find_junctions(self,
                       soapsplice_index=None,
                       path_to_soapsplice=None,
                       soapsplice_options={}):
        """
        Retrieve unmapped reads from a precedent mapping and runs SOAPsplice on them.
        Return the names of a .bed track indicating the junctions positions, as well as
        of a bam file of the alignments attesting the junctions.

        :param soapsplice_index: (str) path to the SOAPsplice index.
        :param path_to_soapsplice: (str) specify the path to the program if it is not in your $PATH.
        :param soapsplice_options: (dict) SOAPsplice options, e.g. {'-m':2}.
        :rtype: str, str
        """
        @program
        def soapsplice(unmapped_R1,
                       unmapped_R2,
                       index,
                       output=None,
                       path_to_soapsplice=None,
                       options={}):
            """Bind 'soapsplice'. Return a text file containing the list of junctions.

            :param unmapped_R1: (str) path to the fastq file containing the 'left' reads.
            :param unmapped_R2: (str) path to the fastq file containing the 'right' reads.
            :param index: (str) path to the SOAPsplice index.
            :param output: (str) output file name.
            :param path_to_soapsplice: (str) path to the SOAPsplice executable.
                If not specified, the program must be in your $PATH.
            :param options: (dict) SOAPsplice options, given as {opt: value}.
            :rtype: str

            Main options::

            -p: number of threads, <= 20. [1]
            -S: 1: forward strand, 2: reverse strand, 3: both. [3]
            -m: maximum mismatch for one-segment alignment, <= 5. [3]
            -g: maximum indel for one-segment alignment, <= 2. [2]
            -i: length of tail that can be ignored in one-segment alignment. [7]
            -t: longest gap between two segments in two-segment alignment. [500000]
            -a: shortest length of a segment in two-segment alignment. [8]
            -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0]
            -L: maximum distance between paired-end reads. [500000]
            -l: minimum distance between paired-end reads. [50]
            -I: insert length of paired-end reads.
            """
            if not output: output = unique_filename_in()
            path_to_soapsplice = path_to_soapsplice or 'soapsplice'
            args = [
                path_to_soapsplice, '-d', index, '-1', unmapped_R1, '-2',
                unmapped_R2, '-o', output, '-f', '2'
            ]
            opts = []
            for k, v in options.iteritems():
                opts.extend([str(k), str(v)])
            return {"arguments": args + opts, "return_value": output}

        if not program_exists('soapsplice'):
            self.write_debug("Skipped junctions search: soapsplice not found.")
            return
        self.assembly.set_index_path(intype=3)
        soapsplice_index = soapsplice_index or self.assembly.index_path
        soapsplice_options.update(
            self.job.options.get('soapsplice_options', {}))
        soapsplice_options.setdefault('-p', 16)  # number of threads
        soapsplice_options.setdefault('-q', 1)  # Sanger format
        unmapped_fastq = {}
        for gid, group in self.job.groups.iteritems():
            unmapped_fastq[gid] = []
            for rid, run in group['runs'].iteritems():
                unmapped = self.job.files[gid][rid].get('unmapped_fastq')
                if not unmapped:
                    self.write_log(
                        "No unmapped reads found for group %s, run %d. Skip." %
                        (gid, rid))
                    continue
                elif not isinstance(unmapped, tuple):
                    self.write_log("Pair-end reads required. Skip.")
                    continue
                unmapped_fastq[gid].append(unmapped)
            if len(unmapped_fastq[gid]) == 0:
                continue
            R1 = cat(zip(*unmapped_fastq[gid])[0])
            R2 = cat(zip(*unmapped_fastq[gid])[1])
            future = soapsplice.nonblocking(
                self.ex,
                R1,
                R2,
                soapsplice_index,
                path_to_soapsplice=path_to_soapsplice,
                options=soapsplice_options,
                via=self.via,
                memory=8,
                threads=soapsplice_options['-p'])
            try:
                template = future.wait()
            except Exception as err:
                self.write_debug("SOAPsplice failed: %s." % str(err))
                return
            if template is None:
                self.write_debug("SOAPsplice failed.")
                return
            junc_file = template + '.junc'
            bed = self.convert_junc_file(junc_file, self.assembly)
            bed_descr = set_file_descr('junctions_%s.bed' % group['name'],
                                       groupId=gid,
                                       type='bed',
                                       step='junctions',
                                       ucsc=1)
            bam_descr = set_file_descr('junctions_%s.bam' % group['name'],
                                       groupId=gid,
                                       type='bam',
                                       step='junctions',
                                       ucsc=0)
            sam = template + '.sam'
            try:
                bam = sam_to_bam(self.ex, sam, reheader=self.assembly.name)
                add_and_index_bam(self.ex, bam, description=bam_descr)
                self.ex.add(bam, description=bam_descr)
            except Exception as e:
                self.write_debug(
                    "%s\n(Qualities may be in the wrong format, try with '-q 0'.)"
                    % str(e))
            self.ex.add(bed, description=bed_descr)
        return bed, bam
Ejemplo n.º 6
0
    def find_junctions(self, soapsplice_index=None, path_to_soapsplice=None, soapsplice_options={}):
        """
        Retrieve unmapped reads from a precedent mapping and runs SOAPsplice on them.
        Return the names of a .bed track indicating the junctions positions, as well as
        of a bam file of the alignments attesting the junctions.

        :param soapsplice_index: (str) path to the SOAPsplice index.
        :param path_to_soapsplice: (str) specify the path to the program if it is not in your $PATH.
        :param soapsplice_options: (dict) SOAPsplice options, e.g. {'-m':2}.
        :rtype: str, str
        """

        @program
        def soapsplice(unmapped_R1, unmapped_R2, index, output=None, path_to_soapsplice=None, options={}):
            """Bind 'soapsplice'. Return a text file containing the list of junctions.

            :param unmapped_R1: (str) path to the fastq file containing the 'left' reads.
            :param unmapped_R2: (str) path to the fastq file containing the 'right' reads.
            :param index: (str) path to the SOAPsplice index.
            :param output: (str) output file name.
            :param path_to_soapsplice: (str) path to the SOAPsplice executable.
                If not specified, the program must be in your $PATH.
            :param options: (dict) SOAPsplice options, given as {opt: value}.
            :rtype: str

            Main options::

            -p: number of threads, <= 20. [1]
            -S: 1: forward strand, 2: reverse strand, 3: both. [3]
            -m: maximum mismatch for one-segment alignment, <= 5. [3]
            -g: maximum indel for one-segment alignment, <= 2. [2]
            -i: length of tail that can be ignored in one-segment alignment. [7]
            -t: longest gap between two segments in two-segment alignment. [500000]
            -a: shortest length of a segment in two-segment alignment. [8]
            -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0]
            -L: maximum distance between paired-end reads. [500000]
            -l: minimum distance between paired-end reads. [50]
            -I: insert length of paired-end reads.
            """
            if not output: output = unique_filename_in()
            path_to_soapsplice = path_to_soapsplice or 'soapsplice'
            args = [path_to_soapsplice,'-d',index,'-1',unmapped_R1,'-2',unmapped_R2,'-o',output,'-f','2']
            opts = []
            for k,v in options.iteritems(): opts.extend([str(k),str(v)])
            return {"arguments": args+opts, "return_value": output}

        if not program_exists('soapsplice'):
            self.write_debug("Skipped junctions search: soapsplice not found.")
            return
        self.assembly.set_index_path(intype=3)
        soapsplice_index = soapsplice_index or self.assembly.index_path
        soapsplice_options.update(self.job.options.get('soapsplice_options',{}))
        soapsplice_options.setdefault('-p',16) # number of threads
        soapsplice_options.setdefault('-q',1)  # Sanger format
        unmapped_fastq = {}
        for gid, group in self.job.groups.iteritems():
            unmapped_fastq[gid] = []
            for rid, run in group['runs'].iteritems():
                unmapped = self.job.files[gid][rid].get('unmapped_fastq')
                if not unmapped:
                    self.write_log("No unmapped reads found for group %s, run %d. Skip." % (gid,rid))
                    continue
                elif not isinstance(unmapped,tuple):
                    self.write_log("Pair-end reads required. Skip.")
                    continue
                unmapped_fastq[gid].append(unmapped)
            if len(unmapped_fastq[gid]) == 0:
                continue
            R1 = cat(zip(*unmapped_fastq[gid])[0])
            R2 = cat(zip(*unmapped_fastq[gid])[1])
            future = soapsplice.nonblocking(self.ex,R1,R2,soapsplice_index,
                                            path_to_soapsplice=path_to_soapsplice,
                                            options=soapsplice_options,
                                            via=self.via, memory=8, threads=soapsplice_options['-p'])
            try:
                template = future.wait()
            except Exception as err:
                self.write_debug("SOAPsplice failed: %s." % str(err))
                return
            if template is None:
                self.write_debug("SOAPsplice failed.")
                return
            junc_file = template+'.junc'
            bed = self.convert_junc_file(junc_file,self.assembly)
            bed_descr = set_file_descr('junctions_%s.bed' % group['name'],
                                       groupId=gid,type='bed',step='junctions', ucsc=1)
            bam_descr = set_file_descr('junctions_%s.bam' % group['name'],
                                       groupId=gid,type='bam',step='junctions', ucsc=0)
            sam = template+'.sam'
            try:
                bam = sam_to_bam(self.ex,sam,reheader=self.assembly.name)
                add_and_index_bam(self.ex, bam, description=bam_descr)
                self.ex.add(bam, description=bam_descr)
            except Exception as e:
                self.write_debug("%s\n(Qualities may be in the wrong format, try with '-q 0'.)" %str(e))
            self.ex.add(bed, description=bed_descr)
        return bed, bam
Ejemplo n.º 7
0
def c4seq_workflow( ex, job, primers_dict, assembly,
                    c4_url=None, script_path='', logfile=sys.stdout, via='lsf' ):
    '''
    Main
    * open the 4C-seq minilims and create execution
    * 0. get/create the library
    * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql)
    * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate)
    '''

    mapseq_files = job.files
### outputs
    processed = {'lib': {}, 'density': {}, '4cseq': {}}
    processed['4cseq'] = {'density_files' : {},
                          'countsPerFrag' : {},
                          'countsPerFrag_grp' : {},
                          'norm' : {},
                          'norm_grp' : {},
                          'profileCorrection': {},
                          'profileCorrection_grp' : {},
                          'smooth_grp' : {},
                          'domainogram_grp' : {},
                          'bricks2frags' : {}}
                            # was 'smoothFrag': {}, 'domainogram': {}}
    regToExclude = {}
    new_libs=[]
### options
    run_domainogram = {}
    before_profile_correction = {}
    if not job.options.get('viewpoints_chrs',False):
        out_chromosomes = ','.join([ch for ch in assembly.chrnames])
    else:
        out_chromosomes = ','.join([primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] for gid,group in job.groups.iteritems()])
    print "out_chromosomes=" + out_chromosomes + "\n"

    sizeExt = job.options.get('norm_reg',1000000)
    print "region considered for normalisation: mid viewpoint +/-" + str(sizeExt) + 'bps'

### do it
    for gid, group in job.groups.iteritems():
        run_domainogram[gid] = group.get('run_domainogram',False)
        if isinstance(run_domainogram[gid],basestring):
            run_domainogram[gid] = (run_domainogram[gid].lower() in ['1','true','on','t'])
        before_profile_correction[gid] = group.get('before_profile_correction',False)
        if isinstance(before_profile_correction[gid],basestring):
            before_profile_correction[gid] = (before_profile_correction[gid].lower() in ['1','true','on','t'])
        processed['lib'][gid] = get_libForGrp(ex, group, assembly,
                                              new_libs, gid, c4_url, via=via)
#reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed'
        processed['4cseq']['density_files'][gid] = {}
        regToExclude[gid] = primers_dict.get(group['name'],{}).get('regToExclude',"").replace('\r','')

        # if no regToExclude defined, set it as mid_baitCoord +/-5kb
        if len(regToExclude[gid])==0 :
            baitcoord_mid = int(0.5 * (int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[0]) + int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[1]) ))
            regToExclude[gid] = primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] + ':' + str(baitcoord_mid-5000) + '-' + str(baitcoord_mid+5000)

        #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()]))
        print(primers_dict.get(group['name'],{}))
        print "regToExclude["+str(gid)+"]="+regToExclude[gid]
        for rid,run in group['runs'].iteritems():
            libname = mapseq_files[gid][rid]['libname']
            if job.options.get('merge_strands') != 0 or not('wig' in mapseq_files[gid][rid]):
                density_file=parallel_density_sql( ex, mapseq_files[gid][rid]['bam'],
                                                   assembly.chrmeta,
                                                   nreads=mapseq_files[gid][rid]['stats']["total"],
                                                   merge=0,
                                                   read_extension=mapseq_files[gid][rid]['stats']['read_length'],
                                                   convert=False,
                                                   via=via )
                density_file += "merged.sql"
                ex.add( density_file,
                        description=set_file_descr("density_file_"+libname+".sql",
                                                   groupId=gid,step="density",type="sql",view='admin',gdv="1") )
            else:
                density_file = mapseq_files[gid][rid]['wig']['merged']
            #density_files.append(density_file)
            processed['4cseq']['density_files'][gid][rid]=density_file

        # back to grp level!
        # not anymore:
        # processed['density'][gid] = merge_sql(ex, density_files, via=via)

    processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via )
    ## access per gid+rid

    futures_norm = {}
    countsPerFrags_bedGraph = {}
    futures_merged_raw = {}
    for gid, group in job.groups.iteritems():
        futures_norm[gid] = {}
        countsPerFrags_bedGraph[gid] = {}
        processed['4cseq']['norm'][gid] = {}
        for rid,run in group['runs'].iteritems():
            normfile = unique_filename_in()
            touch(ex, normfile)
            resfile = unique_filename_in()+".bedGraph"
            resfiles = processed['4cseq']['countsPerFrag'][gid][rid] # _all.sql
            convert(resfiles[3],resfile)
            countsPerFrags_bedGraph[gid][rid] = resfile

            print "call normFrags: infiles="+resfile+", normfile="+normfile+"baitCoord="+primers_dict[group['name']]['baitcoord']+", sizeExt=sizeExt, name="+ group['name']+"rep_"+str(rid) + "regToExclude="+regToExclude[gid]+"\n"
            futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name']+"rep_"+str(rid) ,regToExclude=regToExclude[gid], script_path=script_path, via=via )
            processed['4cseq']['norm'][gid][rid] = normfile

        if len(group) > 1:
            ## merge replicates before normalisation.
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_raw_mergedRep"
            print "gid="+group['name']
            print "call mergeRep for replicates before normalisation: infiles="+",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['countsPerFrag_grp'][gid] = mergefile
        else:
            futures_merged_raw[gid] = None
            processed['4cseq']['countsPerFrag_grp'][gid] = countsPerFrags_bedGraph[gid][0] #if no replicates, then the file we want is the 1st one

    print "***** profile correction / sample + merge normalised data"
    futures_merged = {} # per gid
    futures_profcor = {} # per gid, per rid
    for gid, group in job.groups.iteritems():
        ## run profile correction per run then merge them
        futures_profcor[gid] = {}
        processed['4cseq']['profileCorrection'][gid] = {}
        for rid, run in group['runs'].iteritems():
            # wait for normalisation of all replicates to be finished
            futures_norm[gid][rid].wait() ## normalised files, per grp, per rep
            normfile = processed['4cseq']['norm'][gid][rid]
            file1 = unique_filename_in() #track file
            touch(ex,file1)
            file2 = unique_filename_in() #report file
            touch(ex,file2)
            file3 = unique_filename_in() #table file
            touch(ex, file3)
            print "call profileCorrection: normfile="+normfile+", baitCoord="+primers_dict[group['name']]['baitcoord']+", name="+group['name']+", file1="+file1+", file2="+file2+", file3= "+file3+"\n"
            futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile,
                                        primers_dict[group['name']]['baitcoord'],
                                        group['name'], file1, file2, file3, script_path,
                                        via=via )
            processed['4cseq']['profileCorrection'][gid][rid] = [file1, file2, file3]

        ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop.
        if len(group)>1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_norm_mergedRep"
            print "gid="+group['name']
            print "call mergeRep: infiles="+",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['norm_grp'][gid] = mergefile
        else:
            futures_merged[gid] = None
            processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][gid][0] ##if no replicates, then the file we want is the 1st one

    print "***** merge profile corrected data"
    futures_profcor_merged = {} # per gid
    for gid, group in job.groups.iteritems():
        processed['4cseq']['profileCorrection_grp'][gid] = {}
        for rid, run in group['runs'].iteritems():
            futures_profcor[gid][rid].wait()   ## wait for ProfileCorrection to be finished

        ## merge replicates after profile correction
        if len(group)>1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_ProfCor_mergedRep"
            pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid,res_rid in processed['4cseq']['profileCorrection'][gid].iteritems()]
            print "call mergeRep (for PC tables): infiles="+",".join(pcfiles)+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['profileCorrection_grp'][gid] = mergefile
        else:
            futures_profcor_merged[gid] = None
            processed['4cseq']['profileCorrection_grp'][gid] = processed['4cseq']['profileCorrection'][gid][0] ##if no replicates, then the file we want is the 1st one


    print "***** smooth data"
    futures_smoothed = {}
    for gid, group in job.groups.iteritems():
        file1 = unique_filename_in()
        touch(ex,file1)
        file2 = unique_filename_in()
        touch(ex, file2)
        file3 = unique_filename_in()
        touch(ex, file3)
        nFragsPerWin = group['window_size']
        futures_merged_raw[gid].wait() ## wait for merging of raw_grp to be completed
        futures_smoothed[gid] = ( smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'],
                                                    file1, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        futures_merged[gid].wait() ## wait for merging of norm_grp to be completed
        futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name']+"_norm",
                                                    file2, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        futures_profcor_merged[gid].wait() # wait for the merging of profile corrected data to be done
        futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name']+"_fromProfileCorrected",
                                                    file3, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        processed['4cseq']['smooth_grp'][gid] = [file1,file2,file3] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC]

    print "***** Domainograms"
    futures_domainograms = {}
    for gid, group in job.groups.iteritems():
        grName = job.groups[gid]['name']
        if run_domainogram[gid]:
            regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord']
            if before_profile_correction[gid]:
               futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid],
                                                                            grName, regCoord=regCoord, skip=1,
                                                                            script_path=script_path, via=via, memory=15 )
            else:
                futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid],
                                                                            grName, regCoord=regCoord.split(':')[0], skip=1,
                                                                            script_path=script_path, via=via, memory=15 )

    ## prepare tar files for domainogram results (if any)
    ## and create "BRICKS to frags" files
    print "***** BRICKS to Frags"
    futures_BRICKS2Frags = {}
    for gid, f in futures_domainograms.iteritems():
        if run_domainogram[gid]: # if domainogram has been run
            resFiles = []
            logFile = f.wait()
            start = False
            tarname = job.groups[gid]['name']+"_domainogram.tar.gz"
            res_tar = tarfile.open(tarname, "w:gz")
            futures_BRICKS2Frags[gid] = []
            processed['4cseq']['bricks2frags'][gid] = []
            if logFile is None: continue
            with open(logFile) as f:
                for s in f:
                    s = s.strip()
                    if '####resfiles####' in s:
                        start = True
                    elif start and "RData" not in s:
                        resFiles.append(s)
                        res_tar.add(s)
                    if start and "foundBRICKS" in s:
                        bricks2fragsfile = unique_filename_in()+".bedGraph"
                        touch(ex, bricks2fragsfile)
                        futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking(ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4 ) ]
                        processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ]
            res_tar.close()
            processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname]




############### prepare tables for global results
    print "***** combine results into tables "
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        for rid,run in group['runs'].iteritems():
            allNames += [ group['name']+"_rep"+str(rid)+"_norm", group['name']+"_rep"+str(rid)+"_fit" ]
            allFiles += [ processed['4cseq']['profileCorrection'][gid][rid][2] ]
            allRegToExclude += [ regToExclude[gid] ]
    tablePC=unique_filename_in()+".txt"
    print("***will call makeTable with:")
    print(",".join(allFiles))
    print("resfile="+tablePC)
    print(",".join(allNames))
    touch(ex,tablePC)

    #regToExclude[gid]

    futures_tables = (makeTable.nonblocking(ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    # wait for all smoothing to be done
    for gid, fg in futures_smoothed.iteritems():
        for f in fg: f.wait()

    ## make Table raw/smoothed_raw
    print("** make Table raw/smoothed_raw")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        futures_merged_raw[gid].wait()
        allNames += [ group['name']+"_raw", group['name']+"_rawSmoothed" ]
        allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ]
        allRegToExclude += [ 'NA', regToExclude[gid] ]

    tableSmoothedRaw_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothedRaw_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## make Table norm/smoothed_norm before PC
    print("** make Table norm/smoothed_norm befor PC")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        allNames += [ group['name']+"_norm", group['name']+"_smoothed" ]
        allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ]
        allRegToExclude += [ regToExclude[gid], regToExclude[gid] ]

    tableSmoothed_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothed_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## make Table norm/smoothed_norm after PC
    print("** make Table norm/smoothed_norm after PC")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        allNames += [ group['name']+"_normPC", group['name']+"_smoothedPC" ]
        allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ]
        allRegToExclude += [ regToExclude[gid], regToExclude[gid] ]

    tableSmoothedPC_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothedPC_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## combine BRICKS2Frags files
    allNames=[]
    allFiles=[]
    for gid, fg in futures_BRICKS2Frags.iteritems():
        for f in fg: f.wait()
        allNames += [ job.groups[gid]['name']+"_BRICKSpval" ]
        cat_bricks2frags = unique_filename_in()+".txt"
        print ','.join(processed['4cseq']['bricks2frags'][gid])
        cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],out=cat_bricks2frags)
        allFiles += [ cat_bricks2frags ]

    for gid, fg in futures_smoothed.iteritems():
        for f in fg: f.wait()

    tableBRICKS2Frags = unique_filename_in()+".txt"
    touch(ex,tableBRICKS2Frags)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8 ), )


    for f in futures_tables: f.wait()


################ Add everything to minilims below!
    step = "density"
    for gid in processed['4cseq']['density_files'].keys():
        for rid, sql in processed['4cseq']['density_files'][gid].iteritems():
            fname = "density_file_"+job.groups[gid]['name']+"_merged_rep"+str(rid)
            ex.add( sql, description=set_file_descr( fname+".sql",
                                                 groupId=gid,step=step,type="sql",gdv="1" ) )
            wig = unique_filename_in()+".bw"
            convert( sql, wig )
            ex.add( wig, description=set_file_descr( fname+".bw",
                                                 groupId=gid,step=step,type="bigWig",ucsc="1") )
    step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate
    for gid in processed['4cseq']['countsPerFrag'].keys():
        for rid, resfiles in processed['4cseq']['countsPerFrag'][gid].iteritems():
            fname = "meanScorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid)
            ex.add( resfiles[1], description=set_file_descr( fname+".sql",
                                                             groupId=gid,step=step,type="sql",view="admin",gdv='1'))
            #gzipfile(ex,resfiles[0])
            #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz",
            #                                                       groupId=gid,step=step,type="bed",view="admin" ))
            fname = "segToFrag_"+job.groups[gid]['name']+"_rep"+str(rid)
            ex.add( resfiles[3], description=set_file_descr( fname+"_all.sql",
                                                             groupId=gid,step=step,type="sql",
                                                             comment="all informative frags - null included" ))
            trsql = track(resfiles[3])
            bwig = unique_filename_in()+".bw"
            trwig = track(bwig,chrmeta=trsql.chrmeta)
            trwig.write(trsql.read(fields=['chr','start','end','score'],
                                   selection={'score':(0.01,sys.maxint)}))
            trwig.close()
            ex.add( bwig, set_file_descr(fname+".bw",groupId=gid,step=step,type="bigWig",ucsc='1'))
        ## add segToFrags before normalisation
        futures_merged_raw[gid].wait()
        trbedgraph = track(removeNA(processed['4cseq']['countsPerFrag_grp'][gid]),format='bedgraph')
        bwig = unique_filename_in()+".bw"
        trwig = track(bwig,chrmeta=assembly.chrmeta)
        trwig.write(trbedgraph.read(fields=['chr','start','end','score'],
                               selection={'score':(0.01,sys.maxint)}))
        trwig.close()
        fname = "segToFrag_"+job.groups[gid]['name']
        ex.add( bwig, description=set_file_descr( fname+".bw",
                                                             groupId=gid,step=step,type="bigWig",
                                                             comment="segToFrag file before normalisation" ))

    step = "norm_counts_per_frags"  # after new normalisation process, combined replicates
    for gid, resfile in processed['4cseq']['norm_grp'].iteritems():
        fname = "normalised_scorePerFeature_"+job.groups[gid]['name']
        gzipfile(ex,resfile)
        ex.add( resfile+".gz", description=set_file_descr( fname+".bedGraph.gz", groupId=gid,step=step, type="bedGraph",ucsc='1'))
    # norm files, per replicates (might be removed)
    for gid, dict_gid in processed['4cseq']['norm'].iteritems():
        for rid, resfile in dict_gid.iteritems():
            fname = "normalised_scorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid)
            gzipfile(ex,resfile)
            ex.add(resfile+".gz",
                    description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
    step = "profile_correction" # Profile corrected data, combined replicates
    for gid, profileCorrectedFile in processed['4cseq']['profileCorrection_grp'].iteritems():
        fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected"
        gzipfile(ex,profileCorrectedFile)
        ex.add( profileCorrectedFile+".gz",
                description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
    # Profile corrected, per replicate (might be removed)
    for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems():
        for rid, resfiles in dict_gid.iteritems():
    #        profileCorrectedFile = resfiles[0]
            reportProfileCorrection = resfiles[1]
            fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_rep"+str(rid)
    #        gzipfile(ex,profileCorrectedFile)
     #       ex.add( profileCorrectedFile+".gz",
      #              description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
            ex.add( reportProfileCorrection, description=set_file_descr(fname+".pdf",
                                                                    groupId=gid,step=step,type="pdf"))
    step = "smoothing"
    for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems():
        rawSmoothFile = resfiles[0]
        smoothFile = resfiles[1]
        afterProfileCorrection = resfiles[2]
        nFrags = str(job.groups[gid]['window_size'])
        ## smoothed file before normalisation
        fname = "segToFrag_"+job.groups[gid]['name']+"_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,rawSmoothFile)
        ex.add(rawSmoothFile+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
        ## smoothed file after normalisation, before Profile correction
        fname = "segToFrag_"+job.groups[gid]['name']+"_norm_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,smoothFile)
        ex.add(smoothFile+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
        ## smoothed file after normalisation, after Profile correction
        fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,afterProfileCorrection)
        ex.add(afterProfileCorrection+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))

    step = "domainograms"
    for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems():
        tarFile = resfiles.pop()
        fname = job.groups[gid]['name']+"_domainogram.tar.gz"
        ex.add(tarFile, description=set_file_descr(fname,
                                                   groupId=gid,step=step,type="tgz"))
        for s in resfiles:
            if s[-8:] == "bedGraph":
                gzipfile(ex,s)
                s += ".gz"
                ex.add( s, description=set_file_descr( s, groupId=gid,step=step,type="bedGraph",ucsc="1",gdv="1"))

    step = "combined_results"
    gzipfile(ex,tableSmoothedRaw_grp)
    ex.add(tableSmoothedRaw_grp+".gz", description=set_file_descr("table_segToFrags_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableSmoothed_grp)
    ex.add(tableSmoothed_grp+".gz", description=set_file_descr("table_normalised_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableSmoothedPC_grp)
    ex.add(tableSmoothedPC_grp+".gz", description=set_file_descr("table_profileCorrected_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tablePC)
    ex.add(tablePC+".gz", description=set_file_descr("table_normalised_fit_per_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableBRICKS2Frags)
    ex.add(tableBRICKS2Frags+".gz", description=set_file_descr("table_frags_in_BRICKS_combined_replicates.txt.gz",step=step,type="txt"))

    return processed
Ejemplo n.º 8
0
def density_to_countsPerFrag( ex, file_dict, groups, assembly, regToExclude, script_path, via='lsf' ):
    '''
    Main function to compute normalised counts per fragments from a density file.
    '''
    futures = {}
    results = {}
    for gid, group in groups.iteritems():
        reffile = file_dict['lib'][gid]
        futures[gid] = {}
        results[gid] = {}
        for rid,run in group['runs'].iteritems():
            density_file = file_dict['4cseq']['density_files'][gid][rid]
            gm_futures = []
            for ch in assembly.chrnames:
                chref = os.path.join(reffile,ch+".bed.gz")
                if not(os.path.exists(chref)): chref = reffile
    #            features = track(chref,'bed')
    #            outbed.write(gMiner.stream.mean_score_by_feature(
    #                    scores.read(selection=ch),
    #                    features.read(selection=ch)), mode='append')
                bedfile = unique_filename_in()+".bed"
                gfminer_job = {"operation": "score_by_feature",
                               "output": bedfile,
                               "datatype": "qualitative",
                               "args": "'"+json.dumps({"trackScores":density_file,
                                                       "trackFeatures":chref,
                                                       "chromosome":ch})+"'"}
                gm_futures.append((gfminer_run.nonblocking(ex,gfminer_job,via=via),
                                   bedfile))
            outsql = unique_filename_in()+".sql"
            sqlouttr = track( outsql, chrmeta=assembly.chrmeta,
                              info={'datatype':'quantitative'},
                              fields=['start', 'end', 'score'] )
            outbed_all = []
            for n,f in enumerate(gm_futures):
                f[0].wait()
                fout = f[1]
                if not(os.path.exists(fout)):
                    time.sleep(60)
                    touch(ex,fout)
                outbed_all.append(fout)
                outbed = track(fout, chrmeta=assembly.chrmeta)
                sqlouttr.write( outbed.read(fields=['start', 'end', 'score'],
                                            selection={'score':(0.01,sys.maxint)}),
                                chrom=assembly.chrnames[n] )
            sqlouttr.close()
            countsPerFragFile = unique_filename_in()+".bed"
            countsPerFragFile = cat(outbed_all,out=countsPerFragFile)
            results[gid][rid] = [ countsPerFragFile, outsql ]
            FragFile = unique_filename_in()
            touch(ex,FragFile)
            futures[gid][rid] = (FragFile,
                            segToFrag.nonblocking( ex, countsPerFragFile, regToExclude[gid],
                                                   script_path, via=via, stdout=FragFile ,
                                                   memory=4 ))
    def _parse_select_frag(stream):
        for s in stream:
            sr = s.strip().split('\t')
            if 'IsValid' in sr[2] and not any([w in sr[8] for w in ['_and_','BothRepeats','notValid']]):
                patt = re.search(r'([^:]+):(\d+)-(\d+)',sr[1])
                if patt:
                    coord = patt.groups()
#                    if float(sr[11])>0.0:
                    yield (coord[0], int(coord[1])-1, int(coord[2]), float(sr[11]))

    for gid, dict_gid in futures.iteritems():
        for rid, res in dict_gid.iteritems():
            res[1].wait()
            touch(ex,res[0])
            segOut = open(res[0],"r")
            resBedGraph = unique_filename_in()+".sql"
            sqlTr = track( resBedGraph, fields=['start','end','score'],
                           info={'datatype':'quantitative'}, chrmeta=assembly.chrmeta )
            sqlTr.write(_parse_select_frag(segOut),fields=['chr','start','end','score'])
            sqlTr.close()
            segOut.close()
            results[gid][rid].extend([res[0],resBedGraph])
    return results #[countsPerFrag_allBed, countsPerFrag_selectSql, segToFrag_out, segToFrag_sql]
Ejemplo n.º 9
0
def c4seq_workflow(ex,
                   job,
                   primers_dict,
                   assembly,
                   c4_url=None,
                   script_path='',
                   logfile=sys.stdout,
                   via='lsf'):
    '''
    Main
    * open the 4C-seq minilims and create execution
    * 0. get/create the library
    * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql)
    * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate)
    '''

    mapseq_files = job.files
    ### outputs
    processed = {'lib': {}, 'density': {}, '4cseq': {}}
    processed['4cseq'] = {
        'density_files': {},
        'countsPerFrag': {},
        'countsPerFrag_grp': {},
        'norm': {},
        'norm_grp': {},
        'profileCorrection': {},
        'profileCorrection_grp': {},
        'smooth_grp': {},
        'domainogram_grp': {},
        'bricks2frags': {}
    }
    # was 'smoothFrag': {}, 'domainogram': {}}
    regToExclude = {}
    new_libs = []
    ### options
    run_domainogram = {}
    before_profile_correction = {}
    if not job.options.get('viewpoints_chrs', False):
        out_chromosomes = ','.join([ch for ch in assembly.chrnames])
    else:
        out_chromosomes = ','.join([
            primers_dict.get(group['name'], {}).get('baitcoord').split(':')[0]
            for gid, group in job.groups.iteritems()
        ])
    print "out_chromosomes=" + out_chromosomes + "\n"

    sizeExt = job.options.get('norm_reg', 1000000)
    print "region considered for normalisation: mid viewpoint +/-" + str(
        sizeExt) + 'bps'

    ### do it
    for gid, group in job.groups.iteritems():
        run_domainogram[gid] = group.get('run_domainogram', False)
        if isinstance(run_domainogram[gid], basestring):
            run_domainogram[gid] = (run_domainogram[gid].lower()
                                    in ['1', 'true', 'on', 't'])
        before_profile_correction[gid] = group.get('before_profile_correction',
                                                   False)
        if isinstance(before_profile_correction[gid], basestring):
            before_profile_correction[gid] = (
                before_profile_correction[gid].lower()
                in ['1', 'true', 'on', 't'])
        processed['lib'][gid] = get_libForGrp(ex,
                                              group,
                                              assembly,
                                              new_libs,
                                              gid,
                                              c4_url,
                                              via=via)
        #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed'
        processed['4cseq']['density_files'][gid] = {}
        regToExclude[gid] = primers_dict.get(group['name'],
                                             {}).get('regToExclude',
                                                     "").replace('\r', '')

        # if no regToExclude defined, set it as mid_baitCoord +/-5kb
        if len(regToExclude[gid]) == 0:
            baitcoord_mid = int(0.5 * (int(
                primers_dict.get(group['name'], {}).get('baitcoord').split(':')
                [1].split('-')[0]) + int(
                    primers_dict.get(group['name'], {}).get('baitcoord').split(
                        ':')[1].split('-')[1])))
            regToExclude[gid] = primers_dict.get(
                group['name'], {}).get('baitcoord').split(':')[0] + ':' + str(
                    baitcoord_mid - 5000) + '-' + str(baitcoord_mid + 5000)

        #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()]))
        print(primers_dict.get(group['name'], {}))
        print "regToExclude[" + str(gid) + "]=" + regToExclude[gid]
        for rid, run in group['runs'].iteritems():
            libname = mapseq_files[gid][rid]['libname']
            if job.options.get('merge_strands') != 0 or not (
                    'wig' in mapseq_files[gid][rid]):
                density_file = parallel_density_sql(
                    ex,
                    mapseq_files[gid][rid]['bam'],
                    assembly.chrmeta,
                    nreads=mapseq_files[gid][rid]['stats']["total"],
                    merge=0,
                    read_extension=mapseq_files[gid][rid]['stats']
                    ['read_length'],
                    convert=False,
                    via=via)
                density_file += "merged.sql"
                ex.add(density_file,
                       description=set_file_descr("density_file_" + libname +
                                                  ".sql",
                                                  groupId=gid,
                                                  step="density",
                                                  type="sql",
                                                  view='admin',
                                                  gdv="1"))
            else:
                density_file = mapseq_files[gid][rid]['wig']['merged']
            #density_files.append(density_file)
            processed['4cseq']['density_files'][gid][rid] = density_file

        # back to grp level!
        # not anymore:
        # processed['density'][gid] = merge_sql(ex, density_files, via=via)

    processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag(
        ex, processed, job.groups, assembly, regToExclude, script_path, via)
    ## access per gid+rid

    futures_norm = {}
    countsPerFrags_bedGraph = {}
    futures_merged_raw = {}
    for gid, group in job.groups.iteritems():
        futures_norm[gid] = {}
        countsPerFrags_bedGraph[gid] = {}
        processed['4cseq']['norm'][gid] = {}
        for rid, run in group['runs'].iteritems():
            normfile = unique_filename_in()
            touch(ex, normfile)
            resfile = unique_filename_in() + ".bedGraph"
            resfiles = processed['4cseq']['countsPerFrag'][gid][
                rid]  # _all.sql
            convert(resfiles[3], resfile)
            countsPerFrags_bedGraph[gid][rid] = resfile

            print "call normFrags: infiles=" + resfile + ", normfile=" + normfile + "baitCoord=" + primers_dict[
                group['name']][
                    'baitcoord'] + ", sizeExt=sizeExt, name=" + group[
                        'name'] + "rep_" + str(
                            rid) + "regToExclude=" + regToExclude[gid] + "\n"
            futures_norm[gid][rid] = normFrags.nonblocking(
                ex,
                resfile,
                normfile,
                baitCoord=primers_dict[group['name']]['baitcoord'],
                sizeExt=sizeExt,
                name=group['name'] + "rep_" + str(rid),
                regToExclude=regToExclude[gid],
                script_path=script_path,
                via=via)
            processed['4cseq']['norm'][gid][rid] = normfile

        if len(group) > 1:
            ## merge replicates before normalisation.
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_raw_mergedRep"
            print "gid=" + group['name']
            print "call mergeRep for replicates before normalisation: infiles=" + ",".join(
                [
                    res_rid for rid, res_rid in
                    countsPerFrags_bedGraph[gid].iteritems()
                ]
            ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_merged_raw[gid] = mergeRep.nonblocking(
                ex,
                ",".join([
                    res_rid for rid, res_rid in
                    countsPerFrags_bedGraph[gid].iteritems()
                ]),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['countsPerFrag_grp'][gid] = mergefile
        else:
            futures_merged_raw[gid] = None
            processed['4cseq']['countsPerFrag_grp'][
                gid] = countsPerFrags_bedGraph[gid][
                    0]  #if no replicates, then the file we want is the 1st one

    print "***** profile correction / sample + merge normalised data"
    futures_merged = {}  # per gid
    futures_profcor = {}  # per gid, per rid
    for gid, group in job.groups.iteritems():
        ## run profile correction per run then merge them
        futures_profcor[gid] = {}
        processed['4cseq']['profileCorrection'][gid] = {}
        for rid, run in group['runs'].iteritems():
            # wait for normalisation of all replicates to be finished
            futures_norm[gid][rid].wait(
            )  ## normalised files, per grp, per rep
            normfile = processed['4cseq']['norm'][gid][rid]
            file1 = unique_filename_in()  #track file
            touch(ex, file1)
            file2 = unique_filename_in()  #report file
            touch(ex, file2)
            file3 = unique_filename_in()  #table file
            touch(ex, file3)
            print "call profileCorrection: normfile=" + normfile + ", baitCoord=" + primers_dict[
                group['name']]['baitcoord'] + ", name=" + group[
                    'name'] + ", file1=" + file1 + ", file2=" + file2 + ", file3= " + file3 + "\n"
            futures_profcor[gid][rid] = profileCorrection.nonblocking(
                ex,
                normfile,
                primers_dict[group['name']]['baitcoord'],
                group['name'],
                file1,
                file2,
                file3,
                script_path,
                via=via)
            processed['4cseq']['profileCorrection'][gid][rid] = [
                file1, file2, file3
            ]

        ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop.
        if len(group) > 1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_norm_mergedRep"
            print "gid=" + group['name']
            print "call mergeRep: infiles=" + ",".join([
                res_rid for rid, res_rid in processed['4cseq']['norm']
                [gid].iteritems()
            ]) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_merged[gid] = mergeRep.nonblocking(
                ex,
                ",".join([
                    res_rid for rid, res_rid in processed['4cseq']['norm']
                    [gid].iteritems()
                ]),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['norm_grp'][gid] = mergefile
        else:
            futures_merged[gid] = None
            processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][
                gid][
                    0]  ##if no replicates, then the file we want is the 1st one

    print "***** merge profile corrected data"
    futures_profcor_merged = {}  # per gid
    for gid, group in job.groups.iteritems():
        processed['4cseq']['profileCorrection_grp'][gid] = {}
        for rid, run in group['runs'].iteritems():
            futures_profcor[gid][rid].wait(
            )  ## wait for ProfileCorrection to be finished

        ## merge replicates after profile correction
        if len(group) > 1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_ProfCor_mergedRep"
            pcfiles = [
                processed['4cseq']['profileCorrection'][gid][rid][0]
                for rid, res_rid in processed['4cseq']['profileCorrection']
                [gid].iteritems()
            ]
            print "call mergeRep (for PC tables): infiles=" + ",".join(
                pcfiles
            ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_profcor_merged[gid] = mergeRep.nonblocking(
                ex,
                ",".join(pcfiles),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['profileCorrection_grp'][gid] = mergefile
        else:
            futures_profcor_merged[gid] = None
            processed['4cseq']['profileCorrection_grp'][gid] = processed[
                '4cseq']['profileCorrection'][gid][
                    0]  ##if no replicates, then the file we want is the 1st one

    print "***** smooth data"
    futures_smoothed = {}
    for gid, group in job.groups.iteritems():
        file1 = unique_filename_in()
        touch(ex, file1)
        file2 = unique_filename_in()
        touch(ex, file2)
        file3 = unique_filename_in()
        touch(ex, file3)
        nFragsPerWin = group['window_size']
        futures_merged_raw[gid].wait(
        )  ## wait for merging of raw_grp to be completed
        futures_smoothed[gid] = (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['countsPerFrag_grp'][gid],
            nFragsPerWin,
            group['name'],
            file1,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        futures_merged[gid].wait(
        )  ## wait for merging of norm_grp to be completed
        futures_smoothed[gid] += (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['norm_grp'][gid],
            nFragsPerWin,
            group['name'] + "_norm",
            file2,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        futures_profcor_merged[gid].wait(
        )  # wait for the merging of profile corrected data to be done
        futures_smoothed[gid] += (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['profileCorrection_grp'][gid],
            nFragsPerWin,
            group['name'] + "_fromProfileCorrected",
            file3,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        processed['4cseq']['smooth_grp'][gid] = [
            file1, file2, file3
        ]  #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC]

    print "***** Domainograms"
    futures_domainograms = {}
    for gid, group in job.groups.iteritems():
        grName = job.groups[gid]['name']
        if run_domainogram[gid]:
            regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord']
            if before_profile_correction[gid]:
                futures_domainograms[gid] = runDomainogram.nonblocking(
                    ex,
                    processed['4cseq']['norm_grp'][gid],
                    grName,
                    regCoord=regCoord,
                    skip=1,
                    script_path=script_path,
                    via=via,
                    memory=15)
            else:
                futures_domainograms[gid] = runDomainogram.nonblocking(
                    ex,
                    processed['4cseq']['profileCorrection_grp'][gid],
                    grName,
                    regCoord=regCoord.split(':')[0],
                    skip=1,
                    script_path=script_path,
                    via=via,
                    memory=15)

    ## prepare tar files for domainogram results (if any)
    ## and create "BRICKS to frags" files
    print "***** BRICKS to Frags"
    futures_BRICKS2Frags = {}
    for gid, f in futures_domainograms.iteritems():
        if run_domainogram[gid]:  # if domainogram has been run
            resFiles = []
            logFile = f.wait()
            start = False
            tarname = job.groups[gid]['name'] + "_domainogram.tar.gz"
            res_tar = tarfile.open(tarname, "w:gz")
            futures_BRICKS2Frags[gid] = []
            processed['4cseq']['bricks2frags'][gid] = []
            if logFile is None: continue
            with open(logFile) as f:
                for s in f:
                    s = s.strip()
                    if '####resfiles####' in s:
                        start = True
                    elif start and "RData" not in s:
                        resFiles.append(s)
                        res_tar.add(s)
                    if start and "foundBRICKS" in s:
                        bricks2fragsfile = unique_filename_in() + ".bedGraph"
                        touch(ex, bricks2fragsfile)
                        futures_BRICKS2Frags[gid] += [
                            BRICKSToFrag.nonblocking(
                                ex,
                                s,
                                processed['4cseq']['norm_grp'][gid],
                                bricks2fragsfile,
                                script_path=script_path,
                                via=via,
                                memory=4)
                        ]
                        processed['4cseq']['bricks2frags'][gid] += [
                            bricks2fragsfile
                        ]
            res_tar.close()
            processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname]

############### prepare tables for global results
    print "***** combine results into tables "
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        for rid, run in group['runs'].iteritems():
            allNames += [
                group['name'] + "_rep" + str(rid) + "_norm",
                group['name'] + "_rep" + str(rid) + "_fit"
            ]
            allFiles += [processed['4cseq']['profileCorrection'][gid][rid][2]]
            allRegToExclude += [regToExclude[gid]]
    tablePC = unique_filename_in() + ".txt"
    print("***will call makeTable with:")
    print(",".join(allFiles))
    print("resfile=" + tablePC)
    print(",".join(allNames))
    touch(ex, tablePC)

    #regToExclude[gid]

    futures_tables = (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tablePC,
        ",".join(allNames),
        idCols="4,5",
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    # wait for all smoothing to be done
    for gid, fg in futures_smoothed.iteritems():
        for f in fg:
            f.wait()

    ## make Table raw/smoothed_raw
    print("** make Table raw/smoothed_raw")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        futures_merged_raw[gid].wait()
        allNames += [group['name'] + "_raw", group['name'] + "_rawSmoothed"]
        allFiles += [
            processed['4cseq']['countsPerFrag_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][0]
        ]
        allRegToExclude += ['NA', regToExclude[gid]]

    tableSmoothedRaw_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothedRaw_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothedRaw_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## make Table norm/smoothed_norm before PC
    print("** make Table norm/smoothed_norm befor PC")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        allNames += [group['name'] + "_norm", group['name'] + "_smoothed"]
        allFiles += [
            processed['4cseq']['norm_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][1]
        ]
        allRegToExclude += [regToExclude[gid], regToExclude[gid]]

    tableSmoothed_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothed_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothed_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## make Table norm/smoothed_norm after PC
    print("** make Table norm/smoothed_norm after PC")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        allNames += [group['name'] + "_normPC", group['name'] + "_smoothedPC"]
        allFiles += [
            processed['4cseq']['profileCorrection_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][2]
        ]
        allRegToExclude += [regToExclude[gid], regToExclude[gid]]

    tableSmoothedPC_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothedPC_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothedPC_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## combine BRICKS2Frags files
    allNames = []
    allFiles = []
    for gid, fg in futures_BRICKS2Frags.iteritems():
        for f in fg:
            f.wait()
        allNames += [job.groups[gid]['name'] + "_BRICKSpval"]
        cat_bricks2frags = unique_filename_in() + ".txt"
        print ','.join(processed['4cseq']['bricks2frags'][gid])
        cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],
                               out=cat_bricks2frags)
        allFiles += [cat_bricks2frags]

    for gid, fg in futures_smoothed.iteritems():
        for f in fg:
            f.wait()

    tableBRICKS2Frags = unique_filename_in() + ".txt"
    touch(ex, tableBRICKS2Frags)
    futures_tables += (makeTable.nonblocking(ex,
                                             ",".join(allFiles),
                                             tableBRICKS2Frags,
                                             ",".join(allNames),
                                             idCols="4",
                                             out_chromosomes=out_chromosomes,
                                             defVal="NA",
                                             script_path=script_path,
                                             via=via,
                                             memory=8), )

    for f in futures_tables:
        f.wait()

    ################ Add everything to minilims below!
    step = "density"
    for gid in processed['4cseq']['density_files'].keys():
        for rid, sql in processed['4cseq']['density_files'][gid].iteritems():
            fname = "density_file_" + job.groups[gid][
                'name'] + "_merged_rep" + str(rid)
            ex.add(sql,
                   description=set_file_descr(fname + ".sql",
                                              groupId=gid,
                                              step=step,
                                              type="sql",
                                              gdv="1"))
            wig = unique_filename_in() + ".bw"
            convert(sql, wig)
            ex.add(wig,
                   description=set_file_descr(fname + ".bw",
                                              groupId=gid,
                                              step=step,
                                              type="bigWig",
                                              ucsc="1"))
    step = "counts_per_frag"  #was _norm_counts_per_frags # before normalisation process, per replicate
    for gid in processed['4cseq']['countsPerFrag'].keys():
        for rid, resfiles in processed['4cseq']['countsPerFrag'][
                gid].iteritems():
            fname = "meanScorePerFeature_" + job.groups[gid][
                'name'] + "_rep" + str(rid)
            ex.add(resfiles[1],
                   description=set_file_descr(fname + ".sql",
                                              groupId=gid,
                                              step=step,
                                              type="sql",
                                              view="admin",
                                              gdv='1'))
            #gzipfile(ex,resfiles[0])
            #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz",
            #                                                       groupId=gid,step=step,type="bed",view="admin" ))
            fname = "segToFrag_" + job.groups[gid]['name'] + "_rep" + str(rid)
            ex.add(resfiles[3],
                   description=set_file_descr(
                       fname + "_all.sql",
                       groupId=gid,
                       step=step,
                       type="sql",
                       comment="all informative frags - null included"))
            trsql = track(resfiles[3])
            bwig = unique_filename_in() + ".bw"
            trwig = track(bwig, chrmeta=trsql.chrmeta)
            trwig.write(
                trsql.read(fields=['chr', 'start', 'end', 'score'],
                           selection={'score': (0.01, sys.maxint)}))
            trwig.close()
            ex.add(
                bwig,
                set_file_descr(fname + ".bw",
                               groupId=gid,
                               step=step,
                               type="bigWig",
                               ucsc='1'))
        ## add segToFrags before normalisation
        futures_merged_raw[gid].wait()
        trbedgraph = track(removeNA(
            processed['4cseq']['countsPerFrag_grp'][gid]),
                           format='bedgraph')
        bwig = unique_filename_in() + ".bw"
        trwig = track(bwig, chrmeta=assembly.chrmeta)
        trwig.write(
            trbedgraph.read(fields=['chr', 'start', 'end', 'score'],
                            selection={'score': (0.01, sys.maxint)}))
        trwig.close()
        fname = "segToFrag_" + job.groups[gid]['name']
        ex.add(bwig,
               description=set_file_descr(
                   fname + ".bw",
                   groupId=gid,
                   step=step,
                   type="bigWig",
                   comment="segToFrag file before normalisation"))

    step = "norm_counts_per_frags"  # after new normalisation process, combined replicates
    for gid, resfile in processed['4cseq']['norm_grp'].iteritems():
        fname = "normalised_scorePerFeature_" + job.groups[gid]['name']
        gzipfile(ex, resfile)
        ex.add(resfile + ".gz",
               description=set_file_descr(fname + ".bedGraph.gz",
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1'))
    # norm files, per replicates (might be removed)
    for gid, dict_gid in processed['4cseq']['norm'].iteritems():
        for rid, resfile in dict_gid.iteritems():
            fname = "normalised_scorePerFeature_" + job.groups[gid][
                'name'] + "_rep" + str(rid)
            gzipfile(ex, resfile)
            ex.add(resfile + ".gz",
                   description=set_file_descr(fname + ".bedGraph.gz",
                                              groupId=gid,
                                              step=step,
                                              type="bedGraph",
                                              ucsc='1',
                                              gdv='1'))
    step = "profile_correction"  # Profile corrected data, combined replicates
    for gid, profileCorrectedFile in processed['4cseq'][
            'profileCorrection_grp'].iteritems():
        fname = "segToFrag_" + job.groups[gid]['name'] + "_profileCorrected"
        gzipfile(ex, profileCorrectedFile)
        ex.add(profileCorrectedFile + ".gz",
               description=set_file_descr(fname + ".bedGraph.gz",
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
    # Profile corrected, per replicate (might be removed)
    for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems():
        for rid, resfiles in dict_gid.iteritems():
            #        profileCorrectedFile = resfiles[0]
            reportProfileCorrection = resfiles[1]
            fname = "segToFrag_" + job.groups[gid][
                'name'] + "_profileCorrected_rep" + str(rid)
            #        gzipfile(ex,profileCorrectedFile)
            #       ex.add( profileCorrectedFile+".gz",
            #              description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
            ex.add(reportProfileCorrection,
                   description=set_file_descr(fname + ".pdf",
                                              groupId=gid,
                                              step=step,
                                              type="pdf"))
    step = "smoothing"
    for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems():
        rawSmoothFile = resfiles[0]
        smoothFile = resfiles[1]
        afterProfileCorrection = resfiles[2]
        nFrags = str(job.groups[gid]['window_size'])
        ## smoothed file before normalisation
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, rawSmoothFile)
        ex.add(rawSmoothFile + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
        ## smoothed file after normalisation, before Profile correction
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_norm_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, smoothFile)
        ex.add(smoothFile + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
        ## smoothed file after normalisation, after Profile correction
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_profileCorrected_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, afterProfileCorrection)
        ex.add(afterProfileCorrection + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))

    step = "domainograms"
    for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems():
        tarFile = resfiles.pop()
        fname = job.groups[gid]['name'] + "_domainogram.tar.gz"
        ex.add(tarFile,
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="tgz"))
        for s in resfiles:
            if s[-8:] == "bedGraph":
                gzipfile(ex, s)
                s += ".gz"
                ex.add(s,
                       description=set_file_descr(s,
                                                  groupId=gid,
                                                  step=step,
                                                  type="bedGraph",
                                                  ucsc="1",
                                                  gdv="1"))

    step = "combined_results"
    gzipfile(ex, tableSmoothedRaw_grp)
    ex.add(tableSmoothedRaw_grp + ".gz",
           description=set_file_descr(
               "table_segToFrags_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableSmoothed_grp)
    ex.add(tableSmoothed_grp + ".gz",
           description=set_file_descr(
               "table_normalised_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableSmoothedPC_grp)
    ex.add(tableSmoothedPC_grp + ".gz",
           description=set_file_descr(
               "table_profileCorrected_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tablePC)
    ex.add(tablePC + ".gz",
           description=set_file_descr(
               "table_normalised_fit_per_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableBRICKS2Frags)
    ex.add(tableBRICKS2Frags + ".gz",
           description=set_file_descr(
               "table_frags_in_BRICKS_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    return processed
Ejemplo n.º 10
0
def density_to_countsPerFrag(ex,
                             file_dict,
                             groups,
                             assembly,
                             regToExclude,
                             script_path,
                             via='lsf'):
    '''
    Main function to compute normalised counts per fragments from a density file.
    '''
    futures = {}
    results = {}
    for gid, group in groups.iteritems():
        reffile = file_dict['lib'][gid]
        futures[gid] = {}
        results[gid] = {}
        for rid, run in group['runs'].iteritems():
            density_file = file_dict['4cseq']['density_files'][gid][rid]
            gm_futures = []
            for ch in assembly.chrnames:
                chref = os.path.join(reffile, ch + ".bed.gz")
                if not (os.path.exists(chref)): chref = reffile
                #            features = track(chref,'bed')
                #            outbed.write(gMiner.stream.mean_score_by_feature(
                #                    scores.read(selection=ch),
                #                    features.read(selection=ch)), mode='append')
                bedfile = unique_filename_in() + ".bed"
                gfminer_job = {
                    "operation":
                    "score_by_feature",
                    "output":
                    bedfile,
                    "datatype":
                    "qualitative",
                    "args":
                    "'" + json.dumps({
                        "trackScores": density_file,
                        "trackFeatures": chref,
                        "chromosome": ch
                    }) + "'"
                }
                gm_futures.append((gfminer_run.nonblocking(ex,
                                                           gfminer_job,
                                                           via=via), bedfile))
            outsql = unique_filename_in() + ".sql"
            sqlouttr = track(outsql,
                             chrmeta=assembly.chrmeta,
                             info={'datatype': 'quantitative'},
                             fields=['start', 'end', 'score'])
            outbed_all = []
            for n, f in enumerate(gm_futures):
                f[0].wait()
                fout = f[1]
                if not (os.path.exists(fout)):
                    time.sleep(60)
                    touch(ex, fout)
                outbed_all.append(fout)
                outbed = track(fout, chrmeta=assembly.chrmeta)
                sqlouttr.write(outbed.read(
                    fields=['start', 'end', 'score'],
                    selection={'score': (0.01, sys.maxint)}),
                               chrom=assembly.chrnames[n])
            sqlouttr.close()
            countsPerFragFile = unique_filename_in() + ".bed"
            countsPerFragFile = cat(outbed_all, out=countsPerFragFile)
            results[gid][rid] = [countsPerFragFile, outsql]
            FragFile = unique_filename_in()
            touch(ex, FragFile)
            futures[gid][rid] = (FragFile,
                                 segToFrag.nonblocking(ex,
                                                       countsPerFragFile,
                                                       regToExclude[gid],
                                                       script_path,
                                                       via=via,
                                                       stdout=FragFile,
                                                       memory=4))

    def _parse_select_frag(stream):
        for s in stream:
            sr = s.strip().split('\t')
            if 'IsValid' in sr[2] and not any(
                [w in sr[8] for w in ['_and_', 'BothRepeats', 'notValid']]):
                patt = re.search(r'([^:]+):(\d+)-(\d+)', sr[1])
                if patt:
                    coord = patt.groups()
                    #                    if float(sr[11])>0.0:
                    yield (coord[0], int(coord[1]) - 1, int(coord[2]),
                           float(sr[11]))

    for gid, dict_gid in futures.iteritems():
        for rid, res in dict_gid.iteritems():
            res[1].wait()
            touch(ex, res[0])
            segOut = open(res[0], "r")
            resBedGraph = unique_filename_in() + ".sql"
            sqlTr = track(resBedGraph,
                          fields=['start', 'end', 'score'],
                          info={'datatype': 'quantitative'},
                          chrmeta=assembly.chrmeta)
            sqlTr.write(_parse_select_frag(segOut),
                        fields=['chr', 'start', 'end', 'score'])
            sqlTr.close()
            segOut.close()
            results[gid][rid].extend([res[0], resBedGraph])
    return results  #[countsPerFrag_allBed, countsPerFrag_selectSql, segToFrag_out, segToFrag_sql]