Python merge_bam Examples

Programming Language: Python

Namespace/Package Name: bbcflib.mapseq

Method/Function: merge_bam

Examples at hotexamples.com: 6

Python merge_bam - 6 examples found. These are the top rated real world Python examples of bbcflib.mapseq.merge_bam extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: snp.py Project: JoseEspinosa/bbcflib

def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local',
                 logfile=sys.stdout, debugfile=sys.stderr):
    """Main function of the workflow"""
    ref_genome = assembly.fasta_by_chrom
    sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())]

    logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush()
    vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}}
    bams = {}
    # Launch the jobs
    for gid in sorted(job.files.keys()):
        # Merge all bams belonging to the same group
        runs = [r['bam'] for r in job.files[gid].itervalues()]
        bam = Samfile(runs[0])
        header = bam.header
        headerfile = unique_filename_in()
        for h in header["SQ"]:
            if h["SN"] in assembly.chrmeta:
                h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
        head = Samfile( headerfile, "wh", header=header )
        head.close()
        if len(runs) > 1:
            _b = merge_bam(ex,runs)
            index_bam(ex,_b)
            bams[gid] = _b
        else:
            bams[gid] = runs[0]
        # Samtools mpileup + bcftools + vcfutils.pl
        for chrom,ref in ref_genome.iteritems():
            vcf = unique_filename_in()
            vcfs[chrom][gid] = (vcf,
                                pileup.nonblocking(ex, bams[gid], ref, header=headerfile,
                                                   via=via, stdout=vcf))
        logfile.write("  ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush()
    # Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
    for gid in sorted(job.files.keys()):
        for chrom,ref in ref_genome.iteritems():
            vcfs[chrom][gid][1].wait()
            vcfs[chrom][gid] = vcfs[chrom][gid][0]
        logfile.write("  ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush()
    # Targz the pileup files (vcf)
    tarname = unique_filename_in()
    tarfh = tarfile.open(tarname, "w:gz")
    for chrom,v in vcfs.iteritems():
        for gid,vcf in v.iteritems():
            tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom))
    tarfh.close()
    ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') )

    logfile.write("\n* Merge info from vcf files\n"); logfile.flush()
    outall = unique_filename_in()
    outexons = unique_filename_in()
    with open(outall,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
                                 ['gene','location_type','distance'])+'\n')
    with open(outexons,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
                                  + ['new_aa_'+s for s in sample_names])+'\n')
    msa_table = dict((s,'') for s in [assembly.name]+sample_names)
    for chrom,v in vcfs.iteritems():
        logfile.write("  > Chromosome '%s'\n" % chrom); logfile.flush()
    # Put together info from all vcf files
        logfile.write("  - All SNPs\n"); logfile.flush()
        allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly,
                           sample_names,mincov,float(minsnp),logfile,debugfile)
    # Annotate SNPs and check synonymy
        logfile.write("  - Exonic SNPs\n"); logfile.flush()
        exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile)
        for snprow in allsnps:
            for n,k in enumerate([assembly.name]+sample_names):
                msa_table[k] += snprow[3+n][0]
    description = set_file_descr("allSNP.txt",step="SNPs",type="txt")
    ex.add(outall,description=description)
    description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt")
    ex.add(outexons,description=description)
    msafile = unique_filename_in()
    with open(msafile,"w") as msa:
        msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0])))
        for name,seq in msa_table.iteritems():
            msa.write("%s\t%s\n" %(name,seq))
    msa_table = {}
    description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt")
    ex.add(msafile,description=description)
    # Create UCSC bed tracks
    logfile.write("\n* Create tracks\n"); logfile.flush()
    create_tracks(ex,outall,sample_names,assembly)
    # Create quantitative tracks
    logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush()

    def _process_pileup(pileups, seq, startpos, endpos):
        atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        vectors = ([],[],[])
        for pileupcolumn in pileups:
            position = pileupcolumn.pos
            if position < startpos: continue
            if position >= endpos: break
            coverage = pileupcolumn.n
            ref_symbol = seq[position-startpos]
            ref = atoi.get(ref_symbol, 4)
            symbols = [0,0,0,0,0]
            quality = 0
            for pileupread in pileupcolumn.pileups:
                symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1
                quality += ord(pileupread.alignment.qual[pileupread.qpos])-33
            quality = float(quality)/coverage
            info = heterozygosity(ref, symbols[0:4])
            if coverage > 0: vectors[0].append((position, position+1, coverage))
            if info > 0: vectors[1].append((position, position+1, info))
            if quality > 0: vectors[2].append((position, position+1, quality))
#            yield (position, position+1, coverage, info, quality)
        return vectors

    if job.options.get('make_bigwigs',False):
        _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'}
        for gid,bamfile in bams.iteritems():
            _descr['groupId'] = gid
            bamtr = track(bamfile,format="bam")
            covname = unique_filename_in()+".bw"
            out_cov = track(covname, chrmeta=assembly.chrmeta)
            hetname = unique_filename_in()+".bw"
            out_het = track(hetname, chrmeta=assembly.chrmeta)
            qualname = unique_filename_in()+".bw"
            out_qual = track(qualname, chrmeta=assembly.chrmeta)
            for chrom, cinfo in assembly.chrmeta.iteritems():
                fasta = Fastafile(ref_genome[chrom])
                #process fasta and bam by 10Mb chunks
                for chunk in range(0,cinfo["length"],10**7):
                    fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7)
                    vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7)
                    out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom)
                    out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom)
                    out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom)
            out_cov.close()
            out_het.close()
            out_qual.close()
            description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr)
            ex.add(covname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr)
            ex.add(hetname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr)
            ex.add(qualname,description=description)

    return 0

Example #2

Show file

File: dnaseseq.py Project: MolbioUnige/bbcflib

def dnaseseq_workflow(ex, job, assembly, logfile=sys.stdout, via='lsf'):
    """
    This workflow performs the following steps:

      * BAM files from replicates within the same group are merged
      * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group
      * Wellington is called to identify footprints within these enriched regions
      * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file
      * Average DNAse profiles around motifs are plotted

    """
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid, mapped in job.files.iteritems():
        group_name = job.groups[gid]['name']
        if not isinstance(mapped, dict):
            raise TypeError(
                "Files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped: mapped = {'_': mapped}
        if len(mapped) > 1:
            bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()])
            index = index_bam(ex, bamfile)
        else:
            bamfile = mapped.values()[0]['bam']
        if job.groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            if os.path.exists(job.groups[gid].get('bedfile', 'null')):
                bedfile = job.groups[gid]['bedfile']
            elif os.path.exists(
                    os.path.join(supdir,
                                 job.groups[gid].get('bedfile', 'null'))):
                bedfile = os.path.join(supdir, job.groups[gid]['bedfile'])
            else:
                bedfile = None
            tests.append((bedfile, bamfile))
            names['tests'].append((gid, group_name))
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    tests = macs_bedfiles(ex, assembly.chrmeta, tests, controls, names,
                          job.options.get('macs_args', ["--keep-dup", "10"]),
                          via, logfile)
    bedlist = run_wellington(ex, tests, names, assembly, via, logfile)
    ######################### Motif scanning / plotting
    if any([
            gr.get('motif') != 'null' and gr.get('motif')
            for gr in job.groups.values()
    ]):
        motifbeds = motif_scan(ex, bedlist, assembly, job.groups, via, logfile)
        siglist = dict((gid[0], []) for gid in names['tests'])
        for gid, mapped in job.files.iteritems():
            wig = []
            suffixes = ["fwd", "rev"]
            merge_strands = int(job.options.get('merge_strands', -1))
            read_extension = int(job.options.get('read_extension') or -1)
            make_wigs = merge_strands >= 0 or read_extension != 1
            for m in mapped.values():
                if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                    output = mapseq.parallel_density_sql(
                        ex,
                        m["bam"],
                        assembly.chrmeta,
                        nreads=m["stats"]["total"],
                        merge=-1,
                        read_extension=1,
                        convert=False,
                        b2w_args=[],
                        via=via)
                    wig.append(dict(
                        (s, output + s + '.sql') for s in suffixes))
                else:
                    wig.append(m['wig'])
            if len(wig) > 1:
                wig[0] = dict((s, merge_sql(ex, [x[s] for x in wig], via=via))
                              for s in suffixes)
            _trn = job.groups[gid]['name'] + "_%s"
            if job.groups[gid]['control']:
                for s, w in wig[0].iteritems():
                    for _g in siglist.keys():
                        siglist[_g].append(track(w, info={'name': _trn % s}))
            else:
                siglist[gid].extend([
                    track(w, info={'name': _trn % s})
                    for s, w in wig[0].iteritems()
                ])
        plot_files = plot_footprint_profile(ex, motifbeds, siglist,
                                            assembly.chrnames, job.groups,
                                            logfile)
        for gid, flist in plot_files.iteritems():
            gname = job.groups[gid]['name']
            plotall = unique_filename_in()
            touch(ex, plotall)
            ex.add(plotall,
                   description=set_file_descr(gname + '_footprints_plots',
                                              type='none',
                                              view='admin',
                                              step='motifs',
                                              groupId=gid))
            ex.add(flist['pdf'],
                   description=set_file_descr(gname + '_footprints_plots.pdf',
                                              type='pdf',
                                              step='motifs',
                                              groupId=gid),
                   associate_to_filename=plotall,
                   template='%s.pdf')
            tarname = unique_filename_in()
            tarfh = tarfile.open(tarname, "w:gz")
            for mname, matf in flist['mat']:
                tarfh.add(matf, arcname="%s_%s.txt" % (gname, mname))
            tarfh.close()
            ex.add(tarname,
                   description=set_file_descr(gname +
                                              '_footprints_plots.tar.gz',
                                              type='tar',
                                              step='motifs',
                                              groupId=gid),
                   associate_to_filename=plotall,
                   template='%s.tar.gz')
    logfile.write("\nDone.\n ")
    logfile.flush()
    return 0

Example #3

Show file

File: dnaseseq.py Project: JoseEspinosa/bbcflib

def dnaseseq_workflow( ex, job, assembly, logfile=sys.stdout, via='lsf' ):
    """
    This workflow performs the following steps:

      * BAM files from replicates within the same group are merged
      * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group
      * Wellington is called to identify footprints within these enriched regions
      * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file
      * Average DNAse profiles around motifs are plotted

    """
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid,mapped in job.files.iteritems():
        group_name = job.groups[gid]['name']
        if not isinstance(mapped,dict):
            raise TypeError("Files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped: mapped = {'_': mapped}
        if len(mapped)>1:
            bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()])
            index = index_bam(ex, bamfile)
        else:
            bamfile = mapped.values()[0]['bam']
        if job.groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            if os.path.exists(job.groups[gid].get('bedfile','null')):
                bedfile = job.groups[gid]['bedfile']
            elif os.path.exists(os.path.join(supdir,job.groups[gid].get('bedfile','null'))):
                bedfile = os.path.join(supdir,job.groups[gid]['bedfile'])
            else:
                bedfile = None
            tests.append((bedfile,bamfile))
            names['tests'].append((gid,group_name))
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    tests = macs_bedfiles( ex, assembly.chrmeta, tests, controls, names, 
                           job.options.get('macs_args',["--keep-dup","10"]), via, logfile )
    bedlist = run_wellington(ex, tests, names, assembly, via, logfile)
######################### Motif scanning / plotting
    if any([gr.get('motif') != 'null' and gr.get('motif') 
            for gr in job.groups.values()]):
        motifbeds = motif_scan( ex, bedlist, assembly, job.groups, via, logfile )
        siglist = dict((gid[0],[]) for gid in names['tests'])
        for gid,mapped in job.files.iteritems():
            wig = []
            suffixes = ["fwd","rev"]
            merge_strands = int(job.options.get('merge_strands',-1))
            read_extension = int(job.options.get('read_extension') or -1)
            make_wigs = merge_strands >= 0 or read_extension != 1
            for m in mapped.values():
                if make_wigs or not('wig' in m) or len(m['wig'])<2:
                    output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                          nreads=m["stats"]["total"],
                                                          merge=-1, read_extension=1,
                                                          convert=False,
                                                          b2w_args=[], via=via )
                    wig.append(dict((s,output+s+'.sql') for s in suffixes))
                else:
                    wig.append(m['wig'])
            if len(wig) > 1:
                wig[0] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) 
                              for s in suffixes)
            _trn = job.groups[gid]['name']+"_%s"
            if job.groups[gid]['control']:
                for s,w in wig[0].iteritems():
                    for _g in siglist.keys():
                        siglist[_g].append(track(w,info={'name': _trn%s}))
            else:
                siglist[gid].extend([track(w,info={'name': _trn%s})
                                     for s,w in wig[0].iteritems()])
        plot_files = plot_footprint_profile( ex, motifbeds, siglist, 
                                             assembly.chrnames, 
                                             job.groups, logfile )
        for gid, flist in plot_files.iteritems():
            gname = job.groups[gid]['name']
            plotall = unique_filename_in()
            touch( ex, plotall )
            ex.add(plotall, description=set_file_descr(gname+'_footprints_plots', 
                                                       type='none', view='admin',
                                                       step='motifs', groupId=gid))
            ex.add(flist['pdf'], description=set_file_descr(gname+'_footprints_plots.pdf', 
                                                            type='pdf', step='motifs', 
                                                            groupId=gid),
                   associate_to_filename=plotall, template='%s.pdf')
            tarname = unique_filename_in()
            tarfh = tarfile.open(tarname, "w:gz")
            for mname,matf in flist['mat']:
                tarfh.add(matf, arcname="%s_%s.txt" % (gname,mname))
            tarfh.close()
            ex.add( tarname, description=set_file_descr(gname+'_footprints_plots.tar.gz',
                                                        type='tar', step='motifs', groupId=gid),
                    associate_to_filename=plotall, template='%s.tar.gz')
    logfile.write("\nDone.\n ");logfile.flush()
    return 0

Example #4

Show file

File: chipseq.py Project: bbcf/bbcflib

def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict,frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files',{})
    else:
        raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.")
    merge_strands = int(options.get('merge_strands',-1))
    suffixes = ["fwd","rev"]
    peak_deconvolution = options.get('peak_deconvolution',False)
    if isinstance(peak_deconvolution,basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1','true','t']
    run_meme = options.get('run_meme',False)
    if isinstance(run_meme,basestring):
        run_meme = run_meme.lower() in ['1','true','t']
    macs_args = options.get('macs_args',["--bw","200"])
    b2w_args = options.get('b2w_args',[])
    if not(isinstance(mapseq_files,dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid,mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not(isinstance(mapped,dict)):
            raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name+"_"+str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via )
            if mapped[k].get('poisson_threshold',-1)>0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns)>0:
            p_thresh['group_name'] = sum(ptruns)/len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped)>1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid,group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    logfile.write("Starting MACS.\n");logfile.flush()
    processed = {'macs': add_macs_results( ex, read_length, genome_size,
                                           tests, ctrlbam=controls, name=names,
                                           poisson_threshold=p_thresh,
                                           macs_args=macs_args, via=via ) }
    logfile.write("Done MACS.\n");logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score':(6,sys.maxint)}
    _fields = ['chr','start','end','name','score']
    for i,name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name,names['controls'][0])
            macsbed = track(processed['macs'][ctrl]+"_summits.bed",
                            chrmeta=chrlist, fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed",
                                         chrmeta=chrlist, fields=_fields).read(selection=_select),
                                         'name', lambda __n,_n=xn: "%s:%i" %(__n,_n))
                                   for xn,x in enumerate(names['controls'])])
        ##############################
        macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 )
        peak_list[name] = unique_filename_in()+".sql"
        macs_final = track( peak_list[name], chrmeta=chrlist,
                            info={'datatype':'qualitative'},
                            fields=['start','end','name','score'] )
        macs_final.write(fusion(macs_neighb),clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1: options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension']>100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid,mapped in mapseq_files.iteritems():
#            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not('wig' in m) or len(m['wig'])<2:
                output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                      nreads=m["stats"]["total"],
                                                      merge=-1, read_extension=options['read_extension'],
                                                      convert=False,
                                                      b2w_args=b2w_args, via=via )
                wig.append(dict((s,output+s+'.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via))
                                          for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv( stream, pval ):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] 
                                   for x in stream 
                                   if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), 
                                  fields=stream.fields )
        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1]+" deconvolution.\n");logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name,names['controls'][0])
                macsbed = processed['macs'][ctrl]+"_peaks.bed"
            else:
                macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed"
                                                   for x in names['controls']], via=via )
            deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta,
                                 options['read_extension'], script_path, via=via )
            peak_list[name] = unique_filename_in()+".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed,0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1]+'_peaks.sql', type='sql',
                                              step='deconvolution', groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1]+'_deconv.sql', type='sql',
                                              step='deconvolution',  groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'],(bigwig,"bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1]+'_deconv.bw', type='bigWig',
                                                  ucsc='1', step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e));logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1]+'_deconv.pdf', type='pdf',
                                              step='deconvolution', groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs( stream, xlsl, _f ):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:])
                        yield _p+xlsl[0][nb-1][1:]
                    else:
                        nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:]
        return FeatureStream( _macs_row(stream), fields=_f )
    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']])
        try:
###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat),
                                         xlsl, _fields), mode='append')
        except ValueError:
            _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append')
        peakout.close()
        gzipfile(ex,peakfile)
        peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields))
        ex.add(peakfile+".gz",
               description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text',
                                          step='annotation',groupId=name[0]))
    stracks = [track(wig,info={'name':name+"_"+st}) 
               for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()]
    tablefile = unique_filename_in()
    with open(tablefile,"w") as _tf:
        _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1]
                   for _s in names['tests'] for _c in names['controls']]
        _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']),
                        'name', lambda __n,_n=npt: "%s:%i" %(__n,_n))
                  for npt,pt in enumerate(peakfile_list)]
        features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], 
                                      remove_duplicates=True, group_by=['chr','start','end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile,"a") as _tf:
            for row in quantifs:
                pcols = ['']*_ns*_nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while ( _k < len(_rnsplit)-1-int(_nc>1) ):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k-1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])*_nc+int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n")
    gzipfile(ex,tablefile)
    ex.add(tablefile+".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n");logfile.flush()
        processed['meme'] = parallel_meme( ex, assembly,
                                           peak_list.values(), name=peak_list.keys(),
                                           chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'],
                                           via=via )
    return processed

Example #5

Show file

File: chipseq.py Project: MolbioUnige/bbcflib

def chipseq_workflow(ex,
                     job_or_dict,
                     assembly,
                     script_path='',
                     logfile=sys.stdout,
                     via='lsf'):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict, frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not ('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files', {})
    else:
        raise TypeError(
            "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'."
        )
    merge_strands = int(options.get('merge_strands', -1))
    suffixes = ["fwd", "rev"]
    peak_deconvolution = options.get('peak_deconvolution', False)
    if isinstance(peak_deconvolution, basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't']
    run_meme = options.get('run_meme', False)
    if isinstance(run_meme, basestring):
        run_meme = run_meme.lower() in ['1', 'true', 't']
    macs_args = options.get('macs_args', ["--bw", "200"])
    b2w_args = options.get('b2w_args', [])
    if not (isinstance(mapseq_files, dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid, mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not (isinstance(mapped, dict)):
            raise TypeError(
                "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name + "_" + str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking(ex,
                                                         mapped[k]["bam"],
                                                         via=via)
            if mapped[k].get('poisson_threshold', -1) > 0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns) > 0:
            p_thresh['group_name'] = sum(ptruns) / len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped) > 1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid, group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    logfile.write("Starting MACS.\n")
    logfile.flush()
    processed = {
        'macs':
        add_macs_results(ex,
                         read_length,
                         genome_size,
                         tests,
                         ctrlbam=controls,
                         name=names,
                         poisson_threshold=p_thresh,
                         macs_args=macs_args,
                         via=via)
    }
    logfile.write("Done MACS.\n")
    logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
    ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score': (6, sys.maxint)}
    _fields = ['chr', 'start', 'end', 'name', 'score']
    for i, name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name, names['controls'][0])
            macsbed = track(processed['macs'][ctrl] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([
                apply(track(processed['macs'][(name, x)] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select),
                      'name',
                      lambda __n, _n=xn: "%s:%i" % (__n, _n))
                for xn, x in enumerate(names['controls'])
            ])
        ##############################
        macs_neighb = neighborhood(macsbed, before_start=150, after_end=150)
        peak_list[name] = unique_filename_in() + ".sql"
        macs_final = track(peak_list[name],
                           chrmeta=chrlist,
                           info={'datatype': 'qualitative'},
                           fields=['start', 'end', 'name', 'score'])
        macs_final.write(fusion(macs_neighb), clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(
        options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1:
        options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension'] > 100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid, mapped in mapseq_files.iteritems():
        #            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                output = mapseq.parallel_density_sql(
                    ex,
                    m["bam"],
                    assembly.chrmeta,
                    nreads=m["stats"]["total"],
                    merge=-1,
                    read_extension=options['read_extension'],
                    convert=False,
                    b2w_args=b2w_args,
                    via=via)
                wig.append(dict((s, output + s + '.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict(
                (s, merge_sql(ex, [x[s] for x in wig], via=via))
                for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv(stream, pval):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream(
                ((x[0], ) + ((x[2] + x[1]) / 2 - 150,
                             (x[2] + x[1]) / 2 + 150) + x[3:]
                 for x in stream if "FERR=" in x[3]
                 and float(ferr.search(x[3]).groups()[0]) <= pval),
                fields=stream.fields)

        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1] + " deconvolution.\n")
            logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name, names['controls'][0])
                macsbed = processed['macs'][ctrl] + "_peaks.bed"
            else:
                macsbed = intersect_many_bed(ex, [
                    processed['macs'][(name, x)] + "_peaks.bed"
                    for x in names['controls']
                ],
                                             via=via)
            deconv = run_deconv(ex,
                                merged_wig[name[1]],
                                macsbed,
                                assembly.chrmeta,
                                options['read_extension'],
                                script_path,
                                via=via)
            peak_list[name] = unique_filename_in() + ".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist,
                       fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed, 0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1] + '_peaks.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1] + '_deconv.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'], (bigwig, "bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1] + '_deconv.bw',
                                                  type='bigWig',
                                                  ucsc='1',
                                                  step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e))
                logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1] + '_deconv.pdf',
                                              type='pdf',
                                              step='deconvolution',
                                              groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs(stream, xlsl, _f):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(
                            _n.split(";")[0][13:]) if _n[:3] == "ID=" else int(
                                _n[10:])
                        yield _p + xlsl[0][nb - 1][1:]
                    else:
                        nb = _n.split(
                            ";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:]

        return FeatureStream(_macs_row(stream), fields=_f)

    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,
                       chrmeta=chrlist,
                       fields=["chr", "start", "end", "name", "score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([
            processed['macs'][(name, _c)] + "_peaks.xls"
            for _c in names['controls']
        ])
        try:
            ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join([
                'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)',
                'location_type', 'distance'
            ] + _fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(
                    getNearestFeature(ptrack.read(selection=chrom), _feat),
                    xlsl, _fields),
                              mode='append')
        except ValueError:
            _fields = ['chr', 'start', 'end', 'name', 'score'
                       ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join(
                ['chromosome', 'start', 'end', 'info', 'peak_height'] +
                _fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl,
                                         _fields),
                              mode='append')
        peakout.close()
        gzipfile(ex, peakfile)
        peakfile_list.append(
            track(peakfile + ".gz", format='txt', fields=_fields))
        ex.add(peakfile + ".gz",
               description=set_file_descr(name[1] + '_annotated_peaks.txt.gz',
                                          type='text',
                                          step='annotation',
                                          groupId=name[0]))
    stracks = [
        track(wig, info={'name': name + "_" + st})
        for name, wigdict in merged_wig.iteritems()
        for st, wig in wigdict.iteritems()
    ]
    tablefile = unique_filename_in()
    with open(tablefile, "w") as _tf:
        _pnames = [
            "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1]
            for _s in names['tests'] for _c in names['controls']
        ]
        _tf.write("\t".join([
            '#chromosome',
            'start',
            'end',
        ] + _pnames + [s.name for s in stracks]) + "\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [
            apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']),
                  'name',
                  lambda __n, _n=npt: "%s:%i" % (__n, _n))
            for npt, pt in enumerate(peakfile_list)
        ]
        features = fusion(
            concatenate(pk_lst,
                        fields=['chr', 'start', 'end', 'name'],
                        remove_duplicates=True,
                        group_by=['chr', 'start', 'end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile, "a") as _tf:
            for row in quantifs:
                pcols = [''] * _ns * _nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while (_k < len(_rnsplit) - 1 - int(_nc > 1)):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k - 1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0]) * _nc + int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(
                    str(tt)
                    for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) +
                          "\n")
    gzipfile(ex, tablefile)
    ex.add(tablefile + ".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',
                                      type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n")
        logfile.flush()
        processed['meme'] = parallel_meme(
            ex,
            assembly,
            peak_list.values(),
            name=peak_list.keys(),
            chip=True,
            meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'],
            via=via)
    return processed

Example #6

Show file

File: snp.py Project: MolbioUnige/bbcflib

def snp_workflow(ex,
                 job,
                 assembly,
                 minsnp=40.,
                 mincov=5,
                 path_to_ref=None,
                 via='local',
                 logfile=sys.stdout,
                 debugfile=sys.stderr):
    """Main function of the workflow"""
    ref_genome = assembly.fasta_by_chrom
    sample_names = [
        job.groups[gid]['name'] for gid in sorted(job.files.keys())
    ]

    logfile.write("\n* Generate vcfs for each chrom/group\n")
    logfile.flush()
    vcfs = dict((chrom, {}) for chrom in ref_genome.keys())  # {chr: {}}
    bams = {}
    # Launch the jobs
    bam = Samfile(job.files.values()[0].values()[0]['bam'])
    header = bam.header
    headerfile = unique_filename_in()
    for h in header["SQ"]:
        if h["SN"] in assembly.chrmeta:
            h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
    head = Samfile(headerfile, "wh", header=header)
    head.close()
    for gid in job.files.keys():
        # Merge all bams belonging to the same group
        runs = [r['bam'] for r in job.files[gid].itervalues()]
        if len(runs) > 1:
            _b = merge_bam(ex, runs)
            index_bam(ex, _b)
            bams[gid] = _b
        else:
            index_bam(ex, runs[0])
            bams[gid] = runs[0]
        # Samtools mpileup + bcftools + vcfutils.pl
        for chrom, ref in ref_genome.iteritems():
            vcf = unique_filename_in()
            vcfs[chrom][gid] = (vcf,
                                pileup.nonblocking(ex,
                                                   bams[gid],
                                                   ref,
                                                   header=headerfile,
                                                   via=via,
                                                   stdout=vcf))
        logfile.write("  ...Group %s running.\n" % job.groups[gid]['name'])
        logfile.flush()
    # Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
    for gid in job.files.keys():
        for chrom, ref in ref_genome.iteritems():
            vcfs[chrom][gid][1].wait()
            vcfs[chrom][gid] = vcfs[chrom][gid][0]
        logfile.write("  ...Group %s done.\n" % job.groups[gid]['name'])
        logfile.flush()
    # Targz the pileup files (vcf)
    tarname = unique_filename_in()
    tarfh = tarfile.open(tarname, "w:gz")
    for chrom, v in vcfs.iteritems():
        for gid, vcf in v.iteritems():
            tarfh.add(vcf,
                      arcname="%s_%s.vcf" % (job.groups[gid]['name'], chrom))
    tarfh.close()
    ex.add(tarname,
           description=set_file_descr("vcf_files.tar.gz",
                                      step="pileup",
                                      type="tar",
                                      view='admin'))

    logfile.write("\n* Merge info from vcf files\n")
    logfile.flush()
    outall = unique_filename_in()
    outexons = unique_filename_in()
    with open(outall, "w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
                                 ['gene','location_type','distance'])+'\n')
    with open(outexons, "w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
                                  + ['new_aa_'+s for s in sample_names])+'\n')
    msa_table = dict((s, '') for s in [assembly.name] + sample_names)
    for chrom, v in vcfs.iteritems():
        logfile.write("  > Chromosome '%s'\n" % chrom)
        logfile.flush()
        # Put together info from all vcf files
        logfile.write("  - All SNPs\n")
        logfile.flush()
        allsnps = all_snps(ex, chrom, vcfs[chrom], bams, outall,
                           assembly, headerfile, sample_names, mincov,
                           float(minsnp), logfile, debugfile, via)
        # Annotate SNPs and check synonymy
        logfile.write("  - Exonic SNPs\n")
        logfile.flush()
        exon_snps(chrom, outexons, allsnps, assembly, sample_names, ref_genome,
                  logfile, debugfile)
        for snprow in allsnps:
            for n, k in enumerate([assembly.name] + sample_names):
                base = snprow[3 + n][0]
                if base == "-": base = snprow[3][0]
                if base not in 'ACGTacgt': base = "N"
                msa_table[k] += base
    description = set_file_descr("allSNP.txt", step="SNPs", type="txt")
    ex.add(outall, description=description)
    description = set_file_descr("exonsSNP.txt", step="SNPs", type="txt")
    ex.add(outexons, description=description)
    msafile = unique_filename_in()
    with open(msafile, "w") as msa:
        msa.write(" %i %i\n" % (len(msa_table), len(msa_table.values()[0])))
        for name, seq in msa_table.iteritems():
            msa.write("%s\t%s\n" % (name, seq))
    msa_table = {}
    description = set_file_descr("SNPalignment.txt", step="SNPs", type="txt")
    ex.add(msafile, description=description)
    # Create UCSC bed tracks
    logfile.write("\n* Create tracks\n")
    logfile.flush()
    create_tracks(ex, outall, sample_names, assembly)
    # Create quantitative tracks
    logfile.write("\n* Create heteroz. and quality tracks\n")
    logfile.flush()

    def _process_pileup(pileups, seq, startpos, endpos):
        atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        vectors = ([], [], [])
        for pileupcolumn in pileups:
            position = pileupcolumn.pos
            if position < startpos: continue
            if position >= endpos: break
            coverage = pileupcolumn.n
            ref_symbol = seq[position - startpos]
            ref = atoi.get(ref_symbol, 4)
            symbols = [0, 0, 0, 0, 0]
            quality = 0
            for pileupread in pileupcolumn.pileups:
                if pileupread.qpos >= len(pileupread.alignment.seq):
                    coverage -= 1
                else:
                    symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos],
                                     4)] += 1
                    quality += ord(
                        pileupread.alignment.qual[pileupread.qpos]) - 33
            quality = float(quality) / coverage
            info = heterozygosity(ref, symbols[0:4])
            if coverage > 0:
                vectors[0].append((position, position + 1, coverage))
            if info > 0: vectors[1].append((position, position + 1, info))
            if quality > 0:
                vectors[2].append((position, position + 1, quality))
#            yield (position, position+1, coverage, info, quality)
        return vectors

    if job.options.get('make_bigwigs', False):
        _descr = {
            'groupId': 0,
            'step': "tracks",
            'type': "bigWig",
            'ucsc': '1'
        }
        for gid, bamfile in bams.iteritems():
            _descr['groupId'] = gid
            bamtr = track(bamfile, format="bam")
            covname = unique_filename_in() + ".bw"
            out_cov = track(covname, chrmeta=assembly.chrmeta)
            hetname = unique_filename_in() + ".bw"
            out_het = track(hetname, chrmeta=assembly.chrmeta)
            qualname = unique_filename_in() + ".bw"
            out_qual = track(qualname, chrmeta=assembly.chrmeta)
            for chrom, cinfo in assembly.chrmeta.iteritems():
                fasta = Fastafile(ref_genome[chrom])
                #process fasta and bam by 10Mb chunks
                for chunk in range(0, cinfo["length"], 10**7):
                    fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk + 10**7)
                    vecs = _process_pileup(
                        bamtr.pileup(chrom, chunk, chunk + 10**7), fastaseq,
                        chunk, chunk + 10**7)
                    out_cov.write(vecs[0],
                                  fields=['start', 'end', 'score'],
                                  chrom=chrom)
                    out_het.write(vecs[1],
                                  fields=['start', 'end', 'score'],
                                  chrom=chrom)
                    out_qual.write(vecs[2],
                                   fields=['start', 'end', 'score'],
                                   chrom=chrom)
            out_cov.close()
            out_het.close()
            out_qual.close()
            description = set_file_descr(
                job.groups[gid]['name'] + "_coverage.bw", **_descr)
            ex.add(covname, description=description)
            description = set_file_descr(
                job.groups[gid]['name'] + "_heterozygosity.bw", **_descr)
            ex.add(hetname, description=description)
            description = set_file_descr(
                job.groups[gid]['name'] + "_quality.bw", **_descr)
            ex.add(qualname, description=description)

    return 0