Esempio n. 1
0
 def __call__(self, **kw):
     assembly_id = kw.get('assembly') or None
     assembly = genrep.Assembly(assembly_id)
     tinput = track(kw.get('track'), chrmeta=assembly.chrmeta)
     try:
         thPromot = int(kw.get("promoter"))
     except (ValueError, TypeError):
         thPromot = prom_def
     try:
         thInter = int(kw.get("intergenic"))
     except (ValueError, TypeError):
         thInter = inter_def
     try:
         thUTR = int(kw.get("UTR"))
     except (ValueError, TypeError):
         thUTR = utr_def
     output = self.temporary_path(fname=tinput.name+'_annotated.txt')
     _fields = tinput.fields+['gene', 'location_type', 'distance']
     tout = track(output, format='txt', fields=_fields)
     tout.make_header("#"+"\t".join(tout.fields))
     for chrom in assembly.chrnames:
         tout.write(getNearestFeature(
                 tinput.read(selection=chrom),
                 assembly.gene_track(chrom),
                 thPromot, thInter, thUTR), mode='append')
     tout.close()
     self.new_file(output, 'table')
     return self.display_time()
Esempio n. 2
0
 def test_getNearestFeature(self):
     features = fstream([('chrII', 14795327, 14798367)],
                        fields=['chr', 'start', 'end'])
     expected = [
         ('chrII', 14795327, 14798367, 'Y54E2A.12|tbc-20_Y54E2A.11|eif-3.B',
          'Promot_Included', '28_0')
     ]
     annotations = self.assembly.gene_track(chromlist=['chrII'])
     res = list(getNearestFeature(features, annotations))
     self.assertItemsEqual(res, expected)
Esempio n. 3
0
def all_snps(ex, chrom, vcfs, bams, outall, assembly, sample_names, mincov, minsnp,
             logfile=sys.stdout, debugfile=sys.stderr):
    """For a given chromosome, returns a summary file containing all SNPs identified
    in at least one of the samples.
    Each row contains: chromosome id, SNP position, reference base, SNP base (with proportions)

    :param chrom: (str) chromosome name.
    :param outall: (str) name of the file that will contain the list of all SNPs.
    :param sample_names: (list of str) list of sample names.
    :param mincov: (int) Minimum number of reads supporting an SNP at a position for it to be considered. [5]
    :param minsnp: (int) Minimum percentage of reads supporting the SNP for it to be returned.
        N.B.: Effectively, half of it on each strand for diploids. [40]
    """
    ploidy = assembly.ploidy
    allsnps = []
    nsamples = len(sample_names)
    sorder = range(len(sample_names))
    current = [None]*nsamples
    bam_tracks = [track(v,format='bam') for k,v in sorted(bams.items())]
    vcf_handles = [open(v) for k,v in sorted(vcfs.items())]
    for i,vh in enumerate(vcf_handles):
        line = '#'
        while line and line[0]=='#':
            line = vh.readline()
        current[i] = parse_vcf(line)
    lastpos = 0
    pos = -1
    while any(current):
        current_pos = [int(x[0][1]) if x else sys.maxint for x in current]
        pos = min(current_pos)
        if pos == sys.maxint: break
        current_snp_idx = set(i for i in range(nsamples) if current_pos[i]==pos)
        current_snps = ["0"]*nsamples
        for i in current_snp_idx:
            general,snp_info,sample_stats = current[i]
            chrbam = general[0]
            ref = general[2]
            current_snps[i] = filter_snp(general,snp_info,sample_stats, mincov,minsnp,ploidy)
        # If there were still snp called at this position after filtering
        if any(current_snps[i] not in ("-","0") for i in current_snp_idx):
            for i in set(range(nsamples))-current_snp_idx:
                for coverage in bam_tracks[sorder[i]].coverage((chrom,pos-1,pos)):
                    if coverage[-1] > 0:
                        current_snps[i] = '-'  # '/'.join([ref]*ploidy)
            if pos != lastpos: # indel can be located at the same position as an SNP
                allsnps.append((chrom,pos-1,pos,ref)+tuple(current_snps))
            lastpos = pos
        for i in current_snp_idx:
            current[i] = parse_vcf(vcf_handles[i].readline())
    for f in vcf_handles: f.close()
    for b in bam_tracks: b.close()

    logfile.write("  Annotate all SNPs\n"); logfile.flush()
    snp_read = FeatureStream(allsnps, fields=['chr','start','end','name']+sample_names)
    try:
        annotation = assembly.gene_track(chrom)
        annotated_stream = gm_stream.getNearestFeature(snp_read,annotation,
            thresholdPromot=3000, thresholdInter=3000, thresholdUTR=10)
    except:
        annotated_stream = snp_read
    logfile.write("  Write all SNPs\n"); logfile.flush()
    with open(outall,"a") as fout:
        for snp in annotated_stream:
            # snp: ('chrV',154529, 154530,'T','A (50% of 10)','A (80% of 10)',
            #       'YER002W|NOP16_YER001W|MNN1','Upstream_Included','2271_1011')
            fout.write('\t'.join(str(x) for x in (snp[0],)+snp[2:])+'\n')  # remove start coord (0-based)
    return allsnps
Esempio n. 4
0
 def test_getNearestFeature(self):
     features = fstream([('chrII',14795327,14798367)], fields=['chr','start','end'])
     expected = [('chrII',14795327, 14798367, 'Y54E2A.12|tbc-20_Y54E2A.11|eif-3.B', 'Promot_Included', '28_0')]
     annotations = self.assembly.gene_track(chromlist=['chrII'])
     res = list(getNearestFeature(features,annotations))
     self.assertItemsEqual(res,expected)
Esempio n. 5
0
File: snp.py Progetto: bbcf/bbcflib
def all_snps(ex, chrom, vcfs, bams, outall, assembly, headerfile, sample_names, mincov, minsnp,
             logfile=sys.stdout, debugfile=sys.stderr, via='local'):
    """For a given chromosome, returns a summary file containing all SNPs identified
    in at least one of the samples.
    Each row contains: chromosome id, SNP position, reference base, SNP base (with proportions)

    :param chrom: (str) chromosome name.
    :param vcfs: (dict) vcf files for each sample, dictionary keys are group ids.
    :param bams: (dict) bamfiles organized like the vcf files.
    :param outall: (str) name of the file that will contain the list of all SNPs.
    :param assembly: (genrep.Assembly) assembly for the fasta files and ploidy value.
    :param headerfile: (string) name of file with substitute bam header to match the fasta files.
    :param sample_names: (list of str) list of sample names.
    :param mincov: (int) minimum number of reads supporting an SNP at a position for it to be considered. [5]
    :param minsnp: (int) minimum percentage of reads supporting the SNP for it to be returned.
        N.B.: Effectively, half of it on each strand for diploids. [40]
    """
    ploidy = assembly.ploidy
    reffasta = assembly.fasta_by_chrom[chrom]
    allsnps = []
    nsamples = len(sample_names)
    sorder = range(len(sample_names))
    current = [None]*nsamples
#####
    if nsamples > 1:
        poslist = set()
        bamchrom = None
        for vf in vcfs.values():
            with open(vf) as vh:
                for line in vh:
                    if (not line) or line[0]=='#': continue
                    line = line.strip().split('\t')
                    if bamchrom is None: bamchrom = line[0]
                    poslist.add( int(line[1]) )
        snplist = unique_filename_in()
        with open(snplist,"w") as snpfh:
            snpfh.write("\n".join("%s\t%i" %(bamchrom,pos) for pos in sorted(poslist)))
        pilejobs = []
        vcfs2 = {}
        for gid, bamfile in bams.iteritems():
            vcfs2[gid] = unique_filename_in()
            pilejobs.append( pileup.nonblocking(ex, bams[gid], reffasta, 
                                                step="list", bedfile=snplist, header=headerfile,
                                                via=via, stdout=vcfs2[gid]) )
        [job.wait() for job in pilejobs]
    else:
        vcfs2 = vcfs
#####
    bam_tracks = [track(v,format='bam') for k,v in sorted(bams.items())]
    vcf_handles = [open(v) for k,v in sorted(vcfs2.items())]
    for i,vh in enumerate(vcf_handles):
        line = '#'
        while line and line[0]=='#':
            line = vh.readline()
        current[i] = parse_vcf(line)
    lastpos = 0
    pos = -1
    while any(current):
        current_pos = [int(x[0][1]) if x else sys.maxint for x in current]
        pos = min(current_pos)
        if pos == sys.maxint: break
        current_snp_idx = set(i for i in range(nsamples) if current_pos[i]==pos)
        current_snps = ["0"]*nsamples
        for i in current_snp_idx:
            general,snp_info,sample_stats = current[i]
            chrbam = general[0]
            ref = general[2]
            current_snps[i] = filter_snp(general,snp_info,sample_stats, mincov,minsnp,ploidy)
        # If there were still snp called at this position after filtering
        if any(current_snps[i] not in ("-","0") for i in current_snp_idx):
            for i in set(range(nsamples))-current_snp_idx:
                for coverage in bam_tracks[sorder[i]].coverage((chrom,pos-1,pos)):
                    if coverage[-1] > 0:
                        current_snps[i] = '-'  # '/'.join([ref]*ploidy)
            if pos != lastpos: # indel can be located at the same position as an SNP
                allsnps.append((chrom,pos-1,pos,ref)+tuple(current_snps))
            lastpos = pos
        for i in current_snp_idx:
            current[i] = parse_vcf(vcf_handles[i].readline())
    for f in vcf_handles: f.close()
    for b in bam_tracks: b.close()

    logfile.write("  Annotate all SNPs\n"); logfile.flush()
    snp_read = FeatureStream(allsnps, fields=['chr','start','end','name']+sample_names)
    try:
        annotation = assembly.gene_track(chrom)
        annotated_stream = gm_stream.getNearestFeature(snp_read,annotation,
            thresholdPromot=3000, thresholdInter=3000, thresholdUTR=10)
    except:
        annotated_stream = snp_read
    logfile.write("  Write all SNPs\n"); logfile.flush()
    with open(outall,"a") as fout:
        for snp in annotated_stream:
            # snp: ('chrV',154529, 154530,'T','A (50% of 10)','A (80% of 10)',
            #       'YER002W|NOP16_YER001W|MNN1','Upstream_Included','2271_1011')
            fout.write('\t'.join(str(x) for x in (snp[0],)+snp[2:])+'\n')  # remove start coord (0-based)
    return allsnps
Esempio n. 6
0
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict,frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files',{})
    else:
        raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.")
    merge_strands = int(options.get('merge_strands',-1))
    suffixes = ["fwd","rev"]
    peak_deconvolution = options.get('peak_deconvolution',False)
    if isinstance(peak_deconvolution,basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1','true','t']
    run_meme = options.get('run_meme',False)
    if isinstance(run_meme,basestring):
        run_meme = run_meme.lower() in ['1','true','t']
    macs_args = options.get('macs_args',["--bw","200"])
    b2w_args = options.get('b2w_args',[])
    if not(isinstance(mapseq_files,dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid,mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not(isinstance(mapped,dict)):
            raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name+"_"+str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via )
            if mapped[k].get('poisson_threshold',-1)>0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns)>0:
            p_thresh['group_name'] = sum(ptruns)/len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped)>1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid,group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    logfile.write("Starting MACS.\n");logfile.flush()
    processed = {'macs': add_macs_results( ex, read_length, genome_size,
                                           tests, ctrlbam=controls, name=names,
                                           poisson_threshold=p_thresh,
                                           macs_args=macs_args, via=via ) }
    logfile.write("Done MACS.\n");logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score':(6,sys.maxint)}
    _fields = ['chr','start','end','name','score']
    for i,name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name,names['controls'][0])
            macsbed = track(processed['macs'][ctrl]+"_summits.bed",
                            chrmeta=chrlist, fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed",
                                         chrmeta=chrlist, fields=_fields).read(selection=_select),
                                         'name', lambda __n,_n=xn: "%s:%i" %(__n,_n))
                                   for xn,x in enumerate(names['controls'])])
        ##############################
        macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 )
        peak_list[name] = unique_filename_in()+".sql"
        macs_final = track( peak_list[name], chrmeta=chrlist,
                            info={'datatype':'qualitative'},
                            fields=['start','end','name','score'] )
        macs_final.write(fusion(macs_neighb),clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1: options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension']>100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid,mapped in mapseq_files.iteritems():
#            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not('wig' in m) or len(m['wig'])<2:
                output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                      nreads=m["stats"]["total"],
                                                      merge=-1, read_extension=options['read_extension'],
                                                      convert=False,
                                                      b2w_args=b2w_args, via=via )
                wig.append(dict((s,output+s+'.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via))
                                          for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv( stream, pval ):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] 
                                   for x in stream 
                                   if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), 
                                  fields=stream.fields )
        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1]+" deconvolution.\n");logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name,names['controls'][0])
                macsbed = processed['macs'][ctrl]+"_peaks.bed"
            else:
                macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed"
                                                   for x in names['controls']], via=via )
            deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta,
                                 options['read_extension'], script_path, via=via )
            peak_list[name] = unique_filename_in()+".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed,0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1]+'_peaks.sql', type='sql',
                                              step='deconvolution', groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1]+'_deconv.sql', type='sql',
                                              step='deconvolution',  groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'],(bigwig,"bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1]+'_deconv.bw', type='bigWig',
                                                  ucsc='1', step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e));logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1]+'_deconv.pdf', type='pdf',
                                              step='deconvolution', groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs( stream, xlsl, _f ):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:])
                        yield _p+xlsl[0][nb-1][1:]
                    else:
                        nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:]
        return FeatureStream( _macs_row(stream), fields=_f )
    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']])
        try:
###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat),
                                         xlsl, _fields), mode='append')
        except ValueError:
            _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append')
        peakout.close()
        gzipfile(ex,peakfile)
        peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields))
        ex.add(peakfile+".gz",
               description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text',
                                          step='annotation',groupId=name[0]))
    stracks = [track(wig,info={'name':name+"_"+st}) 
               for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()]
    tablefile = unique_filename_in()
    with open(tablefile,"w") as _tf:
        _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1]
                   for _s in names['tests'] for _c in names['controls']]
        _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']),
                        'name', lambda __n,_n=npt: "%s:%i" %(__n,_n))
                  for npt,pt in enumerate(peakfile_list)]
        features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], 
                                      remove_duplicates=True, group_by=['chr','start','end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile,"a") as _tf:
            for row in quantifs:
                pcols = ['']*_ns*_nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while ( _k < len(_rnsplit)-1-int(_nc>1) ):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k-1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])*_nc+int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n")
    gzipfile(ex,tablefile)
    ex.add(tablefile+".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n");logfile.flush()
        processed['meme'] = parallel_meme( ex, assembly,
                                           peak_list.values(), name=peak_list.keys(),
                                           chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'],
                                           via=via )
    return processed
Esempio n. 7
0
def chipseq_workflow(ex,
                     job_or_dict,
                     assembly,
                     script_path='',
                     logfile=sys.stdout,
                     via='lsf'):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict, frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not ('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files', {})
    else:
        raise TypeError(
            "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'."
        )
    merge_strands = int(options.get('merge_strands', -1))
    suffixes = ["fwd", "rev"]
    peak_deconvolution = options.get('peak_deconvolution', False)
    if isinstance(peak_deconvolution, basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't']
    run_meme = options.get('run_meme', False)
    if isinstance(run_meme, basestring):
        run_meme = run_meme.lower() in ['1', 'true', 't']
    macs_args = options.get('macs_args', ["--bw", "200"])
    b2w_args = options.get('b2w_args', [])
    if not (isinstance(mapseq_files, dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid, mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not (isinstance(mapped, dict)):
            raise TypeError(
                "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name + "_" + str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking(ex,
                                                         mapped[k]["bam"],
                                                         via=via)
            if mapped[k].get('poisson_threshold', -1) > 0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns) > 0:
            p_thresh['group_name'] = sum(ptruns) / len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped) > 1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid, group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    logfile.write("Starting MACS.\n")
    logfile.flush()
    processed = {
        'macs':
        add_macs_results(ex,
                         read_length,
                         genome_size,
                         tests,
                         ctrlbam=controls,
                         name=names,
                         poisson_threshold=p_thresh,
                         macs_args=macs_args,
                         via=via)
    }
    logfile.write("Done MACS.\n")
    logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
    ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score': (6, sys.maxint)}
    _fields = ['chr', 'start', 'end', 'name', 'score']
    for i, name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name, names['controls'][0])
            macsbed = track(processed['macs'][ctrl] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([
                apply(track(processed['macs'][(name, x)] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select),
                      'name',
                      lambda __n, _n=xn: "%s:%i" % (__n, _n))
                for xn, x in enumerate(names['controls'])
            ])
        ##############################
        macs_neighb = neighborhood(macsbed, before_start=150, after_end=150)
        peak_list[name] = unique_filename_in() + ".sql"
        macs_final = track(peak_list[name],
                           chrmeta=chrlist,
                           info={'datatype': 'qualitative'},
                           fields=['start', 'end', 'name', 'score'])
        macs_final.write(fusion(macs_neighb), clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(
        options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1:
        options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension'] > 100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid, mapped in mapseq_files.iteritems():
        #            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                output = mapseq.parallel_density_sql(
                    ex,
                    m["bam"],
                    assembly.chrmeta,
                    nreads=m["stats"]["total"],
                    merge=-1,
                    read_extension=options['read_extension'],
                    convert=False,
                    b2w_args=b2w_args,
                    via=via)
                wig.append(dict((s, output + s + '.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict(
                (s, merge_sql(ex, [x[s] for x in wig], via=via))
                for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv(stream, pval):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream(
                ((x[0], ) + ((x[2] + x[1]) / 2 - 150,
                             (x[2] + x[1]) / 2 + 150) + x[3:]
                 for x in stream if "FERR=" in x[3]
                 and float(ferr.search(x[3]).groups()[0]) <= pval),
                fields=stream.fields)

        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1] + " deconvolution.\n")
            logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name, names['controls'][0])
                macsbed = processed['macs'][ctrl] + "_peaks.bed"
            else:
                macsbed = intersect_many_bed(ex, [
                    processed['macs'][(name, x)] + "_peaks.bed"
                    for x in names['controls']
                ],
                                             via=via)
            deconv = run_deconv(ex,
                                merged_wig[name[1]],
                                macsbed,
                                assembly.chrmeta,
                                options['read_extension'],
                                script_path,
                                via=via)
            peak_list[name] = unique_filename_in() + ".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist,
                       fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed, 0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1] + '_peaks.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1] + '_deconv.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'], (bigwig, "bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1] + '_deconv.bw',
                                                  type='bigWig',
                                                  ucsc='1',
                                                  step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e))
                logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1] + '_deconv.pdf',
                                              type='pdf',
                                              step='deconvolution',
                                              groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs(stream, xlsl, _f):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(
                            _n.split(";")[0][13:]) if _n[:3] == "ID=" else int(
                                _n[10:])
                        yield _p + xlsl[0][nb - 1][1:]
                    else:
                        nb = _n.split(
                            ";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:]

        return FeatureStream(_macs_row(stream), fields=_f)

    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,
                       chrmeta=chrlist,
                       fields=["chr", "start", "end", "name", "score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([
            processed['macs'][(name, _c)] + "_peaks.xls"
            for _c in names['controls']
        ])
        try:
            ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join([
                'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)',
                'location_type', 'distance'
            ] + _fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(
                    getNearestFeature(ptrack.read(selection=chrom), _feat),
                    xlsl, _fields),
                              mode='append')
        except ValueError:
            _fields = ['chr', 'start', 'end', 'name', 'score'
                       ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join(
                ['chromosome', 'start', 'end', 'info', 'peak_height'] +
                _fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl,
                                         _fields),
                              mode='append')
        peakout.close()
        gzipfile(ex, peakfile)
        peakfile_list.append(
            track(peakfile + ".gz", format='txt', fields=_fields))
        ex.add(peakfile + ".gz",
               description=set_file_descr(name[1] + '_annotated_peaks.txt.gz',
                                          type='text',
                                          step='annotation',
                                          groupId=name[0]))
    stracks = [
        track(wig, info={'name': name + "_" + st})
        for name, wigdict in merged_wig.iteritems()
        for st, wig in wigdict.iteritems()
    ]
    tablefile = unique_filename_in()
    with open(tablefile, "w") as _tf:
        _pnames = [
            "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1]
            for _s in names['tests'] for _c in names['controls']
        ]
        _tf.write("\t".join([
            '#chromosome',
            'start',
            'end',
        ] + _pnames + [s.name for s in stracks]) + "\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [
            apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']),
                  'name',
                  lambda __n, _n=npt: "%s:%i" % (__n, _n))
            for npt, pt in enumerate(peakfile_list)
        ]
        features = fusion(
            concatenate(pk_lst,
                        fields=['chr', 'start', 'end', 'name'],
                        remove_duplicates=True,
                        group_by=['chr', 'start', 'end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile, "a") as _tf:
            for row in quantifs:
                pcols = [''] * _ns * _nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while (_k < len(_rnsplit) - 1 - int(_nc > 1)):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k - 1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0]) * _nc + int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(
                    str(tt)
                    for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) +
                          "\n")
    gzipfile(ex, tablefile)
    ex.add(tablefile + ".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',
                                      type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n")
        logfile.flush()
        processed['meme'] = parallel_meme(
            ex,
            assembly,
            peak_list.values(),
            name=peak_list.keys(),
            chip=True,
            meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'],
            via=via)
    return processed
Esempio n. 8
0
def all_snps(ex,
             chrom,
             vcfs,
             bams,
             outall,
             assembly,
             headerfile,
             sample_names,
             mincov,
             minsnp,
             logfile=sys.stdout,
             debugfile=sys.stderr,
             via='local'):
    """For a given chromosome, returns a summary file containing all SNPs identified
    in at least one of the samples.
    Each row contains: chromosome id, SNP position, reference base, SNP base (with proportions)

    :param chrom: (str) chromosome name.
    :param vcfs: (dict) vcf files for each sample, dictionary keys are group ids.
    :param bams: (dict) bamfiles organized like the vcf files.
    :param outall: (str) name of the file that will contain the list of all SNPs.
    :param assembly: (genrep.Assembly) assembly for the fasta files and ploidy value.
    :param headerfile: (string) name of file with substitute bam header to match the fasta files.
    :param sample_names: (list of str) list of sample names.
    :param mincov: (int) minimum number of reads supporting an SNP at a position for it to be considered. [5]
    :param minsnp: (int) minimum percentage of reads supporting the SNP for it to be returned.
        N.B.: Effectively, half of it on each strand for diploids. [40]
    """
    ploidy = assembly.ploidy
    reffasta = assembly.fasta_by_chrom[chrom]
    allsnps = []
    nsamples = len(sample_names)
    sorder = range(len(sample_names))
    current = [None] * nsamples
    #####
    if nsamples > 1:
        poslist = set()
        bamchrom = None
        for vf in vcfs.values():
            with open(vf) as vh:
                for line in vh:
                    if (not line) or line[0] == '#': continue
                    line = line.strip().split('\t')
                    if bamchrom is None: bamchrom = line[0]
                    poslist.add(int(line[1]))
        snplist = unique_filename_in()
        with open(snplist, "w") as snpfh:
            snpfh.write("\n".join("%s\t%i" % (bamchrom, pos)
                                  for pos in sorted(poslist)))
        pilejobs = []
        vcfs2 = {}
        for gid, bamfile in bams.iteritems():
            vcfs2[gid] = unique_filename_in()
            pilejobs.append(
                pileup.nonblocking(ex,
                                   bams[gid],
                                   reffasta,
                                   step="list",
                                   bedfile=snplist,
                                   header=headerfile,
                                   via=via,
                                   stdout=vcfs2[gid]))
        [job.wait() for job in pilejobs]
    else:
        vcfs2 = vcfs


#####
    bam_tracks = [track(v, format='bam') for k, v in sorted(bams.items())]
    vcf_handles = [open(v) for k, v in sorted(vcfs2.items())]
    for i, vh in enumerate(vcf_handles):
        line = '#'
        while line and line[0] == '#':
            line = vh.readline()
        current[i] = parse_vcf(line)
    lastpos = 0
    pos = -1
    while any(current):
        current_pos = [int(x[0][1]) if x else sys.maxint for x in current]
        pos = min(current_pos)
        if pos == sys.maxint: break
        current_snp_idx = set(i for i in range(nsamples)
                              if current_pos[i] == pos)
        current_snps = ["0"] * nsamples
        for i in current_snp_idx:
            general, snp_info, sample_stats = current[i]
            chrbam = general[0]
            ref = general[2]
            current_snps[i] = filter_snp(general, snp_info, sample_stats,
                                         mincov, minsnp, ploidy)
        # If there were still snp called at this position after filtering
        if any(current_snps[i] not in ("-", "0") for i in current_snp_idx):
            for i in set(range(nsamples)) - current_snp_idx:
                for coverage in bam_tracks[sorder[i]].coverage(
                    (chrom, pos - 1, pos)):
                    if coverage[-1] > 0:
                        current_snps[i] = '-'  # '/'.join([ref]*ploidy)
            if pos != lastpos:  # indel can be located at the same position as an SNP
                allsnps.append((chrom, pos - 1, pos, ref) +
                               tuple(current_snps))
            lastpos = pos
        for i in current_snp_idx:
            current[i] = parse_vcf(vcf_handles[i].readline())
    for f in vcf_handles:
        f.close()
    for b in bam_tracks:
        b.close()

    logfile.write("  Annotate all SNPs\n")
    logfile.flush()
    snp_read = FeatureStream(allsnps,
                             fields=['chr', 'start', 'end', 'name'] +
                             sample_names)
    try:
        annotation = assembly.gene_track(chrom)
        annotated_stream = gm_stream.getNearestFeature(snp_read,
                                                       annotation,
                                                       thresholdPromot=3000,
                                                       thresholdInter=3000,
                                                       thresholdUTR=10)
    except:
        annotated_stream = snp_read
    logfile.write("  Write all SNPs\n")
    logfile.flush()
    with open(outall, "a") as fout:
        for snp in annotated_stream:
            # snp: ('chrV',154529, 154530,'T','A (50% of 10)','A (80% of 10)',
            #       'YER002W|NOP16_YER001W|MNN1','Upstream_Included','2271_1011')
            fout.write('\t'.join(str(x) for x in (snp[0], ) + snp[2:]) +
                       '\n')  # remove start coord (0-based)
    return allsnps