Beispiel #1
0
 def test_merge_scores(self):
     # Arithmetic mean
     s1 = fstream([('chr',10,20,6.,'A')], fields=['chr','start','end','score','name'])
     s2 = fstream([('chr',5,15,2.,'B')], fields=['chr','start','end','score','name'])
     res = list(merge_scores([s1,s2]))
     expected = [('chr',5,10,1.,'B'),('chr',10,15,4.,'A|B'),('chr',15,20,3.,'A')]
     self.assertListEqual(res,expected)
     # Geometric mean
     s1 = fstream([(10,20,6.)], fields=['start','end','score'])
     s2 = fstream([(5,15,2.)], fields=['start','end','score'])
     res = list(merge_scores([s1,s2], method='geometric'))
     expected = [(5,10,math.sqrt(2)),(10,15,math.sqrt(12)),(15,20,math.sqrt(6))]
     self.assertListEqual(res,expected)
     # Sum
     s1 = fstream([(10,20,6.)], fields=['start','end','score'])
     s2 = fstream([(5,15,2.)], fields=['start','end','score'])
     res = list(merge_scores([s1,s2], method='sum'))
     expected = [(5,10,2.),(10,15,8.),(15,20,6.)]
     self.assertListEqual(res,expected)
Beispiel #2
0
def merge(*args,**kw):
    if not(kw['forward'] and os.path.exists(kw['forward'])):
        raise Usage("Specify a valid forward strand density file with -f.")
    if not(kw['reverse'] and os.path.exists(kw['reverse'])):
        raise Usage("Specify a valid reverse strand density file with -r.")
    if not(kw['output']):
        raise Usage("Specify the output file name.")

    def _shift(stream,shift):
        istart = stream.fields.index('start')
        iend   = stream.fields.index('end')
        i1 = min(istart,iend)
        i2 = max(istart,iend)
        def _apply_shift(x):
            return x[:i1]+(x[i1]+shift,)+x[i1+1:i2]+(x[i2]+shift,)+x[i2+1:]
        return track.FeatureStream((_apply_shift(x) for x in stream),
                                    fields=stream.fields)

    fields = ['chr','start','end','score']
    chrmeta = _get_chrmeta(**kw)
    tfwd = track.track(kw['forward'],format=kw['formatf'],chrmeta=chrmeta)
    trev = track.track(kw['reverse'],format=kw['formatr'],chrmeta=chrmeta)
    if tfwd.chrmeta:
        chrmeta = tfwd.chrmeta
    elif trev.chrmeta:
        chrmeta = trev.chrmeta
    else:
        raise Usage("Specify an assembly with -a.")

    shiftval = int(kw['shift'])
    if shiftval < 0:
        slim = 300
        chrsize,chrom = sorted([(v['length'],k)
                                for k,v in chrmeta.iteritems()],reverse=True)[0]
        xcor = correlation([tfwd.read(chrom),trev.read(chrom)],
                           (1,chrsize),limits=(-slim,slim))
        shiftval = (xcor.argmax()-slim-1)/2
        print "Autocorrelation shift=%i, correlation is %f." %(shiftval,xcor.max())

    tout = track.track(kw['output'],fields=fields,
                       chrmeta=chrmeta,info={'datatype':'quantitative'})
    mode = 'write'
    method = kw.get("method","mean")
    for chrom in chrmeta.keys():
        tout.write(merge_scores([_shift(tfwd.read(chrom), shiftval),
                                 _shift(trev.read(chrom),-shiftval)],
                                method=method),
                   chrom=chrom,mode=mode,clip=True)
        mode = 'append'
    tout.close()
    trev.close()
    tfwd.close()
    return 0
Beispiel #3
0
    def __call__(self,**kw):
        assembly = kw.get('assembly') or 'guess'
        t1 = track(kw['numerator'],chrmeta=assembly)
        t2 = track(kw['denominator'],chrmeta=assembly)
        format = kw.get('output') or t1.format
        wsize = int(kw.get('window_size') or size_def)
        self.log = kw.get('log',False)
        if isinstance(self.log, basestring):
            self.log = (self.log.lower() in ['1', 'true', 't','on'])
        try:
            self.pseudo = float(kw.get('pseudo'))
        except:
            self.pseudo = pseudo_def
        self.baseline = -log(self.pseudo,2)
        try:
            self.threshold = float(kw.get('threshold'))
        except:
            self.threshold = threshold_def
        distribution = kw.get('distribution',False)
        if isinstance(distribution, basestring):
            distribution = (distribution.lower() in ['1', 'true', 't','on'])
        if distribution:
            genome_length = sum((v['length'] for v in t1.chrmeta.values()))
            self.shifts = list(poisson(float(genome_length)/float(self.sample_num),self.sample_num))
            self.ratios = []

        output = self.temporary_path(fname='ratios_%s-%s.%s'%(t1.name,t2.name,format))
        with track(output, chrmeta=t1.chrmeta, fields=t1.fields,
                   info={'datatype': 'quantitative',
                         'log': self.log,
                         'pseudocounts': self.pseudo,
                         'threshold': self.threshold,
                         'window_size': wsize}) as tout:
            for chrom,vchr in t1.chrmeta.iteritems():
                if wsize > 1:
                    s1 = window_smoothing(t1.read(chrom),window_size=wsize,step_size=1,featurewise=False)
                    s2 = window_smoothing(t2.read(chrom),window_size=wsize,step_size=1,featurewise=False)
                else:
                    s1 = t1.read(chrom)
                    s2 = t2.read(chrom)
                s3 = merge_scores([s1,s2],method=self._divide)
                if distribution:
                    s3 = FeatureStream(self._sample_stream(s3,vchr['length']),fields=s3.fields)
                tout.write(s3, chrom=chrom, clip=True)
        self.new_file(output, 'ratios')

        if distribution:
            pdf = self.temporary_path(fname='%s-%s_ratios_distribution.pdf'%(t1.name,t2.name))
            density_boxplot(self.ratios,output=pdf,
                            name="%s/%s (median=%.2f)" %(t1.name,t2.name,median(self.ratios)))
            self.new_file(pdf, 'boxplot')
        return self.display_time()
Beispiel #4
0
    def __call__(self, **kw):
        def _shift(stream, shift):
            istart = stream.fields.index('start')
            iend = stream.fields.index('end')
            i1 = min(istart, iend)
            i2 = max(istart, iend)

            def _apply_shift(x):
                return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:]
            return FeatureStream((_apply_shift(x) for x in stream),
                                       fields=stream.fields)

        assembly = kw.get('assembly') or 'guess'
        tfwd = track(kw.get('forward'), chrmeta=assembly)
        trev = track(kw.get('reverse'), chrmeta=assembly)
        chrmeta = tfwd.chrmeta

        shiftval = int(kw.get('shift', 0))
        if shiftval < 0:  # Determine shift automatically
            shiftval = None
            xcor_lim = 300
            for chrom, v in chrmeta.iteritems():
                chrsize = v['length']
                xcor_lim = min(xcor_lim, 0.01 * chrsize)
                xcor = correlation([tfwd.read(chrom), trev.read(chrom)], regions=(1, chrsize),
                                   limits=(-xcor_lim, xcor_lim))
                max_xcor_idx = xcor.argmax()
                if xcor[max_xcor_idx] > 0.2:
                    shiftval = (max_xcor_idx - xcor_lim - 1)/2
                    break
            if not shiftval:
                raise ValueError("Unable to detect shift automatically. Must specify a shift value.")

        output = self.temporary_path(fname=tfwd.name+'-'+trev.name+'_merged', 
                                     ext=kw.get('format',tfwd.format))
        outfields = [f for f in tfwd.fields if f in trev.fields]
        tout = track(output, chrmeta=chrmeta, fields=outfields,
                     info={'datatype': 'quantitative', 'shift': shiftval})
        mode = 'write'
        method = kw.get("method","mean")
        for chrom in chrmeta.keys():
            tout.write(merge_scores([_shift(tfwd.read(selection=chrom),  shiftval),
                                     _shift(trev.read(selection=chrom), -shiftval)],
                                    method=method),
                       chrom=chrom, mode=mode, clip=True)
            mode = 'append'
        tout.close()
        trev.close()
        tfwd.close()
        self.new_file(output, 'density_merged')
        return self.display_time()
Beispiel #5
0
 def test_merge_scores(self):
     # Arithmetic mean
     s1 = fstream([('chr', 10, 20, 6., 'A')],
                  fields=['chr', 'start', 'end', 'score', 'name'])
     s2 = fstream([('chr', 5, 15, 2., 'B')],
                  fields=['chr', 'start', 'end', 'score', 'name'])
     res = list(merge_scores([s1, s2]))
     expected = [('chr', 5, 10, 1., 'B'), ('chr', 10, 15, 4., 'A|B'),
                 ('chr', 15, 20, 3., 'A')]
     self.assertListEqual(res, expected)
     # Geometric mean
     s1 = fstream([(10, 20, 6.)], fields=['start', 'end', 'score'])
     s2 = fstream([(5, 15, 2.)], fields=['start', 'end', 'score'])
     res = list(merge_scores([s1, s2], method='geometric'))
     expected = [(5, 10, math.sqrt(2)), (10, 15, math.sqrt(12)),
                 (15, 20, math.sqrt(6))]
     self.assertListEqual(res, expected)
     # Sum
     s1 = fstream([(10, 20, 6.)], fields=['start', 'end', 'score'])
     s2 = fstream([(5, 15, 2.)], fields=['start', 'end', 'score'])
     res = list(merge_scores([s1, s2], method='sum'))
     expected = [(5, 10, 2.), (10, 15, 8.), (15, 20, 6.)]
     self.assertListEqual(res, expected)
Beispiel #6
0
    def __call__(self, **kw):
        def _shift(stream, shift):
            istart = stream.fields.index('start')
            iend = stream.fields.index('end')
            i1 = min(istart, iend)
            i2 = max(istart, iend)

            def _apply_shift(x):
                return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:]
            return FeatureStream((_apply_shift(x) for x in stream),
                                       fields=stream.fields)

        assembly = kw.get('assembly') or 'guess'
        tfwd = track(kw.get('forward'), chrmeta=assembly)
        trev = track(kw.get('reverse'), chrmeta=assembly)
        chrmeta = tfwd.chrmeta

        shiftval = int(kw.get('shift', 0))
        if shiftval < 0:  # Determine shift automatically
            shiftval = None
            xcor_lim = 300
            for chrom, v in chrmeta.iteritems():
                chrsize = v['length']
                xcor_lim = min(xcor_lim, 0.01 * chrsize)
                xcor = correlation([tfwd.read(chrom), trev.read(chrom)], regions=(1, chrsize),
                                   limits=(-xcor_lim, xcor_lim))
                max_xcor_idx = xcor.argmax()
                if xcor[max_xcor_idx] > 0.2:
                    shiftval = (max_xcor_idx - xcor_lim - 1)/2
                    break
            if not shiftval:
                raise ValueError("Unable to detect shift automatically. Must specify a shift value.")

        output = self.temporary_path(fname=tfwd.name+'-'+trev.name+'_merged', 
                                     ext=kw.get('format',tfwd.format))
        tout = track(output, chrmeta=chrmeta,
                     info={'datatype': 'quantitative', 'shift': shiftval})
        mode = 'write'
        method = kw.get("method","mean")
        for chrom in chrmeta.keys():
            tout.write(merge_scores([_shift(tfwd.read(selection=chrom), shiftval),
                                     _shift(trev.read(selection=chrom), -shiftval)],
                                    method=method),
                       chrom=chrom, mode=mode, clip=True)
            mode = 'append'
        tout.close()
        trev.close()
        tfwd.close()
        self.new_file(output, 'density_merged')
        return self.display_time()
Beispiel #7
0
 def __call__(self, **kw):
     b2wargs = []
     control = None
     samples = kw.get('BamMulti', {}).get('sample', [])
     if not isinstance(samples, list): samples = [samples]
     samples = [os.path.abspath(s) for s in samples if os.path.exists(s)]
     if kw.get('control'):
         control = kw['control']
         b2wargs = ["-c", str(control)]
         assert os.path.exists(
             str(control)), "Control file not found: '%s'." % control
         control = os.path.abspath(control)
     try:
         nreads = int(kw.get('normalization'))
     except (ValueError, TypeError):
         nreads = -1
     bamfiles = [track(s, format='bam') for s in samples]
     if nreads < 0:
         _nreads = [0] * len(samples)
         if control is not None:
             b2wargs += ["-r"]
     else:
         _nreads = [nreads for s in samples]
     try:
         merge_strands = int(kw.get('merge_strands'))
     except (ValueError, TypeError):
         merge_strands = -1
     try:
         read_extension = int(kw.get('read_extension'))
     except (ValueError, TypeError):
         read_extension = -1
     output = [
         self.temporary_path(fname=b.name + '_density_') for b in bamfiles
     ]
     format = kw.get("format", "sql")
     with execution(None) as ex:
         files = [
             bam_to_density(ex,
                            s,
                            output[n],
                            nreads=_nreads[n],
                            merge=merge_strands,
                            read_extension=read_extension,
                            sql=True,
                            args=b2wargs) for n, s in enumerate(samples)
         ]
     info = {'datatype': 'quantitative', 'read_extension': read_extension}
     if merge_strands >= 0:
         suffixes = ["merged"]
         info['shift'] = merge_strands
     else:
         suffixes = ["fwd", "rev"]
     chrmeta = bamfiles[0].chrmeta
     for suf in suffixes:
         all_s_files = [
             x for y in files for x in y if x.endswith(suf + ".sql")
         ]
         if len(all_s_files) > 1:
             x = self.temporary_path(fname="Density_average_" + suf +
                                     ".sql")
             tsql = track(x,
                          fields=['start', 'end', 'score'],
                          chrmeta=chrmeta,
                          info={'datatype': 'quantitative'})
             insql = []
             for f in all_s_files:
                 t = track(f, format='sql', chrmeta=chrmeta)
                 t.save()
                 insql.append(t)
             for c in tsql.chrmeta:
                 tsql.write(merge_scores([t.read(c) for t in insql]),
                            chrom=c)
         else:
             x = all_s_files[0]
             tsql = track(x,
                          format='sql',
                          fields=['start', 'end', 'score'],
                          chrmeta=chrmeta,
                          info=info)
             tsql.save()
         if format in [None, "sql"]:
             outname = x
         else:
             outname = os.path.splitext(x)[0] + "." + format
             convert(x, outname, mode="overwrite")
         self.new_file(outname, 'density_' + suf)
     return self.display_time()
Beispiel #8
0
def merge(*args, **kw):
    if not (kw['forward'] and os.path.exists(kw['forward'])):
        raise Usage("Specify a valid forward strand density file with -f.")
    if not (kw['reverse'] and os.path.exists(kw['reverse'])):
        raise Usage("Specify a valid reverse strand density file with -r.")
    if not (kw['output']):
        raise Usage("Specify the output file name.")

    def _shift(stream, shift):
        istart = stream.fields.index('start')
        iend = stream.fields.index('end')
        i1 = min(istart, iend)
        i2 = max(istart, iend)

        def _apply_shift(x):
            return x[:i1] + (x[i1] + shift, ) + x[i1 + 1:i2] + (
                x[i2] + shift, ) + x[i2 + 1:]

        return track.FeatureStream((_apply_shift(x) for x in stream),
                                   fields=stream.fields)

    fields = ['chr', 'start', 'end', 'score']
    chrmeta = _get_chrmeta(**kw)
    tfwd = track.track(kw['forward'], format=kw['formatf'], chrmeta=chrmeta)
    trev = track.track(kw['reverse'], format=kw['formatr'], chrmeta=chrmeta)
    if tfwd.chrmeta:
        chrmeta = tfwd.chrmeta
    elif trev.chrmeta:
        chrmeta = trev.chrmeta
    else:
        raise Usage("Specify an assembly with -a.")

    shiftval = int(kw['shift'])
    if shiftval < 0:
        slim = 300
        chrsize, chrom = sorted([(v['length'], k)
                                 for k, v in chrmeta.iteritems()],
                                reverse=True)[0]
        xcor = correlation(
            [tfwd.read(chrom), trev.read(chrom)], (1, chrsize),
            limits=(-slim, slim))
        shiftval = (xcor.argmax() - slim - 1) / 2
        print "Autocorrelation shift=%i, correlation is %f." % (shiftval,
                                                                xcor.max())

    tout = track.track(kw['output'],
                       fields=fields,
                       chrmeta=chrmeta,
                       info={'datatype': 'quantitative'})
    mode = 'write'
    method = kw.get("method", "mean")
    for chrom in chrmeta.keys():
        tout.write(merge_scores([
            _shift(tfwd.read(chrom), shiftval),
            _shift(trev.read(chrom), -shiftval)
        ],
                                method=method),
                   chrom=chrom,
                   mode=mode,
                   clip=True)
        mode = 'append'
    tout.close()
    trev.close()
    tfwd.close()
    return 0
Beispiel #9
0
    def __call__(self, **kw):
        assembly = kw.get('assembly') or 'guess'
        t1 = track(kw['numerator'], chrmeta=assembly)
        t2 = track(kw['denominator'], chrmeta=assembly)
        format = kw.get('format') or t1.format
        wsize = int(kw.get('window_size') or size_def)
        self.log = kw.get('log', False)
        if isinstance(self.log, basestring):
            self.log = (self.log.lower() in ['1', 'true', 't', 'on'])
        try:
            self.pseudo = float(kw.get('pseudo'))
        except:
            self.pseudo = pseudo_def
        self.baseline = -log(self.pseudo, 2)
        try:
            self.threshold = float(kw.get('threshold'))
        except:
            self.threshold = threshold_def
        distribution = kw.get('distribution', False)
        if isinstance(distribution, basestring):
            distribution = (distribution.lower() in ['1', 'true', 't', 'on'])
        if distribution:
            genome_length = sum((v['length'] for v in t1.chrmeta.values()))
            self.shifts = list(
                poisson(
                    float(genome_length) / float(self.sample_num),
                    self.sample_num))
            self.ratios = []

        output = self.temporary_path(fname='ratios_%s-%s.%s' %
                                     (t1.name, t2.name, format))
        with track(output,
                   chrmeta=t1.chrmeta,
                   fields=t1.fields,
                   info={
                       'datatype': 'quantitative',
                       'log': self.log,
                       'pseudocounts': self.pseudo,
                       'threshold': self.threshold,
                       'window_size': wsize
                   }) as tout:
            for chrom, vchr in t1.chrmeta.iteritems():
                if wsize > 1:
                    s1 = window_smoothing(t1.read(chrom),
                                          window_size=wsize,
                                          step_size=1,
                                          featurewise=False)
                    s2 = window_smoothing(t2.read(chrom),
                                          window_size=wsize,
                                          step_size=1,
                                          featurewise=False)
                else:
                    s1 = t1.read(chrom)
                    s2 = t2.read(chrom)
                s3 = merge_scores([s1, s2], method=self._divide)
                if distribution:
                    s3 = FeatureStream(self._sample_stream(s3, vchr['length']),
                                       fields=s3.fields)
                tout.write(s3, chrom=chrom, clip=True)
        self.new_file(output, 'ratios')

        if distribution:
            pdf = self.temporary_path(fname='%s-%s_ratios_distribution.pdf' %
                                      (t1.name, t2.name))
            density_boxplot(self.ratios,
                            output=pdf,
                            name="%s/%s (median=%.2f)" %
                            (t1.name, t2.name, median(self.ratios)))
            self.new_file(pdf, 'boxplot')
        return self.display_time()
Beispiel #10
0
 def __call__(self, **kw):
     b2wargs = []
     control = None
     samples = kw.get('BamMulti',{}).get('sample', [])
     if not isinstance(samples, list): samples = [samples]
     samples = [os.path.abspath(s) for s in samples if os.path.exists(s)]
     if kw.get('control'):
         control = kw['control']
         b2wargs = ["-c", str(control)]
         assert os.path.exists(str(control)), "Control file not found: '%s'." % control
         control = os.path.abspath(control)
     try:
         nreads = int(kw.get('normalization'))
     except (ValueError, TypeError):
         nreads = -1
     bamfiles = [track(s, format='bam') for s in samples]
     if nreads < 0:
         _nreads = [0]*len(samples)
         if control is not None:
             b2wargs += ["-r"]
     else:
         _nreads = [nreads for s in samples]
     try:
         merge_strands = int(kw.get('merge_strands'))
     except (ValueError, TypeError):
         merge_strands = -1
     try:
         read_extension = int(kw.get('read_extension'))
     except (ValueError, TypeError):
         read_extension = -1
     output = [self.temporary_path(fname=b.name+'_density_') for b in bamfiles]
     format = kw.get("format", "sql")
     with execution(None) as ex:
         files = [bam_to_density( ex, s, output[n], nreads=_nreads[n],
                                  merge=merge_strands,
                                  read_extension=read_extension,
                                  sql=True, args=b2wargs )
                  for n,s in enumerate(samples)]
     info = {'datatype': 'quantitative', 'read_extension': read_extension}
     if merge_strands >= 0:
         suffixes = ["merged"]
         info['shift'] = merge_strands
     else:
         suffixes = ["fwd", "rev"]
     chrmeta = bamfiles[0].chrmeta
     for suf in suffixes:
         all_s_files = [x for y in files for x in y if x.endswith(suf+".sql")]
         if len(all_s_files) > 1:
             x = self.temporary_path(fname="Density_average_"+suf+".sql")
             tsql = track( x, fields=['start', 'end', 'score'],
                           chrmeta=chrmeta, info={'datatype': 'quantitative'} )
             insql = []
             for f in all_s_files:
                 t = track(f, format='sql', chrmeta=chrmeta)
                 t.save()
                 insql.append(t)
             for c in tsql.chrmeta:
                 tsql.write(merge_scores([t.read(c) for t in insql]),chrom=c)
         else:
             x = all_s_files[0]
             tsql = track( x, format='sql', fields=['start', 'end', 'score'],
                           chrmeta=chrmeta, info=info )
             tsql.save()
         if format in [None,"sql"]:
             outname = x
         else:
             outname = os.path.splitext(x)[0]+"."+format
             convert(x, outname, mode="overwrite")
         self.new_file(outname, 'density_'+suf)
     return self.display_time()