Example #1
0
    def run(self):
        if self.output is not None:
            out_file = open(self.output, 'wb')
        else:
            out_file = sys.stdout

        annotation.write_gff3_header(out_file)

        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue

                if 'ID' not in item.attr and 'locus_tag' in item.attr:
                    item.attr['ID'] = item.attr['locus_tag']

                if 'color' not in item.attr:
                    if item.type == 'CDS':
                        item.attr['color'] = '#008800'
                    if item.type == 'rRNA':
                        item.attr['color'] = '#bb0000'
                    if item.type == 'tRNA':
                        item.attr['color'] = '#bb00bb'
                    if item.type == 'misc_feature':
                        item.attr['color'] = '#8888ff'

                print >> out_file, item.as_gff()

        if self.output is not None:
            out_file.close()
Example #2
0
    def run(self):
        if self.output is not None:
           out_file = open(self.output,'wb')
        else:
           out_file = sys.stdout
    
        annotation.write_gff3_header(out_file)
        
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                
                if 'ID' not in item.attr and 'locus_tag' in item.attr:
                    item.attr['ID'] = item.attr['locus_tag']
                    
                if 'color' not in item.attr:
                    if item.type == 'CDS':
                        item.attr['color'] = '#008800'
                    if item.type == 'rRNA':
                        item.attr['color'] = '#bb0000'
                    if item.type == 'tRNA':
                        item.attr['color'] = '#bb00bb'
                    if item.type == 'misc_feature':
                        item.attr['color'] = '#8888ff'

                print >> out_file, item.as_gff()
        
        if self.output is not None:
            out_file.close()
Example #3
0
 def set_annotations(self, filenames):
     f = self.open('reference.gff','wb')
     annotation.write_gff3_header(f)
     for filename in filenames:
         for feature in annotation.read_annotations(filename):
             print >> f, feature.as_gff()
     f.close()
Example #4
0
    def run(self):
        assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.'
        strand_changer = STRAND_CHANGE[self.change_strand]

        shift_start_absolute, shift_start_proportion = decode_shift(
            self.shift_start)
        shift_end_absolute, shift_end_proportion = decode_shift(self.shift_end)

        renames = []
        if self.rename:
            for item in self.rename.split(','):
                new, old = item.split('=')
                if new != old:
                    renames.append((new, old))

        out_file = open(self.prefix + '.gff', 'wb')
        annotation.write_gff3_header(out_file)

        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue

                if self.type:
                    item.type = self.type

                length = item.end - item.start
                shift_start = int(
                    math.floor(0.5 + shift_start_absolute +
                               shift_start_proportion * length))
                shift_end = int(
                    math.floor(0.5 + shift_end_absolute +
                               shift_end_proportion * length))

                if item.strand == 1:
                    item.start += shift_start
                    item.end += shift_end
                elif item.strand == -1:
                    item.end -= shift_start
                    item.start -= shift_end
                item.start = max(0, item.start)  #IGV complains

                item.strand = strand_changer[item.strand]

                old_attr = item.attr.copy()
                for new, old in renames:
                    if old in item.attr:
                        del item.attr[old]
                for new, old in renames:
                    if old in old_attr:
                        item.attr[new] = old_attr[old]

                print >> out_file, item.as_gff()

        out_file.close()
Example #5
0
    def run(self):
        annotations = [ ]
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                if self.type:
                    item.type = self.type
                annotations.append(item)
        
        annotations.sort(key=lambda item: (item.seqid, item.strand, item.start))
        
        group = [ ]
        groups = [ ]
        def emit():
            if not group: return
            groups.append(group[:])
            del group[:]        
        seqid = None
        strand = None
        end = 0
        for item in annotations:
            if item.seqid != seqid or item.strand != strand or item.start >= end:
                emit()
                seqid = item.seqid
                strand = item.strand
                end = item.end-self.overlap
            group.append(item)
            end = max(item.end-self.overlap, end)
        emit()

        out_file = open(self.prefix+'.gff','wb')
        annotation.write_gff3_header(out_file)

        for group in groups:
            item = annotation.Annotation()
            item.source = group[0].source
            item.type = join_descriptions( item2.type for item2 in group )
            item.seqid = group[0].seqid
            item.strand = group[0].strand
            item.start = min( item2.start for item2 in group )
            item.end = max( item2.end for item2 in group )
            item.score = None
            item.phase = None
            item.attr = { }
            
            for item2 in group:
                for key in item2.attr:
                    if key in item.attr: continue
                    item.attr[key] = join_descriptions( item3.attr[key] for item3 in group if key in item3.attr )
            
            print >> out_file, item.as_gff()
            
        out_file.close()
Example #6
0
    def run(self):
        spans = collections.defaultdict(list)
        
        for item in legion.parallel_imap(self._load_bam, self.filenames):
            for key,value in item.items():
                spans[key].extend(value)

        grace.status('Calling peaks')

        f = open(self.prefix+'.gff', 'wb')
        annotation.write_gff3_header(f)
        
        n = 0

        for (rname, strand), span_list in spans.items():
            depth = [ 0.0 ] * (1+max( item[1] for item in span_list ))
            for start, end in span_list:
                depth[start] += 1.0
                depth[end] -= 1.0
            
            for i in xrange(1,len(depth)):
                depth[i] += depth[i-1]

            for start, end in self._find_spans(depth):
                if end-self.lap-start <= 0: continue
                
                n += 1
                
                id = 'peak%d' % n
                
                ann = annotation.Annotation()
                ann.source = 'nesoni'
                ann.type = self.type
                ann.seqid = rname
                ann.start = start
                ann.end = end - self.lap
                ann.strand = strand
                ann.score = None
                ann.phase = None
                ann.attr = { 
                    'id' : id,
                    'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080',
                    }
                print >> f, ann.as_gff()
            f.flush()

        f.close()
        
        self.log.datum('-','called peaks',n)
        
        grace.status('')
    def run(self):
        assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.'
        strand_changer = STRAND_CHANGE[self.change_strand]
        
        shift_start_absolute, shift_start_proportion = decode_shift(self.shift_start)
        shift_end_absolute, shift_end_proportion = decode_shift(self.shift_end)
        
        renames = [ ]
        if self.rename:
            for item in self.rename.split(','):
                new, old = item.split('=')
                if new != old:
                    renames.append((new,old))
    
        out_file = open(self.prefix+'.gff','wb')    
        annotation.write_gff3_header(out_file)
        
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                
                if self.type:
                    item.type = self.type
                
                length = item.end-item.start
                shift_start = int(math.floor(0.5+shift_start_absolute+shift_start_proportion*length))
                shift_end = int(math.floor(0.5+shift_end_absolute+shift_end_proportion*length))
                
                if item.strand == 1:
                    item.start += shift_start
                    item.end += shift_end
                elif item.strand == -1:
                    item.end -= shift_start
                    item.start -= shift_end
                item.start = max(0, item.start) #IGV complains
                
                item.strand = strand_changer[item.strand]
                
                old_attr = item.attr.copy()
                for new,old in renames:
                    if old in item.attr:
                       del item.attr[old]
                for new,old in renames:
                    if old in old_attr:
                       item.attr[new] = old_attr[old]
            
                print >> out_file, item.as_gff()

        out_file.close()
Example #8
0
    def run(self):
        features_parent = [
            _Related_feature(item, item.start, item.end, [])
            for item in annotation.read_annotations(self.parent)
            if selection.matches(self.select_parent, [item.type])
        ]
        features_child = [
            _Related_feature(item, item.start, item.end, [])
            for item in annotation.read_annotations(self.child)
            if selection.matches(self.select_child, [item.type])
        ]

        index = {}
        for item in features_child:
            if item.feature.seqid not in index:
                index[item.feature.seqid] = span_index.Span_index()
            index[item.feature.seqid].insert(item)
        for value in index.values():
            value.prepare()

        for item_1 in features_parent:
            if item_1.feature.strand == 1:
                start = item_1.start - self.upstrand
                end = item_1.end + self.downstrand
            elif item_1.feature.strand == -1:
                start = item_1.start - self.downstrand
                end = item_1.end + self.upstrand
            else:
                start = item_1.start - max(self.upstrand, self.downstrand)
                end = item_1.end + max(self.upstrand, self.downstrand)
            if item_1.feature.seqid in index:
                for item_2 in index[item_1.feature.seqid].get(start, end):
                    item_1.relations.append(item_2)
                    item_2.relations.append(item_1)

        for item in features_parent:
            item.modify_with_relations(self.use, self.to_child, self.to_parent)

        with open(self.prefix + '-parent.gff', 'wb') as f:
            annotation.write_gff3_header(f)
            for item in features_parent:
                print >> f, item.feature.as_gff()

        with open(self.prefix + '-child.gff', 'wb') as f:
            annotation.write_gff3_header(f)
            for item in features_child:
                print >> f, item.feature.as_gff()
Example #9
0
    def run(self):
        features_parent = [ 
            _Related_feature(item,item.start,item.end,[]) 
            for item in annotation.read_annotations(self.parent) 
            if selection.matches(self.select_parent, [item.type]) 
            ]
        features_child = [ 
            _Related_feature(item,item.start,item.end,[]) 
            for item in annotation.read_annotations(self.child) 
            if selection.matches(self.select_child, [item.type])
            ]
        
        index = { }
        for item in features_child:
            if item.feature.seqid not in index:
                index[item.feature.seqid] = span_index.Span_index()
            index[item.feature.seqid].insert(item)
        for value in index.values():
            value.prepare()
        
        for item_1 in features_parent:
            if item_1.feature.strand == 1:
                start = item_1.start - self.upstrand
                end = item_1.end + self.downstrand
            elif item_1.feature.strand == -1:
                start = item_1.start - self.downstrand
                end = item_1.end + self.upstrand
            else:
                start = item_1.start - max(self.upstrand,self.downstrand)
                end = item_1.end + max(self.upstrand,self.downstrand)
            if item_1.feature.seqid in index:
                for item_2 in index[item_1.feature.seqid].get(start,end):
                    item_1.relations.append(item_2)
                    item_2.relations.append(item_1)

        for item in features_parent:
            item.modify_with_relations(self.use, self.to_child, self.to_parent)
        
        with open(self.prefix + '-parent.gff','wb') as f:
            annotation.write_gff3_header(f)
            for item in features_parent:
                print >> f, item.feature.as_gff()
        
        with open(self.prefix + '-child.gff','wb') as f:
            annotation.write_gff3_header(f)
            for item in features_child:
                print >> f, item.feature.as_gff()
Example #10
0
    def run(self):
        assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.'
        strand_changer = STRAND_CHANGE[self.change_strand]
    
        out_file = open(self.prefix+'.gff','wb')    
        annotation.write_gff3_header(out_file)
        
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                
                if item.strand == 1:
                    item.start += self.shift_start
                    item.end += self.shift_end
                elif item.strand == -1:
                    item.end -= self.shift_start
                    item.start -= self.shift_end
                
                item.strand = strand_changer[item.strand]
            
                print >> out_file, item.as_gff()

        out_file.close()
Example #11
0
    def run(self):
        assert self.ucsc_name, 'Need a UCSC genome name'
        
        scratch = _ucsc_scratch(self)
        
        # Load annotations
        
        source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table)
        
        table = scratch.get_table(self.table)
        get_name = scratch.getter(self.name)
        get_product = scratch.getter(self.product)

        mrnas = [ ]
        
        for item in table:
            ann = annotation.Annotation(
                seqid = item.chrom,
                source = source,
                type = 'mRNA',
                strand = {'+':1, '-':-1}[item.strand],
                start = int(item.txStart),
                end = int(item.txEnd),
                attr = {
                    'ID' : item.name,
                    'Name' : get_name(item),
                    'Product' : get_product(item),
                    #'UCSC_name2' : item.name2,
                    }
                )
            
            ann.record = item
            mrnas.append(ann)

        _uniquify_ids(mrnas)
        
        annotations = [ ]
        
        for group in _grouped_features(mrnas):
            ID = '/'.join(item.attr['ID'] for item in group)
            for item in group:
                item.attr['Parent'] = ID
                item.attr['ID'] = item.attr['ID'] + '-mRNA'
            
            annotations.append(annotation.Annotation(
                source = source,
                type = 'gene',
                seqid = group[0].seqid,
                strand = group[0].strand,
                start = min(item.start for item in group),
                end = max(item.end for item in group),
                attr = {
                    'ID' : ID,
                    'Name' : annotation_tools.join_descriptions([ item.attr['Name'] for item in group ], '/'),
                    'Product' : annotation_tools.join_descriptions([ item.attr['Product'] for item in group ], '/'),
                    #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'),
                    }
                ))
            for item in group:
                annotations.append(item)
                
                exonStarts = _parse_ints(item.record.exonStarts)
                exonEnds = _parse_ints(item.record.exonEnds)
                cdsStart = int(item.record.cdsStart)
                cdsEnd = int(item.record.cdsEnd)
                for start,end in zip(exonStarts,exonEnds):
                    annotations.append(annotation.Annotation(
                        source = source,
                        type = 'exon',
                        seqid = item.seqid,
                        strand = item.strand,
                        start = start,
                        end = end,
                        attr = {
                            'Parent' : item.attr['ID'],
                            }
                        ))
                    if max(cdsStart,start) < min(cdsEnd,end):
                        annotations.append(annotation.Annotation(
                            source = source,
                            type = 'CDS',
                            seqid = item.seqid,
                            strand = item.strand,
                            start = max(cdsStart,start),
                            end = min(cdsEnd,end),
                            #TODO: phase
                            attr = {
                                'Parent' : item.attr['ID'],
                                }
                            ))

        # Load sequence
        
        if self.download:
            io.execute(['rsync','-P','rsync://hgdownload.cse.ucsc.edu/goldenPath/'+self.ucsc_name+'/bigZips/chromFa.tar.gz',scratch.ucsc/'chromFa.tar.gz'])
        
        with workspace.tempspace() as temp:
            io.execute(['tar','-C',temp.working_dir,'-zxf',scratch.ucsc/'chromFa.tar.gz'])
            sequences = [ temp/item for item in natural_sorted(os.listdir(temp.working_dir)) ]
            
            with open(temp/'reference.gff','wb') as f:
                annotation.write_gff3_header(f)
                for item in annotations:
                    print >> f, item.as_gff()
            
            Make_tt_reference(
                self.output_dir,
                filenames = sequences + [ temp/'reference.gff' ],
                index = self.index,
                ).run()
Example #12
0
 def run(self):    
     work = self.get_workspace()
     work.update_param(remove=['tail_tools_reference_version'])
     
     nesoni.Make_reference(
         self.output_dir,
         filenames = self.filenames,
         snpeff = False,
         cs = 'ifavailable' if self.index else False,
         ls = False,
         bowtie = 'ifavailable' if self.index else False,
         ).run()
         
     annotations = list(annotation.read_annotations(work/'reference.gff'))
     annotation.link_up_annotations(annotations)
     
     with open(work/'utr.gff','wb') as f:
         annotation.write_gff3_header(f)
         for gene in annotations:
             if gene.type != 'gene': continue
             mrnas = [ item for item in gene.children if item.type == 'mRNA' ]
         
             utr_5primes = [ ]
             
             for mrna in mrnas:
                 cdss = [ item for item in mrna.children if item.type == 'CDS' ]
                 exons = [ item for item in mrna.children if item.type == 'exon' ]
                 if not cdss or not exons: continue
                 if gene.strand >= 0:
                    cds_3prime = max(item.end for item in cdss)
                    for item in exons:
                        if item.end > cds_3prime:
                            utr_5primes.append(max(item.start,cds_3prime))
                 else:
                    cds_3prime = min(item.start for item in cdss)
                    for item in exons:
                        if item.start < cds_3prime:
                            utr_5primes.append(min(item.end,cds_3prime))
             
             if gene.strand >= 0:
                 utr_start = max(utr_5primes) if utr_5primes else gene.end
                 utr_end = max(utr_start+1,gene.end)
             else:
                 utr_end = min(utr_5primes) if utr_5primes else gene.start
                 utr_start = min(gene.start,utr_end-1)
             
             attr = gene.attr.copy()
             attr['Parent'] = attr['ID']
             attr['ID'] = attr['ID']+'-3UTR'
             thing = annotation.Annotation(
                 source = 'tt',
                 type = 'three_prime_utr',
                 seqid = gene.seqid,
                 strand = gene.strand,
                 start = utr_start,
                 end = utr_end,
                 attr = attr,
                 )
             print >> f, thing.as_gff()
         
         
     work.update_param(tail_tools_reference_version=work.VERSION)
Example #13
0
    def run(self):
        annotations = []
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                if self.type:
                    item.type = self.type
                annotations.append(item)

        annotations.sort(
            key=lambda item: (item.type, item.seqid, item.strand, item.start))

        group = []
        groups = []

        def emit():
            if not group: return
            groups.append(group[:])
            del group[:]

        type = None
        seqid = None
        strand = None
        end = 0
        for item in annotations:
            if item.type != type or item.seqid != seqid or item.strand != strand or item.start >= end:
                emit()
                type = item.type
                seqid = item.seqid
                strand = item.strand
                end = item.end - self.overlap
            group.append(item)
            end = max(item.end - self.overlap, end)
        emit()

        items = []

        id_map = {}

        for group in groups:
            item = annotation.Annotation()
            item.source = group[0].source
            item.type = group[0].type
            item.seqid = group[0].seqid
            item.strand = group[0].strand
            item.start = min(item2.start for item2 in group)
            item.end = max(item2.end for item2 in group)
            item.score = None
            item.phase = None
            item.attr = {}

            for item2 in group:
                for key in item2.attr:
                    if key in item.attr: continue
                    item.attr[key] = join_descriptions([
                        item3.attr[key] for item3 in group if key in item3.attr
                    ], self.joiner)

            item.parents = []
            for item2 in group:
                if 'ID' in item2.attr:
                    assert item2.attr[
                        'ID'] not in id_map, 'Duplicate ID: ' + item2.attr['ID']
                    id_map[item2.attr['ID']] = item.attr['ID']
                if 'Parent' in item2.attr:
                    item.parents.append(item2.attr['Parent'])

            items.append(item)

        for item in items:
            if item.parents:
                item.attr['Parent'] = join_descriptions(
                    [id_map.get(parent, parent) for parent in item.parents],
                    ',')

        with open(self.prefix + '.gff', 'wb') as out_file:
            annotation.write_gff3_header(out_file)
            for item in items:
                print >> out_file, item.as_gff()
    def run(self):
        spans = { }
        
        #for item in legion.parallel_imap(self._load_bam, self.filenames):
        #    for key,value in item.items():
        for filename in self.filenames:
            self._load_bam(filename, spans)

        grace.status('Calling peaks')

        f = open(self.prefix+'.gff', 'wb')
        annotation.write_gff3_header(f)
        
        n = 0

        for (rname, strand), span_counts in spans.items():
            length = 1+max( item[1] for item in span_counts )
            depth = [ 0.0 ] * length
            AN_total = [ 0.0 ] * length
            AG_total = [ 0.0 ] * length 
            for (start, end, AN, AG), count in span_counts.iteritems():
                depth[start] += 1.0*count
                depth[end] -= 1.0*count
                AN_total[start] += AN*count
                AN_total[end] -= AN*count
                AG_total[start] += AG*count
                AG_total[end] -= AG*count
            
            for i in xrange(1,length):
                depth[i] += depth[i-1]
                AN_total[i] += AN_total[i-1]
                AG_total[i] += AG_total[i-1]

            for start, end in self._find_spans(depth):
                if end-self.lap-start <= 0: continue
                
                n += 1
                
                id = 'peak%d' % n
                
                ann = annotation.Annotation()
                ann.source = 'tailtools'
                ann.type = self.type
                ann.seqid = rname
                ann.start = start
                ann.end = end - self.lap
                
                if ann.end != ann.start+1:
                    self.log.log("%s odd: start %d end %d\n" % (id, ann.start, ann.end))

                ann.strand = strand
                ann.score = None
                ann.phase = None
                ann.attr = { 
                    'id' : id,
                    'n' : str(depth[start+self.lap//2]),
                    'mean_tail' : str(AN_total[start+self.lap//2]/depth[start+self.lap//2]),
                    'mean_genomic' : str(AG_total[start+self.lap//2]/depth[start+self.lap//2]),
                    'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080',
                    }
                print >> f, ann.as_gff()
            f.flush()

        f.close()
        
        self.log.datum('-','called peaks',n)
        
        grace.status('')
    def run(self):
        assert self.ucsc_name, 'Need a UCSC genome name'

        scratch = _ucsc_scratch(self)

        # Load annotations

        source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table)

        table = scratch.get_table(self.table)
        get_name = scratch.getter(self.name)
        get_product = scratch.getter(self.product)

        mrnas = []

        for item in table:
            ann = annotation.Annotation(
                seqid=item.chrom,
                source=source,
                type='mRNA',
                strand={
                    '+': 1,
                    '-': -1
                }[item.strand],
                start=int(item.txStart),
                end=int(item.txEnd),
                attr={
                    'ID': item.name,
                    'Name': get_name(item),
                    'Product': get_product(item),
                    #'UCSC_name2' : item.name2,
                })

            ann.record = item
            mrnas.append(ann)

        _uniquify_ids(mrnas)

        annotations = []

        for group in _grouped_features(mrnas):
            ID = '/'.join(item.attr['ID'] for item in group)
            for item in group:
                item.attr['Parent'] = ID
                item.attr['ID'] = item.attr['ID'] + '-mRNA'

            annotations.append(
                annotation.Annotation(
                    source=source,
                    type='gene',
                    seqid=group[0].seqid,
                    strand=group[0].strand,
                    start=min(item.start for item in group),
                    end=max(item.end for item in group),
                    attr={
                        'ID':
                        ID,
                        'Name':
                        annotation_tools.join_descriptions(
                            [item.attr['Name'] for item in group], '/'),
                        'Product':
                        annotation_tools.join_descriptions(
                            [item.attr['Product'] for item in group], '/'),
                        #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'),
                    }))
            for item in group:
                annotations.append(item)

                exonStarts = _parse_ints(item.record.exonStarts)
                exonEnds = _parse_ints(item.record.exonEnds)
                cdsStart = int(item.record.cdsStart)
                cdsEnd = int(item.record.cdsEnd)
                for start, end in zip(exonStarts, exonEnds):
                    annotations.append(
                        annotation.Annotation(source=source,
                                              type='exon',
                                              seqid=item.seqid,
                                              strand=item.strand,
                                              start=start,
                                              end=end,
                                              attr={
                                                  'Parent': item.attr['ID'],
                                              }))
                    if max(cdsStart, start) < min(cdsEnd, end):
                        annotations.append(
                            annotation.Annotation(
                                source=source,
                                type='CDS',
                                seqid=item.seqid,
                                strand=item.strand,
                                start=max(cdsStart, start),
                                end=min(cdsEnd, end),
                                #TODO: phase
                                attr={
                                    'Parent': item.attr['ID'],
                                }))

        # Load sequence

        if self.download:
            io.execute([
                'rsync', '-P', 'rsync://hgdownload.cse.ucsc.edu/goldenPath/' +
                self.ucsc_name + '/bigZips/chromFa.tar.gz',
                scratch.ucsc / 'chromFa.tar.gz'
            ])

        with workspace.tempspace() as temp:
            io.execute([
                'tar', '-C', temp.working_dir, '-zxf',
                scratch.ucsc / 'chromFa.tar.gz'
            ])
            sequences = [
                temp / item
                for item in natural_sorted(os.listdir(temp.working_dir))
            ]

            with open(temp / 'reference.gff', 'wb') as f:
                annotation.write_gff3_header(f)
                for item in annotations:
                    print >> f, item.as_gff()

            Make_tt_reference(
                self.output_dir,
                filenames=sequences + [temp / 'reference.gff'],
                index=self.index,
            ).run()
Example #16
0
    def run(self):
        work = self.get_workspace()

        data = []
        names = []
        sample_tags = []

        old = grace.status("Loading pickles")

        max_length = 1
        for i, item in enumerate(self.pickles):
            grace.status("Loading " + os.path.basename(item))
            f = io.open_possibly_compressed_file(item)
            name, tags, datum = pickle.load(f)
            f.close()
            data.append(datum)
            names.append(name)
            sample_tags.append(tags)

            try:
                max_length = max(
                    max_length,
                    max(item[0]  #tail_length
                        for feature in datum for item in feature.hits) + 1)
            except ValueError:
                pass

            if i == 0:
                annotations = datum

        grace.status(old)

        self.log.log("Maximum tail length %d\n" % max_length)

        for i in xrange(len(names)):
            n_alignments = 0
            for feature in data[i]:
                feature.total_count = len(feature.hits)
                feature.tail_counts = [0] * max_length

                n_alignments += feature.total_count

                for tail_length, adaptor_bases in feature.hits:
                    if adaptor_bases >= self.adaptor:
                        feature.tail_counts[tail_length] += 1

                del feature.hits

            self.log.datum(names[i], 'Alignments to features', n_alignments)

        counts = []  # [feature][sample](total_count, [taillength])

        for item in data:
            assert len(item) == len(data[0])
        for row in itertools.izip(*data):
            this_counts = [(item.total_count, item.tail_counts)
                           for item in row]
            counts.append(this_counts)

        n_features = len(counts)
        n_samples = len(data)

        sample_n = [[0] * n_samples for i in xrange(n_features)
                    ]  # [feature][sample]  Total count
        sample_n_tail = [[0] * n_samples for i in xrange(n_features)
                         ]  # [feature][sample]  Polya count
        sample_prop = [
            [None] * n_samples for i in xrange(n_features)
        ]  # [feature][sample]  Proportion of reads with tail (deprecated)
        sample_tail = [[None] * n_samples for i in xrange(n_features)
                       ]  # [feature][sample]  Mean tail length in each sample
        sample_sd_tail = [
            [None] * n_samples for i in xrange(n_features)
        ]  # [feature][sample]  Std dev tail length in each sample
        sample_total_tail = [[0] * n_samples for i in xrange(n_features)]

        sample_quantile_tail = collections.OrderedDict(
            (item, [[None] * n_samples for i in xrange(n_features)])
            for item in [25, 50, 75, 100])

        overall_n = [0] * n_features  # [feature]          Overall count
        overall_prop = [
            None
        ] * n_features  # [feature]          Overall proportion with tail
        overall_tail = [
            None
        ] * n_features  # [feature]          Overall mean tail length
        overall_n_tail = [
            0
        ] * n_features  # [feature]          Overall polya count
        for i, row in enumerate(counts):
            for j, (this_this_n, item) in enumerate(row):
                sample_n[i][j] = this_this_n
                sample_n_tail[i][j] = sum(item[self.tail:])
                sample_total_tail[i][j] = sum(
                    item[k] * k for k in xrange(self.tail, max_length))

                if sample_n[i][j] >= 1:
                    sample_prop[i][j] = float(
                        sample_n_tail[i][j]) / sample_n[i][j]

                if sample_n_tail[i][j] >= 1:
                    sample_tail[i][j] = float(
                        sample_total_tail[i][j]) / sample_n_tail[i][j]

                    for quantile in sample_quantile_tail:
                        counter = sample_n_tail[i][j] * quantile / 100.0
                        for k in xrange(self.tail, max_length):
                            counter -= item[k]
                            if counter <= 0: break
                        sample_quantile_tail[quantile][i][j] = k

                if sample_n_tail[i][j] >= 2:
                    sample_sd_tail[i][j] = math.sqrt(
                        float(
                            sum(item[k] * ((k - sample_tail[i][j])**2)
                                for k in xrange(self.tail, max_length))) /
                        (sample_n_tail[i][j] - 1))

            overall_n[i] = sum(sample_n[i])
            overall_n_tail[i] = sum(sample_n_tail[i])
            if overall_n[i] >= 1:
                overall_prop[i] = float(sum(sample_n_tail[i])) / overall_n[i]
            if overall_n_tail[i] >= 1:
                overall_tail[i] = float(sum(
                    sample_total_tail[i])) / overall_n_tail[i]

        for i, name in enumerate(names):
            this_total = sum(item[i] for item in sample_total_tail)
            this_n = sum(item[i] for item in sample_n_tail)
            if this_n:
                self.log.datum(name, 'Average poly-A tail',
                               float(this_total) / this_n)

        for i, name in enumerate(names):
            this_total = sum(item[i] for item in sample_n_tail)
            this_n = sum(item[i] for item in sample_n)
            if this_n:
                self.log.datum(name, 'Average proportion of reads with tail',
                               float(this_total) / this_n)

        with open(work / 'features-with-data.gff', 'wb') as f:
            annotation.write_gff3_header(f)
            for i, item in enumerate(annotations):
                item.attr['reads'] = str(overall_n[i])
                item.attr['reads_with_tail'] = str(overall_n_tail[i])
                item.attr['mean_tail'] = '%.1f' % overall_tail[
                    i] if overall_tail[i] else 'NA'
                item.attr['proportion_with_tail'] = '%.2f' % overall_prop[
                    i] if overall_prop[i] else 'NA'

                if overall_tail[i] is None:
                    item.attr['color'] = '#444444'
                else:
                    a = (overall_tail[i] - self.tail) / max(
                        1, max_length - self.tail)
                    item.attr['color'] = '#%02x%02x%02x' % (int(
                        a * 255), int(
                            (1 - abs(a * 2 - 1)) * 255), 255 - int(a * 255))
                #item.attr['color'] = ...
                print >> f, item.as_gff()

        comments = ['#Counts'] + [
            '#sampleTags=' + ','.join(tags) for tags in sample_tags
        ] + [
            '"Tail_count" group is number of reads with tail',
            '"Tail" group is mean tail per sample',
            '"Proportion" group is proportion of reads with tail',
        ]

        have_biotype = any("Biotype" in item.attr for item in annotations)
        have_parent = any("Parent" in item.attr for item in annotations)
        have_relation = any("Relation" in item.attr for item in annotations)
        have_antisense = any("Antisense_parent" in item.attr
                             for item in annotations)

        def counts_iter():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(n_samples):
                    row[('Count', names[j])] = '%d' % sample_n[i][j]

                row[('Annotation',
                     'Length')] = annotations[i].end - annotations[i].start
                row[('Annotation',
                     'gene')] = annotations[i].attr.get('Name', '')
                row[('Annotation',
                     'product')] = annotations[i].attr.get('Product', '')
                if have_biotype:
                    row[('Annotation',
                         'biotype')] = annotations[i].attr.get('Biotype', '')
                if have_parent:
                    row[('Annotation',
                         'parent')] = annotations[i].attr.get('Parent', '')
                if have_relation:
                    row[('Annotation', 'relation')] = annotations[i].attr.get(
                        'Relation', '')

                if have_antisense:
                    row[('Annotation',
                         'antisense_gene')] = annotations[i].attr.get(
                             'Antisense_name', '')
                    row[('Annotation',
                         'antisense_product')] = annotations[i].attr.get(
                             'Antisense_product', '')
                    row[('Annotation',
                         'antisense_biotype')] = annotations[i].attr.get(
                             'Antisense_biotype', '')
                    row[('Annotation',
                         'antisense_parent')] = annotations[i].attr.get(
                             'Antisense_parent', '')

                row[('Annotation', 'chromosome')] = str(annotations[i].seqid)
                row[('Annotation', 'strand')] = str(annotations[i].strand)
                row[('Annotation', 'start')] = str(annotations[i].start + 1)
                row[('Annotation', 'end')] = str(annotations[i].end)

                row[('Annotation', 'reads')] = str(overall_n[i])
                row[('Annotation', 'reads-with-tail')] = str(overall_n_tail[i])
                row[('Annotation', 'mean-tail')] = str_na(overall_tail[i])
                row[('Annotation',
                     'proportion-with-tail')] = str_na(overall_prop[i])
                for j in xrange(n_samples):
                    row[('Tail_count', names[j])] = '%d' % sample_n_tail[i][j]
                for j in xrange(n_samples):
                    row[('Tail', names[j])] = str_na(sample_tail[i][j])
                for j in xrange(n_samples):
                    row[('Tail_sd', names[j])] = str_na(sample_sd_tail[i][j])

                for quantile in sample_quantile_tail:
                    for j in xrange(n_samples):
                        row[('Tail_quantile_%d' % quantile,
                             names[j])] = str_na(
                                 sample_quantile_tail[quantile][i][j])

                for j in xrange(len(names)):
                    row[('Proportion', names[j])] = str_na(sample_prop[i][j])
                yield row

        io.write_csv(work / 'counts.csv', counts_iter(), comments=comments)

        def write_csv_matrix(filename, matrix):
            def emitter():
                for i in xrange(n_features):
                    row = collections.OrderedDict()
                    row["Feature"] = annotations[i].get_id()
                    for j in xrange(n_samples):
                        row[names[j]] = str_na(matrix[i][j])
                    yield row

            io.write_csv(filename, emitter())

        write_csv_matrix(work / 'read_count.csv', sample_n)
        write_csv_matrix(work / 'tail_count.csv', sample_n_tail)
        write_csv_matrix(work / 'tail.csv', sample_tail)
        write_csv_matrix(work / 'tail_sd.csv', sample_sd_tail)
        for quantile in sample_quantile_tail:
            write_csv_matrix(work / ('tail_quantile_%d.csv' % quantile),
                             sample_quantile_tail[quantile])

        #def raw_columns():
        #    for i in xrange(n_samples):
        #        row = collections.OrderedDict()
        #        row['Sample'] = names[i]
        #        for j in xrange(max_length):
        #            row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based
        #        yield row
        #io.write_csv(work/'raw-columns.csv', raw_columns())
        #
        ##Somewhat inefficient
        #def raw():
        #    for i in xrange(n_features):
        #        row = collections.OrderedDict()
        #        row['Feature'] = annotations[i].get_id()
        #        for j in xrange(n_samples):
        #            for k in xrange(max_length):
        #                row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] )
        #        yield row
        #io.write_csv(work/'raw.csv', raw())

        def pooled():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(max_length):
                    row[str(j)] = str(
                        sum(counts[i][k][1][j] for k in xrange(n_samples)))
                yield row

        io.write_csv(work / 'pooled.csv', pooled())
    def run(self):
        annotations = [ ]
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                if self.type:
                    item.type = self.type
                annotations.append(item)
        
        annotations.sort(key=lambda item: (item.type, item.seqid, item.strand, item.start))
        
        group = [ ]
        groups = [ ]
        def emit():
            if not group: return
            groups.append(group[:])
            del group[:]        
        type = None
        seqid = None
        strand = None
        end = 0
        for item in annotations:
            if item.type != type or item.seqid != seqid or item.strand != strand or item.start >= end:
                emit()
                type = item.type
                seqid = item.seqid
                strand = item.strand
                end = item.end-self.overlap
            group.append(item)
            end = max(item.end-self.overlap, end)
        emit()


        items = [ ]
        
        id_map = { }

        for group in groups:
            item = annotation.Annotation()
            item.source = group[0].source
            item.type = group[0].type
            item.seqid = group[0].seqid
            item.strand = group[0].strand
            item.start = min( item2.start for item2 in group )
            item.end = max( item2.end for item2 in group )
            item.score = None
            item.phase = None
            item.attr = { }
            
            for item2 in group:
                for key in item2.attr:
                    if key in item.attr: continue
                    item.attr[key] = join_descriptions([ item3.attr[key] for item3 in group if key in item3.attr ], self.joiner )

            item.parents = [ ]
            for item2 in group:
                if 'ID' in item2.attr:
                    assert item2.attr['ID'] not in id_map, 'Duplicate ID: '+item2.attr['ID']
                    id_map[item2.attr['ID']] = item.attr['ID']
                if 'Parent' in item2.attr:
                    item.parents.append(item2.attr['Parent'])
            
            items.append(item)
        
        for item in items:
            if item.parents:
                item.attr['Parent'] = join_descriptions([ id_map.get(parent,parent) for parent in item.parents ], ',')
        
        with open(self.prefix+'.gff','wb') as out_file:
            annotation.write_gff3_header(out_file)
            for item in items:
                print >> out_file, item.as_gff()
    def run(self):
        assert len(self.pickles) > 0, "No samples to count."
        
        work = self.get_workspace()
        
        data = [ ]
        names = [ ]
        sample_tags = [ ]
        
        old = grace.status("Loading pickles")
        
        max_length = 1
        for i, item in enumerate(self.pickles):
            grace.status("Loading "+os.path.basename(item))
            f = io.open_possibly_compressed_file(item)
            name, tags, datum = pickle.load(f)
            f.close()
            data.append(datum)
            names.append(name)
            sample_tags.append(tags)
            
            try:
                max_length = max(max_length, max( 
                    item[0] #tail_length
                    for feature in datum
                    for item in feature.hits
                    ) + 1)
            except ValueError:
                pass
            
            if i == 0:
               annotations = datum
        
        grace.status(old)
        
        self.log.log("Maximum tail length %d\n" % max_length)

        for i in xrange(len(names)):        
            n_alignments = 0
            for feature in data[i]:
                feature.total_count = len(feature.hits)
                feature.tail_counts = [ 0 ] * max_length
                
                n_alignments += feature.total_count
                
                for tail_length, adaptor_bases in feature.hits:
                    if adaptor_bases >= self.adaptor:
                        feature.tail_counts[tail_length] += 1
                
                del feature.hits

            self.log.datum(names[i], 'Alignments to features', n_alignments)
                
        
        counts = [ ]  # [feature][sample](total_count, [taillength])
        
        for item in data: 
            assert len(item) == len(data[0])
        for row in itertools.izip(*data):
            this_counts = [ (item.total_count, item.tail_counts) for item in row ]
            counts.append(this_counts)
        
        n_features = len(counts)
        n_samples = len(data)
        
        sample_n = [ [0]*n_samples for i in xrange(n_features) ]        # [feature][sample]  Total count
        sample_n_tail = [ [0]*n_samples for i in xrange(n_features) ]   # [feature][sample]  Polya count
        sample_prop = [ [None]*n_samples for i in xrange(n_features) ]    # [feature][sample]  Proportion of reads with tail (deprecated)
        sample_tail = [ [None]*n_samples for i in xrange(n_features) ]    # [feature][sample]  Mean tail length in each sample
        sample_sd_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample]  Std dev tail length in each sample
        sample_total_tail = [ [0]*n_samples for i in xrange(n_features) ]
        
        sample_quantile_tail = collections.OrderedDict( 
            (item, [ [None]*n_samples for i in xrange(n_features) ]) 
            for item in [25,50,75,100]
            )
        
        overall_n = [ 0 ]*n_features       # [feature]          Overall count
        overall_prop = [ None ]*n_features   # [feature]          Overall proportion with tail
        overall_tail = [ None ]*n_features   # [feature]          Overall mean tail length
        overall_n_tail = [ 0 ]*n_features  # [feature]          Overall polya count
        for i, row in enumerate(counts):
            for j, (this_this_n, item) in enumerate(row):
                sample_n[i][j] = this_this_n
                sample_n_tail[i][j] = sum(item[self.tail:])
                sample_total_tail[i][j] = sum( item[k]*k for k in xrange(self.tail,max_length) )

                if sample_n[i][j] >= 1:
                    sample_prop[i][j] = float(sample_n_tail[i][j])/sample_n[i][j]
                
                if sample_n_tail[i][j] >= 1:
                    sample_tail[i][j] = float(sample_total_tail[i][j])/sample_n_tail[i][j]
                
                    for quantile in sample_quantile_tail:
                        counter = sample_n_tail[i][j] * quantile / 100.0
                        for k in xrange(self.tail, max_length):
                            counter -= item[k]
                            if counter <= 0: break
                        sample_quantile_tail[quantile][i][j] = k
                
                if sample_n_tail[i][j] >= 2:
                    sample_sd_tail[i][j] = math.sqrt(
                        float(sum( item[k]*((k-sample_tail[i][j])**2) for k in xrange(self.tail,max_length) ))
                        / (sample_n_tail[i][j]-1)
                        )
                    
            overall_n[i] = sum(sample_n[i])
            overall_n_tail[i] = sum(sample_n_tail[i])
            if overall_n[i] >= 1:
                overall_prop[i] = float(sum(sample_n_tail[i]))/overall_n[i]
            if overall_n_tail[i] >= 1:
                overall_tail[i] = float(sum(sample_total_tail[i]))/overall_n_tail[i]
             
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_total_tail )
            this_n = sum( item[i] for item in sample_n_tail )
            if this_n:
                self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n)
                
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_n_tail )
            this_n = sum( item[i] for item in sample_n )
            if this_n:
                self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n)
        
        with open(work/'features-with-data.gff','wb') as f:
            annotation.write_gff3_header(f)
            for i, item in enumerate(annotations):
                item.attr['reads'] = str(overall_n[i])
                item.attr['reads_with_tail'] = str(overall_n_tail[i])
                item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA'
                item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA'
                
                if overall_tail[i] is None:
                    item.attr['color'] = '#444444'
                else:
                    a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail)
                    item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255))
                #item.attr['color'] = ...                
                print >> f, item.as_gff()
        
        
        comments = [ '#Counts' ] + [
            '#sampleTags='+','.join(tags)
            for tags in sample_tags
            ] + [
            '"Tail_count" group is number of reads with tail',
            '"Tail" group is mean tail per sample',
            '"Proportion" group is proportion of reads with tail',
            ]
            
        have_biotype = any("Biotype" in item.attr for item in annotations)
        have_parent = any("Parent" in item.attr for item in annotations)
        have_relation = any("Relation" in item.attr for item in annotations)
        have_antisense = any("Antisense_parent" in item.attr for item in annotations)

        def counts_iter():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(n_samples):
                    row[('Count',names[j])] = '%d' % sample_n[i][j]

                row[('Annotation','Length')] = annotations[i].end - annotations[i].start
                row[('Annotation','gene')] = annotations[i].attr.get('Name','')
                row[('Annotation','product')] = annotations[i].attr.get('Product','')
                if have_biotype:
                    row[('Annotation','biotype')] = annotations[i].attr.get('Biotype','')
                if have_parent:
                    row[('Annotation','parent')] = annotations[i].attr.get('Parent','')
                if have_relation:
                    row[('Annotation','relation')] = annotations[i].attr.get('Relation','')
                
                if have_antisense:
                    row[('Annotation','antisense_gene')] = annotations[i].attr.get('Antisense_name','')
                    row[('Annotation','antisense_product')] = annotations[i].attr.get('Antisense_product','')
                    row[('Annotation','antisense_biotype')] = annotations[i].attr.get('Antisense_biotype','')
                    row[('Annotation','antisense_parent')] = annotations[i].attr.get('Antisense_parent','')
                
                row[('Annotation','chromosome')] = str(annotations[i].seqid)
                row[('Annotation','strand')] = str(annotations[i].strand)
                row[('Annotation','start')] = str(annotations[i].start+1)
                row[('Annotation','end')] = str(annotations[i].end)
                
                row[('Annotation','reads')] = str(overall_n[i])
                row[('Annotation','reads-with-tail')] = str(overall_n_tail[i])
                row[('Annotation','mean-tail')] = str_na(overall_tail[i])
                row[('Annotation','proportion-with-tail')] = str_na(overall_prop[i])
                for j in xrange(n_samples):
                    row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j]
                for j in xrange(n_samples):
                    row[('Tail',names[j])] = str_na(sample_tail[i][j])
                for j in xrange(n_samples):
                    row[('Tail_sd',names[j])] = str_na(sample_sd_tail[i][j])
                
                for quantile in sample_quantile_tail:
                    for j in xrange(n_samples):
                        row[('Tail_quantile_%d'%quantile,names[j])] = str_na(sample_quantile_tail[quantile][i][j])                    
                
                for j in xrange(len(names)):
                    row[('Proportion',names[j])] = str_na(sample_prop[i][j])
                yield row
        io.write_csv(work/'counts.csv', counts_iter(), comments=comments)
        
        
        def write_csv_matrix(filename, matrix):
            def emitter():
                for i in xrange(n_features):
                    row = collections.OrderedDict()
                    row["Feature"] = annotations[i].get_id()
                    for j in xrange(n_samples):
                        row[names[j]] = str_na(matrix[i][j])
                    yield row
            io.write_csv(filename, emitter())
            
        write_csv_matrix(work/'read_count.csv', sample_n)
        write_csv_matrix(work/'tail_count.csv', sample_n_tail)
        write_csv_matrix(work/'tail.csv', sample_tail)
        write_csv_matrix(work/'tail_sd.csv', sample_sd_tail)
        for quantile in sample_quantile_tail:
            write_csv_matrix(work/('tail_quantile_%d.csv'%quantile), sample_quantile_tail[quantile])


        #def raw_columns():
        #    for i in xrange(n_samples):
        #        row = collections.OrderedDict()
        #        row['Sample'] = names[i]
        #        for j in xrange(max_length):
        #            row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based
        #        yield row
        #io.write_csv(work/'raw-columns.csv', raw_columns())
        #
        ##Somewhat inefficient        
        #def raw():
        #    for i in xrange(n_features):
        #        row = collections.OrderedDict()
        #        row['Feature'] = annotations[i].get_id()
        #        for j in xrange(n_samples):
        #            for k in xrange(max_length):
        #                row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] )
        #        yield row
        #io.write_csv(work/'raw.csv', raw())
        
        def pooled():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(max_length):
                    row[str(j)] = str( sum( counts[i][k][1][j] for k in xrange(n_samples) ) )
                yield row
        io.write_csv(work/'pooled.csv', pooled())
Example #19
0
    def run(self):
        spans = collections.defaultdict(list)
        
        #for item in legion.parallel_imap(self._load_bam, self.filenames):
        #    for key,value in item.items():
        for filename in self.filenames:
            for key,value in self._load_bam(filename).items():
                spans[key].extend(value)

        grace.status('Calling peaks')

        f = open(self.prefix+'.gff', 'wb')
        annotation.write_gff3_header(f)
        
        n = 0

        for (rname, strand), span_list in spans.items():
            length = 1+max( item[1] for item in span_list )
            depth = [ 0.0 ] * length
            AN_total = [ 0.0 ] * length
            AG_total = [ 0.0 ] * length 
            for start, end, AN, AG in span_list:
                depth[start] += 1.0
                depth[end] -= 1.0
                AN_total[start] += AN
                AN_total[end] -= AN
                AG_total[start] += AG
                AG_total[end] -= AG
            
            for i in xrange(1,length):
                depth[i] += depth[i-1]
                AN_total[i] += AN_total[i-1]
                AG_total[i] += AG_total[i-1]

            for start, end in self._find_spans(depth):
                if end-self.lap-start <= 0: continue
                
                n += 1
                
                id = 'peak%d' % n
                
                ann = annotation.Annotation()
                ann.source = 'tailtools'
                ann.type = self.type
                ann.seqid = rname
                ann.start = start
                ann.end = end - self.lap
                assert ann.end == ann.start+1
                ann.strand = strand
                ann.score = None
                ann.phase = None
                ann.attr = { 
                    'id' : id,
                    'n' : str(depth[start+self.lap//2]),
                    'mean_tail' : str(AN_total[start+self.lap//2]/depth[start+self.lap//2]),
                    'mean_genomic' : str(AG_total[start+self.lap//2]/depth[start+self.lap//2]),
                    'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080',
                    }
                print >> f, ann.as_gff()
            f.flush()

        f.close()
        
        self.log.datum('-','called peaks',n)
        
        grace.status('')
Example #20
0
    def run(self):
        assert self.what in ('fragment','5prime','3prime'), 'Unknown option for --what.'
        #assert self.moderation > 0.0, '--moderation must be greater than zero.'
        #assert self.power > 0.0, '--power must be greater than zero.'
        #assert self.width_power >= 1.0, '--width-power must be greater than or equal to one.'
    
        #if self.filter == 'poly':
        #    use_bam_filename = 'alignments.bam'
        #    use_only_top = True
        #    use_only_monogamous = False
        #    expect_multiple_alignments = True
        #elif self.filter == 'mono': 
        #    use_bam_filename = 'alignments.bam'
        #    use_only_top = True
        #    use_only_monogamous = True
        #    expect_multiple_alignments = True
        #else:
        #    assert self.filter == 'existing', 'Unrecognized filtering mode'
        #    use_bam_filename = 'alignments_filtered.bam'
        #    use_only_top = False
        #    use_only_monogamous = False
        #    expect_multiple_alignments = False
                    
        spans = collections.defaultdict(list)
        
        for item in legion.parallel_imap(self._load_bam, self.filenames):
            for key,value in item.items():
                spans[key].extend(value)
        
        #for i, filename in enumerate(self.filenames):
        #    if os.path.isdir(filename):
        #        filename = os.path.join(filename, use_bam_filename)
        #    
        #    n = 0
        #    for read_name, fragment_alignments, unmapped in \
        #            sam.bam_iter_fragments(
        #                filename, 
        #                'Scanning sample %d of %d' % (i+1,len(self.filenames))):
        #        if not fragment_alignments:
        #            continue
        #            
        #        if use_only_top:
        #            fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ]            
        #            best_score = max(fragment_scores)
        #            fragment_alignments = [ 
        #                item 
        #                for item, score in zip(fragment_alignments, fragment_scores)
        #                if score >= best_score ]            
        #        
        #        for alignments in fragment_alignments:
        #            if self.strand_specific:
        #                strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
        #            else:
        #                strand = 0
        #        
        #            start = min(item.pos-1 for item in alignments)
        #            end = max(item.pos+item.length-1 for item in alignments)
        #            if end-start <= self.trim*2: continue
        #            
        #            rname = alignments[0].rname                    
        #            spans[(rname, strand)].append((start+self.trim,end-self.trim))
        #        
        #        n += 1
        #        #if n > 100000: break
        #
        #if self.deduplicate:
        #    for key in spans:
        #        spans[key] = list(set(spans[key]))

        grace.status('Calling peaks')

        f = open(self.prefix+'.gff', 'wb')
        annotation.write_gff3_header(f)
        
        n = 0

        for (rname, strand), span_list in spans.items():
            depth = [ 0.0 ] * (1+max( item[1] for item in span_list ))
            for start, end in span_list:
                depth[start] += 1.0
                depth[end] -= 1.0
            
            if self.crosstalk and strand and (rname,-strand) in spans:
                for start, end in spans[(rname,-strand)]:
                    if start < len(depth): depth[start] -= self.crosstalk
                    if end < len(depth): depth[end] += self.crosstalk
            
            for i in xrange(1,len(depth)):
                depth[i] += depth[i-1]

            if self.crosstalk:
                for i in xrange(len(depth)):
                    depth[i] = max(0.0,depth[i])

            #import pylab
            #pylab.plot(depth)
            
            for start, end in self._find_spans(depth):
                #pylab.axvspan(start-0.5,end-0.5,alpha=0.25)
                
                if end-self.lap-start <= 0: continue
                
                n += 1
                
                id = 'peak%d' % n
                
                #if strand == -1:
                #    id = '%s-%d..%d' % (rname,start,end+1)
                #elif strand == 0:
                #    id = '%s.%d..%d' % (rname,start+1,end)
                #else:
                #    id = '%s+%d..%d' % (rname,start+1,end)
                
                ann = annotation.Annotation()
                ann.source = 'nesoni'
                ann.type = self.type
                ann.seqid = rname
                ann.start = start
                ann.end = end - self.lap
                ann.strand = strand
                ann.score = None
                ann.phase = None
                ann.attr = { 
                    'id' : id,
                    'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080',
                    }
                print >> f, ann.as_gff()
            f.flush()
            
            #pylab.show()

        f.close()
        
        self.log.datum('-','called peaks',n)
        
        grace.status('')
Example #21
0
    def run(self):
        work = self.get_workspace()
        
        data = [ ]
        names = [ ]
        sample_tags = [ ]
        
        for item in self.pickles:
            f = io.open_possibly_compressed_file(item)
            name, tags, datum = pickle.load(f)
            f.close()
            data.append(datum)
            names.append(name)
            sample_tags.append(tags)
        
        annotations = data[0]
        
        all_lengths = [ 
            #tail_length
            item[2]
            for sample in data
            for feature in sample
            #for rel_start,rel_end,tail_length in feature.hits
            for item in feature.hits
            ]
        if all_lengths: 
            max_length = max(all_lengths)+1
        else:
            max_length = 1
        del all_lengths
        
        for i, sample in enumerate(data):
            n_alignments = 0
            n_duplicates = 0
            n_good = 0
            for feature in sample:
                feature.tail_counts = [ 0.0 ] * max_length
                
                buckets = collections.defaultdict(list)
                for item in feature.hits:
                    rel_start,rel_end,tail_length = item[:3]
                    buckets[ (rel_start,rel_end) ].append(tail_length)
                for item in buckets.values():
                    n_alignments += len(item)
                    n_good += 1
                    if self.saturation < 1 or len(item) <= self.saturation:
                        weight = 1.0
                    else:
                        weight = float(self.saturation) / len(item)
                        n_duplicates += len(item)
                    for item2 in item:
                        feature.tail_counts[item2] += weight                

            self.log.datum(names[i], 'Alignments to features', n_alignments)
            if self.saturation >= 1:
                self.log.datum(names[i], 'Proportion of alignments with duplicate start and end position', float(n_duplicates)/max(1,n_alignments))
                self.log.datum(names[i], 'Alignments to features after deduplication', n_good)
                
        
        counts = [ ]  # [feature][sample][taillength]
        
        for item in data: 
            assert len(item) == len(data[0])
        for row in itertools.izip(*data):
            this_counts = [ item.tail_counts for item in row ]
            counts.append(this_counts)
        
        sample_n = [ ]        # [feature][sample]  Total count
        sample_n_tail = [ ]   # [feature][sample]  Polya count
        sample_prop = [ ]     # [feature][sample]  Proportion of reads with tail
        sample_tail = [ ]     # [feature][sample]  Mean tail length in each sample
        sample_total_tail = [ ]
        overall_n = [ ]
        overall_prop = [ ]    # [feature]          Overall proportion with tail
        overall_tail = [ ]    # [feature]          Overall mean tail length
        overall_n_tail = [ ]  # [feature]          Overall polya count
        overall_total_tail = [ ]
        for row in counts:
            this_n = [ ]
            this_n_tail = [ ]
            this_prop = [ ]
            this_tail = [ ]
            this_total_tail = [ ]
            for item in row:
                this_this_n = sum(item)
                this_n.append( this_this_n )

                this_this_n_tail = sum(item[self.tail:])
                this_n_tail.append( this_this_n_tail )

                this_this_total_tail = sum( item[i]*i for i in xrange(self.tail,max_length) )
                this_total_tail.append( this_this_total_tail )

                if this_this_n < 1:
                    this_prop.append(None)
                else:
                    this_prop.append(float(this_this_n_tail)/this_this_n)
                if this_this_n_tail < 1:
                    this_tail.append(None)
                else:
                    this_tail.append(this_this_total_tail/this_this_n_tail)

            sample_n.append(this_n)
            sample_n_tail.append(this_n_tail)
            sample_prop.append(this_prop)
            sample_tail.append(this_tail)
            sample_total_tail.append(this_total_tail)
            overall_n.append(sum(this_n))
            overall_n_tail.append(sum(this_n_tail))
            overall_total_tail.append(sum(this_total_tail))
            if sum(this_n) < 1:
                overall_prop.append(None)
            else:
                overall_prop.append(float(sum(this_n_tail))/sum(this_n))
            if sum(this_n_tail) < 1:
                overall_tail.append(None)
            else:
                overall_tail.append(float(sum(this_total_tail))/sum(this_n_tail))
             
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_total_tail )
            this_n = sum( item[i] for item in sample_n_tail )
            if this_n:
                self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n)
                
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_n_tail )
            this_n = sum( item[i] for item in sample_n )
            if this_n:
                self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n)
            
        
        #max_length = max(max(len(item) for item in row) for row in counts)
        #
        #for row in counts:
        #    for item in row:
        #        while len(item) < max_length:
        #            item.append(0)
                
        
        with open(work/'features-with-data.gff','wb') as f:
            annotation.write_gff3_header(f)
            for i, item in enumerate(annotations):
                item.attr['reads'] = str(overall_n[i])
                item.attr['reads_with_tail'] = str(overall_n_tail[i])
                item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA'
                item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA'
                
                if overall_tail[i] is None:
                    item.attr['color'] = '#444444'
                else:
                    a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail)
                    item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255))
                #item.attr['color'] = ...                
                print >> f, item.as_gff()
        
        
        comments = [ '#Counts' ] + [
            '#sampleTags='+','.join(tags)
            for tags in sample_tags
            ] + [
            '"Tail_count" group is number of reads with tail',
            '"Tail" group is mean tail per sample',
            '"Proportion" group is proportion of reads with tail',
            ]

        def counts_iter():
            for i in xrange(len(counts)):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(len(names)):
                    row[('Count',names[j])] = '%d' % sample_n[i][j]

                row[('Annotation','Length')] = annotations[i].end - annotations[i].start
                row[('Annotation','gene')] = annotations[i].attr.get('Name','')
                row[('Annotation','product')] = annotations[i].attr.get('Product','')
                #row[('Annotation','Strand')] = str(annotations[i].strand)
                row[('Annotation','reads')] = str(overall_n[i])
                row[('Annotation','reads-with-tail')] = str(overall_n_tail[i])
                row[('Annotation','mean-tail')] = str(overall_tail[i]) if overall_tail[i] is not None else 'NA'
                row[('Annotation','proportion-with-tail')] = str(overall_prop[i]) if overall_prop[i] is not None else 'NA'
                for j in xrange(len(names)):
                    row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j]
                for j in xrange(len(names)):
                    row[('Tail',names[j])] = str(sample_tail[i][j]) if sample_tail[i][j] is not None else 'NA'
                for j in xrange(len(names)):
                    row[('Proportion',names[j])] = str(sample_prop[i][j]) if sample_prop[i][j] is not None else 'NA'
                yield row
        io.write_csv(work/'counts.csv', counts_iter(), comments=comments)

        def raw_columns():
            for i in xrange(len(names)):
                row = collections.OrderedDict()
                row['Sample'] = names[i]
                for j in xrange(max_length):
                    row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based
                yield row
        io.write_csv(work/'raw-columns.csv', raw_columns())

        #Somewhat inefficient        
        def raw():
            for i in xrange(len(counts)):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(len(names)):
                    for k in xrange(max_length):
                        row['%d %s' % (k,names[j])] = str( counts[i][j][k] )
                yield row
        io.write_csv(work/'raw.csv', raw())
        
        def pooled():
            for i in xrange(len(counts)):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(max_length):
                    row[str(j)] = str( sum( counts[i][k][j] for k in xrange(len(names)) ) )
                yield row
        io.write_csv(work/'pooled.csv', pooled())