def run(self): if self.output is not None: out_file = open(self.output, 'wb') else: out_file = sys.stdout annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if 'ID' not in item.attr and 'locus_tag' in item.attr: item.attr['ID'] = item.attr['locus_tag'] if 'color' not in item.attr: if item.type == 'CDS': item.attr['color'] = '#008800' if item.type == 'rRNA': item.attr['color'] = '#bb0000' if item.type == 'tRNA': item.attr['color'] = '#bb00bb' if item.type == 'misc_feature': item.attr['color'] = '#8888ff' print >> out_file, item.as_gff() if self.output is not None: out_file.close()
def run(self): if self.output is not None: out_file = open(self.output,'wb') else: out_file = sys.stdout annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if 'ID' not in item.attr and 'locus_tag' in item.attr: item.attr['ID'] = item.attr['locus_tag'] if 'color' not in item.attr: if item.type == 'CDS': item.attr['color'] = '#008800' if item.type == 'rRNA': item.attr['color'] = '#bb0000' if item.type == 'tRNA': item.attr['color'] = '#bb00bb' if item.type == 'misc_feature': item.attr['color'] = '#8888ff' print >> out_file, item.as_gff() if self.output is not None: out_file.close()
def set_annotations(self, filenames): f = self.open('reference.gff','wb') annotation.write_gff3_header(f) for filename in filenames: for feature in annotation.read_annotations(filename): print >> f, feature.as_gff() f.close()
def run(self): assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.' strand_changer = STRAND_CHANGE[self.change_strand] shift_start_absolute, shift_start_proportion = decode_shift( self.shift_start) shift_end_absolute, shift_end_proportion = decode_shift(self.shift_end) renames = [] if self.rename: for item in self.rename.split(','): new, old = item.split('=') if new != old: renames.append((new, old)) out_file = open(self.prefix + '.gff', 'wb') annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type length = item.end - item.start shift_start = int( math.floor(0.5 + shift_start_absolute + shift_start_proportion * length)) shift_end = int( math.floor(0.5 + shift_end_absolute + shift_end_proportion * length)) if item.strand == 1: item.start += shift_start item.end += shift_end elif item.strand == -1: item.end -= shift_start item.start -= shift_end item.start = max(0, item.start) #IGV complains item.strand = strand_changer[item.strand] old_attr = item.attr.copy() for new, old in renames: if old in item.attr: del item.attr[old] for new, old in renames: if old in old_attr: item.attr[new] = old_attr[old] print >> out_file, item.as_gff() out_file.close()
def run(self): annotations = [ ] for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type annotations.append(item) annotations.sort(key=lambda item: (item.seqid, item.strand, item.start)) group = [ ] groups = [ ] def emit(): if not group: return groups.append(group[:]) del group[:] seqid = None strand = None end = 0 for item in annotations: if item.seqid != seqid or item.strand != strand or item.start >= end: emit() seqid = item.seqid strand = item.strand end = item.end-self.overlap group.append(item) end = max(item.end-self.overlap, end) emit() out_file = open(self.prefix+'.gff','wb') annotation.write_gff3_header(out_file) for group in groups: item = annotation.Annotation() item.source = group[0].source item.type = join_descriptions( item2.type for item2 in group ) item.seqid = group[0].seqid item.strand = group[0].strand item.start = min( item2.start for item2 in group ) item.end = max( item2.end for item2 in group ) item.score = None item.phase = None item.attr = { } for item2 in group: for key in item2.attr: if key in item.attr: continue item.attr[key] = join_descriptions( item3.attr[key] for item3 in group if key in item3.attr ) print >> out_file, item.as_gff() out_file.close()
def run(self): spans = collections.defaultdict(list) for item in legion.parallel_imap(self._load_bam, self.filenames): for key,value in item.items(): spans[key].extend(value) grace.status('Calling peaks') f = open(self.prefix+'.gff', 'wb') annotation.write_gff3_header(f) n = 0 for (rname, strand), span_list in spans.items(): depth = [ 0.0 ] * (1+max( item[1] for item in span_list )) for start, end in span_list: depth[start] += 1.0 depth[end] -= 1.0 for i in xrange(1,len(depth)): depth[i] += depth[i-1] for start, end in self._find_spans(depth): if end-self.lap-start <= 0: continue n += 1 id = 'peak%d' % n ann = annotation.Annotation() ann.source = 'nesoni' ann.type = self.type ann.seqid = rname ann.start = start ann.end = end - self.lap ann.strand = strand ann.score = None ann.phase = None ann.attr = { 'id' : id, 'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080', } print >> f, ann.as_gff() f.flush() f.close() self.log.datum('-','called peaks',n) grace.status('')
def run(self): assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.' strand_changer = STRAND_CHANGE[self.change_strand] shift_start_absolute, shift_start_proportion = decode_shift(self.shift_start) shift_end_absolute, shift_end_proportion = decode_shift(self.shift_end) renames = [ ] if self.rename: for item in self.rename.split(','): new, old = item.split('=') if new != old: renames.append((new,old)) out_file = open(self.prefix+'.gff','wb') annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type length = item.end-item.start shift_start = int(math.floor(0.5+shift_start_absolute+shift_start_proportion*length)) shift_end = int(math.floor(0.5+shift_end_absolute+shift_end_proportion*length)) if item.strand == 1: item.start += shift_start item.end += shift_end elif item.strand == -1: item.end -= shift_start item.start -= shift_end item.start = max(0, item.start) #IGV complains item.strand = strand_changer[item.strand] old_attr = item.attr.copy() for new,old in renames: if old in item.attr: del item.attr[old] for new,old in renames: if old in old_attr: item.attr[new] = old_attr[old] print >> out_file, item.as_gff() out_file.close()
def run(self): features_parent = [ _Related_feature(item, item.start, item.end, []) for item in annotation.read_annotations(self.parent) if selection.matches(self.select_parent, [item.type]) ] features_child = [ _Related_feature(item, item.start, item.end, []) for item in annotation.read_annotations(self.child) if selection.matches(self.select_child, [item.type]) ] index = {} for item in features_child: if item.feature.seqid not in index: index[item.feature.seqid] = span_index.Span_index() index[item.feature.seqid].insert(item) for value in index.values(): value.prepare() for item_1 in features_parent: if item_1.feature.strand == 1: start = item_1.start - self.upstrand end = item_1.end + self.downstrand elif item_1.feature.strand == -1: start = item_1.start - self.downstrand end = item_1.end + self.upstrand else: start = item_1.start - max(self.upstrand, self.downstrand) end = item_1.end + max(self.upstrand, self.downstrand) if item_1.feature.seqid in index: for item_2 in index[item_1.feature.seqid].get(start, end): item_1.relations.append(item_2) item_2.relations.append(item_1) for item in features_parent: item.modify_with_relations(self.use, self.to_child, self.to_parent) with open(self.prefix + '-parent.gff', 'wb') as f: annotation.write_gff3_header(f) for item in features_parent: print >> f, item.feature.as_gff() with open(self.prefix + '-child.gff', 'wb') as f: annotation.write_gff3_header(f) for item in features_child: print >> f, item.feature.as_gff()
def run(self): features_parent = [ _Related_feature(item,item.start,item.end,[]) for item in annotation.read_annotations(self.parent) if selection.matches(self.select_parent, [item.type]) ] features_child = [ _Related_feature(item,item.start,item.end,[]) for item in annotation.read_annotations(self.child) if selection.matches(self.select_child, [item.type]) ] index = { } for item in features_child: if item.feature.seqid not in index: index[item.feature.seqid] = span_index.Span_index() index[item.feature.seqid].insert(item) for value in index.values(): value.prepare() for item_1 in features_parent: if item_1.feature.strand == 1: start = item_1.start - self.upstrand end = item_1.end + self.downstrand elif item_1.feature.strand == -1: start = item_1.start - self.downstrand end = item_1.end + self.upstrand else: start = item_1.start - max(self.upstrand,self.downstrand) end = item_1.end + max(self.upstrand,self.downstrand) if item_1.feature.seqid in index: for item_2 in index[item_1.feature.seqid].get(start,end): item_1.relations.append(item_2) item_2.relations.append(item_1) for item in features_parent: item.modify_with_relations(self.use, self.to_child, self.to_parent) with open(self.prefix + '-parent.gff','wb') as f: annotation.write_gff3_header(f) for item in features_parent: print >> f, item.feature.as_gff() with open(self.prefix + '-child.gff','wb') as f: annotation.write_gff3_header(f) for item in features_child: print >> f, item.feature.as_gff()
def run(self): assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.' strand_changer = STRAND_CHANGE[self.change_strand] out_file = open(self.prefix+'.gff','wb') annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if item.strand == 1: item.start += self.shift_start item.end += self.shift_end elif item.strand == -1: item.end -= self.shift_start item.start -= self.shift_end item.strand = strand_changer[item.strand] print >> out_file, item.as_gff() out_file.close()
def run(self): assert self.ucsc_name, 'Need a UCSC genome name' scratch = _ucsc_scratch(self) # Load annotations source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table) table = scratch.get_table(self.table) get_name = scratch.getter(self.name) get_product = scratch.getter(self.product) mrnas = [ ] for item in table: ann = annotation.Annotation( seqid = item.chrom, source = source, type = 'mRNA', strand = {'+':1, '-':-1}[item.strand], start = int(item.txStart), end = int(item.txEnd), attr = { 'ID' : item.name, 'Name' : get_name(item), 'Product' : get_product(item), #'UCSC_name2' : item.name2, } ) ann.record = item mrnas.append(ann) _uniquify_ids(mrnas) annotations = [ ] for group in _grouped_features(mrnas): ID = '/'.join(item.attr['ID'] for item in group) for item in group: item.attr['Parent'] = ID item.attr['ID'] = item.attr['ID'] + '-mRNA' annotations.append(annotation.Annotation( source = source, type = 'gene', seqid = group[0].seqid, strand = group[0].strand, start = min(item.start for item in group), end = max(item.end for item in group), attr = { 'ID' : ID, 'Name' : annotation_tools.join_descriptions([ item.attr['Name'] for item in group ], '/'), 'Product' : annotation_tools.join_descriptions([ item.attr['Product'] for item in group ], '/'), #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'), } )) for item in group: annotations.append(item) exonStarts = _parse_ints(item.record.exonStarts) exonEnds = _parse_ints(item.record.exonEnds) cdsStart = int(item.record.cdsStart) cdsEnd = int(item.record.cdsEnd) for start,end in zip(exonStarts,exonEnds): annotations.append(annotation.Annotation( source = source, type = 'exon', seqid = item.seqid, strand = item.strand, start = start, end = end, attr = { 'Parent' : item.attr['ID'], } )) if max(cdsStart,start) < min(cdsEnd,end): annotations.append(annotation.Annotation( source = source, type = 'CDS', seqid = item.seqid, strand = item.strand, start = max(cdsStart,start), end = min(cdsEnd,end), #TODO: phase attr = { 'Parent' : item.attr['ID'], } )) # Load sequence if self.download: io.execute(['rsync','-P','rsync://hgdownload.cse.ucsc.edu/goldenPath/'+self.ucsc_name+'/bigZips/chromFa.tar.gz',scratch.ucsc/'chromFa.tar.gz']) with workspace.tempspace() as temp: io.execute(['tar','-C',temp.working_dir,'-zxf',scratch.ucsc/'chromFa.tar.gz']) sequences = [ temp/item for item in natural_sorted(os.listdir(temp.working_dir)) ] with open(temp/'reference.gff','wb') as f: annotation.write_gff3_header(f) for item in annotations: print >> f, item.as_gff() Make_tt_reference( self.output_dir, filenames = sequences + [ temp/'reference.gff' ], index = self.index, ).run()
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames = self.filenames, snpeff = False, cs = 'ifavailable' if self.index else False, ls = False, bowtie = 'ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work/'reference.gff')) annotation.link_up_annotations(annotations) with open(work/'utr.gff','wb') as f: annotation.write_gff3_header(f) for gene in annotations: if gene.type != 'gene': continue mrnas = [ item for item in gene.children if item.type == 'mRNA' ] utr_5primes = [ ] for mrna in mrnas: cdss = [ item for item in mrna.children if item.type == 'CDS' ] exons = [ item for item in mrna.children if item.type == 'exon' ] if not cdss or not exons: continue if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end > cds_3prime: utr_5primes.append(max(item.start,cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start < cds_3prime: utr_5primes.append(min(item.end,cds_3prime)) if gene.strand >= 0: utr_start = max(utr_5primes) if utr_5primes else gene.end utr_end = max(utr_start+1,gene.end) else: utr_end = min(utr_5primes) if utr_5primes else gene.start utr_start = min(gene.start,utr_end-1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' thing = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = gene.seqid, strand = gene.strand, start = utr_start, end = utr_end, attr = attr, ) print >> f, thing.as_gff() work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): annotations = [] for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type annotations.append(item) annotations.sort( key=lambda item: (item.type, item.seqid, item.strand, item.start)) group = [] groups = [] def emit(): if not group: return groups.append(group[:]) del group[:] type = None seqid = None strand = None end = 0 for item in annotations: if item.type != type or item.seqid != seqid or item.strand != strand or item.start >= end: emit() type = item.type seqid = item.seqid strand = item.strand end = item.end - self.overlap group.append(item) end = max(item.end - self.overlap, end) emit() items = [] id_map = {} for group in groups: item = annotation.Annotation() item.source = group[0].source item.type = group[0].type item.seqid = group[0].seqid item.strand = group[0].strand item.start = min(item2.start for item2 in group) item.end = max(item2.end for item2 in group) item.score = None item.phase = None item.attr = {} for item2 in group: for key in item2.attr: if key in item.attr: continue item.attr[key] = join_descriptions([ item3.attr[key] for item3 in group if key in item3.attr ], self.joiner) item.parents = [] for item2 in group: if 'ID' in item2.attr: assert item2.attr[ 'ID'] not in id_map, 'Duplicate ID: ' + item2.attr['ID'] id_map[item2.attr['ID']] = item.attr['ID'] if 'Parent' in item2.attr: item.parents.append(item2.attr['Parent']) items.append(item) for item in items: if item.parents: item.attr['Parent'] = join_descriptions( [id_map.get(parent, parent) for parent in item.parents], ',') with open(self.prefix + '.gff', 'wb') as out_file: annotation.write_gff3_header(out_file) for item in items: print >> out_file, item.as_gff()
def run(self): spans = { } #for item in legion.parallel_imap(self._load_bam, self.filenames): # for key,value in item.items(): for filename in self.filenames: self._load_bam(filename, spans) grace.status('Calling peaks') f = open(self.prefix+'.gff', 'wb') annotation.write_gff3_header(f) n = 0 for (rname, strand), span_counts in spans.items(): length = 1+max( item[1] for item in span_counts ) depth = [ 0.0 ] * length AN_total = [ 0.0 ] * length AG_total = [ 0.0 ] * length for (start, end, AN, AG), count in span_counts.iteritems(): depth[start] += 1.0*count depth[end] -= 1.0*count AN_total[start] += AN*count AN_total[end] -= AN*count AG_total[start] += AG*count AG_total[end] -= AG*count for i in xrange(1,length): depth[i] += depth[i-1] AN_total[i] += AN_total[i-1] AG_total[i] += AG_total[i-1] for start, end in self._find_spans(depth): if end-self.lap-start <= 0: continue n += 1 id = 'peak%d' % n ann = annotation.Annotation() ann.source = 'tailtools' ann.type = self.type ann.seqid = rname ann.start = start ann.end = end - self.lap if ann.end != ann.start+1: self.log.log("%s odd: start %d end %d\n" % (id, ann.start, ann.end)) ann.strand = strand ann.score = None ann.phase = None ann.attr = { 'id' : id, 'n' : str(depth[start+self.lap//2]), 'mean_tail' : str(AN_total[start+self.lap//2]/depth[start+self.lap//2]), 'mean_genomic' : str(AG_total[start+self.lap//2]/depth[start+self.lap//2]), 'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080', } print >> f, ann.as_gff() f.flush() f.close() self.log.datum('-','called peaks',n) grace.status('')
def run(self): assert self.ucsc_name, 'Need a UCSC genome name' scratch = _ucsc_scratch(self) # Load annotations source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table) table = scratch.get_table(self.table) get_name = scratch.getter(self.name) get_product = scratch.getter(self.product) mrnas = [] for item in table: ann = annotation.Annotation( seqid=item.chrom, source=source, type='mRNA', strand={ '+': 1, '-': -1 }[item.strand], start=int(item.txStart), end=int(item.txEnd), attr={ 'ID': item.name, 'Name': get_name(item), 'Product': get_product(item), #'UCSC_name2' : item.name2, }) ann.record = item mrnas.append(ann) _uniquify_ids(mrnas) annotations = [] for group in _grouped_features(mrnas): ID = '/'.join(item.attr['ID'] for item in group) for item in group: item.attr['Parent'] = ID item.attr['ID'] = item.attr['ID'] + '-mRNA' annotations.append( annotation.Annotation( source=source, type='gene', seqid=group[0].seqid, strand=group[0].strand, start=min(item.start for item in group), end=max(item.end for item in group), attr={ 'ID': ID, 'Name': annotation_tools.join_descriptions( [item.attr['Name'] for item in group], '/'), 'Product': annotation_tools.join_descriptions( [item.attr['Product'] for item in group], '/'), #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'), })) for item in group: annotations.append(item) exonStarts = _parse_ints(item.record.exonStarts) exonEnds = _parse_ints(item.record.exonEnds) cdsStart = int(item.record.cdsStart) cdsEnd = int(item.record.cdsEnd) for start, end in zip(exonStarts, exonEnds): annotations.append( annotation.Annotation(source=source, type='exon', seqid=item.seqid, strand=item.strand, start=start, end=end, attr={ 'Parent': item.attr['ID'], })) if max(cdsStart, start) < min(cdsEnd, end): annotations.append( annotation.Annotation( source=source, type='CDS', seqid=item.seqid, strand=item.strand, start=max(cdsStart, start), end=min(cdsEnd, end), #TODO: phase attr={ 'Parent': item.attr['ID'], })) # Load sequence if self.download: io.execute([ 'rsync', '-P', 'rsync://hgdownload.cse.ucsc.edu/goldenPath/' + self.ucsc_name + '/bigZips/chromFa.tar.gz', scratch.ucsc / 'chromFa.tar.gz' ]) with workspace.tempspace() as temp: io.execute([ 'tar', '-C', temp.working_dir, '-zxf', scratch.ucsc / 'chromFa.tar.gz' ]) sequences = [ temp / item for item in natural_sorted(os.listdir(temp.working_dir)) ] with open(temp / 'reference.gff', 'wb') as f: annotation.write_gff3_header(f) for item in annotations: print >> f, item.as_gff() Make_tt_reference( self.output_dir, filenames=sequences + [temp / 'reference.gff'], index=self.index, ).run()
def run(self): work = self.get_workspace() data = [] names = [] sample_tags = [] old = grace.status("Loading pickles") max_length = 1 for i, item in enumerate(self.pickles): grace.status("Loading " + os.path.basename(item)) f = io.open_possibly_compressed_file(item) name, tags, datum = pickle.load(f) f.close() data.append(datum) names.append(name) sample_tags.append(tags) try: max_length = max( max_length, max(item[0] #tail_length for feature in datum for item in feature.hits) + 1) except ValueError: pass if i == 0: annotations = datum grace.status(old) self.log.log("Maximum tail length %d\n" % max_length) for i in xrange(len(names)): n_alignments = 0 for feature in data[i]: feature.total_count = len(feature.hits) feature.tail_counts = [0] * max_length n_alignments += feature.total_count for tail_length, adaptor_bases in feature.hits: if adaptor_bases >= self.adaptor: feature.tail_counts[tail_length] += 1 del feature.hits self.log.datum(names[i], 'Alignments to features', n_alignments) counts = [] # [feature][sample](total_count, [taillength]) for item in data: assert len(item) == len(data[0]) for row in itertools.izip(*data): this_counts = [(item.total_count, item.tail_counts) for item in row] counts.append(this_counts) n_features = len(counts) n_samples = len(data) sample_n = [[0] * n_samples for i in xrange(n_features) ] # [feature][sample] Total count sample_n_tail = [[0] * n_samples for i in xrange(n_features) ] # [feature][sample] Polya count sample_prop = [ [None] * n_samples for i in xrange(n_features) ] # [feature][sample] Proportion of reads with tail (deprecated) sample_tail = [[None] * n_samples for i in xrange(n_features) ] # [feature][sample] Mean tail length in each sample sample_sd_tail = [ [None] * n_samples for i in xrange(n_features) ] # [feature][sample] Std dev tail length in each sample sample_total_tail = [[0] * n_samples for i in xrange(n_features)] sample_quantile_tail = collections.OrderedDict( (item, [[None] * n_samples for i in xrange(n_features)]) for item in [25, 50, 75, 100]) overall_n = [0] * n_features # [feature] Overall count overall_prop = [ None ] * n_features # [feature] Overall proportion with tail overall_tail = [ None ] * n_features # [feature] Overall mean tail length overall_n_tail = [ 0 ] * n_features # [feature] Overall polya count for i, row in enumerate(counts): for j, (this_this_n, item) in enumerate(row): sample_n[i][j] = this_this_n sample_n_tail[i][j] = sum(item[self.tail:]) sample_total_tail[i][j] = sum( item[k] * k for k in xrange(self.tail, max_length)) if sample_n[i][j] >= 1: sample_prop[i][j] = float( sample_n_tail[i][j]) / sample_n[i][j] if sample_n_tail[i][j] >= 1: sample_tail[i][j] = float( sample_total_tail[i][j]) / sample_n_tail[i][j] for quantile in sample_quantile_tail: counter = sample_n_tail[i][j] * quantile / 100.0 for k in xrange(self.tail, max_length): counter -= item[k] if counter <= 0: break sample_quantile_tail[quantile][i][j] = k if sample_n_tail[i][j] >= 2: sample_sd_tail[i][j] = math.sqrt( float( sum(item[k] * ((k - sample_tail[i][j])**2) for k in xrange(self.tail, max_length))) / (sample_n_tail[i][j] - 1)) overall_n[i] = sum(sample_n[i]) overall_n_tail[i] = sum(sample_n_tail[i]) if overall_n[i] >= 1: overall_prop[i] = float(sum(sample_n_tail[i])) / overall_n[i] if overall_n_tail[i] >= 1: overall_tail[i] = float(sum( sample_total_tail[i])) / overall_n_tail[i] for i, name in enumerate(names): this_total = sum(item[i] for item in sample_total_tail) this_n = sum(item[i] for item in sample_n_tail) if this_n: self.log.datum(name, 'Average poly-A tail', float(this_total) / this_n) for i, name in enumerate(names): this_total = sum(item[i] for item in sample_n_tail) this_n = sum(item[i] for item in sample_n) if this_n: self.log.datum(name, 'Average proportion of reads with tail', float(this_total) / this_n) with open(work / 'features-with-data.gff', 'wb') as f: annotation.write_gff3_header(f) for i, item in enumerate(annotations): item.attr['reads'] = str(overall_n[i]) item.attr['reads_with_tail'] = str(overall_n_tail[i]) item.attr['mean_tail'] = '%.1f' % overall_tail[ i] if overall_tail[i] else 'NA' item.attr['proportion_with_tail'] = '%.2f' % overall_prop[ i] if overall_prop[i] else 'NA' if overall_tail[i] is None: item.attr['color'] = '#444444' else: a = (overall_tail[i] - self.tail) / max( 1, max_length - self.tail) item.attr['color'] = '#%02x%02x%02x' % (int( a * 255), int( (1 - abs(a * 2 - 1)) * 255), 255 - int(a * 255)) #item.attr['color'] = ... print >> f, item.as_gff() comments = ['#Counts'] + [ '#sampleTags=' + ','.join(tags) for tags in sample_tags ] + [ '"Tail_count" group is number of reads with tail', '"Tail" group is mean tail per sample', '"Proportion" group is proportion of reads with tail', ] have_biotype = any("Biotype" in item.attr for item in annotations) have_parent = any("Parent" in item.attr for item in annotations) have_relation = any("Relation" in item.attr for item in annotations) have_antisense = any("Antisense_parent" in item.attr for item in annotations) def counts_iter(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(n_samples): row[('Count', names[j])] = '%d' % sample_n[i][j] row[('Annotation', 'Length')] = annotations[i].end - annotations[i].start row[('Annotation', 'gene')] = annotations[i].attr.get('Name', '') row[('Annotation', 'product')] = annotations[i].attr.get('Product', '') if have_biotype: row[('Annotation', 'biotype')] = annotations[i].attr.get('Biotype', '') if have_parent: row[('Annotation', 'parent')] = annotations[i].attr.get('Parent', '') if have_relation: row[('Annotation', 'relation')] = annotations[i].attr.get( 'Relation', '') if have_antisense: row[('Annotation', 'antisense_gene')] = annotations[i].attr.get( 'Antisense_name', '') row[('Annotation', 'antisense_product')] = annotations[i].attr.get( 'Antisense_product', '') row[('Annotation', 'antisense_biotype')] = annotations[i].attr.get( 'Antisense_biotype', '') row[('Annotation', 'antisense_parent')] = annotations[i].attr.get( 'Antisense_parent', '') row[('Annotation', 'chromosome')] = str(annotations[i].seqid) row[('Annotation', 'strand')] = str(annotations[i].strand) row[('Annotation', 'start')] = str(annotations[i].start + 1) row[('Annotation', 'end')] = str(annotations[i].end) row[('Annotation', 'reads')] = str(overall_n[i]) row[('Annotation', 'reads-with-tail')] = str(overall_n_tail[i]) row[('Annotation', 'mean-tail')] = str_na(overall_tail[i]) row[('Annotation', 'proportion-with-tail')] = str_na(overall_prop[i]) for j in xrange(n_samples): row[('Tail_count', names[j])] = '%d' % sample_n_tail[i][j] for j in xrange(n_samples): row[('Tail', names[j])] = str_na(sample_tail[i][j]) for j in xrange(n_samples): row[('Tail_sd', names[j])] = str_na(sample_sd_tail[i][j]) for quantile in sample_quantile_tail: for j in xrange(n_samples): row[('Tail_quantile_%d' % quantile, names[j])] = str_na( sample_quantile_tail[quantile][i][j]) for j in xrange(len(names)): row[('Proportion', names[j])] = str_na(sample_prop[i][j]) yield row io.write_csv(work / 'counts.csv', counts_iter(), comments=comments) def write_csv_matrix(filename, matrix): def emitter(): for i in xrange(n_features): row = collections.OrderedDict() row["Feature"] = annotations[i].get_id() for j in xrange(n_samples): row[names[j]] = str_na(matrix[i][j]) yield row io.write_csv(filename, emitter()) write_csv_matrix(work / 'read_count.csv', sample_n) write_csv_matrix(work / 'tail_count.csv', sample_n_tail) write_csv_matrix(work / 'tail.csv', sample_tail) write_csv_matrix(work / 'tail_sd.csv', sample_sd_tail) for quantile in sample_quantile_tail: write_csv_matrix(work / ('tail_quantile_%d.csv' % quantile), sample_quantile_tail[quantile]) #def raw_columns(): # for i in xrange(n_samples): # row = collections.OrderedDict() # row['Sample'] = names[i] # for j in xrange(max_length): # row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based # yield row #io.write_csv(work/'raw-columns.csv', raw_columns()) # ##Somewhat inefficient #def raw(): # for i in xrange(n_features): # row = collections.OrderedDict() # row['Feature'] = annotations[i].get_id() # for j in xrange(n_samples): # for k in xrange(max_length): # row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] ) # yield row #io.write_csv(work/'raw.csv', raw()) def pooled(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(max_length): row[str(j)] = str( sum(counts[i][k][1][j] for k in xrange(n_samples))) yield row io.write_csv(work / 'pooled.csv', pooled())
def run(self): annotations = [ ] for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type annotations.append(item) annotations.sort(key=lambda item: (item.type, item.seqid, item.strand, item.start)) group = [ ] groups = [ ] def emit(): if not group: return groups.append(group[:]) del group[:] type = None seqid = None strand = None end = 0 for item in annotations: if item.type != type or item.seqid != seqid or item.strand != strand or item.start >= end: emit() type = item.type seqid = item.seqid strand = item.strand end = item.end-self.overlap group.append(item) end = max(item.end-self.overlap, end) emit() items = [ ] id_map = { } for group in groups: item = annotation.Annotation() item.source = group[0].source item.type = group[0].type item.seqid = group[0].seqid item.strand = group[0].strand item.start = min( item2.start for item2 in group ) item.end = max( item2.end for item2 in group ) item.score = None item.phase = None item.attr = { } for item2 in group: for key in item2.attr: if key in item.attr: continue item.attr[key] = join_descriptions([ item3.attr[key] for item3 in group if key in item3.attr ], self.joiner ) item.parents = [ ] for item2 in group: if 'ID' in item2.attr: assert item2.attr['ID'] not in id_map, 'Duplicate ID: '+item2.attr['ID'] id_map[item2.attr['ID']] = item.attr['ID'] if 'Parent' in item2.attr: item.parents.append(item2.attr['Parent']) items.append(item) for item in items: if item.parents: item.attr['Parent'] = join_descriptions([ id_map.get(parent,parent) for parent in item.parents ], ',') with open(self.prefix+'.gff','wb') as out_file: annotation.write_gff3_header(out_file) for item in items: print >> out_file, item.as_gff()
def run(self): assert len(self.pickles) > 0, "No samples to count." work = self.get_workspace() data = [ ] names = [ ] sample_tags = [ ] old = grace.status("Loading pickles") max_length = 1 for i, item in enumerate(self.pickles): grace.status("Loading "+os.path.basename(item)) f = io.open_possibly_compressed_file(item) name, tags, datum = pickle.load(f) f.close() data.append(datum) names.append(name) sample_tags.append(tags) try: max_length = max(max_length, max( item[0] #tail_length for feature in datum for item in feature.hits ) + 1) except ValueError: pass if i == 0: annotations = datum grace.status(old) self.log.log("Maximum tail length %d\n" % max_length) for i in xrange(len(names)): n_alignments = 0 for feature in data[i]: feature.total_count = len(feature.hits) feature.tail_counts = [ 0 ] * max_length n_alignments += feature.total_count for tail_length, adaptor_bases in feature.hits: if adaptor_bases >= self.adaptor: feature.tail_counts[tail_length] += 1 del feature.hits self.log.datum(names[i], 'Alignments to features', n_alignments) counts = [ ] # [feature][sample](total_count, [taillength]) for item in data: assert len(item) == len(data[0]) for row in itertools.izip(*data): this_counts = [ (item.total_count, item.tail_counts) for item in row ] counts.append(this_counts) n_features = len(counts) n_samples = len(data) sample_n = [ [0]*n_samples for i in xrange(n_features) ] # [feature][sample] Total count sample_n_tail = [ [0]*n_samples for i in xrange(n_features) ] # [feature][sample] Polya count sample_prop = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Proportion of reads with tail (deprecated) sample_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Mean tail length in each sample sample_sd_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Std dev tail length in each sample sample_total_tail = [ [0]*n_samples for i in xrange(n_features) ] sample_quantile_tail = collections.OrderedDict( (item, [ [None]*n_samples for i in xrange(n_features) ]) for item in [25,50,75,100] ) overall_n = [ 0 ]*n_features # [feature] Overall count overall_prop = [ None ]*n_features # [feature] Overall proportion with tail overall_tail = [ None ]*n_features # [feature] Overall mean tail length overall_n_tail = [ 0 ]*n_features # [feature] Overall polya count for i, row in enumerate(counts): for j, (this_this_n, item) in enumerate(row): sample_n[i][j] = this_this_n sample_n_tail[i][j] = sum(item[self.tail:]) sample_total_tail[i][j] = sum( item[k]*k for k in xrange(self.tail,max_length) ) if sample_n[i][j] >= 1: sample_prop[i][j] = float(sample_n_tail[i][j])/sample_n[i][j] if sample_n_tail[i][j] >= 1: sample_tail[i][j] = float(sample_total_tail[i][j])/sample_n_tail[i][j] for quantile in sample_quantile_tail: counter = sample_n_tail[i][j] * quantile / 100.0 for k in xrange(self.tail, max_length): counter -= item[k] if counter <= 0: break sample_quantile_tail[quantile][i][j] = k if sample_n_tail[i][j] >= 2: sample_sd_tail[i][j] = math.sqrt( float(sum( item[k]*((k-sample_tail[i][j])**2) for k in xrange(self.tail,max_length) )) / (sample_n_tail[i][j]-1) ) overall_n[i] = sum(sample_n[i]) overall_n_tail[i] = sum(sample_n_tail[i]) if overall_n[i] >= 1: overall_prop[i] = float(sum(sample_n_tail[i]))/overall_n[i] if overall_n_tail[i] >= 1: overall_tail[i] = float(sum(sample_total_tail[i]))/overall_n_tail[i] for i, name in enumerate(names): this_total = sum( item[i] for item in sample_total_tail ) this_n = sum( item[i] for item in sample_n_tail ) if this_n: self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n) for i, name in enumerate(names): this_total = sum( item[i] for item in sample_n_tail ) this_n = sum( item[i] for item in sample_n ) if this_n: self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n) with open(work/'features-with-data.gff','wb') as f: annotation.write_gff3_header(f) for i, item in enumerate(annotations): item.attr['reads'] = str(overall_n[i]) item.attr['reads_with_tail'] = str(overall_n_tail[i]) item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA' item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA' if overall_tail[i] is None: item.attr['color'] = '#444444' else: a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail) item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255)) #item.attr['color'] = ... print >> f, item.as_gff() comments = [ '#Counts' ] + [ '#sampleTags='+','.join(tags) for tags in sample_tags ] + [ '"Tail_count" group is number of reads with tail', '"Tail" group is mean tail per sample', '"Proportion" group is proportion of reads with tail', ] have_biotype = any("Biotype" in item.attr for item in annotations) have_parent = any("Parent" in item.attr for item in annotations) have_relation = any("Relation" in item.attr for item in annotations) have_antisense = any("Antisense_parent" in item.attr for item in annotations) def counts_iter(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(n_samples): row[('Count',names[j])] = '%d' % sample_n[i][j] row[('Annotation','Length')] = annotations[i].end - annotations[i].start row[('Annotation','gene')] = annotations[i].attr.get('Name','') row[('Annotation','product')] = annotations[i].attr.get('Product','') if have_biotype: row[('Annotation','biotype')] = annotations[i].attr.get('Biotype','') if have_parent: row[('Annotation','parent')] = annotations[i].attr.get('Parent','') if have_relation: row[('Annotation','relation')] = annotations[i].attr.get('Relation','') if have_antisense: row[('Annotation','antisense_gene')] = annotations[i].attr.get('Antisense_name','') row[('Annotation','antisense_product')] = annotations[i].attr.get('Antisense_product','') row[('Annotation','antisense_biotype')] = annotations[i].attr.get('Antisense_biotype','') row[('Annotation','antisense_parent')] = annotations[i].attr.get('Antisense_parent','') row[('Annotation','chromosome')] = str(annotations[i].seqid) row[('Annotation','strand')] = str(annotations[i].strand) row[('Annotation','start')] = str(annotations[i].start+1) row[('Annotation','end')] = str(annotations[i].end) row[('Annotation','reads')] = str(overall_n[i]) row[('Annotation','reads-with-tail')] = str(overall_n_tail[i]) row[('Annotation','mean-tail')] = str_na(overall_tail[i]) row[('Annotation','proportion-with-tail')] = str_na(overall_prop[i]) for j in xrange(n_samples): row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j] for j in xrange(n_samples): row[('Tail',names[j])] = str_na(sample_tail[i][j]) for j in xrange(n_samples): row[('Tail_sd',names[j])] = str_na(sample_sd_tail[i][j]) for quantile in sample_quantile_tail: for j in xrange(n_samples): row[('Tail_quantile_%d'%quantile,names[j])] = str_na(sample_quantile_tail[quantile][i][j]) for j in xrange(len(names)): row[('Proportion',names[j])] = str_na(sample_prop[i][j]) yield row io.write_csv(work/'counts.csv', counts_iter(), comments=comments) def write_csv_matrix(filename, matrix): def emitter(): for i in xrange(n_features): row = collections.OrderedDict() row["Feature"] = annotations[i].get_id() for j in xrange(n_samples): row[names[j]] = str_na(matrix[i][j]) yield row io.write_csv(filename, emitter()) write_csv_matrix(work/'read_count.csv', sample_n) write_csv_matrix(work/'tail_count.csv', sample_n_tail) write_csv_matrix(work/'tail.csv', sample_tail) write_csv_matrix(work/'tail_sd.csv', sample_sd_tail) for quantile in sample_quantile_tail: write_csv_matrix(work/('tail_quantile_%d.csv'%quantile), sample_quantile_tail[quantile]) #def raw_columns(): # for i in xrange(n_samples): # row = collections.OrderedDict() # row['Sample'] = names[i] # for j in xrange(max_length): # row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based # yield row #io.write_csv(work/'raw-columns.csv', raw_columns()) # ##Somewhat inefficient #def raw(): # for i in xrange(n_features): # row = collections.OrderedDict() # row['Feature'] = annotations[i].get_id() # for j in xrange(n_samples): # for k in xrange(max_length): # row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] ) # yield row #io.write_csv(work/'raw.csv', raw()) def pooled(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(max_length): row[str(j)] = str( sum( counts[i][k][1][j] for k in xrange(n_samples) ) ) yield row io.write_csv(work/'pooled.csv', pooled())
def run(self): spans = collections.defaultdict(list) #for item in legion.parallel_imap(self._load_bam, self.filenames): # for key,value in item.items(): for filename in self.filenames: for key,value in self._load_bam(filename).items(): spans[key].extend(value) grace.status('Calling peaks') f = open(self.prefix+'.gff', 'wb') annotation.write_gff3_header(f) n = 0 for (rname, strand), span_list in spans.items(): length = 1+max( item[1] for item in span_list ) depth = [ 0.0 ] * length AN_total = [ 0.0 ] * length AG_total = [ 0.0 ] * length for start, end, AN, AG in span_list: depth[start] += 1.0 depth[end] -= 1.0 AN_total[start] += AN AN_total[end] -= AN AG_total[start] += AG AG_total[end] -= AG for i in xrange(1,length): depth[i] += depth[i-1] AN_total[i] += AN_total[i-1] AG_total[i] += AG_total[i-1] for start, end in self._find_spans(depth): if end-self.lap-start <= 0: continue n += 1 id = 'peak%d' % n ann = annotation.Annotation() ann.source = 'tailtools' ann.type = self.type ann.seqid = rname ann.start = start ann.end = end - self.lap assert ann.end == ann.start+1 ann.strand = strand ann.score = None ann.phase = None ann.attr = { 'id' : id, 'n' : str(depth[start+self.lap//2]), 'mean_tail' : str(AN_total[start+self.lap//2]/depth[start+self.lap//2]), 'mean_genomic' : str(AG_total[start+self.lap//2]/depth[start+self.lap//2]), 'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080', } print >> f, ann.as_gff() f.flush() f.close() self.log.datum('-','called peaks',n) grace.status('')
def run(self): assert self.what in ('fragment','5prime','3prime'), 'Unknown option for --what.' #assert self.moderation > 0.0, '--moderation must be greater than zero.' #assert self.power > 0.0, '--power must be greater than zero.' #assert self.width_power >= 1.0, '--width-power must be greater than or equal to one.' #if self.filter == 'poly': # use_bam_filename = 'alignments.bam' # use_only_top = True # use_only_monogamous = False # expect_multiple_alignments = True #elif self.filter == 'mono': # use_bam_filename = 'alignments.bam' # use_only_top = True # use_only_monogamous = True # expect_multiple_alignments = True #else: # assert self.filter == 'existing', 'Unrecognized filtering mode' # use_bam_filename = 'alignments_filtered.bam' # use_only_top = False # use_only_monogamous = False # expect_multiple_alignments = False spans = collections.defaultdict(list) for item in legion.parallel_imap(self._load_bam, self.filenames): for key,value in item.items(): spans[key].extend(value) #for i, filename in enumerate(self.filenames): # if os.path.isdir(filename): # filename = os.path.join(filename, use_bam_filename) # # n = 0 # for read_name, fragment_alignments, unmapped in \ # sam.bam_iter_fragments( # filename, # 'Scanning sample %d of %d' % (i+1,len(self.filenames))): # if not fragment_alignments: # continue # # if use_only_top: # fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ] # best_score = max(fragment_scores) # fragment_alignments = [ # item # for item, score in zip(fragment_alignments, fragment_scores) # if score >= best_score ] # # for alignments in fragment_alignments: # if self.strand_specific: # strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1 # else: # strand = 0 # # start = min(item.pos-1 for item in alignments) # end = max(item.pos+item.length-1 for item in alignments) # if end-start <= self.trim*2: continue # # rname = alignments[0].rname # spans[(rname, strand)].append((start+self.trim,end-self.trim)) # # n += 1 # #if n > 100000: break # #if self.deduplicate: # for key in spans: # spans[key] = list(set(spans[key])) grace.status('Calling peaks') f = open(self.prefix+'.gff', 'wb') annotation.write_gff3_header(f) n = 0 for (rname, strand), span_list in spans.items(): depth = [ 0.0 ] * (1+max( item[1] for item in span_list )) for start, end in span_list: depth[start] += 1.0 depth[end] -= 1.0 if self.crosstalk and strand and (rname,-strand) in spans: for start, end in spans[(rname,-strand)]: if start < len(depth): depth[start] -= self.crosstalk if end < len(depth): depth[end] += self.crosstalk for i in xrange(1,len(depth)): depth[i] += depth[i-1] if self.crosstalk: for i in xrange(len(depth)): depth[i] = max(0.0,depth[i]) #import pylab #pylab.plot(depth) for start, end in self._find_spans(depth): #pylab.axvspan(start-0.5,end-0.5,alpha=0.25) if end-self.lap-start <= 0: continue n += 1 id = 'peak%d' % n #if strand == -1: # id = '%s-%d..%d' % (rname,start,end+1) #elif strand == 0: # id = '%s.%d..%d' % (rname,start+1,end) #else: # id = '%s+%d..%d' % (rname,start+1,end) ann = annotation.Annotation() ann.source = 'nesoni' ann.type = self.type ann.seqid = rname ann.start = start ann.end = end - self.lap ann.strand = strand ann.score = None ann.phase = None ann.attr = { 'id' : id, 'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080', } print >> f, ann.as_gff() f.flush() #pylab.show() f.close() self.log.datum('-','called peaks',n) grace.status('')
def run(self): work = self.get_workspace() data = [ ] names = [ ] sample_tags = [ ] for item in self.pickles: f = io.open_possibly_compressed_file(item) name, tags, datum = pickle.load(f) f.close() data.append(datum) names.append(name) sample_tags.append(tags) annotations = data[0] all_lengths = [ #tail_length item[2] for sample in data for feature in sample #for rel_start,rel_end,tail_length in feature.hits for item in feature.hits ] if all_lengths: max_length = max(all_lengths)+1 else: max_length = 1 del all_lengths for i, sample in enumerate(data): n_alignments = 0 n_duplicates = 0 n_good = 0 for feature in sample: feature.tail_counts = [ 0.0 ] * max_length buckets = collections.defaultdict(list) for item in feature.hits: rel_start,rel_end,tail_length = item[:3] buckets[ (rel_start,rel_end) ].append(tail_length) for item in buckets.values(): n_alignments += len(item) n_good += 1 if self.saturation < 1 or len(item) <= self.saturation: weight = 1.0 else: weight = float(self.saturation) / len(item) n_duplicates += len(item) for item2 in item: feature.tail_counts[item2] += weight self.log.datum(names[i], 'Alignments to features', n_alignments) if self.saturation >= 1: self.log.datum(names[i], 'Proportion of alignments with duplicate start and end position', float(n_duplicates)/max(1,n_alignments)) self.log.datum(names[i], 'Alignments to features after deduplication', n_good) counts = [ ] # [feature][sample][taillength] for item in data: assert len(item) == len(data[0]) for row in itertools.izip(*data): this_counts = [ item.tail_counts for item in row ] counts.append(this_counts) sample_n = [ ] # [feature][sample] Total count sample_n_tail = [ ] # [feature][sample] Polya count sample_prop = [ ] # [feature][sample] Proportion of reads with tail sample_tail = [ ] # [feature][sample] Mean tail length in each sample sample_total_tail = [ ] overall_n = [ ] overall_prop = [ ] # [feature] Overall proportion with tail overall_tail = [ ] # [feature] Overall mean tail length overall_n_tail = [ ] # [feature] Overall polya count overall_total_tail = [ ] for row in counts: this_n = [ ] this_n_tail = [ ] this_prop = [ ] this_tail = [ ] this_total_tail = [ ] for item in row: this_this_n = sum(item) this_n.append( this_this_n ) this_this_n_tail = sum(item[self.tail:]) this_n_tail.append( this_this_n_tail ) this_this_total_tail = sum( item[i]*i for i in xrange(self.tail,max_length) ) this_total_tail.append( this_this_total_tail ) if this_this_n < 1: this_prop.append(None) else: this_prop.append(float(this_this_n_tail)/this_this_n) if this_this_n_tail < 1: this_tail.append(None) else: this_tail.append(this_this_total_tail/this_this_n_tail) sample_n.append(this_n) sample_n_tail.append(this_n_tail) sample_prop.append(this_prop) sample_tail.append(this_tail) sample_total_tail.append(this_total_tail) overall_n.append(sum(this_n)) overall_n_tail.append(sum(this_n_tail)) overall_total_tail.append(sum(this_total_tail)) if sum(this_n) < 1: overall_prop.append(None) else: overall_prop.append(float(sum(this_n_tail))/sum(this_n)) if sum(this_n_tail) < 1: overall_tail.append(None) else: overall_tail.append(float(sum(this_total_tail))/sum(this_n_tail)) for i, name in enumerate(names): this_total = sum( item[i] for item in sample_total_tail ) this_n = sum( item[i] for item in sample_n_tail ) if this_n: self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n) for i, name in enumerate(names): this_total = sum( item[i] for item in sample_n_tail ) this_n = sum( item[i] for item in sample_n ) if this_n: self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n) #max_length = max(max(len(item) for item in row) for row in counts) # #for row in counts: # for item in row: # while len(item) < max_length: # item.append(0) with open(work/'features-with-data.gff','wb') as f: annotation.write_gff3_header(f) for i, item in enumerate(annotations): item.attr['reads'] = str(overall_n[i]) item.attr['reads_with_tail'] = str(overall_n_tail[i]) item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA' item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA' if overall_tail[i] is None: item.attr['color'] = '#444444' else: a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail) item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255)) #item.attr['color'] = ... print >> f, item.as_gff() comments = [ '#Counts' ] + [ '#sampleTags='+','.join(tags) for tags in sample_tags ] + [ '"Tail_count" group is number of reads with tail', '"Tail" group is mean tail per sample', '"Proportion" group is proportion of reads with tail', ] def counts_iter(): for i in xrange(len(counts)): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(len(names)): row[('Count',names[j])] = '%d' % sample_n[i][j] row[('Annotation','Length')] = annotations[i].end - annotations[i].start row[('Annotation','gene')] = annotations[i].attr.get('Name','') row[('Annotation','product')] = annotations[i].attr.get('Product','') #row[('Annotation','Strand')] = str(annotations[i].strand) row[('Annotation','reads')] = str(overall_n[i]) row[('Annotation','reads-with-tail')] = str(overall_n_tail[i]) row[('Annotation','mean-tail')] = str(overall_tail[i]) if overall_tail[i] is not None else 'NA' row[('Annotation','proportion-with-tail')] = str(overall_prop[i]) if overall_prop[i] is not None else 'NA' for j in xrange(len(names)): row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j] for j in xrange(len(names)): row[('Tail',names[j])] = str(sample_tail[i][j]) if sample_tail[i][j] is not None else 'NA' for j in xrange(len(names)): row[('Proportion',names[j])] = str(sample_prop[i][j]) if sample_prop[i][j] is not None else 'NA' yield row io.write_csv(work/'counts.csv', counts_iter(), comments=comments) def raw_columns(): for i in xrange(len(names)): row = collections.OrderedDict() row['Sample'] = names[i] for j in xrange(max_length): row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based yield row io.write_csv(work/'raw-columns.csv', raw_columns()) #Somewhat inefficient def raw(): for i in xrange(len(counts)): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(len(names)): for k in xrange(max_length): row['%d %s' % (k,names[j])] = str( counts[i][j][k] ) yield row io.write_csv(work/'raw.csv', raw()) def pooled(): for i in xrange(len(counts)): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(max_length): row[str(j)] = str( sum( counts[i][k][j] for k in xrange(len(names)) ) ) yield row io.write_csv(work/'pooled.csv', pooled())