def write_csv_matrix(filename, matrix): def emitter(): for i in xrange(n_features): row = collections.OrderedDict() row["Feature"] = annotations[i].get_id() for j in xrange(n_samples): row[names[j]] = str_na(matrix[i][j]) yield row io.write_csv(filename, emitter())
def report_logs(self, name, logs, filter=(lambda sample, field: True), renaming={}): table = mine_logs(logs, filter) if name: filename = self.workspace / (self.file_prefix + name + '.csv') io.write_csv(filename, table) self.p(self.href(filename)) if table: table = mine_logs(logs, filter, commas=True) self.write('<table>\n') self.write('<tr>\n') for key in table[0].keys(): self.write('<th>'+key+'</th>') self.write('</tr>\n') for row in table: self.write('<tr>\n') for i,value in enumerate(row.values()): if i == 0: value = renaming.get(value,value) self.write('<td>'+value+'</td>') self.write('</tr>\n') self.write('</table>\n')
def run(self): assert len(self.pickles) > 0, "No samples to count." work = self.get_workspace() data = [ ] names = [ ] sample_tags = [ ] old = grace.status("Loading pickles") max_length = 1 for i, item in enumerate(self.pickles): grace.status("Loading "+os.path.basename(item)) f = io.open_possibly_compressed_file(item) name, tags, datum = pickle.load(f) f.close() data.append(datum) names.append(name) sample_tags.append(tags) try: max_length = max(max_length, max( item[0] #tail_length for feature in datum for item in feature.hits ) + 1) except ValueError: pass if i == 0: annotations = datum grace.status(old) self.log.log("Maximum tail length %d\n" % max_length) for i in xrange(len(names)): n_alignments = 0 for feature in data[i]: feature.total_count = len(feature.hits) feature.tail_counts = [ 0 ] * max_length n_alignments += feature.total_count for tail_length, adaptor_bases in feature.hits: if adaptor_bases >= self.adaptor: feature.tail_counts[tail_length] += 1 del feature.hits self.log.datum(names[i], 'Alignments to features', n_alignments) counts = [ ] # [feature][sample](total_count, [taillength]) for item in data: assert len(item) == len(data[0]) for row in itertools.izip(*data): this_counts = [ (item.total_count, item.tail_counts) for item in row ] counts.append(this_counts) n_features = len(counts) n_samples = len(data) sample_n = [ [0]*n_samples for i in xrange(n_features) ] # [feature][sample] Total count sample_n_tail = [ [0]*n_samples for i in xrange(n_features) ] # [feature][sample] Polya count sample_prop = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Proportion of reads with tail (deprecated) sample_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Mean tail length in each sample sample_sd_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Std dev tail length in each sample sample_total_tail = [ [0]*n_samples for i in xrange(n_features) ] sample_quantile_tail = collections.OrderedDict( (item, [ [None]*n_samples for i in xrange(n_features) ]) for item in [25,50,75,100] ) overall_n = [ 0 ]*n_features # [feature] Overall count overall_prop = [ None ]*n_features # [feature] Overall proportion with tail overall_tail = [ None ]*n_features # [feature] Overall mean tail length overall_n_tail = [ 0 ]*n_features # [feature] Overall polya count for i, row in enumerate(counts): for j, (this_this_n, item) in enumerate(row): sample_n[i][j] = this_this_n sample_n_tail[i][j] = sum(item[self.tail:]) sample_total_tail[i][j] = sum( item[k]*k for k in xrange(self.tail,max_length) ) if sample_n[i][j] >= 1: sample_prop[i][j] = float(sample_n_tail[i][j])/sample_n[i][j] if sample_n_tail[i][j] >= 1: sample_tail[i][j] = float(sample_total_tail[i][j])/sample_n_tail[i][j] for quantile in sample_quantile_tail: counter = sample_n_tail[i][j] * quantile / 100.0 for k in xrange(self.tail, max_length): counter -= item[k] if counter <= 0: break sample_quantile_tail[quantile][i][j] = k if sample_n_tail[i][j] >= 2: sample_sd_tail[i][j] = math.sqrt( float(sum( item[k]*((k-sample_tail[i][j])**2) for k in xrange(self.tail,max_length) )) / (sample_n_tail[i][j]-1) ) overall_n[i] = sum(sample_n[i]) overall_n_tail[i] = sum(sample_n_tail[i]) if overall_n[i] >= 1: overall_prop[i] = float(sum(sample_n_tail[i]))/overall_n[i] if overall_n_tail[i] >= 1: overall_tail[i] = float(sum(sample_total_tail[i]))/overall_n_tail[i] for i, name in enumerate(names): this_total = sum( item[i] for item in sample_total_tail ) this_n = sum( item[i] for item in sample_n_tail ) if this_n: self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n) for i, name in enumerate(names): this_total = sum( item[i] for item in sample_n_tail ) this_n = sum( item[i] for item in sample_n ) if this_n: self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n) with open(work/'features-with-data.gff','wb') as f: annotation.write_gff3_header(f) for i, item in enumerate(annotations): item.attr['reads'] = str(overall_n[i]) item.attr['reads_with_tail'] = str(overall_n_tail[i]) item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA' item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA' if overall_tail[i] is None: item.attr['color'] = '#444444' else: a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail) item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255)) #item.attr['color'] = ... print >> f, item.as_gff() comments = [ '#Counts' ] + [ '#sampleTags='+','.join(tags) for tags in sample_tags ] + [ '"Tail_count" group is number of reads with tail', '"Tail" group is mean tail per sample', '"Proportion" group is proportion of reads with tail', ] have_biotype = any("Biotype" in item.attr for item in annotations) have_parent = any("Parent" in item.attr for item in annotations) have_relation = any("Relation" in item.attr for item in annotations) have_antisense = any("Antisense_parent" in item.attr for item in annotations) def counts_iter(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(n_samples): row[('Count',names[j])] = '%d' % sample_n[i][j] row[('Annotation','Length')] = annotations[i].end - annotations[i].start row[('Annotation','gene')] = annotations[i].attr.get('Name','') row[('Annotation','product')] = annotations[i].attr.get('Product','') if have_biotype: row[('Annotation','biotype')] = annotations[i].attr.get('Biotype','') if have_parent: row[('Annotation','parent')] = annotations[i].attr.get('Parent','') if have_relation: row[('Annotation','relation')] = annotations[i].attr.get('Relation','') if have_antisense: row[('Annotation','antisense_gene')] = annotations[i].attr.get('Antisense_name','') row[('Annotation','antisense_product')] = annotations[i].attr.get('Antisense_product','') row[('Annotation','antisense_biotype')] = annotations[i].attr.get('Antisense_biotype','') row[('Annotation','antisense_parent')] = annotations[i].attr.get('Antisense_parent','') row[('Annotation','chromosome')] = str(annotations[i].seqid) row[('Annotation','strand')] = str(annotations[i].strand) row[('Annotation','start')] = str(annotations[i].start+1) row[('Annotation','end')] = str(annotations[i].end) row[('Annotation','reads')] = str(overall_n[i]) row[('Annotation','reads-with-tail')] = str(overall_n_tail[i]) row[('Annotation','mean-tail')] = str_na(overall_tail[i]) row[('Annotation','proportion-with-tail')] = str_na(overall_prop[i]) for j in xrange(n_samples): row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j] for j in xrange(n_samples): row[('Tail',names[j])] = str_na(sample_tail[i][j]) for j in xrange(n_samples): row[('Tail_sd',names[j])] = str_na(sample_sd_tail[i][j]) for quantile in sample_quantile_tail: for j in xrange(n_samples): row[('Tail_quantile_%d'%quantile,names[j])] = str_na(sample_quantile_tail[quantile][i][j]) for j in xrange(len(names)): row[('Proportion',names[j])] = str_na(sample_prop[i][j]) yield row io.write_csv(work/'counts.csv', counts_iter(), comments=comments) def write_csv_matrix(filename, matrix): def emitter(): for i in xrange(n_features): row = collections.OrderedDict() row["Feature"] = annotations[i].get_id() for j in xrange(n_samples): row[names[j]] = str_na(matrix[i][j]) yield row io.write_csv(filename, emitter()) write_csv_matrix(work/'read_count.csv', sample_n) write_csv_matrix(work/'tail_count.csv', sample_n_tail) write_csv_matrix(work/'tail.csv', sample_tail) write_csv_matrix(work/'tail_sd.csv', sample_sd_tail) for quantile in sample_quantile_tail: write_csv_matrix(work/('tail_quantile_%d.csv'%quantile), sample_quantile_tail[quantile]) #def raw_columns(): # for i in xrange(n_samples): # row = collections.OrderedDict() # row['Sample'] = names[i] # for j in xrange(max_length): # row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based # yield row #io.write_csv(work/'raw-columns.csv', raw_columns()) # ##Somewhat inefficient #def raw(): # for i in xrange(n_features): # row = collections.OrderedDict() # row['Feature'] = annotations[i].get_id() # for j in xrange(n_samples): # for k in xrange(max_length): # row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] ) # yield row #io.write_csv(work/'raw.csv', raw()) def pooled(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(max_length): row[str(j)] = str( sum( counts[i][k][1][j] for k in xrange(n_samples) ) ) yield row io.write_csv(work/'pooled.csv', pooled())
def report_logs(self, name, logs, filter=lambda sample, field: True): filename = self.workspace / (self.file_prefix + name + '.csv') io.write_csv(filename, mine_logs(logs, filter)) self.p(self.href(filename))
def run(self): names = [ sample.output_dir for sample in self.samples ] #os.path.splitext(os.path.split(item)[1])[0] #for item in self.reads #] reference = reference_directory.Reference(self.reference, must_exist=True) workspace = io.Workspace(self.output_dir, must_exist=False) samplespace = io.Workspace(workspace/'samples', must_exist=False) plotspace = io.Workspace(workspace/'plots', must_exist=False) expressionspace = io.Workspace(workspace/'expression', must_exist=False) testspace = io.Workspace(workspace/'test', must_exist=False) testspace_dedup = io.Workspace(workspace/'test-dedup', must_exist=False) file_prefix = self.file_prefix if file_prefix and not file_prefix.endswith('-'): file_prefix += '-' #dirs = [ # workspace/item # for item in names #] samples = [ ] for sample in self.samples: samples.append(sample( samplespace / sample.output_dir, reference = self.reference, )) dirs = [ item.output_dir for item in samples ] polya_dirs = [ item + '-polyA' for item in dirs ] interleaved = [ item2 for item in zip(dirs,polya_dirs) for item2 in item ] clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ] filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ] filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ] #filter_logs = [ item.get_filter_action().log_filename() for item in samples ] #filter_polya_logs = [ item.get_polya_filter_action().log_filename() for item in samples ] analyse_template = tail_lengths.Analyse_tail_counts( working_dirs = dirs, saturation = 0, extension = self.extension, annotations = reference/'reference.gff', types = 'gene', ) with nesoni.Stage() as stage: for item in samples: item.process_make(stage) nesoni.Norm_from_samples( workspace/'norm', working_dirs = dirs ).make() def writer(): for row in io.read_table(workspace/'norm.csv'): row['Name'] = row['Name']+'-polyA' yield row io.write_csv(workspace/'norm-polyA.csv', writer(), comments=['Normalization']) with nesoni.Stage() as stage: if self.include_plots: for plot_name, directories, norm_filename in [ ('all', dirs, workspace/'norm.csv'), ('polyA', polya_dirs, workspace/'norm-polyA.csv'), ]: nesoni.IGV_plots( plotspace/plot_name, working_dirs = directories, label_prefix = plot_name+' ', raw = True, norm = True, genome = reference.get_genome_filename(), norm_file = norm_filename, #delete_igv = False, ).process_make(stage) analyse_gene_counts_0 = analyse_template( output_dir = expressionspace/'genewise', saturation = 0, extension = self.extension, title = 'Genewise expression - ' + self.title, file_prefix = file_prefix+'genewise-', ) analyse_gene_counts_0.process_make(stage) analyse_gene_counts_1 = analyse_template( output_dir = expressionspace/'genewise-dedup', saturation = 1, title = 'Genewise expression with read deduplication - ' + self.title, file_prefix = file_prefix+'genewise-dedup-', ) analyse_gene_counts_1.process_make(stage) stage.process(self._run_peaks, workspace=workspace, expressionspace=expressionspace, reference=reference, polya_dirs=polya_dirs, analyse_template=analyse_template, file_prefix=file_prefix, ) with nesoni.Stage() as stage: for test in self.tests: test( output_dir = testspace/test.output_dir, analysis = self.output_dir ).process_make(stage) test( output_dir = testspace_dedup/test.output_dir, analysis = self.output_dir, dedup = True, ).process_make(stage) #=============================================== # Report #=============================================== r = reporting.Reporter(os.path.join(self.output_dir, 'report'), self.title, self.file_prefix) r.heading('Alignment to reference') r.report_logs('alignment-statistics', #[ workspace/'stats.txt' ] + clipper_logs + filter_logs + #filter_polya_logs + [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ], filter=lambda sample, field: ( field not in [ 'fragments','fragments aligned to the reference','reads kept', 'average depth of coverage, ambiguous', 'average depth of coverage, unambiguous', ] ), ) if self.include_plots: r.heading('IGV plots') r.p('These files show the depth of coverage. They can be viewed with the IGV genome browser.') genome_files = [ ] if self.include_genome: genome_files.append(reference.get_genome_filename()) genome_dir = reference.get_genome_dir() base = os.path.split(self.genome_dir)[1] for filename in os.listdir(genome_dir): genome_files.append(( os.path.join(genome_dir, filename), os.path.join(base, filename) )) r.p(r.tar('igv-plots', genome_files + glob.glob(plotspace/'*.tdf') )) if self.include_bams: r.heading('BAM files') r.p('These BAM files contain the alignments of reads to the reference sequences.') r.p('Reads with a poly(A) tail have an \'AN\' attribute giving the length of non-templated poly(A) sequence. ' 'Tail-tools only treats a read as having a tail if this length is at least 4.') bam_files = [ ] for name in names: bam_files.append( (samplespace/(name,'alignments_filtered_sorted.bam'),name+'.bam') ) bam_files.append( (samplespace/(name,'alignments_filtered_sorted.bam.bai'),name+'.bam.bai') ) r.p(r.tar('bam-files', bam_files)) r.heading('Genewise expression') io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise') r.p('<a href="genewise/index.html">→ Genewise expression</a>') io.symbolic_link(source=expressionspace/('genewise-dedup','report'),link_name=r.workspace/'genewise-dedup') r.p('<a href="genewise-dedup/index.html">→ Genewise expression with read deduplication</a>') r.heading('Peakwise expression') web.Geneview_webapp(r.workspace/'view').run() peak_filename = expressionspace/('peakwise','features-with-data.gff') n_peaks = len(list(annotation.read_annotations(peak_filename))) r.p('%d peaks called (%d poly(A) reads were required to call a peak).' % (n_peaks, self.peak_min_depth)) r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called') #if self.groups: #r.subheading('Peak shift between groups') #r.p(r.get(workspace/('peak-shift','grouped.csv')) + ' - genes with a potential peak shift') #r.get(workspace/('peak-shift','grouped.json')) #r.subheading('Peak shift between samples') #r.p(r.get(workspace/('peak-shift','individual.csv')) + ' - genes with a potential peak shift') #r.get(workspace/('peak-shift','individual.json')) io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise') r.p('<a href="peakwise/index.html">→ Peakwise expression</a>') io.symbolic_link(source=expressionspace/('peakwise-dedup','report'),link_name=r.workspace/'peakwise-dedup') r.p('<a href="peakwise-dedup/index.html">→ Peakwise expression with read deduplication</a>') if self.tests: r.heading('Differential tests') for test in self.tests: io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir)) io.symbolic_link(source=testspace_dedup/test.output_dir,link_name=r.workspace/('test-dedup-'+test.output_dir)) r.p('<a href="test-%s">→ %s</a> ' ' <a href="test-dedup-%s" style="font-size: 66%%">[→ Deduplicated version]</a>' % (test.output_dir, test.get_title(), test.output_dir)) r.heading('Gene viewers') r.p('Having identified interesting genes from heatmaps and differential tests above, ' 'these viewers allow specific genes to be examined in detail.') if self.groups: r.p('<a href="view.html?json=%sgrouped.json">→ Gene viewer, grouped samples</a>' % r.file_prefix) r.p('<a href="view.html?json=%sindividual.json">→ Gene viewer, individual samples</a>' % r.file_prefix) r.write('<p/><hr>\n') r.p('Note: Use deduplicated versions with care. ' 'They may possibly provide more significant results, however they are less quantitative. ' 'Read deduplication involves throwing away a large amount of data, much of which will not be a technical artifact. ' 'Deduplicated versions might best be viewed as a check on data quality.') r.p('This set of genes was used in the analysis:') r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format') r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions') r.p('tail-tools version '+tail_tools.VERSION) r.p('nesoni version '+nesoni.VERSION) #r.p('SHRiMP version '+grace.get_shrimp_2_version()) r.close()
def run(self): work = self.get_workspace() data = [] names = [] sample_tags = [] old = grace.status("Loading pickles") max_length = 1 for i, item in enumerate(self.pickles): grace.status("Loading " + os.path.basename(item)) f = io.open_possibly_compressed_file(item) name, tags, datum = pickle.load(f) f.close() data.append(datum) names.append(name) sample_tags.append(tags) try: max_length = max( max_length, max(item[0] #tail_length for feature in datum for item in feature.hits) + 1) except ValueError: pass if i == 0: annotations = datum grace.status(old) self.log.log("Maximum tail length %d\n" % max_length) for i in xrange(len(names)): n_alignments = 0 for feature in data[i]: feature.total_count = len(feature.hits) feature.tail_counts = [0] * max_length n_alignments += feature.total_count for tail_length, adaptor_bases in feature.hits: if adaptor_bases >= self.adaptor: feature.tail_counts[tail_length] += 1 del feature.hits self.log.datum(names[i], 'Alignments to features', n_alignments) counts = [] # [feature][sample](total_count, [taillength]) for item in data: assert len(item) == len(data[0]) for row in itertools.izip(*data): this_counts = [(item.total_count, item.tail_counts) for item in row] counts.append(this_counts) n_features = len(counts) n_samples = len(data) sample_n = [[0] * n_samples for i in xrange(n_features) ] # [feature][sample] Total count sample_n_tail = [[0] * n_samples for i in xrange(n_features) ] # [feature][sample] Polya count sample_prop = [ [None] * n_samples for i in xrange(n_features) ] # [feature][sample] Proportion of reads with tail (deprecated) sample_tail = [[None] * n_samples for i in xrange(n_features) ] # [feature][sample] Mean tail length in each sample sample_sd_tail = [ [None] * n_samples for i in xrange(n_features) ] # [feature][sample] Std dev tail length in each sample sample_total_tail = [[0] * n_samples for i in xrange(n_features)] sample_quantile_tail = collections.OrderedDict( (item, [[None] * n_samples for i in xrange(n_features)]) for item in [25, 50, 75, 100]) overall_n = [0] * n_features # [feature] Overall count overall_prop = [ None ] * n_features # [feature] Overall proportion with tail overall_tail = [ None ] * n_features # [feature] Overall mean tail length overall_n_tail = [ 0 ] * n_features # [feature] Overall polya count for i, row in enumerate(counts): for j, (this_this_n, item) in enumerate(row): sample_n[i][j] = this_this_n sample_n_tail[i][j] = sum(item[self.tail:]) sample_total_tail[i][j] = sum( item[k] * k for k in xrange(self.tail, max_length)) if sample_n[i][j] >= 1: sample_prop[i][j] = float( sample_n_tail[i][j]) / sample_n[i][j] if sample_n_tail[i][j] >= 1: sample_tail[i][j] = float( sample_total_tail[i][j]) / sample_n_tail[i][j] for quantile in sample_quantile_tail: counter = sample_n_tail[i][j] * quantile / 100.0 for k in xrange(self.tail, max_length): counter -= item[k] if counter <= 0: break sample_quantile_tail[quantile][i][j] = k if sample_n_tail[i][j] >= 2: sample_sd_tail[i][j] = math.sqrt( float( sum(item[k] * ((k - sample_tail[i][j])**2) for k in xrange(self.tail, max_length))) / (sample_n_tail[i][j] - 1)) overall_n[i] = sum(sample_n[i]) overall_n_tail[i] = sum(sample_n_tail[i]) if overall_n[i] >= 1: overall_prop[i] = float(sum(sample_n_tail[i])) / overall_n[i] if overall_n_tail[i] >= 1: overall_tail[i] = float(sum( sample_total_tail[i])) / overall_n_tail[i] for i, name in enumerate(names): this_total = sum(item[i] for item in sample_total_tail) this_n = sum(item[i] for item in sample_n_tail) if this_n: self.log.datum(name, 'Average poly-A tail', float(this_total) / this_n) for i, name in enumerate(names): this_total = sum(item[i] for item in sample_n_tail) this_n = sum(item[i] for item in sample_n) if this_n: self.log.datum(name, 'Average proportion of reads with tail', float(this_total) / this_n) with open(work / 'features-with-data.gff', 'wb') as f: annotation.write_gff3_header(f) for i, item in enumerate(annotations): item.attr['reads'] = str(overall_n[i]) item.attr['reads_with_tail'] = str(overall_n_tail[i]) item.attr['mean_tail'] = '%.1f' % overall_tail[ i] if overall_tail[i] else 'NA' item.attr['proportion_with_tail'] = '%.2f' % overall_prop[ i] if overall_prop[i] else 'NA' if overall_tail[i] is None: item.attr['color'] = '#444444' else: a = (overall_tail[i] - self.tail) / max( 1, max_length - self.tail) item.attr['color'] = '#%02x%02x%02x' % (int( a * 255), int( (1 - abs(a * 2 - 1)) * 255), 255 - int(a * 255)) #item.attr['color'] = ... print >> f, item.as_gff() comments = ['#Counts'] + [ '#sampleTags=' + ','.join(tags) for tags in sample_tags ] + [ '"Tail_count" group is number of reads with tail', '"Tail" group is mean tail per sample', '"Proportion" group is proportion of reads with tail', ] have_biotype = any("Biotype" in item.attr for item in annotations) have_parent = any("Parent" in item.attr for item in annotations) have_relation = any("Relation" in item.attr for item in annotations) have_antisense = any("Antisense_parent" in item.attr for item in annotations) def counts_iter(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(n_samples): row[('Count', names[j])] = '%d' % sample_n[i][j] row[('Annotation', 'Length')] = annotations[i].end - annotations[i].start row[('Annotation', 'gene')] = annotations[i].attr.get('Name', '') row[('Annotation', 'product')] = annotations[i].attr.get('Product', '') if have_biotype: row[('Annotation', 'biotype')] = annotations[i].attr.get('Biotype', '') if have_parent: row[('Annotation', 'parent')] = annotations[i].attr.get('Parent', '') if have_relation: row[('Annotation', 'relation')] = annotations[i].attr.get( 'Relation', '') if have_antisense: row[('Annotation', 'antisense_gene')] = annotations[i].attr.get( 'Antisense_name', '') row[('Annotation', 'antisense_product')] = annotations[i].attr.get( 'Antisense_product', '') row[('Annotation', 'antisense_biotype')] = annotations[i].attr.get( 'Antisense_biotype', '') row[('Annotation', 'antisense_parent')] = annotations[i].attr.get( 'Antisense_parent', '') row[('Annotation', 'chromosome')] = str(annotations[i].seqid) row[('Annotation', 'strand')] = str(annotations[i].strand) row[('Annotation', 'start')] = str(annotations[i].start + 1) row[('Annotation', 'end')] = str(annotations[i].end) row[('Annotation', 'reads')] = str(overall_n[i]) row[('Annotation', 'reads-with-tail')] = str(overall_n_tail[i]) row[('Annotation', 'mean-tail')] = str_na(overall_tail[i]) row[('Annotation', 'proportion-with-tail')] = str_na(overall_prop[i]) for j in xrange(n_samples): row[('Tail_count', names[j])] = '%d' % sample_n_tail[i][j] for j in xrange(n_samples): row[('Tail', names[j])] = str_na(sample_tail[i][j]) for j in xrange(n_samples): row[('Tail_sd', names[j])] = str_na(sample_sd_tail[i][j]) for quantile in sample_quantile_tail: for j in xrange(n_samples): row[('Tail_quantile_%d' % quantile, names[j])] = str_na( sample_quantile_tail[quantile][i][j]) for j in xrange(len(names)): row[('Proportion', names[j])] = str_na(sample_prop[i][j]) yield row io.write_csv(work / 'counts.csv', counts_iter(), comments=comments) def write_csv_matrix(filename, matrix): def emitter(): for i in xrange(n_features): row = collections.OrderedDict() row["Feature"] = annotations[i].get_id() for j in xrange(n_samples): row[names[j]] = str_na(matrix[i][j]) yield row io.write_csv(filename, emitter()) write_csv_matrix(work / 'read_count.csv', sample_n) write_csv_matrix(work / 'tail_count.csv', sample_n_tail) write_csv_matrix(work / 'tail.csv', sample_tail) write_csv_matrix(work / 'tail_sd.csv', sample_sd_tail) for quantile in sample_quantile_tail: write_csv_matrix(work / ('tail_quantile_%d.csv' % quantile), sample_quantile_tail[quantile]) #def raw_columns(): # for i in xrange(n_samples): # row = collections.OrderedDict() # row['Sample'] = names[i] # for j in xrange(max_length): # row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based # yield row #io.write_csv(work/'raw-columns.csv', raw_columns()) # ##Somewhat inefficient #def raw(): # for i in xrange(n_features): # row = collections.OrderedDict() # row['Feature'] = annotations[i].get_id() # for j in xrange(n_samples): # for k in xrange(max_length): # row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] ) # yield row #io.write_csv(work/'raw.csv', raw()) def pooled(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(max_length): row[str(j)] = str( sum(counts[i][k][1][j] for k in xrange(n_samples))) yield row io.write_csv(work / 'pooled.csv', pooled())
def run(self): work = self.get_workspace() data = [ ] names = [ ] sample_tags = [ ] for item in self.pickles: f = io.open_possibly_compressed_file(item) name, tags, datum = pickle.load(f) f.close() data.append(datum) names.append(name) sample_tags.append(tags) annotations = data[0] all_lengths = [ #tail_length item[2] for sample in data for feature in sample #for rel_start,rel_end,tail_length in feature.hits for item in feature.hits ] if all_lengths: max_length = max(all_lengths)+1 else: max_length = 1 del all_lengths for i, sample in enumerate(data): n_alignments = 0 n_duplicates = 0 n_good = 0 for feature in sample: feature.tail_counts = [ 0.0 ] * max_length buckets = collections.defaultdict(list) for item in feature.hits: rel_start,rel_end,tail_length = item[:3] buckets[ (rel_start,rel_end) ].append(tail_length) for item in buckets.values(): n_alignments += len(item) n_good += 1 if self.saturation < 1 or len(item) <= self.saturation: weight = 1.0 else: weight = float(self.saturation) / len(item) n_duplicates += len(item) for item2 in item: feature.tail_counts[item2] += weight self.log.datum(names[i], 'Alignments to features', n_alignments) if self.saturation >= 1: self.log.datum(names[i], 'Proportion of alignments with duplicate start and end position', float(n_duplicates)/max(1,n_alignments)) self.log.datum(names[i], 'Alignments to features after deduplication', n_good) counts = [ ] # [feature][sample][taillength] for item in data: assert len(item) == len(data[0]) for row in itertools.izip(*data): this_counts = [ item.tail_counts for item in row ] counts.append(this_counts) sample_n = [ ] # [feature][sample] Total count sample_n_tail = [ ] # [feature][sample] Polya count sample_prop = [ ] # [feature][sample] Proportion of reads with tail sample_tail = [ ] # [feature][sample] Mean tail length in each sample sample_total_tail = [ ] overall_n = [ ] overall_prop = [ ] # [feature] Overall proportion with tail overall_tail = [ ] # [feature] Overall mean tail length overall_n_tail = [ ] # [feature] Overall polya count overall_total_tail = [ ] for row in counts: this_n = [ ] this_n_tail = [ ] this_prop = [ ] this_tail = [ ] this_total_tail = [ ] for item in row: this_this_n = sum(item) this_n.append( this_this_n ) this_this_n_tail = sum(item[self.tail:]) this_n_tail.append( this_this_n_tail ) this_this_total_tail = sum( item[i]*i for i in xrange(self.tail,max_length) ) this_total_tail.append( this_this_total_tail ) if this_this_n < 1: this_prop.append(None) else: this_prop.append(float(this_this_n_tail)/this_this_n) if this_this_n_tail < 1: this_tail.append(None) else: this_tail.append(this_this_total_tail/this_this_n_tail) sample_n.append(this_n) sample_n_tail.append(this_n_tail) sample_prop.append(this_prop) sample_tail.append(this_tail) sample_total_tail.append(this_total_tail) overall_n.append(sum(this_n)) overall_n_tail.append(sum(this_n_tail)) overall_total_tail.append(sum(this_total_tail)) if sum(this_n) < 1: overall_prop.append(None) else: overall_prop.append(float(sum(this_n_tail))/sum(this_n)) if sum(this_n_tail) < 1: overall_tail.append(None) else: overall_tail.append(float(sum(this_total_tail))/sum(this_n_tail)) for i, name in enumerate(names): this_total = sum( item[i] for item in sample_total_tail ) this_n = sum( item[i] for item in sample_n_tail ) if this_n: self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n) for i, name in enumerate(names): this_total = sum( item[i] for item in sample_n_tail ) this_n = sum( item[i] for item in sample_n ) if this_n: self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n) #max_length = max(max(len(item) for item in row) for row in counts) # #for row in counts: # for item in row: # while len(item) < max_length: # item.append(0) with open(work/'features-with-data.gff','wb') as f: annotation.write_gff3_header(f) for i, item in enumerate(annotations): item.attr['reads'] = str(overall_n[i]) item.attr['reads_with_tail'] = str(overall_n_tail[i]) item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA' item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA' if overall_tail[i] is None: item.attr['color'] = '#444444' else: a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail) item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255)) #item.attr['color'] = ... print >> f, item.as_gff() comments = [ '#Counts' ] + [ '#sampleTags='+','.join(tags) for tags in sample_tags ] + [ '"Tail_count" group is number of reads with tail', '"Tail" group is mean tail per sample', '"Proportion" group is proportion of reads with tail', ] def counts_iter(): for i in xrange(len(counts)): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(len(names)): row[('Count',names[j])] = '%d' % sample_n[i][j] row[('Annotation','Length')] = annotations[i].end - annotations[i].start row[('Annotation','gene')] = annotations[i].attr.get('Name','') row[('Annotation','product')] = annotations[i].attr.get('Product','') #row[('Annotation','Strand')] = str(annotations[i].strand) row[('Annotation','reads')] = str(overall_n[i]) row[('Annotation','reads-with-tail')] = str(overall_n_tail[i]) row[('Annotation','mean-tail')] = str(overall_tail[i]) if overall_tail[i] is not None else 'NA' row[('Annotation','proportion-with-tail')] = str(overall_prop[i]) if overall_prop[i] is not None else 'NA' for j in xrange(len(names)): row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j] for j in xrange(len(names)): row[('Tail',names[j])] = str(sample_tail[i][j]) if sample_tail[i][j] is not None else 'NA' for j in xrange(len(names)): row[('Proportion',names[j])] = str(sample_prop[i][j]) if sample_prop[i][j] is not None else 'NA' yield row io.write_csv(work/'counts.csv', counts_iter(), comments=comments) def raw_columns(): for i in xrange(len(names)): row = collections.OrderedDict() row['Sample'] = names[i] for j in xrange(max_length): row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based yield row io.write_csv(work/'raw-columns.csv', raw_columns()) #Somewhat inefficient def raw(): for i in xrange(len(counts)): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(len(names)): for k in xrange(max_length): row['%d %s' % (k,names[j])] = str( counts[i][j][k] ) yield row io.write_csv(work/'raw.csv', raw()) def pooled(): for i in xrange(len(counts)): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(max_length): row[str(j)] = str( sum( counts[i][k][j] for k in xrange(len(names)) ) ) yield row io.write_csv(work/'pooled.csv', pooled())