def write_csv_matrix(filename, matrix):
     def emitter():
         for i in xrange(n_features):
             row = collections.OrderedDict()
             row["Feature"] = annotations[i].get_id()
             for j in xrange(n_samples):
                 row[names[j]] = str_na(matrix[i][j])
             yield row
     io.write_csv(filename, emitter())
Esempio n. 2
0
        def write_csv_matrix(filename, matrix):
            def emitter():
                for i in xrange(n_features):
                    row = collections.OrderedDict()
                    row["Feature"] = annotations[i].get_id()
                    for j in xrange(n_samples):
                        row[names[j]] = str_na(matrix[i][j])
                    yield row

            io.write_csv(filename, emitter())
Esempio n. 3
0
 def report_logs(self, name, logs, filter=(lambda sample, field: True), renaming={}):
     table = mine_logs(logs, filter)
     
     if name:
         filename = self.workspace / (self.file_prefix + name + '.csv')        
         io.write_csv(filename, table)
         self.p(self.href(filename))
     
     if table:
         table = mine_logs(logs, filter, commas=True)
         
         self.write('<table>\n')
         self.write('<tr>\n')
         for key in table[0].keys():
             self.write('<th>'+key+'</th>')
         self.write('</tr>\n')
         for row in table:
             self.write('<tr>\n')
             for i,value in enumerate(row.values()):
                 if i == 0:
                     value = renaming.get(value,value)
                 self.write('<td>'+value+'</td>')
             self.write('</tr>\n')
         self.write('</table>\n')
    def run(self):
        assert len(self.pickles) > 0, "No samples to count."
        
        work = self.get_workspace()
        
        data = [ ]
        names = [ ]
        sample_tags = [ ]
        
        old = grace.status("Loading pickles")
        
        max_length = 1
        for i, item in enumerate(self.pickles):
            grace.status("Loading "+os.path.basename(item))
            f = io.open_possibly_compressed_file(item)
            name, tags, datum = pickle.load(f)
            f.close()
            data.append(datum)
            names.append(name)
            sample_tags.append(tags)
            
            try:
                max_length = max(max_length, max( 
                    item[0] #tail_length
                    for feature in datum
                    for item in feature.hits
                    ) + 1)
            except ValueError:
                pass
            
            if i == 0:
               annotations = datum
        
        grace.status(old)
        
        self.log.log("Maximum tail length %d\n" % max_length)

        for i in xrange(len(names)):        
            n_alignments = 0
            for feature in data[i]:
                feature.total_count = len(feature.hits)
                feature.tail_counts = [ 0 ] * max_length
                
                n_alignments += feature.total_count
                
                for tail_length, adaptor_bases in feature.hits:
                    if adaptor_bases >= self.adaptor:
                        feature.tail_counts[tail_length] += 1
                
                del feature.hits

            self.log.datum(names[i], 'Alignments to features', n_alignments)
                
        
        counts = [ ]  # [feature][sample](total_count, [taillength])
        
        for item in data: 
            assert len(item) == len(data[0])
        for row in itertools.izip(*data):
            this_counts = [ (item.total_count, item.tail_counts) for item in row ]
            counts.append(this_counts)
        
        n_features = len(counts)
        n_samples = len(data)
        
        sample_n = [ [0]*n_samples for i in xrange(n_features) ]        # [feature][sample]  Total count
        sample_n_tail = [ [0]*n_samples for i in xrange(n_features) ]   # [feature][sample]  Polya count
        sample_prop = [ [None]*n_samples for i in xrange(n_features) ]    # [feature][sample]  Proportion of reads with tail (deprecated)
        sample_tail = [ [None]*n_samples for i in xrange(n_features) ]    # [feature][sample]  Mean tail length in each sample
        sample_sd_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample]  Std dev tail length in each sample
        sample_total_tail = [ [0]*n_samples for i in xrange(n_features) ]
        
        sample_quantile_tail = collections.OrderedDict( 
            (item, [ [None]*n_samples for i in xrange(n_features) ]) 
            for item in [25,50,75,100]
            )
        
        overall_n = [ 0 ]*n_features       # [feature]          Overall count
        overall_prop = [ None ]*n_features   # [feature]          Overall proportion with tail
        overall_tail = [ None ]*n_features   # [feature]          Overall mean tail length
        overall_n_tail = [ 0 ]*n_features  # [feature]          Overall polya count
        for i, row in enumerate(counts):
            for j, (this_this_n, item) in enumerate(row):
                sample_n[i][j] = this_this_n
                sample_n_tail[i][j] = sum(item[self.tail:])
                sample_total_tail[i][j] = sum( item[k]*k for k in xrange(self.tail,max_length) )

                if sample_n[i][j] >= 1:
                    sample_prop[i][j] = float(sample_n_tail[i][j])/sample_n[i][j]
                
                if sample_n_tail[i][j] >= 1:
                    sample_tail[i][j] = float(sample_total_tail[i][j])/sample_n_tail[i][j]
                
                    for quantile in sample_quantile_tail:
                        counter = sample_n_tail[i][j] * quantile / 100.0
                        for k in xrange(self.tail, max_length):
                            counter -= item[k]
                            if counter <= 0: break
                        sample_quantile_tail[quantile][i][j] = k
                
                if sample_n_tail[i][j] >= 2:
                    sample_sd_tail[i][j] = math.sqrt(
                        float(sum( item[k]*((k-sample_tail[i][j])**2) for k in xrange(self.tail,max_length) ))
                        / (sample_n_tail[i][j]-1)
                        )
                    
            overall_n[i] = sum(sample_n[i])
            overall_n_tail[i] = sum(sample_n_tail[i])
            if overall_n[i] >= 1:
                overall_prop[i] = float(sum(sample_n_tail[i]))/overall_n[i]
            if overall_n_tail[i] >= 1:
                overall_tail[i] = float(sum(sample_total_tail[i]))/overall_n_tail[i]
             
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_total_tail )
            this_n = sum( item[i] for item in sample_n_tail )
            if this_n:
                self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n)
                
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_n_tail )
            this_n = sum( item[i] for item in sample_n )
            if this_n:
                self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n)
        
        with open(work/'features-with-data.gff','wb') as f:
            annotation.write_gff3_header(f)
            for i, item in enumerate(annotations):
                item.attr['reads'] = str(overall_n[i])
                item.attr['reads_with_tail'] = str(overall_n_tail[i])
                item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA'
                item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA'
                
                if overall_tail[i] is None:
                    item.attr['color'] = '#444444'
                else:
                    a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail)
                    item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255))
                #item.attr['color'] = ...                
                print >> f, item.as_gff()
        
        
        comments = [ '#Counts' ] + [
            '#sampleTags='+','.join(tags)
            for tags in sample_tags
            ] + [
            '"Tail_count" group is number of reads with tail',
            '"Tail" group is mean tail per sample',
            '"Proportion" group is proportion of reads with tail',
            ]
            
        have_biotype = any("Biotype" in item.attr for item in annotations)
        have_parent = any("Parent" in item.attr for item in annotations)
        have_relation = any("Relation" in item.attr for item in annotations)
        have_antisense = any("Antisense_parent" in item.attr for item in annotations)

        def counts_iter():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(n_samples):
                    row[('Count',names[j])] = '%d' % sample_n[i][j]

                row[('Annotation','Length')] = annotations[i].end - annotations[i].start
                row[('Annotation','gene')] = annotations[i].attr.get('Name','')
                row[('Annotation','product')] = annotations[i].attr.get('Product','')
                if have_biotype:
                    row[('Annotation','biotype')] = annotations[i].attr.get('Biotype','')
                if have_parent:
                    row[('Annotation','parent')] = annotations[i].attr.get('Parent','')
                if have_relation:
                    row[('Annotation','relation')] = annotations[i].attr.get('Relation','')
                
                if have_antisense:
                    row[('Annotation','antisense_gene')] = annotations[i].attr.get('Antisense_name','')
                    row[('Annotation','antisense_product')] = annotations[i].attr.get('Antisense_product','')
                    row[('Annotation','antisense_biotype')] = annotations[i].attr.get('Antisense_biotype','')
                    row[('Annotation','antisense_parent')] = annotations[i].attr.get('Antisense_parent','')
                
                row[('Annotation','chromosome')] = str(annotations[i].seqid)
                row[('Annotation','strand')] = str(annotations[i].strand)
                row[('Annotation','start')] = str(annotations[i].start+1)
                row[('Annotation','end')] = str(annotations[i].end)
                
                row[('Annotation','reads')] = str(overall_n[i])
                row[('Annotation','reads-with-tail')] = str(overall_n_tail[i])
                row[('Annotation','mean-tail')] = str_na(overall_tail[i])
                row[('Annotation','proportion-with-tail')] = str_na(overall_prop[i])
                for j in xrange(n_samples):
                    row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j]
                for j in xrange(n_samples):
                    row[('Tail',names[j])] = str_na(sample_tail[i][j])
                for j in xrange(n_samples):
                    row[('Tail_sd',names[j])] = str_na(sample_sd_tail[i][j])
                
                for quantile in sample_quantile_tail:
                    for j in xrange(n_samples):
                        row[('Tail_quantile_%d'%quantile,names[j])] = str_na(sample_quantile_tail[quantile][i][j])                    
                
                for j in xrange(len(names)):
                    row[('Proportion',names[j])] = str_na(sample_prop[i][j])
                yield row
        io.write_csv(work/'counts.csv', counts_iter(), comments=comments)
        
        
        def write_csv_matrix(filename, matrix):
            def emitter():
                for i in xrange(n_features):
                    row = collections.OrderedDict()
                    row["Feature"] = annotations[i].get_id()
                    for j in xrange(n_samples):
                        row[names[j]] = str_na(matrix[i][j])
                    yield row
            io.write_csv(filename, emitter())
            
        write_csv_matrix(work/'read_count.csv', sample_n)
        write_csv_matrix(work/'tail_count.csv', sample_n_tail)
        write_csv_matrix(work/'tail.csv', sample_tail)
        write_csv_matrix(work/'tail_sd.csv', sample_sd_tail)
        for quantile in sample_quantile_tail:
            write_csv_matrix(work/('tail_quantile_%d.csv'%quantile), sample_quantile_tail[quantile])


        #def raw_columns():
        #    for i in xrange(n_samples):
        #        row = collections.OrderedDict()
        #        row['Sample'] = names[i]
        #        for j in xrange(max_length):
        #            row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based
        #        yield row
        #io.write_csv(work/'raw-columns.csv', raw_columns())
        #
        ##Somewhat inefficient        
        #def raw():
        #    for i in xrange(n_features):
        #        row = collections.OrderedDict()
        #        row['Feature'] = annotations[i].get_id()
        #        for j in xrange(n_samples):
        #            for k in xrange(max_length):
        #                row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] )
        #        yield row
        #io.write_csv(work/'raw.csv', raw())
        
        def pooled():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(max_length):
                    row[str(j)] = str( sum( counts[i][k][1][j] for k in xrange(n_samples) ) )
                yield row
        io.write_csv(work/'pooled.csv', pooled())
Esempio n. 5
0
 def report_logs(self, name, logs, filter=lambda sample, field: True):
     filename = self.workspace / (self.file_prefix + name + '.csv')
     io.write_csv(filename, mine_logs(logs, filter))
     self.p(self.href(filename))
Esempio n. 6
0
 def report_logs(self, name, logs, filter=lambda sample, field: True):
     filename = self.workspace / (self.file_prefix + name + '.csv')        
     io.write_csv(filename, mine_logs(logs, filter))        
     self.p(self.href(filename))
Esempio n. 7
0
    def run(self):
        names = [ sample.output_dir for sample in self.samples ]
            #os.path.splitext(os.path.split(item)[1])[0]
            #for item in self.reads
            #]
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        workspace = io.Workspace(self.output_dir, must_exist=False)
        samplespace = io.Workspace(workspace/'samples', must_exist=False)
        plotspace = io.Workspace(workspace/'plots', must_exist=False)
        expressionspace = io.Workspace(workspace/'expression', must_exist=False)
        testspace = io.Workspace(workspace/'test', must_exist=False)
        testspace_dedup = io.Workspace(workspace/'test-dedup', must_exist=False)
                
        file_prefix = self.file_prefix
        if file_prefix and not file_prefix.endswith('-'):
            file_prefix += '-'


        #dirs = [
        #    workspace/item
        #    for item in names
        #]

        samples = [ ]
        for sample in self.samples:
            samples.append(sample(
                samplespace / sample.output_dir,
                reference = self.reference,
                ))
        
        dirs = [ item.output_dir for item in samples ]
        polya_dirs = [ item + '-polyA' for item in dirs ]        
        interleaved = [ item2 for item in zip(dirs,polya_dirs) for item2 in item ]
        
        clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ]
        filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ]
        filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ]                
        #filter_logs = [ item.get_filter_action().log_filename() for item in samples ]
        #filter_polya_logs = [ item.get_polya_filter_action().log_filename() for item in samples ]

        analyse_template = tail_lengths.Analyse_tail_counts(
            working_dirs = dirs,
            saturation = 0,
            extension = self.extension,
            annotations = reference/'reference.gff',
            types = 'gene',
            )
        

        with nesoni.Stage() as stage:        
            for item in samples:
                item.process_make(stage)

        
        
        nesoni.Norm_from_samples(
            workspace/'norm',
            working_dirs = dirs
            ).make()

        def writer():
            for row in io.read_table(workspace/'norm.csv'):
                row['Name'] = row['Name']+'-polyA'
                yield row
        io.write_csv(workspace/'norm-polyA.csv', writer(), comments=['Normalization'])


        with nesoni.Stage() as stage:
            if self.include_plots:        
                for plot_name, directories, norm_filename in [
                      ('all',   dirs,       workspace/'norm.csv'),
                      ('polyA', polya_dirs, workspace/'norm-polyA.csv'),
                      ]:
                    nesoni.IGV_plots(
                        plotspace/plot_name,
                        working_dirs = directories,
                        label_prefix = plot_name+' ',
                        raw = True,
                        norm = True,
                        genome = reference.get_genome_filename(),
                        norm_file = norm_filename,
                        #delete_igv = False,
                        ).process_make(stage)

            analyse_gene_counts_0 = analyse_template(
                output_dir = expressionspace/'genewise',
                saturation = 0,
                extension = self.extension,
                title = 'Genewise expression - ' + self.title,
                file_prefix = file_prefix+'genewise-',
                )
            analyse_gene_counts_0.process_make(stage)
            
            analyse_gene_counts_1 = analyse_template(
                output_dir = expressionspace/'genewise-dedup',
                saturation = 1,
                title = 'Genewise expression with read deduplication - ' + self.title,
                file_prefix = file_prefix+'genewise-dedup-',
                )
            analyse_gene_counts_1.process_make(stage)
            
            stage.process(self._run_peaks, 
                workspace=workspace, expressionspace=expressionspace, reference=reference, 
                polya_dirs=polya_dirs, analyse_template=analyse_template, file_prefix=file_prefix,
                )
            
        with nesoni.Stage() as stage:
            for test in self.tests:
                test(
                    output_dir = testspace/test.output_dir,
                    analysis = self.output_dir
                    ).process_make(stage)

                test(
                    output_dir = testspace_dedup/test.output_dir,
                    analysis = self.output_dir,
                    dedup = True,
                    ).process_make(stage)
        
        #===============================================
        #                   Report        
        #===============================================

        r = reporting.Reporter(os.path.join(self.output_dir, 'report'), self.title, self.file_prefix)
                    
        r.heading('Alignment to reference')
        
        r.report_logs('alignment-statistics',
            #[ workspace/'stats.txt' ] +
            clipper_logs + filter_logs + #filter_polya_logs +
            [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ],
            filter=lambda sample, field: (
                field not in [
                    
                    'fragments','fragments aligned to the reference','reads kept',
                    'average depth of coverage, ambiguous',
                    'average depth of coverage, unambiguous',
                    ]
            ),
        )


        if self.include_plots:        
            r.heading('IGV plots')
            
            r.p('These files show the depth of coverage. They can be viewed with the IGV genome browser.')
            
            genome_files = [ ]
            if self.include_genome:
                genome_files.append(reference.get_genome_filename())
                genome_dir = reference.get_genome_dir()
                base = os.path.split(self.genome_dir)[1]
                for filename in os.listdir(genome_dir):
                    genome_files.append((
                        os.path.join(genome_dir, filename),
                        os.path.join(base, filename)
                        ))
            
            r.p(r.tar('igv-plots',
                genome_files +
                glob.glob(plotspace/'*.tdf')
                ))
        

        if self.include_bams:
            r.heading('BAM files')
            
            r.p('These BAM files contain the alignments of reads to the reference sequences.')
            
            r.p('Reads with a poly(A) tail have an \'AN\' attribute giving the length of non-templated poly(A) sequence. '
                'Tail-tools only treats a read as having a tail if this length is at least 4.')
            
            bam_files = [ ]
            for name in names:
                bam_files.append( (samplespace/(name,'alignments_filtered_sorted.bam'),name+'.bam') )
                bam_files.append( (samplespace/(name,'alignments_filtered_sorted.bam.bai'),name+'.bam.bai') )
            r.p(r.tar('bam-files', bam_files))


        r.heading('Genewise expression')
        
        io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise')
        r.p('<a href="genewise/index.html">&rarr; Genewise expression</a>')

        io.symbolic_link(source=expressionspace/('genewise-dedup','report'),link_name=r.workspace/'genewise-dedup')
        r.p('<a href="genewise-dedup/index.html">&rarr; Genewise expression with read deduplication</a>')


        r.heading('Peakwise expression')

        web.Geneview_webapp(r.workspace/'view').run()        
        
        peak_filename = expressionspace/('peakwise','features-with-data.gff')
        n_peaks = len(list(annotation.read_annotations(peak_filename)))
        r.p('%d peaks called (%d poly(A) reads were required to call a peak).' % (n_peaks, self.peak_min_depth))
        
        r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called')        

        #if self.groups:
            #r.subheading('Peak shift between groups')
            #r.p(r.get(workspace/('peak-shift','grouped.csv')) + ' - genes with a potential peak shift')        
            #r.get(workspace/('peak-shift','grouped.json'))

        #r.subheading('Peak shift between samples')
        #r.p(r.get(workspace/('peak-shift','individual.csv')) + ' - genes with a potential peak shift')        
        #r.get(workspace/('peak-shift','individual.json'))

        
        io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise')
        r.p('<a href="peakwise/index.html">&rarr; Peakwise expression</a>')

        io.symbolic_link(source=expressionspace/('peakwise-dedup','report'),link_name=r.workspace/'peakwise-dedup')
        r.p('<a href="peakwise-dedup/index.html">&rarr; Peakwise expression with read deduplication</a>')
                
        if self.tests:
            r.heading('Differential tests')
            for test in self.tests:
                io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir))
                io.symbolic_link(source=testspace_dedup/test.output_dir,link_name=r.workspace/('test-dedup-'+test.output_dir))
                r.p('<a href="test-%s">&rarr; %s</a> '
                    ' &nbsp; <a href="test-dedup-%s" style="font-size: 66%%">[&rarr; Deduplicated version]</a>' % (test.output_dir, test.get_title(), test.output_dir))

        r.heading('Gene viewers')
        r.p('Having identified interesting genes from heatmaps and differential tests above, '
            'these viewers allow specific genes to be examined in detail.')
        
        if self.groups:
            r.p('<a href="view.html?json=%sgrouped.json">&rarr; Gene viewer, grouped samples</a>' % r.file_prefix)
        r.p('<a href="view.html?json=%sindividual.json">&rarr; Gene viewer, individual samples</a>' % r.file_prefix)
       

        r.write('<p/><hr>\n')
        
        r.p('Note: Use deduplicated versions with care. '
            'They may possibly provide more significant results, however they are less quantitative. '
            'Read deduplication involves throwing away a large amount of data, much of which will not be a technical artifact. '
            'Deduplicated versions might best be viewed as a check on data quality.')
        
        r.p('This set of genes was used in the analysis:')
        
        r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format')
        r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions')

        r.p('tail-tools version '+tail_tools.VERSION)
        r.p('nesoni version '+nesoni.VERSION)
        #r.p('SHRiMP version '+grace.get_shrimp_2_version())
        
        r.close()
Esempio n. 8
0
    def run(self):
        work = self.get_workspace()

        data = []
        names = []
        sample_tags = []

        old = grace.status("Loading pickles")

        max_length = 1
        for i, item in enumerate(self.pickles):
            grace.status("Loading " + os.path.basename(item))
            f = io.open_possibly_compressed_file(item)
            name, tags, datum = pickle.load(f)
            f.close()
            data.append(datum)
            names.append(name)
            sample_tags.append(tags)

            try:
                max_length = max(
                    max_length,
                    max(item[0]  #tail_length
                        for feature in datum for item in feature.hits) + 1)
            except ValueError:
                pass

            if i == 0:
                annotations = datum

        grace.status(old)

        self.log.log("Maximum tail length %d\n" % max_length)

        for i in xrange(len(names)):
            n_alignments = 0
            for feature in data[i]:
                feature.total_count = len(feature.hits)
                feature.tail_counts = [0] * max_length

                n_alignments += feature.total_count

                for tail_length, adaptor_bases in feature.hits:
                    if adaptor_bases >= self.adaptor:
                        feature.tail_counts[tail_length] += 1

                del feature.hits

            self.log.datum(names[i], 'Alignments to features', n_alignments)

        counts = []  # [feature][sample](total_count, [taillength])

        for item in data:
            assert len(item) == len(data[0])
        for row in itertools.izip(*data):
            this_counts = [(item.total_count, item.tail_counts)
                           for item in row]
            counts.append(this_counts)

        n_features = len(counts)
        n_samples = len(data)

        sample_n = [[0] * n_samples for i in xrange(n_features)
                    ]  # [feature][sample]  Total count
        sample_n_tail = [[0] * n_samples for i in xrange(n_features)
                         ]  # [feature][sample]  Polya count
        sample_prop = [
            [None] * n_samples for i in xrange(n_features)
        ]  # [feature][sample]  Proportion of reads with tail (deprecated)
        sample_tail = [[None] * n_samples for i in xrange(n_features)
                       ]  # [feature][sample]  Mean tail length in each sample
        sample_sd_tail = [
            [None] * n_samples for i in xrange(n_features)
        ]  # [feature][sample]  Std dev tail length in each sample
        sample_total_tail = [[0] * n_samples for i in xrange(n_features)]

        sample_quantile_tail = collections.OrderedDict(
            (item, [[None] * n_samples for i in xrange(n_features)])
            for item in [25, 50, 75, 100])

        overall_n = [0] * n_features  # [feature]          Overall count
        overall_prop = [
            None
        ] * n_features  # [feature]          Overall proportion with tail
        overall_tail = [
            None
        ] * n_features  # [feature]          Overall mean tail length
        overall_n_tail = [
            0
        ] * n_features  # [feature]          Overall polya count
        for i, row in enumerate(counts):
            for j, (this_this_n, item) in enumerate(row):
                sample_n[i][j] = this_this_n
                sample_n_tail[i][j] = sum(item[self.tail:])
                sample_total_tail[i][j] = sum(
                    item[k] * k for k in xrange(self.tail, max_length))

                if sample_n[i][j] >= 1:
                    sample_prop[i][j] = float(
                        sample_n_tail[i][j]) / sample_n[i][j]

                if sample_n_tail[i][j] >= 1:
                    sample_tail[i][j] = float(
                        sample_total_tail[i][j]) / sample_n_tail[i][j]

                    for quantile in sample_quantile_tail:
                        counter = sample_n_tail[i][j] * quantile / 100.0
                        for k in xrange(self.tail, max_length):
                            counter -= item[k]
                            if counter <= 0: break
                        sample_quantile_tail[quantile][i][j] = k

                if sample_n_tail[i][j] >= 2:
                    sample_sd_tail[i][j] = math.sqrt(
                        float(
                            sum(item[k] * ((k - sample_tail[i][j])**2)
                                for k in xrange(self.tail, max_length))) /
                        (sample_n_tail[i][j] - 1))

            overall_n[i] = sum(sample_n[i])
            overall_n_tail[i] = sum(sample_n_tail[i])
            if overall_n[i] >= 1:
                overall_prop[i] = float(sum(sample_n_tail[i])) / overall_n[i]
            if overall_n_tail[i] >= 1:
                overall_tail[i] = float(sum(
                    sample_total_tail[i])) / overall_n_tail[i]

        for i, name in enumerate(names):
            this_total = sum(item[i] for item in sample_total_tail)
            this_n = sum(item[i] for item in sample_n_tail)
            if this_n:
                self.log.datum(name, 'Average poly-A tail',
                               float(this_total) / this_n)

        for i, name in enumerate(names):
            this_total = sum(item[i] for item in sample_n_tail)
            this_n = sum(item[i] for item in sample_n)
            if this_n:
                self.log.datum(name, 'Average proportion of reads with tail',
                               float(this_total) / this_n)

        with open(work / 'features-with-data.gff', 'wb') as f:
            annotation.write_gff3_header(f)
            for i, item in enumerate(annotations):
                item.attr['reads'] = str(overall_n[i])
                item.attr['reads_with_tail'] = str(overall_n_tail[i])
                item.attr['mean_tail'] = '%.1f' % overall_tail[
                    i] if overall_tail[i] else 'NA'
                item.attr['proportion_with_tail'] = '%.2f' % overall_prop[
                    i] if overall_prop[i] else 'NA'

                if overall_tail[i] is None:
                    item.attr['color'] = '#444444'
                else:
                    a = (overall_tail[i] - self.tail) / max(
                        1, max_length - self.tail)
                    item.attr['color'] = '#%02x%02x%02x' % (int(
                        a * 255), int(
                            (1 - abs(a * 2 - 1)) * 255), 255 - int(a * 255))
                #item.attr['color'] = ...
                print >> f, item.as_gff()

        comments = ['#Counts'] + [
            '#sampleTags=' + ','.join(tags) for tags in sample_tags
        ] + [
            '"Tail_count" group is number of reads with tail',
            '"Tail" group is mean tail per sample',
            '"Proportion" group is proportion of reads with tail',
        ]

        have_biotype = any("Biotype" in item.attr for item in annotations)
        have_parent = any("Parent" in item.attr for item in annotations)
        have_relation = any("Relation" in item.attr for item in annotations)
        have_antisense = any("Antisense_parent" in item.attr
                             for item in annotations)

        def counts_iter():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(n_samples):
                    row[('Count', names[j])] = '%d' % sample_n[i][j]

                row[('Annotation',
                     'Length')] = annotations[i].end - annotations[i].start
                row[('Annotation',
                     'gene')] = annotations[i].attr.get('Name', '')
                row[('Annotation',
                     'product')] = annotations[i].attr.get('Product', '')
                if have_biotype:
                    row[('Annotation',
                         'biotype')] = annotations[i].attr.get('Biotype', '')
                if have_parent:
                    row[('Annotation',
                         'parent')] = annotations[i].attr.get('Parent', '')
                if have_relation:
                    row[('Annotation', 'relation')] = annotations[i].attr.get(
                        'Relation', '')

                if have_antisense:
                    row[('Annotation',
                         'antisense_gene')] = annotations[i].attr.get(
                             'Antisense_name', '')
                    row[('Annotation',
                         'antisense_product')] = annotations[i].attr.get(
                             'Antisense_product', '')
                    row[('Annotation',
                         'antisense_biotype')] = annotations[i].attr.get(
                             'Antisense_biotype', '')
                    row[('Annotation',
                         'antisense_parent')] = annotations[i].attr.get(
                             'Antisense_parent', '')

                row[('Annotation', 'chromosome')] = str(annotations[i].seqid)
                row[('Annotation', 'strand')] = str(annotations[i].strand)
                row[('Annotation', 'start')] = str(annotations[i].start + 1)
                row[('Annotation', 'end')] = str(annotations[i].end)

                row[('Annotation', 'reads')] = str(overall_n[i])
                row[('Annotation', 'reads-with-tail')] = str(overall_n_tail[i])
                row[('Annotation', 'mean-tail')] = str_na(overall_tail[i])
                row[('Annotation',
                     'proportion-with-tail')] = str_na(overall_prop[i])
                for j in xrange(n_samples):
                    row[('Tail_count', names[j])] = '%d' % sample_n_tail[i][j]
                for j in xrange(n_samples):
                    row[('Tail', names[j])] = str_na(sample_tail[i][j])
                for j in xrange(n_samples):
                    row[('Tail_sd', names[j])] = str_na(sample_sd_tail[i][j])

                for quantile in sample_quantile_tail:
                    for j in xrange(n_samples):
                        row[('Tail_quantile_%d' % quantile,
                             names[j])] = str_na(
                                 sample_quantile_tail[quantile][i][j])

                for j in xrange(len(names)):
                    row[('Proportion', names[j])] = str_na(sample_prop[i][j])
                yield row

        io.write_csv(work / 'counts.csv', counts_iter(), comments=comments)

        def write_csv_matrix(filename, matrix):
            def emitter():
                for i in xrange(n_features):
                    row = collections.OrderedDict()
                    row["Feature"] = annotations[i].get_id()
                    for j in xrange(n_samples):
                        row[names[j]] = str_na(matrix[i][j])
                    yield row

            io.write_csv(filename, emitter())

        write_csv_matrix(work / 'read_count.csv', sample_n)
        write_csv_matrix(work / 'tail_count.csv', sample_n_tail)
        write_csv_matrix(work / 'tail.csv', sample_tail)
        write_csv_matrix(work / 'tail_sd.csv', sample_sd_tail)
        for quantile in sample_quantile_tail:
            write_csv_matrix(work / ('tail_quantile_%d.csv' % quantile),
                             sample_quantile_tail[quantile])

        #def raw_columns():
        #    for i in xrange(n_samples):
        #        row = collections.OrderedDict()
        #        row['Sample'] = names[i]
        #        for j in xrange(max_length):
        #            row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based
        #        yield row
        #io.write_csv(work/'raw-columns.csv', raw_columns())
        #
        ##Somewhat inefficient
        #def raw():
        #    for i in xrange(n_features):
        #        row = collections.OrderedDict()
        #        row['Feature'] = annotations[i].get_id()
        #        for j in xrange(n_samples):
        #            for k in xrange(max_length):
        #                row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] )
        #        yield row
        #io.write_csv(work/'raw.csv', raw())

        def pooled():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(max_length):
                    row[str(j)] = str(
                        sum(counts[i][k][1][j] for k in xrange(n_samples)))
                yield row

        io.write_csv(work / 'pooled.csv', pooled())
Esempio n. 9
0
    def run(self):
        work = self.get_workspace()
        
        data = [ ]
        names = [ ]
        sample_tags = [ ]
        
        for item in self.pickles:
            f = io.open_possibly_compressed_file(item)
            name, tags, datum = pickle.load(f)
            f.close()
            data.append(datum)
            names.append(name)
            sample_tags.append(tags)
        
        annotations = data[0]
        
        all_lengths = [ 
            #tail_length
            item[2]
            for sample in data
            for feature in sample
            #for rel_start,rel_end,tail_length in feature.hits
            for item in feature.hits
            ]
        if all_lengths: 
            max_length = max(all_lengths)+1
        else:
            max_length = 1
        del all_lengths
        
        for i, sample in enumerate(data):
            n_alignments = 0
            n_duplicates = 0
            n_good = 0
            for feature in sample:
                feature.tail_counts = [ 0.0 ] * max_length
                
                buckets = collections.defaultdict(list)
                for item in feature.hits:
                    rel_start,rel_end,tail_length = item[:3]
                    buckets[ (rel_start,rel_end) ].append(tail_length)
                for item in buckets.values():
                    n_alignments += len(item)
                    n_good += 1
                    if self.saturation < 1 or len(item) <= self.saturation:
                        weight = 1.0
                    else:
                        weight = float(self.saturation) / len(item)
                        n_duplicates += len(item)
                    for item2 in item:
                        feature.tail_counts[item2] += weight                

            self.log.datum(names[i], 'Alignments to features', n_alignments)
            if self.saturation >= 1:
                self.log.datum(names[i], 'Proportion of alignments with duplicate start and end position', float(n_duplicates)/max(1,n_alignments))
                self.log.datum(names[i], 'Alignments to features after deduplication', n_good)
                
        
        counts = [ ]  # [feature][sample][taillength]
        
        for item in data: 
            assert len(item) == len(data[0])
        for row in itertools.izip(*data):
            this_counts = [ item.tail_counts for item in row ]
            counts.append(this_counts)
        
        sample_n = [ ]        # [feature][sample]  Total count
        sample_n_tail = [ ]   # [feature][sample]  Polya count
        sample_prop = [ ]     # [feature][sample]  Proportion of reads with tail
        sample_tail = [ ]     # [feature][sample]  Mean tail length in each sample
        sample_total_tail = [ ]
        overall_n = [ ]
        overall_prop = [ ]    # [feature]          Overall proportion with tail
        overall_tail = [ ]    # [feature]          Overall mean tail length
        overall_n_tail = [ ]  # [feature]          Overall polya count
        overall_total_tail = [ ]
        for row in counts:
            this_n = [ ]
            this_n_tail = [ ]
            this_prop = [ ]
            this_tail = [ ]
            this_total_tail = [ ]
            for item in row:
                this_this_n = sum(item)
                this_n.append( this_this_n )

                this_this_n_tail = sum(item[self.tail:])
                this_n_tail.append( this_this_n_tail )

                this_this_total_tail = sum( item[i]*i for i in xrange(self.tail,max_length) )
                this_total_tail.append( this_this_total_tail )

                if this_this_n < 1:
                    this_prop.append(None)
                else:
                    this_prop.append(float(this_this_n_tail)/this_this_n)
                if this_this_n_tail < 1:
                    this_tail.append(None)
                else:
                    this_tail.append(this_this_total_tail/this_this_n_tail)

            sample_n.append(this_n)
            sample_n_tail.append(this_n_tail)
            sample_prop.append(this_prop)
            sample_tail.append(this_tail)
            sample_total_tail.append(this_total_tail)
            overall_n.append(sum(this_n))
            overall_n_tail.append(sum(this_n_tail))
            overall_total_tail.append(sum(this_total_tail))
            if sum(this_n) < 1:
                overall_prop.append(None)
            else:
                overall_prop.append(float(sum(this_n_tail))/sum(this_n))
            if sum(this_n_tail) < 1:
                overall_tail.append(None)
            else:
                overall_tail.append(float(sum(this_total_tail))/sum(this_n_tail))
             
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_total_tail )
            this_n = sum( item[i] for item in sample_n_tail )
            if this_n:
                self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n)
                
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_n_tail )
            this_n = sum( item[i] for item in sample_n )
            if this_n:
                self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n)
            
        
        #max_length = max(max(len(item) for item in row) for row in counts)
        #
        #for row in counts:
        #    for item in row:
        #        while len(item) < max_length:
        #            item.append(0)
                
        
        with open(work/'features-with-data.gff','wb') as f:
            annotation.write_gff3_header(f)
            for i, item in enumerate(annotations):
                item.attr['reads'] = str(overall_n[i])
                item.attr['reads_with_tail'] = str(overall_n_tail[i])
                item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA'
                item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA'
                
                if overall_tail[i] is None:
                    item.attr['color'] = '#444444'
                else:
                    a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail)
                    item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255))
                #item.attr['color'] = ...                
                print >> f, item.as_gff()
        
        
        comments = [ '#Counts' ] + [
            '#sampleTags='+','.join(tags)
            for tags in sample_tags
            ] + [
            '"Tail_count" group is number of reads with tail',
            '"Tail" group is mean tail per sample',
            '"Proportion" group is proportion of reads with tail',
            ]

        def counts_iter():
            for i in xrange(len(counts)):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(len(names)):
                    row[('Count',names[j])] = '%d' % sample_n[i][j]

                row[('Annotation','Length')] = annotations[i].end - annotations[i].start
                row[('Annotation','gene')] = annotations[i].attr.get('Name','')
                row[('Annotation','product')] = annotations[i].attr.get('Product','')
                #row[('Annotation','Strand')] = str(annotations[i].strand)
                row[('Annotation','reads')] = str(overall_n[i])
                row[('Annotation','reads-with-tail')] = str(overall_n_tail[i])
                row[('Annotation','mean-tail')] = str(overall_tail[i]) if overall_tail[i] is not None else 'NA'
                row[('Annotation','proportion-with-tail')] = str(overall_prop[i]) if overall_prop[i] is not None else 'NA'
                for j in xrange(len(names)):
                    row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j]
                for j in xrange(len(names)):
                    row[('Tail',names[j])] = str(sample_tail[i][j]) if sample_tail[i][j] is not None else 'NA'
                for j in xrange(len(names)):
                    row[('Proportion',names[j])] = str(sample_prop[i][j]) if sample_prop[i][j] is not None else 'NA'
                yield row
        io.write_csv(work/'counts.csv', counts_iter(), comments=comments)

        def raw_columns():
            for i in xrange(len(names)):
                row = collections.OrderedDict()
                row['Sample'] = names[i]
                for j in xrange(max_length):
                    row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based
                yield row
        io.write_csv(work/'raw-columns.csv', raw_columns())

        #Somewhat inefficient        
        def raw():
            for i in xrange(len(counts)):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(len(names)):
                    for k in xrange(max_length):
                        row['%d %s' % (k,names[j])] = str( counts[i][j][k] )
                yield row
        io.write_csv(work/'raw.csv', raw())
        
        def pooled():
            for i in xrange(len(counts)):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(max_length):
                    row[str(j)] = str( sum( counts[i][k][j] for k in xrange(len(names)) ) )
                yield row
        io.write_csv(work/'pooled.csv', pooled())