def _describe_peaks(self, r):        
     workspace = io.Workspace(self.output_dir, must_exist=False)
     
     counts = io.read_grouped_table(workspace/("expression","peakwise","counts.csv"))["Count"]
     
     peak_counts = collections.defaultdict(int)
     read_counts = collections.defaultdict(int)
     total = 0
     for item in annotation.read_annotations(workspace/("peaks","relation-child.gff")):
         peak_counts[item.attr.get("Relation","None")] += 1
         read_counts[item.attr.get("Relation","None")] += sum(int(c) for c in counts[item.get_id()].values())            
         total += 1
     
     total_reads = sum(read_counts.values())
     
     r.write("<p>\n")
     r.write("%d peaks\n" % total)
     
     for name, desc in [
             ("3'UTR", "in a 3' UTR"),
             ("Exon", "otherwise in an exon"),
             ("Downstrand", "otherwise downstrand of a non-coding RNA"),
             ("Intron", "otherwise in an intron"),
             ("Antisense", "otherwise antisense to a gene"),
             ("None", "couldn't be related to annotated genes"),
             ]:
         r.write("<br/>%d peaks and %.1f%% of reads %s\n" % (peak_counts[name], read_counts[name]*100.0/total_reads, desc))
     r.write("</p>\n")
 def _describe_peaks(self, r):        
     workspace = io.Workspace(self.output_dir, must_exist=False)
     
     counts = io.read_grouped_table(workspace/("expression","peakwise","counts.csv"))["Count"]
     
     peak_counts = collections.defaultdict(int)
     read_counts = collections.defaultdict(int)
     total = 0
     for item in annotation.read_annotations(workspace/("peaks","relation-child.gff")):
         peak_counts[item.attr.get("Relation","None")] += 1
         read_counts[item.attr.get("Relation","None")] += sum(int(c) for c in counts[item.get_id()].values())            
         total += 1
     
     total_reads = sum(read_counts.values())
     
     r.write("<p>\n")
     r.write("%d peaks\n" % total)
     
     for name, desc in [
             ("3'UTR", "in a 3' UTR"),
             ("Exon", "otherwise in an exon"),
             ("Downstrand", "otherwise downstrand of a non-coding RNA"),
             ("Intron", "otherwise in an intron"),
             ("Antisense", "otherwise antisense to a gene"),
             ("None", "couldn't be related to annotated genes"),
             ]:
         r.write("<br/>%d peaks and %.1f%% of reads %s\n" % (peak_counts[name], read_counts[name]*100.0/total_reads, desc))
     r.write("</p>\n")
Exemple #3
0
    def run(self):
        assert self.filenames, 'No files given to merge.'

        tables = []
        for filename in self.filenames:
            tables.append(
                io.read_grouped_table(
                    filename,
                    [('Count', str), ('Annotation', str), ('Alignment', str)],
                    'Count',
                ))

        result = io.Grouped_table()
        result.comments = ['#Counts']
        for table in tables:
            for comment in table.comments:
                if comment != '#Counts':
                    result.comments.append(comment)

        result['Count'] = matrix_merge([table['Count'] for table in tables])
        result['Annotation'] = matrix_merge(
            [table['Annotation'] for table in tables])
        result['Alignment'] = matrix_merge(
            [table['Alignment'] for table in tables], merge_counts)
        result.write_csv(self.prefix + '.csv')
    def run(self):
        working_dirs = [ ] 
        peaks_file = self.peaks_file       
        for item in self.working_dirs:
            state_filename = os.path.join(item,'analyse-polya-batch.state')
            if not os.path.exists(state_filename):
                working_dirs.append(item)
            else:
                with open(state_filename,'rb') as f:
                    state = pickle.load(f)

                for sample in state.samples:
                    working_dirs.append(os.path.join(item,'samples',sample.output_dir))
                
                if not peaks_file:
                    peaks_file = os.path.join(self.pipeline_dir, "peaks", "relation-child.gff")

        
        sample_names = [ os.path.split(dirname)[1] for dirname in working_dirs ]
        workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ]
        
        workspace = self.get_workspace()
        
        with open(workspace/"index.html","wb") as f:
            web.emit(f, "igv.html", dict(
                SAMPLES = json.dumps(sample_names),
                HAVE_NORM = json.dumps(bool(self.norm_file)),
                TITLE = self.title,
            ))
        
        bams = [ item/"alignments_filtered_sorted.bam" for item in workspaces ]
        
        for i in xrange(len(sample_names)):
            io.symbolic_link(bams[i], workspace/(sample_names[i]+".bam"))
            io.symbolic_link(bams[i]+".bai", workspace/(sample_names[i]+".bam.bai"))
        
        io.symbolic_link(peaks_file, workspace/"peaks.gff")

                
        if self.norm_file:
            mults = io.read_grouped_table(self.norm_file)['All']
            norm_mult = [ float(mults[name]['Normalizing.multiplier']) for name in sample_names ]
        
        with nesoni.Stage() as stage:
            Bam_to_bigwig(workspace/"total", bam_files=bams, what="ambiguity,span,3p,polyaspan,polya3p",
                ).process_make(stage)
            
            for i in xrange(len(sample_names)):
                for scale_desc, scale in \
                        [("raw",1.0)] + \
                        ([("norm",norm_mult[i])] if self.norm_file else []):
                    Bam_to_bigwig(
                        workspace/(sample_names[i]+"-"+scale_desc), 
                        bam_files=[bams[i]], 
                        what='span,3p,polyaspan,polya3p', scale=scale
                        ).process_make(stage)
 def _extract_raw(self):            
     work = io.Workspace(self.output_dir, must_exist=False)
     raw = io.Workspace(work/'raw', must_exist=False)
     
     for name, counts, norms in [
             ('genewise',
                 work/('expression','genewise','counts.csv'),
                 work/('expression','genewise','norm.csv'),
                 ),
             ('primarypeakwise',
                 work/('expression','primarypeakwise','counts.csv'),
                 work/('expression','primarypeakwise','norm.csv'),
                 ),
             ('peakwise',
                 work/('expression','peakwise','counts.csv'),
                 work/('expression','peakwise','norm.csv'),
                 ),
             ('pairwise',
                 work/('peak-shift','individual-pairs.csv'),
                 work/('peak-shift','individual-pairs-norm.csv'),
                 ),
             ]:
         nesoni.Vst(
             raw/(name+'-mlog2-RPM'),
             counts,
             norm_file = norms
             ).make()
         
         counts_table = io.read_grouped_table(counts)
         io.write_csv_2(raw/(name+'-info.csv'), counts_table['Annotation'])
         io.write_csv_2(raw/(name+'-count.csv'), counts_table['Count'])
         io.write_csv_2(raw/(name+'-tail.csv'), counts_table['Tail'])
         io.write_csv_2(raw/(name+'-tail-count.csv'), counts_table['Tail_count'])
         io.write_csv_2(raw/(name+'-proportion.csv'), counts_table['Proportion'])
         
         norm_table = io.read_grouped_table(norms)
         io.write_csv_2(raw/(name+'-norm.csv'), norm_table['All'])
 def _extract_raw(self):            
     work = io.Workspace(self.output_dir, must_exist=False)
     raw = io.Workspace(work/'raw', must_exist=False)
     
     for name, counts, norms in [
             ('genewise',
                 work/('expression','genewise','counts.csv'),
                 work/('expression','genewise','norm.csv'),
                 ),
             ('primarypeakwise',
                 work/('expression','primarypeakwise','counts.csv'),
                 work/('expression','primarypeakwise','norm.csv'),
                 ),
             ('peakwise',
                 work/('expression','peakwise','counts.csv'),
                 work/('expression','peakwise','norm.csv'),
                 ),
             ('pairwise',
                 work/('peak-shift','individual-pairs.csv'),
                 work/('peak-shift','individual-pairs-norm.csv'),
                 ),
             ]:
         nesoni.Vst(
             raw/(name+'-mlog2-RPM'),
             counts,
             norm_file = norms
             ).make()
         
         counts_table = io.read_grouped_table(counts)
         io.write_csv_2(raw/(name+'-info.csv'), counts_table['Annotation'])
         io.write_csv_2(raw/(name+'-count.csv'), counts_table['Count'])
         io.write_csv_2(raw/(name+'-tail.csv'), counts_table['Tail'])
         io.write_csv_2(raw/(name+'-tail-count.csv'), counts_table['Tail_count'])
         io.write_csv_2(raw/(name+'-proportion.csv'), counts_table['Proportion'])
         
         norm_table = io.read_grouped_table(norms)
         io.write_csv_2(raw/(name+'-norm.csv'), norm_table['All'])
Exemple #7
0
 def run(self):
     assert self.filenames, 'No files given to merge.'
     
     tables = [ ]
     for filename in self.filenames:
         tables.append(io.read_grouped_table(
             filename,
             [('Count',str), ('Annotation',str), ('Alignment',str)],
             'Count',
             ))
             
     result = io.Grouped_table()
     result.comments = [ '#Counts' ]
     for table in tables:
         for comment in table.comments:
             if comment != '#Counts':
                 result.comments.append(comment)
     
     result['Count'] = matrix_merge([ table['Count'] for table in tables ])
     result['Annotation'] = matrix_merge([ table['Annotation'] for table in tables ])
     result['Alignment'] = matrix_merge([ table['Alignment'] for table in tables ], merge_counts)        
     result.write_csv(self.prefix + '.csv')
Exemple #8
0
 def load_norm_mult(self):
     #mults = { }
     #for record in io.read_table(self.norm_file):
     #    mults[record['Sample']] = float(record['Normalizing.multiplier'])        
     mults = io.read_grouped_table(self.norm_file)['All']
     self.norm_mult = [ float(mults[name]['Normalizing.multiplier']) for name in self.sample_names ]
Exemple #9
0
    def run(self):
        #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given'
        
        # Reference genome
        
        #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths()
        chromosomes = collections.OrderedDict(io.read_sequences(self.reference))

        def get_interpeak_seq(peaks):
            start = min(item.transcription_stop for item in peaks)
            end = max(item.transcription_stop for item in peaks)
            if end-start > self.max_seq: return ''
            if peaks[0].strand >= 0:
                return chromosomes[peaks[0].seqid][start:end]
            else:
                return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end])

        def get_prepeak_seq(gene,peaks):
            if gene.strand >= 0:
                start = gene.utr_pos
                end = min(item.transcription_stop for item in peaks)
                if end-start > self.max_seq: return ''
                return chromosomes[gene.seqid][start:end]
            else:
                start = max(item.transcription_stop for item in peaks)
                end = gene.utr_pos
                if end-start > self.max_seq: return ''
                return bio.reverse_complement(chromosomes[gene.seqid][start:end])
        
        # Normalization files
        
        if self.norm_file:
            norm_file = self.norm_file
        else:
            nesoni.Norm_from_counts(self.prefix+'-norm', self.counts).run()
            norm_file = self.prefix+'-norm.csv'

        norms = io.read_grouped_table(norm_file, [('All',str)])['All']
        pair_norm_names = [ ]
        pair_norms = [ ]
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i]+'-peak1')
            pair_norms.append(norms.values()[i])
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i]+'-peak2')
            pair_norms.append(norms.values()[i])
        io.write_grouped_csv(
            self.prefix+'-pairs-norm.csv',
            [('All',io.named_list_type(pair_norm_names)(pair_norms))],
            comments=['#Normalization'],
            )


        # Read data
        
        annotations = list(annotation.read_annotations(self.parents))
        if self.utrs:
            utrs = list(annotation.read_annotations(self.utrs))
        else:
            utrs = [ ]
        children = list(annotation.read_annotations(self.children))
        
        count_table = io.read_grouped_table(self.counts, [
            ('Count',int),
            ('Tail_count',int),
            ('Tail',_float_or_none),
            ('Proportion',_float_or_none),
            ('Annotation',str)
            ])
        counts = count_table['Count']
        tail_counts = count_table['Tail_count']
        proportions = count_table['Proportion']
        tails = count_table['Tail']
        
        samples = counts.value_type().keys()
        sample_tags = { }
        for line in count_table.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                assert parts[0] not in sample_tags
                sample_tags[parts[0]] = parts
        
        for item in children:
            item.weight = sum( counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples )
        
        parents = [ ]
        id_to_parent = { }
        for item in annotations:
            if item.type != self.parent_type: continue
            assert item.get_id() not in id_to_parent, 'Duplicate id in parent file: '+item.get_id()
            parents.append(item)
            id_to_parent[item.get_id()] = item
            item.children = [ ]
            #item.cds = [ ]
            
            # Default utr
            if item.strand >= 0:
               item.utr_pos = item.end
            else:
               item.utr_pos = item.start
            
            if 'three_prime_UTR_start' in item.attr:
               if item.strand >= 0:
                  item.utr_pos = int(item.attr['three_prime_UTR_start'])-1
               else:
                  item.utr_pos = int(item.attr['three_prime_UTR_start'])
            
            
        
        for item in utrs:
            assert item.attr['Parent'] in id_to_parent, 'Unknown gene '+item.attr['Parent']    
            id_to_parent[item.attr['Parent']].utr_pos = (item.start if item.strand >= 0 else item.end)


        for item in children:
            item.transcription_stop = item.end if item.strand >= 0 else item.start #End of transcription, 0-based, ie between-positions based
            
            if 'Parent' in item.attr:
                for item_parent in item.attr['Parent'].split(','):
                    parent = id_to_parent[item_parent]
                    parent.children.append(item)
                    

        for item in parents:
            item.children.sort(key=_annotation_sorter)
            
            relevant = list(item.children)
            if self.utr_only:
                #if item.strand <= 0:
                #    relative_utr_start = item.end - int(item.attr['three_prime_UTR_start'])
                #else:
                #    relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start
                #
                #def relative_start(peak):
                #    return item.end-peak.end if item.strand < 0 else peak.start-item.start
                #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ]
                relevant = [ 
                    peak for peak in relevant 
                    if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos)
                    ]
                    
            if self.top:
                relevant.sort(key=lambda peak:peak.weight, reverse=True)
                relevant = relevant[:self.top]
            relevant.sort(key=_annotation_sorter)
            item.relevant_children = relevant
        
        
        
        # JSON output
        
        j_data = { }
        j_genes = j_data['genes'] = { }
        
        j_genes['__comment__'] = 'start is 0-based'
        j_genes['name'] = [ ]
        j_genes['chromosome'] = [ ]
        j_genes['strand'] = [ ]
        j_genes['start'] = [ ]
        j_genes['utr'] = [ ]
        j_genes['end'] = [ ]
        j_genes['gene'] = [ ]
        j_genes['product'] = [ ]
        j_genes['peaks'] = [ ]
        j_genes['relevant_peaks'] = [ ]
        #j_genes['cds'] = [ ]
        #j_genes['cds_start'] = [ ]
        #j_genes['cds_end'] = [ ]
        for item in parents:
            j_genes['name'].append( item.get_id() )
            j_genes['chromosome'].append( item.seqid )
            j_genes['strand'].append( item.strand )
            j_genes['start'].append( item.start )
            j_genes['utr'].append( item.utr_pos )
            j_genes['end'].append( item.end )
            j_genes['gene'].append( item.attr.get('Name',item.attr.get('gene','')) )
            j_genes['product'].append( item.attr.get('Product',item.attr.get('product','')) )
            j_genes['peaks'].append( [ item2.get_id() for item2 in item.children ] )
            j_genes['relevant_peaks'].append( [ item2.get_id() for item2 in item.relevant_children ] )
            #j_genes['cds'].append( item.cds )
            #j_genes['cds_start'].append( item.cds_start )
            #j_genes['cds_end'].append( item.cds_end )
        
        j_peaks = j_data['peaks'] = { }
        j_peaks['__comment__'] = 'start is 0-based'
        j_peaks['name'] = [ ]
        j_peaks['chromosome'] = [ ]
        j_peaks['strand'] = [ ]
        j_peaks['start'] = [ ]
        j_peaks['end'] = [ ]
        j_peaks['parents'] = [ ]
        j_peaks['counts'] = [ ]
        j_peaks['tail_lengths'] = [ ]
        j_peaks['proportion_tailed'] = [ ]
        for item in children:
            j_peaks['name'].append( item.get_id() )
            j_peaks['chromosome'].append( item.seqid )
            j_peaks['strand'].append( item.strand )
            j_peaks['start'].append( item.start )
            j_peaks['end'].append( item.end )
            j_peaks['parents'].append( item.attr['Parent'].split(',') if 'Parent' in item.attr else [ ])
            j_peaks['counts'].append( counts[item.get_id()].values() )
            j_peaks['tail_lengths'].append( count_table['Tail'][item.get_id()].values() )
            j_peaks['proportion_tailed'].append( count_table['Proportion'][item.get_id()].values() )
        
        j_samples = j_data['samples'] = { }
        j_samples['name'] = [ ]
        j_samples['tags'] = [ ]
        j_samples['normalizing_multiplier'] = [ ]
        for name in samples:
            j_samples['name'].append(name)
            j_samples['tags'].append(sample_tags[name])
            j_samples['normalizing_multiplier'].append(float(norms[name]['Normalizing.multiplier']))
        
        j_chromosomes = j_data['chromosomes'] = { }
        j_chromosomes['name'] = [ ]
        j_chromosomes['length'] = [ ]
        for name, seq in chromosomes.iteritems():
            j_chromosomes['name'].append(name)
            j_chromosomes['length'].append(len(seq))        
        
        with open(self.prefix + '.json','wb') as f:
            json.dump(j_data, f)
        
        
        # Output paired peak file
        
        output_comments = [ '#Counts' ]
        output_samples = [ ]
        for item in samples:
            output_samples.append(item+'-peak1')
            output_comments.append('#sampleTags=' + ','.join([item+'-peak1','peak1']+sample_tags.get(item,[])))
        for item in samples:
            output_samples.append(item+'-peak2')
            output_comments.append('#sampleTags=' + ','.join([item+'-peak2','peak2']+sample_tags.get(item,[])))
        
        output_names = [ ]
        output_counts = [ ]
        output_tail_counts = [ ]
        output_proportions = [ ]
        output_tails = [ ]
        output_annotation_fields = [ 'gene', 'product', 'mean_tail_1', 'mean_tail_2', 'chromosome', 'strand', 
                                     'transcription_stops' ] #, 'interpeak_seq', ]
        output_annotations = [ ]
            
        for item in parents:
            peaks = item.relevant_children
            for i in xrange(len(peaks)-1):
                for j in xrange(i+1, len(peaks)):
                    id_i = peaks[i].get_id()
                    id_j = peaks[j].get_id()
                    id_pair = item.get_id() + '-'+id_i+'-'+id_j
                    output_names.append(id_pair)
                    
                    row = [ ]
                    row.extend(counts[id_i].values())
                    row.extend(counts[id_j].values())
                    output_counts.append(filter(_text,row))
                    
                    row = [ ]
                    row.extend(tail_counts[id_i].values())
                    row.extend(tail_counts[id_j].values())
                    output_tail_counts.append(filter(_text,row))

                    row = [ ]
                    row.extend(proportions[id_i].values())
                    row.extend(proportions[id_j].values())
                    output_proportions.append(filter(_text,row))

                    row = [ ]
                    row.extend(tails[id_i].values())
                    row.extend(tails[id_j].values())
                    output_tails.append(filter(_text,row))
                    
                    output_annotations.append([
                        item.attr.get('Name',item.attr.get('gene','')),
                        item.attr.get('Product',item.attr.get('product','')),
                        count_table['Annotation'][id_i]['mean-tail'],
                        count_table['Annotation'][id_j]['mean-tail'],
                        
                        item.seqid,
                        str(item.strand),
                        '%d, %d' % (peaks[i].transcription_stop,peaks[j].transcription_stop),
                        #get_interpeak_seq([peaks[i],peaks[j]]),
                        ])
        
        #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts)
        io.write_grouped_csv(
            self.prefix + '-pairs.csv',
            [ 
                ('Count',io.named_matrix_type(output_names,output_samples)(output_counts)),
                ('Tail_count',io.named_matrix_type(output_names,output_samples)(output_tail_counts)),
                ('Proportion',io.named_matrix_type(output_names,output_samples)(output_proportions)),
                ('Tail',io.named_matrix_type(output_names,output_samples)(output_tails)),
                ('Annotation',io.named_matrix_type(output_names,output_annotation_fields)(output_annotations)),
                ],
            comments=output_comments,
            )
                        
#        # Chi Sq tests
#        
#        #for id in relation:
#        #    peaks = relation[id]
#        #    if len(peaks) < 2: continue     
#        
#        mats = [ ]   
#        genes = [ ]
#        products = [ ]
#        mean_tails = [ ]
#        prop_tails = [ ]
#        
#        peak_names = [ ]
#        chromosome_names = [ ]
#        strands = [ ]
#        transcription_stops = [ ]
#        interpeak_seqs = [ ]
#        prepeak_seqs = [ ]
#        
#        for parent in parents:
#            id = parent.get_id()
#            peaks = parent.relevant_children
#            if len(peaks) < 2: continue
#            
#            matrix = [ ]
#            for item in peaks:
#                matrix.append(counts[item.get_id()].values())
#            
#            mats.append(
#                runr.R_literal(id) + ' = ' + 
#                runr.R_literal(matrix)
#                )
#            
#            genes.append(parent.attr.get('Name',parent.attr.get('gene','')))
#            products.append(parent.attr.get('Product',parent.attr.get('product','')))
#            
#            def format_mean(s):
#                if s == 'NA': return 'NA'
#                return '%.1f' % float(s)
#            mean_tails.append(', '.join( format_mean(count_table['Annotation'][item.get_id()]['mean-tail']) for item in peaks ))
#            
#            def format_prop(s):
#                if s == 'NA': return 'NA'
#                return '%.2f' % float(s)
#            prop_tails.append(', '.join( format_prop(count_table['Annotation'][item.get_id()]['proportion-with-tail']) for item in peaks ))
#            
#            peak_names.append(', '.join(item.get_id() for item in peaks))
#            chromosome_names.append(parent.seqid)
#            strands.append(parent.strand)
#            transcription_stops.append(', '.join(str(item.transcription_stop) for item in peaks))
#            interpeak_seqs.append(get_interpeak_seq(peaks))
#            prepeak_seqs.append(get_prepeak_seq(parent,peaks))
#            
#            #if len(mats) >= 10: break
#        
#        text = 'cat("Loading data into R+\n")\n'
#        text += 'data <- list(\n' + ',\n'.join(mats) + ')\n'        
#        text += CHISQ
#        
#        runr.run_script(text,
#            OUTPUT_FILENAME=self.prefix+'.csv',
#            GENES = genes,
#            PRODUCTS = products,
#            MEAN_TAILS = mean_tails,
#            PROP_TAILS = prop_tails,
#            PEAK_NAMES = peak_names,
#            CHROMOSOME_NAMES = chromosome_names,
#            STRANDS = strands,
#            TRANSCRIPTION_STOPS = transcription_stops,
#            INTERPEAK_SEQS = interpeak_seqs,
#            PREPEAK_SEQS = prepeak_seqs,
#            )
#        
            
        
        
        
        
        
        
        
        
 def run(self):
     data = io.read_grouped_table(
         self.counts,
         [('Count',str), ('Annotation',str), ('Tail_count',str), ('Tail',str), ('Proportion',str)],
         'Count',
         )
     
     features = data['Count'].keys()
     samples = data['Count'].value_type().keys()
     
     tags = { }
     for sample in samples:
         tags[sample] = [sample]        
     for line in data.comments:
         if line.startswith('#sampleTags='):
             parts = line[len('#sampleTags='):].split(',')
             tags[parts[0]] = parts
     
     group_names = [ ]
     groups = [ ]
     group_tags = [ ]
     
     for item in self.groups:
         select = selection.term_specification(item)
         name = selection.term_name(item)
         group = [ item for item in samples if selection.matches(select, tags[item]) ]
         assert group, 'Empty group: '+name
         
         this_group_tags = [ name ]
         for tag in tags[group[0]]:
             if tag == name: continue
             for item in group[1:]:
                 for item2 in tags[item]:
                     if tag not in item2: break
                 else:
                     this_group_tags.append(tag)
         
         group_names.append(name)
         groups.append(group)
         group_tags.append(this_group_tags)
     
     result = io.Grouped_table()
     result.comments = [ '#Counts' ]
     for item in group_tags:
         result.comments.append('#sampleTags='+','.join(item))
     
     
     count = [ ]
     tail_count = [ ]
     tail = [ ]
     proportion = [ ]
     for feature in features:
         this_count = [ ]
         this_tail_count = [ ]
         this_tail = [ ]
         this_proportion = [ ]
         for group in groups:
             this_this_count = [ ]
             this_this_tail_count = [ ]
             this_this_tail = [ ]
             this_this_proportion = [ ]
             for sample in group:
                 this_this_count.append(int(data['Count'][feature][sample]))
                 this_this_tail_count.append(int(data['Tail_count'][feature][sample]))
                 item = data['Tail'][feature][sample]
                 if item != 'NA': this_this_tail.append(float(item))
                 item = data['Proportion'][feature][sample]
                 if item != 'NA': this_this_proportion.append(float(item))
             
             this_count.append(str(sum(this_this_count)))
             this_tail_count.append(str(sum(this_this_tail_count)))
             this_tail.append(str(sum(this_this_tail)/len(this_this_tail)) if this_this_tail else 'NA')
             this_proportion.append(str(sum(this_this_proportion)/len(this_this_proportion)) if this_this_proportion else 'NA')
                 
         count.append(this_count)
         tail_count.append(this_tail_count)
         tail.append(this_tail)
         proportion.append(this_proportion)
     
     matrix = io.named_matrix_type(features,group_names)
     result['Count'] = matrix(count)
     result['Annotation'] = data['Annotation']
     result['Tail_count'] = matrix(tail_count)
     result['Tail'] = matrix(tail)
     result['Proportion'] = matrix(proportion)
     result.write_csv(self.prefix + '.csv')
 def peak_counts(self):
     return io.read_grouped_table(
         join(self.dirname,'expression','peakwise','counts.csv'),
         [ ('Count',int), ('Tail_count',int), 
           ('Tail', float_or_none), ('Proportion', float_or_none) ],
         )
Exemple #12
0
 def peak_counts(self):
     return io.read_grouped_table(
         join(self.dirname, 'expression', 'peakwise', 'counts.csv'),
         [('Count', int), ('Tail_count', int), ('Tail', float_or_none),
          ('Proportion', float_or_none)],
     )
    def run(self):
        data = io.read_grouped_table(
            self.counts,
            [("Count", str), ("Annotation", str), ("Tail_count", str), ("Tail", str), ("Proportion", str)],
            "Count",
        )

        features = data["Count"].keys()
        samples = data["Count"].value_type().keys()

        tags = {}
        for sample in samples:
            tags[sample] = [sample]
        for line in data.comments:
            if line.startswith("#sampleTags="):
                parts = line[len("#sampleTags=") :].split(",")
                tags[parts[0]] = parts

        group_names = []
        groups = []
        group_tags = []

        for item in self.groups:
            select = selection.term_specification(item)
            name = selection.term_name(item)
            group = [item for item in samples if selection.matches(select, tags[item])]
            assert group, "Empty group: " + name

            this_group_tags = [name]
            for tag in tags[group[0]]:
                if tag == name:
                    continue
                for item in group[1:]:
                    for item2 in tags[item]:
                        if tag not in item2:
                            break
                    else:
                        this_group_tags.append(tag)

            group_names.append(name)
            groups.append(group)
            group_tags.append(this_group_tags)

        result = io.Grouped_table()
        result.comments = ["#Counts"]
        for item in group_tags:
            result.comments.append("#sampleTags=" + ",".join(item))

        count = []
        tail_count = []
        tail = []
        proportion = []
        for feature in features:
            this_count = []
            this_tail_count = []
            this_tail = []
            this_proportion = []
            for group in groups:
                this_this_count = []
                this_this_tail_count = []
                this_this_tail = []
                this_this_proportion = []
                for sample in group:
                    this_this_count.append(int(data["Count"][feature][sample]))
                    this_this_tail_count.append(int(data["Tail_count"][feature][sample]))
                    item = data["Tail"][feature][sample]
                    if item != "NA":
                        this_this_tail.append(float(item))
                    item = data["Proportion"][feature][sample]
                    if item != "NA":
                        this_this_proportion.append(float(item))

                this_count.append(str(sum(this_this_count)))
                this_tail_count.append(str(sum(this_this_tail_count)))
                this_tail.append(str(sum(this_this_tail) / len(this_this_tail)) if this_this_tail else "NA")
                this_proportion.append(
                    str(sum(this_this_proportion) / len(this_this_proportion)) if this_this_proportion else "NA"
                )

            count.append(this_count)
            tail_count.append(this_tail_count)
            tail.append(this_tail)
            proportion.append(this_proportion)

        matrix = io.named_matrix_type(features, group_names)
        result["Count"] = matrix(count)
        result["Annotation"] = data["Annotation"]
        result["Tail_count"] = matrix(tail_count)
        result["Tail"] = matrix(tail)
        result["Proportion"] = matrix(proportion)
        result.write_csv(self.prefix + ".csv")
    def run(self):
        #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given'

        # Reference genome

        #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths()
        chromosomes = collections.OrderedDict(io.read_sequences(
            self.reference))

        def get_interpeak_seq(peaks):
            start = min(item.transcription_stop for item in peaks)
            end = max(item.transcription_stop for item in peaks)
            if end - start > self.max_seq: return ''
            if peaks[0].strand >= 0:
                return chromosomes[peaks[0].seqid][start:end]
            else:
                return bio.reverse_complement(
                    chromosomes[peaks[0].seqid][start:end])

        def get_prepeak_seq(gene, peaks):
            if gene.strand >= 0:
                start = gene.utr_pos
                end = min(item.transcription_stop for item in peaks)
                if end - start > self.max_seq: return ''
                return chromosomes[gene.seqid][start:end]
            else:
                start = max(item.transcription_stop for item in peaks)
                end = gene.utr_pos
                if end - start > self.max_seq: return ''
                return bio.reverse_complement(
                    chromosomes[gene.seqid][start:end])

        # Normalization files

        if self.norm_file:
            norm_file = self.norm_file
        else:
            nesoni.Norm_from_counts(self.prefix + '-norm', self.counts).run()
            norm_file = self.prefix + '-norm.csv'

        norms = io.read_grouped_table(norm_file, [('All', str)])['All']
        pair_norm_names = []
        pair_norms = []
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i] + '-peak1')
            pair_norms.append(norms.values()[i])
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i] + '-peak2')
            pair_norms.append(norms.values()[i])
        io.write_grouped_csv(
            self.prefix + '-pairs-norm.csv',
            [('All', io.named_list_type(pair_norm_names)(pair_norms))],
            comments=['#Normalization'],
        )

        # Read data

        annotations = list(annotation.read_annotations(self.parents))
        if self.utrs:
            utrs = list(annotation.read_annotations(self.utrs))
        else:
            utrs = []
        children = list(annotation.read_annotations(self.children))

        count_table = io.read_grouped_table(self.counts,
                                            [('Count', int),
                                             ('Tail_count', int),
                                             ('Tail', _float_or_none),
                                             ('Proportion', _float_or_none),
                                             ('Annotation', str)])
        counts = count_table['Count']
        tail_counts = count_table['Tail_count']
        proportions = count_table['Proportion']
        tails = count_table['Tail']

        samples = counts.value_type().keys()
        sample_tags = {}
        for line in count_table.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                assert parts[0] not in sample_tags
                sample_tags[parts[0]] = parts

        for item in children:
            item.weight = sum(counts[item.get_id()][name] *
                              float(norms[name]['Normalizing.multiplier'])
                              for name in samples)

        parents = []
        id_to_parent = {}
        for item in annotations:
            if item.type != self.parent_type: continue
            assert item.get_id(
            ) not in id_to_parent, 'Duplicate id in parent file: ' + item.get_id(
            )
            parents.append(item)
            id_to_parent[item.get_id()] = item
            item.children = []
            #item.cds = [ ]

            # Default utr
            if item.strand >= 0:
                item.utr_pos = item.end
            else:
                item.utr_pos = item.start

            if 'three_prime_UTR_start' in item.attr:
                if item.strand >= 0:
                    item.utr_pos = int(item.attr['three_prime_UTR_start']) - 1
                else:
                    item.utr_pos = int(item.attr['three_prime_UTR_start'])

        for item in utrs:
            assert item.attr[
                'Parent'] in id_to_parent, 'Unknown gene ' + item.attr['Parent']
            id_to_parent[item.attr['Parent']].utr_pos = (
                item.start if item.strand >= 0 else item.end)

        for item in children:
            item.transcription_stop = item.end if item.strand >= 0 else item.start  #End of transcription, 0-based, ie between-positions based

            if 'Parent' in item.attr and item.attr.get(
                    "Relation") != "Antisense":
                for item_parent in item.attr['Parent'].split(','):
                    parent = id_to_parent[item_parent]
                    parent.children.append(item)

        for item in parents:
            item.children.sort(key=_annotation_sorter)

            relevant = list(item.children)
            if self.utr_only:
                #if item.strand <= 0:
                #    relative_utr_start = item.end - int(item.attr['three_prime_UTR_start'])
                #else:
                #    relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start
                #
                #def relative_start(peak):
                #    return item.end-peak.end if item.strand < 0 else peak.start-item.start
                #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ]

                #relevant = [
                #    peak for peak in relevant
                #    if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos)
                #    ]

                relevant = [
                    peak for peak in relevant
                    if peak.attr.get("Relation") == "3'UTR"
                ]

            if self.top:
                relevant.sort(key=lambda peak: peak.weight, reverse=True)
                relevant = relevant[:self.top]
            relevant.sort(key=_annotation_sorter)
            item.relevant_children = relevant

        # JSON output

        j_data = {}
        j_genes = j_data['genes'] = {}

        j_genes['__comment__'] = 'start is 0-based'
        j_genes['name'] = []
        j_genes['chromosome'] = []
        j_genes['strand'] = []
        j_genes['start'] = []
        j_genes['utr'] = []
        j_genes['end'] = []
        j_genes['gene'] = []
        j_genes['product'] = []
        j_genes['peaks'] = []
        j_genes['relevant_peaks'] = []
        #j_genes['cds'] = [ ]
        #j_genes['cds_start'] = [ ]
        #j_genes['cds_end'] = [ ]
        for item in parents:
            j_genes['name'].append(item.get_id())
            j_genes['chromosome'].append(item.seqid)
            j_genes['strand'].append(item.strand)
            j_genes['start'].append(item.start)
            j_genes['utr'].append(item.utr_pos)
            j_genes['end'].append(item.end)
            j_genes['gene'].append(
                item.attr.get('Name', item.attr.get('gene', '')))
            j_genes['product'].append(
                item.attr.get('Product', item.attr.get('product', '')))
            j_genes['peaks'].append(
                [item2.get_id() for item2 in item.children])
            j_genes['relevant_peaks'].append(
                [item2.get_id() for item2 in item.relevant_children])
            #j_genes['cds'].append( item.cds )
            #j_genes['cds_start'].append( item.cds_start )
            #j_genes['cds_end'].append( item.cds_end )

        j_peaks = j_data['peaks'] = {}
        j_peaks['__comment__'] = 'start is 0-based'
        j_peaks['name'] = []
        j_peaks['chromosome'] = []
        j_peaks['strand'] = []
        j_peaks['start'] = []
        j_peaks['end'] = []
        j_peaks['parents'] = []
        j_peaks['counts'] = []
        j_peaks['tail_lengths'] = []
        j_peaks['proportion_tailed'] = []
        for item in children:
            j_peaks['name'].append(item.get_id())
            j_peaks['chromosome'].append(item.seqid)
            j_peaks['strand'].append(item.strand)
            j_peaks['start'].append(item.start)
            j_peaks['end'].append(item.end)
            j_peaks['parents'].append(item.attr['Parent'].split(',')
                                      if 'Parent' in item.attr else [])
            j_peaks['counts'].append(counts[item.get_id()].values())
            j_peaks['tail_lengths'].append(
                count_table['Tail'][item.get_id()].values())
            j_peaks['proportion_tailed'].append(
                count_table['Proportion'][item.get_id()].values())

        j_samples = j_data['samples'] = {}
        j_samples['name'] = []
        j_samples['tags'] = []
        j_samples['normalizing_multiplier'] = []
        for name in samples:
            j_samples['name'].append(name)
            j_samples['tags'].append(sample_tags[name])
            j_samples['normalizing_multiplier'].append(
                float(norms[name]['Normalizing.multiplier']))

        j_chromosomes = j_data['chromosomes'] = {}
        j_chromosomes['name'] = []
        j_chromosomes['length'] = []
        for name, seq in chromosomes.iteritems():
            j_chromosomes['name'].append(name)
            j_chromosomes['length'].append(len(seq))

        with open(self.prefix + '.json', 'wb') as f:
            json.dump(j_data, f)

        # Output paired peak file

        output_comments = ['#Counts']
        output_samples = []
        for item in samples:
            output_samples.append(item + '-peak1')
            output_comments.append('#sampleTags=' +
                                   ','.join([item + '-peak1', 'peak1'] +
                                            sample_tags.get(item, [])))
        for item in samples:
            output_samples.append(item + '-peak2')
            output_comments.append('#sampleTags=' +
                                   ','.join([item + '-peak2', 'peak2'] +
                                            sample_tags.get(item, [])))

        output_names = []
        output_counts = []
        output_tail_counts = []
        output_proportions = []
        output_tails = []
        output_annotation_fields = [
            'gene', 'product', 'biotype', 'mean_tail_1', 'mean_tail_2',
            'chromosome', 'strand', 'transcription_stops'
        ]  #, 'interpeak_seq', ]
        output_annotations = []

        for item in parents:
            peaks = item.relevant_children
            for i in xrange(len(peaks) - 1):
                for j in xrange(i + 1, len(peaks)):
                    id_i = peaks[i].get_id()
                    id_j = peaks[j].get_id()
                    id_pair = item.get_id() + '-' + id_i + '-' + id_j
                    output_names.append(id_pair)

                    row = []
                    row.extend(counts[id_i].values())
                    row.extend(counts[id_j].values())
                    output_counts.append(filter(_text, row))

                    row = []
                    row.extend(tail_counts[id_i].values())
                    row.extend(tail_counts[id_j].values())
                    output_tail_counts.append(filter(_text, row))

                    row = []
                    row.extend(proportions[id_i].values())
                    row.extend(proportions[id_j].values())
                    output_proportions.append(filter(_text, row))

                    row = []
                    row.extend(tails[id_i].values())
                    row.extend(tails[id_j].values())
                    output_tails.append(filter(_text, row))

                    output_annotations.append([
                        item.attr.get('Name', item.attr.get('gene', '')),
                        item.attr.get('Product', item.attr.get('product', '')),
                        item.attr.get('Biotype', ''),
                        count_table['Annotation'][id_i]['mean-tail'],
                        count_table['Annotation'][id_j]['mean-tail'],
                        item.seqid,
                        str(item.strand),
                        '%d, %d' % (peaks[i].transcription_stop,
                                    peaks[j].transcription_stop),
                        #get_interpeak_seq([peaks[i],peaks[j]]),
                    ])

        #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts)
        io.write_grouped_csv(
            self.prefix + '-pairs.csv',
            [
                ('Count', io.named_matrix_type(output_names,
                                               output_samples)(output_counts)),
                ('Tail_count',
                 io.named_matrix_type(output_names,
                                      output_samples)(output_tail_counts)),
                ('Proportion',
                 io.named_matrix_type(output_names,
                                      output_samples)(output_proportions)),
                ('Tail', io.named_matrix_type(output_names,
                                              output_samples)(output_tails)),
                ('Annotation',
                 io.named_matrix_type(
                     output_names,
                     output_annotation_fields)(output_annotations)),
            ],
            comments=output_comments,
        )
    def run(self):
        data = io.read_grouped_table(
            self.counts,
            [('Count', str), ('Annotation', str), ('Tail_count', str),
             ('Tail', str), ('Proportion', str)],
            'Count',
        )

        features = data['Count'].keys()
        samples = data['Count'].value_type().keys()

        tags = {}
        for sample in samples:
            tags[sample] = [sample]
        for line in data.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                tags[parts[0]] = parts

        group_names = []
        groups = []
        group_tags = []

        for item in self.groups:
            select = selection.term_specification(item)
            name = selection.term_name(item)
            group = [
                item for item in samples
                if selection.matches(select, tags[item])
            ]
            assert group, 'Empty group: ' + name

            this_group_tags = [name]
            for tag in tags[group[0]]:
                if tag == name: continue
                for item in group[1:]:
                    for item2 in tags[item]:
                        if tag not in item2: break
                    else:
                        this_group_tags.append(tag)

            group_names.append(name)
            groups.append(group)
            group_tags.append(this_group_tags)

        result = io.Grouped_table()
        result.comments = ['#Counts']
        for item in group_tags:
            result.comments.append('#sampleTags=' + ','.join(item))

        count = []
        tail_count = []
        tail = []
        proportion = []
        for feature in features:
            this_count = []
            this_tail_count = []
            this_tail = []
            this_proportion = []
            for group in groups:
                this_this_count = []
                this_this_tail_count = []
                this_this_tail = []
                this_this_proportion = []
                for sample in group:
                    this_this_count.append(int(data['Count'][feature][sample]))
                    this_this_tail_count.append(
                        int(data['Tail_count'][feature][sample]))
                    item = data['Tail'][feature][sample]
                    if item != 'NA': this_this_tail.append(float(item))
                    item = data['Proportion'][feature][sample]
                    if item != 'NA': this_this_proportion.append(float(item))

                this_count.append(str(sum(this_this_count)))
                this_tail_count.append(str(sum(this_this_tail_count)))
                this_tail.append(
                    str(sum(this_this_tail) /
                        len(this_this_tail)) if this_this_tail else 'NA')
                this_proportion.append(
                    str(sum(this_this_proportion) / len(this_this_proportion)
                        ) if this_this_proportion else 'NA')

            count.append(this_count)
            tail_count.append(this_tail_count)
            tail.append(this_tail)
            proportion.append(this_proportion)

        matrix = io.named_matrix_type(features, group_names)
        result['Count'] = matrix(count)
        result['Annotation'] = data['Annotation']
        result['Tail_count'] = matrix(tail_count)
        result['Tail'] = matrix(tail)
        result['Proportion'] = matrix(proportion)
        result.write_csv(self.prefix + '.csv')
Exemple #16
0
    def run(self):
        working_dirs = []
        peaks_file = self.peaks_file
        for item in self.working_dirs:
            state_filename = os.path.join(item, 'analyse-polya-batch.state')
            if not os.path.exists(state_filename):
                working_dirs.append(item)
            else:
                with open(state_filename, 'rb') as f:
                    state = pickle.load(f)

                for sample in state.samples:
                    working_dirs.append(
                        os.path.join(item, 'samples', sample.output_dir))

                if not peaks_file:
                    peaks_file = os.path.join(self.pipeline_dir, "peaks",
                                              "relation-child.gff")

        sample_names = [os.path.split(dirname)[1] for dirname in working_dirs]
        workspaces = [
            working_directory.Working(dirname, must_exist=True)
            for dirname in working_dirs
        ]

        workspace = self.get_workspace()

        with open(workspace / "index.html", "wb") as f:
            web.emit(
                f, "igv.html",
                dict(
                    SAMPLES=json.dumps(sample_names),
                    HAVE_NORM=json.dumps(bool(self.norm_file)),
                    TITLE=self.title,
                ))

        bams = [item / "alignments_filtered_sorted.bam" for item in workspaces]

        for i in xrange(len(sample_names)):
            io.symbolic_link(bams[i], workspace / (sample_names[i] + ".bam"))
            io.symbolic_link(bams[i] + ".bai",
                             workspace / (sample_names[i] + ".bam.bai"))

        io.symbolic_link(peaks_file, workspace / "peaks.gff")

        if self.norm_file:
            mults = io.read_grouped_table(self.norm_file)['All']
            norm_mult = [
                float(mults[name]['Normalizing.multiplier'])
                for name in sample_names
            ]

        with nesoni.Stage() as stage:
            Bam_to_bigwig(
                workspace / "total",
                bam_files=bams,
                what="ambiguity,span,3p,polyaspan,polya3p",
            ).process_make(stage)

            for i in xrange(len(sample_names)):
                for scale_desc, scale in \
                        [("raw",1.0)] + \
                        ([("norm",norm_mult[i])] if self.norm_file else []):
                    Bam_to_bigwig(workspace /
                                  (sample_names[i] + "-" + scale_desc),
                                  bam_files=[bams[i]],
                                  what='span,3p,polyaspan,polya3p',
                                  scale=scale).process_make(stage)