Example #1
0
    def _write_table(self, samples, items):
        names = [
            '%s:%d' % (item.record.CHROM, item.record.POS) for item in items
        ]
        sample_list = io.named_list_type(samples)

        groups = []

        locations_list = io.named_list_type(['CHROM', 'POS'])
        locations = io.named_list_type(names, locations_list)([
            locations_list([item.record.CHROM, item.record.POS])
            for item in items
        ])
        groups.append(('Location', locations))

        genotypes = io.named_list_type(names, sample_list)([
            sample_list([
                describe_genotype(item2, item.variants)
                for item2 in item.genotypes
            ]) for item in items
        ])
        groups.append(('Genotype', genotypes))

        if self.qualities:
            qualities = io.named_list_type(names, sample_list)(
                [sample_list(item.qualities) for item in items])
            groups.append(('Quality', qualities))

        if self.counts:
            counts = io.named_list_type(names, sample_list)([
                sample_list([
                    describe_counts(item2, item.variants)
                    for item2 in item.counts
                ]) for item in items
            ])
            groups.append(('Count', counts))

        annotation_list = io.named_list_type(['snpeff'])
        annotations = io.named_list_type(names, annotation_list)([
            annotation_list([
                ' /// '.join(item2[0] for item2 in item.snpeff
                             if selection.matches(self.snpeff_show, item2[1]))
            ]) for item in items
        ])
        groups.append(('Annotation', annotations))

        io.write_grouped_csv(self.prefix + '.csv', groups)
Example #2
0
    def _write_table(self, samples, items):
        names = [ '%s:%d' % (item.record.CHROM, item.record.POS) for item in items ]
        sample_list = io.named_list_type(samples)
        
        groups = [ ]
        
        locations_list = io.named_list_type(['CHROM','POS'])
        locations = io.named_list_type(names, locations_list)([
            locations_list([ item.record.CHROM, item.record.POS ])
            for item in items
            ])
        groups.append(('Location',locations))
                
        genotypes = io.named_list_type(names,sample_list)([
            sample_list([ describe_genotype(item2,item.variants) for item2 in item.genotypes ])
            for item in items
            ])
        groups.append(('Genotype',genotypes))

        if self.qualities:
            qualities = io.named_list_type(names,sample_list)([
                sample_list(item.qualities)
                for item in items
                ])
            groups.append(('Quality',qualities))

        if self.counts:        
            counts = io.named_list_type(names,sample_list)([
                sample_list([ describe_counts(item2,item.variants) for item2 in item.counts ])
                for item in items
                ])
            groups.append(('Count',counts))
        
        annotation_list = io.named_list_type(['snpeff'])
        annotations = io.named_list_type(names, annotation_list)([
            annotation_list([
                ' /// '.join(item2[0] for item2 in item.snpeff if selection.matches(self.snpeff_show, item2[1]))
                ])
            for item in items
            ])
        groups.append(('Annotation',annotations))
        
        io.write_grouped_csv(self.prefix + '.csv', groups)
Example #3
0
    def run(self):
        #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given'
        
        # Reference genome
        
        #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths()
        chromosomes = collections.OrderedDict(io.read_sequences(self.reference))

        def get_interpeak_seq(peaks):
            start = min(item.transcription_stop for item in peaks)
            end = max(item.transcription_stop for item in peaks)
            if end-start > self.max_seq: return ''
            if peaks[0].strand >= 0:
                return chromosomes[peaks[0].seqid][start:end]
            else:
                return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end])

        def get_prepeak_seq(gene,peaks):
            if gene.strand >= 0:
                start = gene.utr_pos
                end = min(item.transcription_stop for item in peaks)
                if end-start > self.max_seq: return ''
                return chromosomes[gene.seqid][start:end]
            else:
                start = max(item.transcription_stop for item in peaks)
                end = gene.utr_pos
                if end-start > self.max_seq: return ''
                return bio.reverse_complement(chromosomes[gene.seqid][start:end])
        
        # Normalization files
        
        if self.norm_file:
            norm_file = self.norm_file
        else:
            nesoni.Norm_from_counts(self.prefix+'-norm', self.counts).run()
            norm_file = self.prefix+'-norm.csv'

        norms = io.read_grouped_table(norm_file, [('All',str)])['All']
        pair_norm_names = [ ]
        pair_norms = [ ]
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i]+'-peak1')
            pair_norms.append(norms.values()[i])
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i]+'-peak2')
            pair_norms.append(norms.values()[i])
        io.write_grouped_csv(
            self.prefix+'-pairs-norm.csv',
            [('All',io.named_list_type(pair_norm_names)(pair_norms))],
            comments=['#Normalization'],
            )


        # Read data
        
        annotations = list(annotation.read_annotations(self.parents))
        if self.utrs:
            utrs = list(annotation.read_annotations(self.utrs))
        else:
            utrs = [ ]
        children = list(annotation.read_annotations(self.children))
        
        count_table = io.read_grouped_table(self.counts, [
            ('Count',int),
            ('Tail_count',int),
            ('Tail',_float_or_none),
            ('Proportion',_float_or_none),
            ('Annotation',str)
            ])
        counts = count_table['Count']
        tail_counts = count_table['Tail_count']
        proportions = count_table['Proportion']
        tails = count_table['Tail']
        
        samples = counts.value_type().keys()
        sample_tags = { }
        for line in count_table.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                assert parts[0] not in sample_tags
                sample_tags[parts[0]] = parts
        
        for item in children:
            item.weight = sum( counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples )
        
        parents = [ ]
        id_to_parent = { }
        for item in annotations:
            if item.type != self.parent_type: continue
            assert item.get_id() not in id_to_parent, 'Duplicate id in parent file: '+item.get_id()
            parents.append(item)
            id_to_parent[item.get_id()] = item
            item.children = [ ]
            #item.cds = [ ]
            
            # Default utr
            if item.strand >= 0:
               item.utr_pos = item.end
            else:
               item.utr_pos = item.start
            
            if 'three_prime_UTR_start' in item.attr:
               if item.strand >= 0:
                  item.utr_pos = int(item.attr['three_prime_UTR_start'])-1
               else:
                  item.utr_pos = int(item.attr['three_prime_UTR_start'])
            
            
        
        for item in utrs:
            assert item.attr['Parent'] in id_to_parent, 'Unknown gene '+item.attr['Parent']    
            id_to_parent[item.attr['Parent']].utr_pos = (item.start if item.strand >= 0 else item.end)


        for item in children:
            item.transcription_stop = item.end if item.strand >= 0 else item.start #End of transcription, 0-based, ie between-positions based
            
            if 'Parent' in item.attr:
                for item_parent in item.attr['Parent'].split(','):
                    parent = id_to_parent[item_parent]
                    parent.children.append(item)
                    

        for item in parents:
            item.children.sort(key=_annotation_sorter)
            
            relevant = list(item.children)
            if self.utr_only:
                #if item.strand <= 0:
                #    relative_utr_start = item.end - int(item.attr['three_prime_UTR_start'])
                #else:
                #    relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start
                #
                #def relative_start(peak):
                #    return item.end-peak.end if item.strand < 0 else peak.start-item.start
                #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ]
                relevant = [ 
                    peak for peak in relevant 
                    if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos)
                    ]
                    
            if self.top:
                relevant.sort(key=lambda peak:peak.weight, reverse=True)
                relevant = relevant[:self.top]
            relevant.sort(key=_annotation_sorter)
            item.relevant_children = relevant
        
        
        
        # JSON output
        
        j_data = { }
        j_genes = j_data['genes'] = { }
        
        j_genes['__comment__'] = 'start is 0-based'
        j_genes['name'] = [ ]
        j_genes['chromosome'] = [ ]
        j_genes['strand'] = [ ]
        j_genes['start'] = [ ]
        j_genes['utr'] = [ ]
        j_genes['end'] = [ ]
        j_genes['gene'] = [ ]
        j_genes['product'] = [ ]
        j_genes['peaks'] = [ ]
        j_genes['relevant_peaks'] = [ ]
        #j_genes['cds'] = [ ]
        #j_genes['cds_start'] = [ ]
        #j_genes['cds_end'] = [ ]
        for item in parents:
            j_genes['name'].append( item.get_id() )
            j_genes['chromosome'].append( item.seqid )
            j_genes['strand'].append( item.strand )
            j_genes['start'].append( item.start )
            j_genes['utr'].append( item.utr_pos )
            j_genes['end'].append( item.end )
            j_genes['gene'].append( item.attr.get('Name',item.attr.get('gene','')) )
            j_genes['product'].append( item.attr.get('Product',item.attr.get('product','')) )
            j_genes['peaks'].append( [ item2.get_id() for item2 in item.children ] )
            j_genes['relevant_peaks'].append( [ item2.get_id() for item2 in item.relevant_children ] )
            #j_genes['cds'].append( item.cds )
            #j_genes['cds_start'].append( item.cds_start )
            #j_genes['cds_end'].append( item.cds_end )
        
        j_peaks = j_data['peaks'] = { }
        j_peaks['__comment__'] = 'start is 0-based'
        j_peaks['name'] = [ ]
        j_peaks['chromosome'] = [ ]
        j_peaks['strand'] = [ ]
        j_peaks['start'] = [ ]
        j_peaks['end'] = [ ]
        j_peaks['parents'] = [ ]
        j_peaks['counts'] = [ ]
        j_peaks['tail_lengths'] = [ ]
        j_peaks['proportion_tailed'] = [ ]
        for item in children:
            j_peaks['name'].append( item.get_id() )
            j_peaks['chromosome'].append( item.seqid )
            j_peaks['strand'].append( item.strand )
            j_peaks['start'].append( item.start )
            j_peaks['end'].append( item.end )
            j_peaks['parents'].append( item.attr['Parent'].split(',') if 'Parent' in item.attr else [ ])
            j_peaks['counts'].append( counts[item.get_id()].values() )
            j_peaks['tail_lengths'].append( count_table['Tail'][item.get_id()].values() )
            j_peaks['proportion_tailed'].append( count_table['Proportion'][item.get_id()].values() )
        
        j_samples = j_data['samples'] = { }
        j_samples['name'] = [ ]
        j_samples['tags'] = [ ]
        j_samples['normalizing_multiplier'] = [ ]
        for name in samples:
            j_samples['name'].append(name)
            j_samples['tags'].append(sample_tags[name])
            j_samples['normalizing_multiplier'].append(float(norms[name]['Normalizing.multiplier']))
        
        j_chromosomes = j_data['chromosomes'] = { }
        j_chromosomes['name'] = [ ]
        j_chromosomes['length'] = [ ]
        for name, seq in chromosomes.iteritems():
            j_chromosomes['name'].append(name)
            j_chromosomes['length'].append(len(seq))        
        
        with open(self.prefix + '.json','wb') as f:
            json.dump(j_data, f)
        
        
        # Output paired peak file
        
        output_comments = [ '#Counts' ]
        output_samples = [ ]
        for item in samples:
            output_samples.append(item+'-peak1')
            output_comments.append('#sampleTags=' + ','.join([item+'-peak1','peak1']+sample_tags.get(item,[])))
        for item in samples:
            output_samples.append(item+'-peak2')
            output_comments.append('#sampleTags=' + ','.join([item+'-peak2','peak2']+sample_tags.get(item,[])))
        
        output_names = [ ]
        output_counts = [ ]
        output_tail_counts = [ ]
        output_proportions = [ ]
        output_tails = [ ]
        output_annotation_fields = [ 'gene', 'product', 'mean_tail_1', 'mean_tail_2', 'chromosome', 'strand', 
                                     'transcription_stops' ] #, 'interpeak_seq', ]
        output_annotations = [ ]
            
        for item in parents:
            peaks = item.relevant_children
            for i in xrange(len(peaks)-1):
                for j in xrange(i+1, len(peaks)):
                    id_i = peaks[i].get_id()
                    id_j = peaks[j].get_id()
                    id_pair = item.get_id() + '-'+id_i+'-'+id_j
                    output_names.append(id_pair)
                    
                    row = [ ]
                    row.extend(counts[id_i].values())
                    row.extend(counts[id_j].values())
                    output_counts.append(filter(_text,row))
                    
                    row = [ ]
                    row.extend(tail_counts[id_i].values())
                    row.extend(tail_counts[id_j].values())
                    output_tail_counts.append(filter(_text,row))

                    row = [ ]
                    row.extend(proportions[id_i].values())
                    row.extend(proportions[id_j].values())
                    output_proportions.append(filter(_text,row))

                    row = [ ]
                    row.extend(tails[id_i].values())
                    row.extend(tails[id_j].values())
                    output_tails.append(filter(_text,row))
                    
                    output_annotations.append([
                        item.attr.get('Name',item.attr.get('gene','')),
                        item.attr.get('Product',item.attr.get('product','')),
                        count_table['Annotation'][id_i]['mean-tail'],
                        count_table['Annotation'][id_j]['mean-tail'],
                        
                        item.seqid,
                        str(item.strand),
                        '%d, %d' % (peaks[i].transcription_stop,peaks[j].transcription_stop),
                        #get_interpeak_seq([peaks[i],peaks[j]]),
                        ])
        
        #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts)
        io.write_grouped_csv(
            self.prefix + '-pairs.csv',
            [ 
                ('Count',io.named_matrix_type(output_names,output_samples)(output_counts)),
                ('Tail_count',io.named_matrix_type(output_names,output_samples)(output_tail_counts)),
                ('Proportion',io.named_matrix_type(output_names,output_samples)(output_proportions)),
                ('Tail',io.named_matrix_type(output_names,output_samples)(output_tails)),
                ('Annotation',io.named_matrix_type(output_names,output_annotation_fields)(output_annotations)),
                ],
            comments=output_comments,
            )
                        
#        # Chi Sq tests
#        
#        #for id in relation:
#        #    peaks = relation[id]
#        #    if len(peaks) < 2: continue     
#        
#        mats = [ ]   
#        genes = [ ]
#        products = [ ]
#        mean_tails = [ ]
#        prop_tails = [ ]
#        
#        peak_names = [ ]
#        chromosome_names = [ ]
#        strands = [ ]
#        transcription_stops = [ ]
#        interpeak_seqs = [ ]
#        prepeak_seqs = [ ]
#        
#        for parent in parents:
#            id = parent.get_id()
#            peaks = parent.relevant_children
#            if len(peaks) < 2: continue
#            
#            matrix = [ ]
#            for item in peaks:
#                matrix.append(counts[item.get_id()].values())
#            
#            mats.append(
#                runr.R_literal(id) + ' = ' + 
#                runr.R_literal(matrix)
#                )
#            
#            genes.append(parent.attr.get('Name',parent.attr.get('gene','')))
#            products.append(parent.attr.get('Product',parent.attr.get('product','')))
#            
#            def format_mean(s):
#                if s == 'NA': return 'NA'
#                return '%.1f' % float(s)
#            mean_tails.append(', '.join( format_mean(count_table['Annotation'][item.get_id()]['mean-tail']) for item in peaks ))
#            
#            def format_prop(s):
#                if s == 'NA': return 'NA'
#                return '%.2f' % float(s)
#            prop_tails.append(', '.join( format_prop(count_table['Annotation'][item.get_id()]['proportion-with-tail']) for item in peaks ))
#            
#            peak_names.append(', '.join(item.get_id() for item in peaks))
#            chromosome_names.append(parent.seqid)
#            strands.append(parent.strand)
#            transcription_stops.append(', '.join(str(item.transcription_stop) for item in peaks))
#            interpeak_seqs.append(get_interpeak_seq(peaks))
#            prepeak_seqs.append(get_prepeak_seq(parent,peaks))
#            
#            #if len(mats) >= 10: break
#        
#        text = 'cat("Loading data into R+\n")\n'
#        text += 'data <- list(\n' + ',\n'.join(mats) + ')\n'        
#        text += CHISQ
#        
#        runr.run_script(text,
#            OUTPUT_FILENAME=self.prefix+'.csv',
#            GENES = genes,
#            PRODUCTS = products,
#            MEAN_TAILS = mean_tails,
#            PROP_TAILS = prop_tails,
#            PEAK_NAMES = peak_names,
#            CHROMOSOME_NAMES = chromosome_names,
#            STRANDS = strands,
#            TRANSCRIPTION_STOPS = transcription_stops,
#            INTERPEAK_SEQS = interpeak_seqs,
#            PREPEAK_SEQS = prepeak_seqs,
#            )
#        
            
        
        
        
        
        
        
        
        
Example #4
0
def count_run(min_score, min_size, max_size, filter_mode, equalize, types,
              locii, qualifiers, use_strand, merge_filename, limit,
              output_prefix, filenames, log):

    if filter_mode == 'poly':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = False
        expect_multiple_alignments = True
    elif filter_mode == 'mono':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = True
        expect_multiple_alignments = True
    else:
        assert filter_mode == 'existing', 'Unrecognized filtering mode'
        use_bam_filename = 'alignments_filtered.bam'
        use_only_top = False
        use_only_monogamous = False
        expect_multiple_alignments = False

    types = types.lower().split(',')

    qualifiers = qualifiers.split(',')

    if locii:
        locii = locii.lower().split(',')
    else:
        locii = None

    assert use_strand is not None, 'You must now explicitly specify --strand'
    assert use_strand in ('pool', 'forward', 'reverse',
                          'both'), "Can't understand --strand specification."

    from Bio import Seq, SeqIO

    annotation_filenames = []
    bam_filenames = []
    for arg in filenames:
        if annotation.is_annotation_file(arg):
            annotation_filenames.append(arg)
        else:
            bam_filenames.append(arg)

    n_samples = len(bam_filenames)
    titles = bam_filenames[:]
    tags = []
    for i in xrange(len(bam_filenames)):
        if os.path.isdir(bam_filenames[i]):
            working = working_directory.Working(bam_filenames[i])
            titles[i] = working.name
            tags.append(working.get_tags())
            if not annotation_filenames:
                reference_filename = working.get_reference(
                ).annotations_filename()
                if reference_filename is not None:
                    annotation_filenames.append(reference_filename)

            bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename)

    assert bam_filenames, 'No reference alignments given'

    merge = {}
    merge_qualifiers = {}
    if merge_filename is not None:
        #First line gives qualifiers
        #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...>

        f = open(merge_filename, 'rU')
        qualifiers = f.readline().rstrip('\n').split('\t')
        for line in f:
            parts = line.rstrip('\n').split('\t')
            if not parts: continue
            for name in parts[len(qualifiers) + 1:]:
                assert name not in merge, 'Duplicate feature name in merge file'
                merge[name] = parts[len(qualifiers)]
                merge_qualifiers[name] = parts[:len(qualifiers)]
        f.close()

    genes = {}  # reference name -> gene index

    feature_names = {}  # feature_name -> number of occurrences

    features = []

    n_features = 0

    chromosome_length = {}
    for filename in bam_filenames:
        headers = sam.bam_headers(filename)
        for line in headers.split('\n'):
            if not line: continue
            parts = line.split('\t')
            if parts[0] != '@SQ': continue

            name = None
            length = None
            for part in parts[1:]:
                if part.startswith('SN:'): name = part[3:]
                if part.startswith('LN:'): length = int(part[3:])
            assert name is not None and length is not None

            if name in chromosome_length:
                assert chromosome_length[name] == length
            else:
                chromosome_length[name] = length

    for name in chromosome_length:
        genes[name] = span_index.Span_index()

    if annotation_filenames:
        assert not merge, 'Merging not supported with annotation files'

        for filename in annotation_filenames:
            for feature in annotation.read_annotations(filename):
                if feature.type.lower() not in types: continue

                if (locii is not None and
                    ('locus_tag' not in feature.attr
                     or feature.attr['locus_tag'].lower() not in locii)):
                    continue

                f = Feature(n_samples)
                f.name = feature.get_id()
                if feature.type.lower() != 'cds' and len(types) > 1:
                    f.name = feature.type + ':' + f.name

                feature_names[f.name] = feature_names.get(f.name, 0) + 1
                if feature_names[f.name] > 1:
                    f.name += '/%d' % feature_names[f.name]

                f.qualifiers = [
                    feature.attr.get(item, '') for item in qualifiers
                ]

                f.length = feature.end - feature.start

                assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files'
                genes[feature.seqid].insert(
                    Span_entry(feature.start, feature.end, feature.strand or 1,
                               f))
                features.append(f)

    else:
        # Sequences as features
        log.log(
            'No annotation files given or found, using sequences as features\n'
        )

        name_feature = {}  # (merged)name -> feature

        for name in chromosome_length:
            merged_name = merge.get(name, name)

            if merged_name not in name_feature:
                f = Feature(n_samples)
                f.name = merged_name
                f.length = length
                f.qualifiers = merge_qualifiers.get(name,
                                                    ('', ) * len(qualifiers))
                n_features += 1
                name_feature[merged_name] = f
                features.append(f)
            else:
                f = name_feature[merged_name]
                f.length = max(f.length, length)  #...

            genes[name].insert(Span_entry(0, chromosome_length[name], 1, f))

    log.log('%d features\n\n' % len(features))

    for name in genes:
        genes[name].prepare()

    n_fragments = [0] * n_samples
    n_fragments_aligned = [0] * n_samples
    n_low_score = [0] * n_samples
    n_something = [0] * n_samples
    n_multiple = [0] * n_samples
    n_span = [0] * n_samples

    for i in xrange(n_samples):
        for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(
                bam_filenames[i],
                'Counting sample %d of %d' % (i + 1, n_samples)):
            n_fragments[i] += 1

            if not fragment_alignments:
                continue

            n_fragments_aligned[i] += 1

            feature_hits = []  # [ [ (feature, strand) ] ]

            # Use only top scoring alignments
            fragment_scores = [
                sum(al.get_AS() for al in item) for item in fragment_alignments
            ]

            best_score = max(fragment_scores)

            if min_score is not None and best_score < min_score:
                n_low_score[i] += 1
                continue

            if use_only_top:
                cutoff = max(best_score, min_score)
            else:
                cutoff = min_score
            fragment_alignments = [
                item
                for item, score in zip(fragment_alignments, fragment_scores)
                if score >= cutoff
            ]

            for alignments in fragment_alignments:
                strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1

                start = min(item.pos - 1 for item in alignments)
                end = max(item.pos + item.length - 1 for item in alignments)
                length = end - start
                if min_size is not None and length < min_size: continue
                if max_size is not None and length > max_size: continue

                rname = alignments[0].rname
                strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1
                assert alignments[
                    0].rname in genes, 'Alignment refers to sequence not present in GENBANK file'

                this_feature_hits = []
                for item in genes[rname].get(start, end):
                    rel_strand = strand * item.strand
                    key = (item.feature, rel_strand)
                    if key in this_feature_hits: continue
                    this_feature_hits.append(key)
                    if not use_only_monogamous or len(
                            fragment_alignments) == 1:
                        item.feature.count[rel_strand][i] += 1

                if this_feature_hits:
                    feature_hits.append(this_feature_hits)

                if len(this_feature_hits) > 1:
                    for a in this_feature_hits:
                        for b in this_feature_hits:
                            if a[0] is b[0]: continue
                            a[0].common[(a[1], b[1])][b[0]] += 1

            if len(feature_hits) > 0:
                n_something[i] += 1
            #else:
            #    print fragment_alignments
            #    print genes[fragment_alignments[0][0].rname].indexes
            #    print

            if len(feature_hits) > 1:
                n_multiple[i] += 1
                for j in xrange(len(feature_hits)):
                    for k in xrange(len(feature_hits)):
                        if j == k: continue
                        for a in feature_hits[j]:
                            for b in feature_hits[k]:
                                if a[0] is b[0]: continue
                                a[0].ambiguous[(a[1], b[1])][b[0]] += 1

            if any(len(item) > 1 for item in feature_hits): n_span[i] += 1

            if limit is not None and n_fragments[i] >= limit: break

        grace.status('')

        #log.log('%s\n' % titles[i])
        #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i]))
        #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i]))
        #if n_low_score[i]:
        #    log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i]))
        #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i]))
        #if expect_multiple_alignments or n_multiple[i]:
        #    log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i]))
        #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i]))
        #log.log('\n')

        log.datum(titles[i], 'fragments', n_fragments[i])
        log.datum(titles[i], 'fragments aligned to the reference',
                  n_fragments_aligned[i])
        if n_low_score[i]:
            log.datum(titles[i], 'had too low an alignment score, discarded',
                      n_low_score[i])
        log.datum(titles[i], 'aligned to an annotated gene', n_something[i])
        if expect_multiple_alignments or n_multiple[i]:
            log.datum(titles[i], 'aligned to multiple genes', n_multiple[i])
        log.datum(titles[i], 'had an alignment that spanned multiple genes',
                  n_span[i])
        log.log('\n')

    strandedness = []
    for feature in features:
        n_forward = sum(feature.count[1])
        n_reverse = sum(feature.count[-1])
        if n_forward + n_reverse < 5: continue
        strandedness.append(
            (n_forward - n_reverse) * 100.0 / (n_forward + n_reverse))
    strandedness = sum(strandedness) / max(1, len(strandedness))
    log.log(
        'Strand specificity score: %.0f\n'
        '  (~ -100 reverse strand, ~ 0 non-specific, ~ 100 forward strand\n'
        '   Average over all features with at least 5 hits.)\n' % strandedness)

    if use_strand == 'pool':
        getters = [
            lambda f:
            (feature.name, add_lists(feature.count[1], feature.count[-1]),
             add_defdicts(feature.common[(1, 1)], feature.common[
                 (1, -1)], feature.common[(-1, 1)], feature.common[(-1, -1)]),
             add_defdicts(feature.ambiguous[(1, 1)], feature.ambiguous[
                 (1, -1)], feature.ambiguous[(-1, 1)], feature.ambiguous[
                     (-1, -1)]))
        ]
    elif use_strand == 'forward':
        getters = [
            lambda f: (feature.name, feature.count[1], feature.common[
                (1, 1)], feature.ambiguous[(1, 1)])
        ]
    elif use_strand == 'reverse':
        getters = [
            lambda f: (feature.name, feature.count[-1], feature.common[
                (-1, -1)], feature.ambiguous[(-1, -1)])
        ]
    elif use_strand == 'both':
        getters = [
            lambda f: (feature.name, feature.count[1], feature.common[
                (1, 1)], feature.ambiguous[(1, 1)]), lambda f:
            (feature.name + 'r', feature.count[-1], feature.common[
                (-1, -1)], feature.ambiguous[(-1, -1)])
        ]

    total_hits = [0] * n_samples
    for feature in features:
        for getter in getters:
            total_hits = add_lists(total_hits, getter(feature)[1])

    if equalize:
        min_hits = min(total_hits)
        p = [float(min_hits) / item for item in total_hits]
        total_hits = [min_hits] * n_samples

    comments = ['#Counts'] + ['#sampleTags=' + ','.join(item) for item in tags]

    names = []

    count_type = io.named_list_type(titles)
    counts = []

    #rpkm_type = io.named_list_type(titles)
    #rpkms = [ ]

    annotation_type = io.named_list_type(['Length'] + qualifiers)
    annotations = []

    alignment_type = io.named_list_type(
        ['On same fragment'] +
        ['Ambiguous alignment'] if expect_multiple_alignments else [])
    alignments = []

    for feature in features:
        for getter in getters:
            feature_name, count, common, ambiguous = getter(feature)

            if equalize:
                count = [subsample(count[i], p[i]) for i in xrange(n_samples)]

            #rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ]

            #common_str = ' '.join(
            #    '%dx%s' % (item[1],item[0])
            #    for item in sorted(common.items(), key=lambda item:item[1], reverse=True)
            #    )
            #ambiguous_str = ' '.join(
            #    '%dx%s' % (item[1],item[0])
            #    for item in sorted(ambiguous.items(), key=lambda item:item[1], reverse=True)
            #    )
            common_str = count_encode(common)
            ambiguous_str = count_encode(ambiguous)

            names.append(feature_name)
            counts.append(count_type(count))
            #rpkms.append(rpkm_type(rpkm))
            annotations.append(
                annotation_type([str(feature.length)] +
                                list(feature.qualifiers)))
            alignments.append(
                alignment_type(
                    [common_str] +
                    [ambiguous_str] if expect_multiple_alignments else []))

    groups = [
        ('Count', io.named_list_type(names, count_type)(counts)),
        #('RPKM', io.named_list_type(names,rpkm_type)(rpkms)),
        ('Annotation', io.named_list_type(names,
                                          annotation_type)(annotations)),
        ('Alignment', io.named_list_type(names, alignment_type)(alignments)),
    ]

    io.write_grouped_csv(output_prefix + '.csv',
                         groups,
                         rowname_name='Feature',
                         comments=comments)
    def run(self):
        #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given'

        # Reference genome

        #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths()
        chromosomes = collections.OrderedDict(io.read_sequences(
            self.reference))

        def get_interpeak_seq(peaks):
            start = min(item.transcription_stop for item in peaks)
            end = max(item.transcription_stop for item in peaks)
            if end - start > self.max_seq: return ''
            if peaks[0].strand >= 0:
                return chromosomes[peaks[0].seqid][start:end]
            else:
                return bio.reverse_complement(
                    chromosomes[peaks[0].seqid][start:end])

        def get_prepeak_seq(gene, peaks):
            if gene.strand >= 0:
                start = gene.utr_pos
                end = min(item.transcription_stop for item in peaks)
                if end - start > self.max_seq: return ''
                return chromosomes[gene.seqid][start:end]
            else:
                start = max(item.transcription_stop for item in peaks)
                end = gene.utr_pos
                if end - start > self.max_seq: return ''
                return bio.reverse_complement(
                    chromosomes[gene.seqid][start:end])

        # Normalization files

        if self.norm_file:
            norm_file = self.norm_file
        else:
            nesoni.Norm_from_counts(self.prefix + '-norm', self.counts).run()
            norm_file = self.prefix + '-norm.csv'

        norms = io.read_grouped_table(norm_file, [('All', str)])['All']
        pair_norm_names = []
        pair_norms = []
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i] + '-peak1')
            pair_norms.append(norms.values()[i])
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i] + '-peak2')
            pair_norms.append(norms.values()[i])
        io.write_grouped_csv(
            self.prefix + '-pairs-norm.csv',
            [('All', io.named_list_type(pair_norm_names)(pair_norms))],
            comments=['#Normalization'],
        )

        # Read data

        annotations = list(annotation.read_annotations(self.parents))
        if self.utrs:
            utrs = list(annotation.read_annotations(self.utrs))
        else:
            utrs = []
        children = list(annotation.read_annotations(self.children))

        count_table = io.read_grouped_table(self.counts,
                                            [('Count', int),
                                             ('Tail_count', int),
                                             ('Tail', _float_or_none),
                                             ('Proportion', _float_or_none),
                                             ('Annotation', str)])
        counts = count_table['Count']
        tail_counts = count_table['Tail_count']
        proportions = count_table['Proportion']
        tails = count_table['Tail']

        samples = counts.value_type().keys()
        sample_tags = {}
        for line in count_table.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                assert parts[0] not in sample_tags
                sample_tags[parts[0]] = parts

        for item in children:
            item.weight = sum(counts[item.get_id()][name] *
                              float(norms[name]['Normalizing.multiplier'])
                              for name in samples)

        parents = []
        id_to_parent = {}
        for item in annotations:
            if item.type != self.parent_type: continue
            assert item.get_id(
            ) not in id_to_parent, 'Duplicate id in parent file: ' + item.get_id(
            )
            parents.append(item)
            id_to_parent[item.get_id()] = item
            item.children = []
            #item.cds = [ ]

            # Default utr
            if item.strand >= 0:
                item.utr_pos = item.end
            else:
                item.utr_pos = item.start

            if 'three_prime_UTR_start' in item.attr:
                if item.strand >= 0:
                    item.utr_pos = int(item.attr['three_prime_UTR_start']) - 1
                else:
                    item.utr_pos = int(item.attr['three_prime_UTR_start'])

        for item in utrs:
            assert item.attr[
                'Parent'] in id_to_parent, 'Unknown gene ' + item.attr['Parent']
            id_to_parent[item.attr['Parent']].utr_pos = (
                item.start if item.strand >= 0 else item.end)

        for item in children:
            item.transcription_stop = item.end if item.strand >= 0 else item.start  #End of transcription, 0-based, ie between-positions based

            if 'Parent' in item.attr and item.attr.get(
                    "Relation") != "Antisense":
                for item_parent in item.attr['Parent'].split(','):
                    parent = id_to_parent[item_parent]
                    parent.children.append(item)

        for item in parents:
            item.children.sort(key=_annotation_sorter)

            relevant = list(item.children)
            if self.utr_only:
                #if item.strand <= 0:
                #    relative_utr_start = item.end - int(item.attr['three_prime_UTR_start'])
                #else:
                #    relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start
                #
                #def relative_start(peak):
                #    return item.end-peak.end if item.strand < 0 else peak.start-item.start
                #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ]

                #relevant = [
                #    peak for peak in relevant
                #    if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos)
                #    ]

                relevant = [
                    peak for peak in relevant
                    if peak.attr.get("Relation") == "3'UTR"
                ]

            if self.top:
                relevant.sort(key=lambda peak: peak.weight, reverse=True)
                relevant = relevant[:self.top]
            relevant.sort(key=_annotation_sorter)
            item.relevant_children = relevant

        # JSON output

        j_data = {}
        j_genes = j_data['genes'] = {}

        j_genes['__comment__'] = 'start is 0-based'
        j_genes['name'] = []
        j_genes['chromosome'] = []
        j_genes['strand'] = []
        j_genes['start'] = []
        j_genes['utr'] = []
        j_genes['end'] = []
        j_genes['gene'] = []
        j_genes['product'] = []
        j_genes['peaks'] = []
        j_genes['relevant_peaks'] = []
        #j_genes['cds'] = [ ]
        #j_genes['cds_start'] = [ ]
        #j_genes['cds_end'] = [ ]
        for item in parents:
            j_genes['name'].append(item.get_id())
            j_genes['chromosome'].append(item.seqid)
            j_genes['strand'].append(item.strand)
            j_genes['start'].append(item.start)
            j_genes['utr'].append(item.utr_pos)
            j_genes['end'].append(item.end)
            j_genes['gene'].append(
                item.attr.get('Name', item.attr.get('gene', '')))
            j_genes['product'].append(
                item.attr.get('Product', item.attr.get('product', '')))
            j_genes['peaks'].append(
                [item2.get_id() for item2 in item.children])
            j_genes['relevant_peaks'].append(
                [item2.get_id() for item2 in item.relevant_children])
            #j_genes['cds'].append( item.cds )
            #j_genes['cds_start'].append( item.cds_start )
            #j_genes['cds_end'].append( item.cds_end )

        j_peaks = j_data['peaks'] = {}
        j_peaks['__comment__'] = 'start is 0-based'
        j_peaks['name'] = []
        j_peaks['chromosome'] = []
        j_peaks['strand'] = []
        j_peaks['start'] = []
        j_peaks['end'] = []
        j_peaks['parents'] = []
        j_peaks['counts'] = []
        j_peaks['tail_lengths'] = []
        j_peaks['proportion_tailed'] = []
        for item in children:
            j_peaks['name'].append(item.get_id())
            j_peaks['chromosome'].append(item.seqid)
            j_peaks['strand'].append(item.strand)
            j_peaks['start'].append(item.start)
            j_peaks['end'].append(item.end)
            j_peaks['parents'].append(item.attr['Parent'].split(',')
                                      if 'Parent' in item.attr else [])
            j_peaks['counts'].append(counts[item.get_id()].values())
            j_peaks['tail_lengths'].append(
                count_table['Tail'][item.get_id()].values())
            j_peaks['proportion_tailed'].append(
                count_table['Proportion'][item.get_id()].values())

        j_samples = j_data['samples'] = {}
        j_samples['name'] = []
        j_samples['tags'] = []
        j_samples['normalizing_multiplier'] = []
        for name in samples:
            j_samples['name'].append(name)
            j_samples['tags'].append(sample_tags[name])
            j_samples['normalizing_multiplier'].append(
                float(norms[name]['Normalizing.multiplier']))

        j_chromosomes = j_data['chromosomes'] = {}
        j_chromosomes['name'] = []
        j_chromosomes['length'] = []
        for name, seq in chromosomes.iteritems():
            j_chromosomes['name'].append(name)
            j_chromosomes['length'].append(len(seq))

        with open(self.prefix + '.json', 'wb') as f:
            json.dump(j_data, f)

        # Output paired peak file

        output_comments = ['#Counts']
        output_samples = []
        for item in samples:
            output_samples.append(item + '-peak1')
            output_comments.append('#sampleTags=' +
                                   ','.join([item + '-peak1', 'peak1'] +
                                            sample_tags.get(item, [])))
        for item in samples:
            output_samples.append(item + '-peak2')
            output_comments.append('#sampleTags=' +
                                   ','.join([item + '-peak2', 'peak2'] +
                                            sample_tags.get(item, [])))

        output_names = []
        output_counts = []
        output_tail_counts = []
        output_proportions = []
        output_tails = []
        output_annotation_fields = [
            'gene', 'product', 'biotype', 'mean_tail_1', 'mean_tail_2',
            'chromosome', 'strand', 'transcription_stops'
        ]  #, 'interpeak_seq', ]
        output_annotations = []

        for item in parents:
            peaks = item.relevant_children
            for i in xrange(len(peaks) - 1):
                for j in xrange(i + 1, len(peaks)):
                    id_i = peaks[i].get_id()
                    id_j = peaks[j].get_id()
                    id_pair = item.get_id() + '-' + id_i + '-' + id_j
                    output_names.append(id_pair)

                    row = []
                    row.extend(counts[id_i].values())
                    row.extend(counts[id_j].values())
                    output_counts.append(filter(_text, row))

                    row = []
                    row.extend(tail_counts[id_i].values())
                    row.extend(tail_counts[id_j].values())
                    output_tail_counts.append(filter(_text, row))

                    row = []
                    row.extend(proportions[id_i].values())
                    row.extend(proportions[id_j].values())
                    output_proportions.append(filter(_text, row))

                    row = []
                    row.extend(tails[id_i].values())
                    row.extend(tails[id_j].values())
                    output_tails.append(filter(_text, row))

                    output_annotations.append([
                        item.attr.get('Name', item.attr.get('gene', '')),
                        item.attr.get('Product', item.attr.get('product', '')),
                        item.attr.get('Biotype', ''),
                        count_table['Annotation'][id_i]['mean-tail'],
                        count_table['Annotation'][id_j]['mean-tail'],
                        item.seqid,
                        str(item.strand),
                        '%d, %d' % (peaks[i].transcription_stop,
                                    peaks[j].transcription_stop),
                        #get_interpeak_seq([peaks[i],peaks[j]]),
                    ])

        #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts)
        io.write_grouped_csv(
            self.prefix + '-pairs.csv',
            [
                ('Count', io.named_matrix_type(output_names,
                                               output_samples)(output_counts)),
                ('Tail_count',
                 io.named_matrix_type(output_names,
                                      output_samples)(output_tail_counts)),
                ('Proportion',
                 io.named_matrix_type(output_names,
                                      output_samples)(output_proportions)),
                ('Tail', io.named_matrix_type(output_names,
                                              output_samples)(output_tails)),
                ('Annotation',
                 io.named_matrix_type(
                     output_names,
                     output_annotation_fields)(output_annotations)),
            ],
            comments=output_comments,
        )
Example #6
0
def count_run(
    min_score, min_size, max_size, filter_mode, equalize, types, locii,
    qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log):
    
    if filter_mode == 'poly':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = False
        expect_multiple_alignments = True
    elif filter_mode == 'mono': 
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = True
        expect_multiple_alignments = True
    else:
        assert filter_mode == 'existing', 'Unrecognized filtering mode'
        use_bam_filename = 'alignments_filtered.bam'
        use_only_top = False
        use_only_monogamous = False
        expect_multiple_alignments = False

    types = types.lower().split(',')

    qualifiers = qualifiers.split(',')

    if locii:
        locii = locii.lower().split(',')
    else:
        locii = None

    assert use_strand is not None, 'You must now explicitly specify --strand'
    assert use_strand in ('pool','forward','reverse','both'), "Can't understand --strand specification."
    
    from Bio import Seq, SeqIO

    annotation_filenames = [ ]
    bam_filenames = [ ]
    for arg in filenames:
        if annotation.is_annotation_file(arg):
            annotation_filenames.append(arg)
        else:
            bam_filenames.append(arg)    

    n_samples = len(bam_filenames)
    titles = bam_filenames[:]
    tags = [ ]
    for i in xrange(len(bam_filenames)):
        if os.path.isdir(bam_filenames[i]):
            working = working_directory.Working(bam_filenames[i])            
            titles[i] = working.name
            tags.append(working.get_tags())
            if not annotation_filenames:
                reference_filename = working.get_reference().annotations_filename()
                if reference_filename is not None:
                    annotation_filenames.append(reference_filename)
            
            bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename)
    
    assert bam_filenames, 'No reference alignments given' 

    merge = { }
    merge_qualifiers = { }
    if merge_filename is not None:
        #First line gives qualifiers
        #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...>
    
        f = open(merge_filename,'rU')
        qualifiers = f.readline().rstrip('\n').split('\t')
        for line in f:
            parts = line.rstrip('\n').split('\t')
            if not parts: continue
            for name in parts[len(qualifiers)+1:]:
                assert name not in merge, 'Duplicate feature name in merge file'
                merge[name] = parts[len(qualifiers)]
                merge_qualifiers[name] = parts[:len(qualifiers)]
        f.close()

    
    genes = { }  # reference name -> gene index
    
    feature_names = { }   # feature_name -> number of occurrences
    
    features = [ ]
    
    n_features = 0

    chromosome_length = { }
    for filename in bam_filenames:
        headers = sam.bam_headers(filename)
        for line in headers.split('\n'):
            if not line: continue
            parts = line.split('\t')
            if parts[0] != '@SQ': continue
            
            name = None
            length = None
            for part in parts[1:]:
                if part.startswith('SN:'): name = part[3:]
                if part.startswith('LN:'): length = int(part[3:])
            assert name is not None and length is not None
            
            if name in chromosome_length:
                assert chromosome_length[name] == length
            else:
                chromosome_length[name] = length
    
    for name in chromosome_length:
        genes[name] = span_index.Span_index()

    
    if annotation_filenames:
        assert not merge, 'Merging not supported with annotation files'
    
        for filename in annotation_filenames:
            for feature in annotation.read_annotations(filename):
                if feature.type.lower() not in types: continue
                
                if (locii is not None and
                    ('locus_tag' not in feature.attr or
                     feature.attr['locus_tag'].lower() not in locii)):
                    continue
                
                f = Feature(n_samples)
                f.name = feature.get_id()                
                if feature.type.lower() != 'cds' and len(types) > 1:
                    f.name = feature.type + ':' + f.name

                feature_names[f.name] = feature_names.get(f.name,0)+1
                if feature_names[f.name] > 1:
                   f.name += '/%d' % feature_names[f.name]
                   
                f.qualifiers = [ feature.attr.get(item,'') for item in qualifiers ]
                
                f.length = feature.end - feature.start

                assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files'
                genes[feature.seqid].insert(Span_entry(feature.start, feature.end, feature.strand or 1, f))
                features.append(f)

    else:
        # Sequences as features        
        log.log('No annotation files given or found, using sequences as features\n')
        
        name_feature = { } # (merged)name -> feature
        
        for name in chromosome_length:
            merged_name = merge.get(name, name)
            
            if merged_name not in name_feature: 
                f = Feature(n_samples)
                f.name = merged_name
                f.length = length
                f.qualifiers = merge_qualifiers.get(name, ('',)*len(qualifiers))
                n_features += 1
                name_feature[merged_name] = f
                features.append(f)
            else:
                f = name_feature[merged_name]
                f.length = max(f.length, length) #...
            
            genes[name].insert(Span_entry(0, chromosome_length[name], 1, f))
                
                

    
    log.log('%d features\n\n' % len(features))

    for name in genes: 
        genes[name].prepare()
        
    n_fragments = [ 0 ] * n_samples
    n_fragments_aligned = [ 0 ] * n_samples
    n_low_score = [ 0 ] * n_samples
    n_something = [ 0 ] * n_samples
    n_multiple = [ 0 ] * n_samples
    n_span = [ 0 ] * n_samples

    for i in xrange(n_samples):
        for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(bam_filenames[i], 'Counting sample %d of %d' % (i+1,n_samples)):
            n_fragments[i] += 1
            
            if not fragment_alignments: 
                continue
            
            n_fragments_aligned[i] += 1
            
            feature_hits = [ ] # [ [ (feature, strand) ] ]
            
            # Use only top scoring alignments
            fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ]
            
            best_score = max(fragment_scores)
            
            if min_score is not None and best_score < min_score:
                n_low_score[i] += 1
                continue
            
            if use_only_top:
                cutoff = max(best_score, min_score)
            else:
                cutoff = min_score            
            fragment_alignments = [ item 
                                    for item, score in zip(fragment_alignments, fragment_scores)
                                    if score >= cutoff ]            
            
            for alignments in fragment_alignments:
                strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
            
                start = min(item.pos-1 for item in alignments)
                end = max(item.pos+item.length-1 for item in alignments)
                length = end-start
                if min_size is not None and length < min_size: continue
                if max_size is not None and length > max_size: continue
                
                rname = alignments[0].rname
                strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
                assert alignments[0].rname in genes, 'Alignment refers to sequence not present in GENBANK file'
            
                this_feature_hits = [ ]    
                for item in genes[rname].get(start, end):
                    rel_strand = strand * item.strand
                    key = (item.feature, rel_strand)
                    if key in this_feature_hits: continue
                    this_feature_hits.append( key )
                    if not use_only_monogamous or len(fragment_alignments) == 1:
                        item.feature.count[rel_strand][i] += 1
                
                if this_feature_hits: 
                    feature_hits.append( this_feature_hits )
                
                if len(this_feature_hits) > 1:
                    for a in this_feature_hits:
                        for b in this_feature_hits:
                            if a[0] is b[0]: continue
                            a[0].common[(a[1],b[1])][b[0]] += 1
                                
                
            if len(feature_hits) > 0: 
                n_something[i] += 1
            #else:
            #    print fragment_alignments
            #    print genes[fragment_alignments[0][0].rname].indexes
            #    print
            
            
            if len(feature_hits) > 1: 
                n_multiple[i] += 1
                for j in xrange(len(feature_hits)):
                    for k in xrange(len(feature_hits)):
                        if j == k: continue
                        for a in feature_hits[j]:
                            for b in feature_hits[k]:
                                if a[0] is b[0]: continue
                                a[0].ambiguous[(a[1],b[1])][b[0]] += 1
            
            if any(len(item) > 1 for item in feature_hits): n_span[i] += 1
            
            if limit is not None and n_fragments[i] >= limit: break

        grace.status('')

        #log.log('%s\n' % titles[i])
        #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i]))
        #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i]))
        #if n_low_score[i]:
        #    log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i]))
        #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i]))
        #if expect_multiple_alignments or n_multiple[i]:
        #    log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i]))
        #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i]))
        #log.log('\n')

        log.datum(titles[i], 'fragments', n_fragments[i])
        log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i])
        if n_low_score[i]:
            log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i])
        log.datum(titles[i], 'aligned to an annotated gene', n_something[i])
        if expect_multiple_alignments or n_multiple[i]:
            log.datum(titles[i], 'aligned to multiple genes', n_multiple[i])
        log.datum(titles[i],'had an alignment that spanned multiple genes', n_span[i])
        log.log('\n')

    strandedness = [ ]
    for feature in features:
        n_forward = sum(feature.count[1])
        n_reverse = sum(feature.count[-1])
        if n_forward+n_reverse < 5: continue
        strandedness.append( (n_forward-n_reverse)*100.0 / (n_forward+n_reverse) )
    strandedness = sum(strandedness) / max(1,len(strandedness))
    log.log('Strand specificity score: %.0f\n'
            '  (~ -100 reverse strand, ~ 0 non-specific, ~ 100 forward strand\n'
            '   Average over all features with at least 5 hits.)\n'
            % strandedness)


    if use_strand == 'pool':
        getters = [ lambda f: (feature.name, 
                               add_lists(feature.count[1],feature.count[-1]),
                               add_defdicts(feature.common[(1,1)], 
                                            feature.common[(1,-1)], 
                                            feature.common[(-1,1)], 
                                            feature.common[(-1,-1)]),
                               add_defdicts(feature.ambiguous[(1,1)], 
                                            feature.ambiguous[(1,-1)], 
                                            feature.ambiguous[(-1,1)], 
                                            feature.ambiguous[(-1,-1)])) ]
    elif use_strand == 'forward':
        getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]) ]
    elif use_strand == 'reverse':
        getters = [ lambda f: (feature.name, feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ]
    elif use_strand == 'both':
        getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]),
                    lambda f: (feature.name + 'r', feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ]

    total_hits = [0] * n_samples
    for feature in features:
        for getter in getters:
            total_hits = add_lists(total_hits, getter(feature)[1])

    if equalize:
        min_hits = min(total_hits)
        p = [ float(min_hits)/item for item in total_hits ]
        total_hits = [ min_hits ] * n_samples

    
    
    comments = [ '#Counts' ] + [
        '#sampleTags='+','.join(item)
        for item in tags
        ]
    
    names = [ ]
    
    count_type = io.named_list_type(titles)    
    counts = [ ]
    
    #rpkm_type = io.named_list_type(titles)
    #rpkms = [ ]
    
    annotation_type = io.named_list_type([ 'Length' ] + qualifiers)
    annotations = [ ]
    
    alignment_type = io.named_list_type(
        [ 'On same fragment' ] + 
            [ 'Ambiguous alignment' ] 
                if expect_multiple_alignments 
                else [ ]
        )
    alignments = [ ]
    
    for feature in features:
        for getter in getters:
            feature_name, count, common, ambiguous = getter(feature)
            
            if equalize:
                count = [
                    subsample(count[i], p[i])
                    for i in xrange(n_samples)
                    ]
            
            #rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ]
            
            #common_str = ' '.join(
            #    '%dx%s' % (item[1],item[0])
            #    for item in sorted(common.items(), key=lambda item:item[1], reverse=True)
            #    ) 
            #ambiguous_str = ' '.join(
            #    '%dx%s' % (item[1],item[0])
            #    for item in sorted(ambiguous.items(), key=lambda item:item[1], reverse=True)
            #    )
            common_str = count_encode(common)
            ambiguous_str = count_encode(ambiguous)
            
            names.append(feature_name)
            counts.append(count_type(count))
            #rpkms.append(rpkm_type(rpkm))
            annotations.append(annotation_type([ str(feature.length) ] + list(feature.qualifiers)))
            alignments.append(alignment_type([ common_str ] + [ ambiguous_str ] if expect_multiple_alignments else [ ]))

    groups = [
        ('Count', io.named_list_type(names,count_type)(counts)),
        #('RPKM', io.named_list_type(names,rpkm_type)(rpkms)),
        ('Annotation', io.named_list_type(names,annotation_type)(annotations)),
        ('Alignment', io.named_list_type(names,alignment_type)(alignments)),
        ]
    
    io.write_grouped_csv(output_prefix + '.csv', groups, rowname_name='Feature', comments=comments)