def _write_table(self, samples, items): names = [ '%s:%d' % (item.record.CHROM, item.record.POS) for item in items ] sample_list = io.named_list_type(samples) groups = [] locations_list = io.named_list_type(['CHROM', 'POS']) locations = io.named_list_type(names, locations_list)([ locations_list([item.record.CHROM, item.record.POS]) for item in items ]) groups.append(('Location', locations)) genotypes = io.named_list_type(names, sample_list)([ sample_list([ describe_genotype(item2, item.variants) for item2 in item.genotypes ]) for item in items ]) groups.append(('Genotype', genotypes)) if self.qualities: qualities = io.named_list_type(names, sample_list)( [sample_list(item.qualities) for item in items]) groups.append(('Quality', qualities)) if self.counts: counts = io.named_list_type(names, sample_list)([ sample_list([ describe_counts(item2, item.variants) for item2 in item.counts ]) for item in items ]) groups.append(('Count', counts)) annotation_list = io.named_list_type(['snpeff']) annotations = io.named_list_type(names, annotation_list)([ annotation_list([ ' /// '.join(item2[0] for item2 in item.snpeff if selection.matches(self.snpeff_show, item2[1])) ]) for item in items ]) groups.append(('Annotation', annotations)) io.write_grouped_csv(self.prefix + '.csv', groups)
def _write_table(self, samples, items): names = [ '%s:%d' % (item.record.CHROM, item.record.POS) for item in items ] sample_list = io.named_list_type(samples) groups = [ ] locations_list = io.named_list_type(['CHROM','POS']) locations = io.named_list_type(names, locations_list)([ locations_list([ item.record.CHROM, item.record.POS ]) for item in items ]) groups.append(('Location',locations)) genotypes = io.named_list_type(names,sample_list)([ sample_list([ describe_genotype(item2,item.variants) for item2 in item.genotypes ]) for item in items ]) groups.append(('Genotype',genotypes)) if self.qualities: qualities = io.named_list_type(names,sample_list)([ sample_list(item.qualities) for item in items ]) groups.append(('Quality',qualities)) if self.counts: counts = io.named_list_type(names,sample_list)([ sample_list([ describe_counts(item2,item.variants) for item2 in item.counts ]) for item in items ]) groups.append(('Count',counts)) annotation_list = io.named_list_type(['snpeff']) annotations = io.named_list_type(names, annotation_list)([ annotation_list([ ' /// '.join(item2[0] for item2 in item.snpeff if selection.matches(self.snpeff_show, item2[1])) ]) for item in items ]) groups.append(('Annotation',annotations)) io.write_grouped_csv(self.prefix + '.csv', groups)
def run(self): #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given' # Reference genome #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths() chromosomes = collections.OrderedDict(io.read_sequences(self.reference)) def get_interpeak_seq(peaks): start = min(item.transcription_stop for item in peaks) end = max(item.transcription_stop for item in peaks) if end-start > self.max_seq: return '' if peaks[0].strand >= 0: return chromosomes[peaks[0].seqid][start:end] else: return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end]) def get_prepeak_seq(gene,peaks): if gene.strand >= 0: start = gene.utr_pos end = min(item.transcription_stop for item in peaks) if end-start > self.max_seq: return '' return chromosomes[gene.seqid][start:end] else: start = max(item.transcription_stop for item in peaks) end = gene.utr_pos if end-start > self.max_seq: return '' return bio.reverse_complement(chromosomes[gene.seqid][start:end]) # Normalization files if self.norm_file: norm_file = self.norm_file else: nesoni.Norm_from_counts(self.prefix+'-norm', self.counts).run() norm_file = self.prefix+'-norm.csv' norms = io.read_grouped_table(norm_file, [('All',str)])['All'] pair_norm_names = [ ] pair_norms = [ ] for i in xrange(len(norms)): pair_norm_names.append(norms.keys()[i]+'-peak1') pair_norms.append(norms.values()[i]) for i in xrange(len(norms)): pair_norm_names.append(norms.keys()[i]+'-peak2') pair_norms.append(norms.values()[i]) io.write_grouped_csv( self.prefix+'-pairs-norm.csv', [('All',io.named_list_type(pair_norm_names)(pair_norms))], comments=['#Normalization'], ) # Read data annotations = list(annotation.read_annotations(self.parents)) if self.utrs: utrs = list(annotation.read_annotations(self.utrs)) else: utrs = [ ] children = list(annotation.read_annotations(self.children)) count_table = io.read_grouped_table(self.counts, [ ('Count',int), ('Tail_count',int), ('Tail',_float_or_none), ('Proportion',_float_or_none), ('Annotation',str) ]) counts = count_table['Count'] tail_counts = count_table['Tail_count'] proportions = count_table['Proportion'] tails = count_table['Tail'] samples = counts.value_type().keys() sample_tags = { } for line in count_table.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') assert parts[0] not in sample_tags sample_tags[parts[0]] = parts for item in children: item.weight = sum( counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples ) parents = [ ] id_to_parent = { } for item in annotations: if item.type != self.parent_type: continue assert item.get_id() not in id_to_parent, 'Duplicate id in parent file: '+item.get_id() parents.append(item) id_to_parent[item.get_id()] = item item.children = [ ] #item.cds = [ ] # Default utr if item.strand >= 0: item.utr_pos = item.end else: item.utr_pos = item.start if 'three_prime_UTR_start' in item.attr: if item.strand >= 0: item.utr_pos = int(item.attr['three_prime_UTR_start'])-1 else: item.utr_pos = int(item.attr['three_prime_UTR_start']) for item in utrs: assert item.attr['Parent'] in id_to_parent, 'Unknown gene '+item.attr['Parent'] id_to_parent[item.attr['Parent']].utr_pos = (item.start if item.strand >= 0 else item.end) for item in children: item.transcription_stop = item.end if item.strand >= 0 else item.start #End of transcription, 0-based, ie between-positions based if 'Parent' in item.attr: for item_parent in item.attr['Parent'].split(','): parent = id_to_parent[item_parent] parent.children.append(item) for item in parents: item.children.sort(key=_annotation_sorter) relevant = list(item.children) if self.utr_only: #if item.strand <= 0: # relative_utr_start = item.end - int(item.attr['three_prime_UTR_start']) #else: # relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start # #def relative_start(peak): # return item.end-peak.end if item.strand < 0 else peak.start-item.start #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ] relevant = [ peak for peak in relevant if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos) ] if self.top: relevant.sort(key=lambda peak:peak.weight, reverse=True) relevant = relevant[:self.top] relevant.sort(key=_annotation_sorter) item.relevant_children = relevant # JSON output j_data = { } j_genes = j_data['genes'] = { } j_genes['__comment__'] = 'start is 0-based' j_genes['name'] = [ ] j_genes['chromosome'] = [ ] j_genes['strand'] = [ ] j_genes['start'] = [ ] j_genes['utr'] = [ ] j_genes['end'] = [ ] j_genes['gene'] = [ ] j_genes['product'] = [ ] j_genes['peaks'] = [ ] j_genes['relevant_peaks'] = [ ] #j_genes['cds'] = [ ] #j_genes['cds_start'] = [ ] #j_genes['cds_end'] = [ ] for item in parents: j_genes['name'].append( item.get_id() ) j_genes['chromosome'].append( item.seqid ) j_genes['strand'].append( item.strand ) j_genes['start'].append( item.start ) j_genes['utr'].append( item.utr_pos ) j_genes['end'].append( item.end ) j_genes['gene'].append( item.attr.get('Name',item.attr.get('gene','')) ) j_genes['product'].append( item.attr.get('Product',item.attr.get('product','')) ) j_genes['peaks'].append( [ item2.get_id() for item2 in item.children ] ) j_genes['relevant_peaks'].append( [ item2.get_id() for item2 in item.relevant_children ] ) #j_genes['cds'].append( item.cds ) #j_genes['cds_start'].append( item.cds_start ) #j_genes['cds_end'].append( item.cds_end ) j_peaks = j_data['peaks'] = { } j_peaks['__comment__'] = 'start is 0-based' j_peaks['name'] = [ ] j_peaks['chromosome'] = [ ] j_peaks['strand'] = [ ] j_peaks['start'] = [ ] j_peaks['end'] = [ ] j_peaks['parents'] = [ ] j_peaks['counts'] = [ ] j_peaks['tail_lengths'] = [ ] j_peaks['proportion_tailed'] = [ ] for item in children: j_peaks['name'].append( item.get_id() ) j_peaks['chromosome'].append( item.seqid ) j_peaks['strand'].append( item.strand ) j_peaks['start'].append( item.start ) j_peaks['end'].append( item.end ) j_peaks['parents'].append( item.attr['Parent'].split(',') if 'Parent' in item.attr else [ ]) j_peaks['counts'].append( counts[item.get_id()].values() ) j_peaks['tail_lengths'].append( count_table['Tail'][item.get_id()].values() ) j_peaks['proportion_tailed'].append( count_table['Proportion'][item.get_id()].values() ) j_samples = j_data['samples'] = { } j_samples['name'] = [ ] j_samples['tags'] = [ ] j_samples['normalizing_multiplier'] = [ ] for name in samples: j_samples['name'].append(name) j_samples['tags'].append(sample_tags[name]) j_samples['normalizing_multiplier'].append(float(norms[name]['Normalizing.multiplier'])) j_chromosomes = j_data['chromosomes'] = { } j_chromosomes['name'] = [ ] j_chromosomes['length'] = [ ] for name, seq in chromosomes.iteritems(): j_chromosomes['name'].append(name) j_chromosomes['length'].append(len(seq)) with open(self.prefix + '.json','wb') as f: json.dump(j_data, f) # Output paired peak file output_comments = [ '#Counts' ] output_samples = [ ] for item in samples: output_samples.append(item+'-peak1') output_comments.append('#sampleTags=' + ','.join([item+'-peak1','peak1']+sample_tags.get(item,[]))) for item in samples: output_samples.append(item+'-peak2') output_comments.append('#sampleTags=' + ','.join([item+'-peak2','peak2']+sample_tags.get(item,[]))) output_names = [ ] output_counts = [ ] output_tail_counts = [ ] output_proportions = [ ] output_tails = [ ] output_annotation_fields = [ 'gene', 'product', 'mean_tail_1', 'mean_tail_2', 'chromosome', 'strand', 'transcription_stops' ] #, 'interpeak_seq', ] output_annotations = [ ] for item in parents: peaks = item.relevant_children for i in xrange(len(peaks)-1): for j in xrange(i+1, len(peaks)): id_i = peaks[i].get_id() id_j = peaks[j].get_id() id_pair = item.get_id() + '-'+id_i+'-'+id_j output_names.append(id_pair) row = [ ] row.extend(counts[id_i].values()) row.extend(counts[id_j].values()) output_counts.append(filter(_text,row)) row = [ ] row.extend(tail_counts[id_i].values()) row.extend(tail_counts[id_j].values()) output_tail_counts.append(filter(_text,row)) row = [ ] row.extend(proportions[id_i].values()) row.extend(proportions[id_j].values()) output_proportions.append(filter(_text,row)) row = [ ] row.extend(tails[id_i].values()) row.extend(tails[id_j].values()) output_tails.append(filter(_text,row)) output_annotations.append([ item.attr.get('Name',item.attr.get('gene','')), item.attr.get('Product',item.attr.get('product','')), count_table['Annotation'][id_i]['mean-tail'], count_table['Annotation'][id_j]['mean-tail'], item.seqid, str(item.strand), '%d, %d' % (peaks[i].transcription_stop,peaks[j].transcription_stop), #get_interpeak_seq([peaks[i],peaks[j]]), ]) #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts) io.write_grouped_csv( self.prefix + '-pairs.csv', [ ('Count',io.named_matrix_type(output_names,output_samples)(output_counts)), ('Tail_count',io.named_matrix_type(output_names,output_samples)(output_tail_counts)), ('Proportion',io.named_matrix_type(output_names,output_samples)(output_proportions)), ('Tail',io.named_matrix_type(output_names,output_samples)(output_tails)), ('Annotation',io.named_matrix_type(output_names,output_annotation_fields)(output_annotations)), ], comments=output_comments, ) # # Chi Sq tests # # #for id in relation: # # peaks = relation[id] # # if len(peaks) < 2: continue # # mats = [ ] # genes = [ ] # products = [ ] # mean_tails = [ ] # prop_tails = [ ] # # peak_names = [ ] # chromosome_names = [ ] # strands = [ ] # transcription_stops = [ ] # interpeak_seqs = [ ] # prepeak_seqs = [ ] # # for parent in parents: # id = parent.get_id() # peaks = parent.relevant_children # if len(peaks) < 2: continue # # matrix = [ ] # for item in peaks: # matrix.append(counts[item.get_id()].values()) # # mats.append( # runr.R_literal(id) + ' = ' + # runr.R_literal(matrix) # ) # # genes.append(parent.attr.get('Name',parent.attr.get('gene',''))) # products.append(parent.attr.get('Product',parent.attr.get('product',''))) # # def format_mean(s): # if s == 'NA': return 'NA' # return '%.1f' % float(s) # mean_tails.append(', '.join( format_mean(count_table['Annotation'][item.get_id()]['mean-tail']) for item in peaks )) # # def format_prop(s): # if s == 'NA': return 'NA' # return '%.2f' % float(s) # prop_tails.append(', '.join( format_prop(count_table['Annotation'][item.get_id()]['proportion-with-tail']) for item in peaks )) # # peak_names.append(', '.join(item.get_id() for item in peaks)) # chromosome_names.append(parent.seqid) # strands.append(parent.strand) # transcription_stops.append(', '.join(str(item.transcription_stop) for item in peaks)) # interpeak_seqs.append(get_interpeak_seq(peaks)) # prepeak_seqs.append(get_prepeak_seq(parent,peaks)) # # #if len(mats) >= 10: break # # text = 'cat("Loading data into R+\n")\n' # text += 'data <- list(\n' + ',\n'.join(mats) + ')\n' # text += CHISQ # # runr.run_script(text, # OUTPUT_FILENAME=self.prefix+'.csv', # GENES = genes, # PRODUCTS = products, # MEAN_TAILS = mean_tails, # PROP_TAILS = prop_tails, # PEAK_NAMES = peak_names, # CHROMOSOME_NAMES = chromosome_names, # STRANDS = strands, # TRANSCRIPTION_STOPS = transcription_stops, # INTERPEAK_SEQS = interpeak_seqs, # PREPEAK_SEQS = prepeak_seqs, # ) #
def count_run(min_score, min_size, max_size, filter_mode, equalize, types, locii, qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log): if filter_mode == 'poly': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = False expect_multiple_alignments = True elif filter_mode == 'mono': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = True expect_multiple_alignments = True else: assert filter_mode == 'existing', 'Unrecognized filtering mode' use_bam_filename = 'alignments_filtered.bam' use_only_top = False use_only_monogamous = False expect_multiple_alignments = False types = types.lower().split(',') qualifiers = qualifiers.split(',') if locii: locii = locii.lower().split(',') else: locii = None assert use_strand is not None, 'You must now explicitly specify --strand' assert use_strand in ('pool', 'forward', 'reverse', 'both'), "Can't understand --strand specification." from Bio import Seq, SeqIO annotation_filenames = [] bam_filenames = [] for arg in filenames: if annotation.is_annotation_file(arg): annotation_filenames.append(arg) else: bam_filenames.append(arg) n_samples = len(bam_filenames) titles = bam_filenames[:] tags = [] for i in xrange(len(bam_filenames)): if os.path.isdir(bam_filenames[i]): working = working_directory.Working(bam_filenames[i]) titles[i] = working.name tags.append(working.get_tags()) if not annotation_filenames: reference_filename = working.get_reference( ).annotations_filename() if reference_filename is not None: annotation_filenames.append(reference_filename) bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename) assert bam_filenames, 'No reference alignments given' merge = {} merge_qualifiers = {} if merge_filename is not None: #First line gives qualifiers #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...> f = open(merge_filename, 'rU') qualifiers = f.readline().rstrip('\n').split('\t') for line in f: parts = line.rstrip('\n').split('\t') if not parts: continue for name in parts[len(qualifiers) + 1:]: assert name not in merge, 'Duplicate feature name in merge file' merge[name] = parts[len(qualifiers)] merge_qualifiers[name] = parts[:len(qualifiers)] f.close() genes = {} # reference name -> gene index feature_names = {} # feature_name -> number of occurrences features = [] n_features = 0 chromosome_length = {} for filename in bam_filenames: headers = sam.bam_headers(filename) for line in headers.split('\n'): if not line: continue parts = line.split('\t') if parts[0] != '@SQ': continue name = None length = None for part in parts[1:]: if part.startswith('SN:'): name = part[3:] if part.startswith('LN:'): length = int(part[3:]) assert name is not None and length is not None if name in chromosome_length: assert chromosome_length[name] == length else: chromosome_length[name] = length for name in chromosome_length: genes[name] = span_index.Span_index() if annotation_filenames: assert not merge, 'Merging not supported with annotation files' for filename in annotation_filenames: for feature in annotation.read_annotations(filename): if feature.type.lower() not in types: continue if (locii is not None and ('locus_tag' not in feature.attr or feature.attr['locus_tag'].lower() not in locii)): continue f = Feature(n_samples) f.name = feature.get_id() if feature.type.lower() != 'cds' and len(types) > 1: f.name = feature.type + ':' + f.name feature_names[f.name] = feature_names.get(f.name, 0) + 1 if feature_names[f.name] > 1: f.name += '/%d' % feature_names[f.name] f.qualifiers = [ feature.attr.get(item, '') for item in qualifiers ] f.length = feature.end - feature.start assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files' genes[feature.seqid].insert( Span_entry(feature.start, feature.end, feature.strand or 1, f)) features.append(f) else: # Sequences as features log.log( 'No annotation files given or found, using sequences as features\n' ) name_feature = {} # (merged)name -> feature for name in chromosome_length: merged_name = merge.get(name, name) if merged_name not in name_feature: f = Feature(n_samples) f.name = merged_name f.length = length f.qualifiers = merge_qualifiers.get(name, ('', ) * len(qualifiers)) n_features += 1 name_feature[merged_name] = f features.append(f) else: f = name_feature[merged_name] f.length = max(f.length, length) #... genes[name].insert(Span_entry(0, chromosome_length[name], 1, f)) log.log('%d features\n\n' % len(features)) for name in genes: genes[name].prepare() n_fragments = [0] * n_samples n_fragments_aligned = [0] * n_samples n_low_score = [0] * n_samples n_something = [0] * n_samples n_multiple = [0] * n_samples n_span = [0] * n_samples for i in xrange(n_samples): for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments( bam_filenames[i], 'Counting sample %d of %d' % (i + 1, n_samples)): n_fragments[i] += 1 if not fragment_alignments: continue n_fragments_aligned[i] += 1 feature_hits = [] # [ [ (feature, strand) ] ] # Use only top scoring alignments fragment_scores = [ sum(al.get_AS() for al in item) for item in fragment_alignments ] best_score = max(fragment_scores) if min_score is not None and best_score < min_score: n_low_score[i] += 1 continue if use_only_top: cutoff = max(best_score, min_score) else: cutoff = min_score fragment_alignments = [ item for item, score in zip(fragment_alignments, fragment_scores) if score >= cutoff ] for alignments in fragment_alignments: strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1 start = min(item.pos - 1 for item in alignments) end = max(item.pos + item.length - 1 for item in alignments) length = end - start if min_size is not None and length < min_size: continue if max_size is not None and length > max_size: continue rname = alignments[0].rname strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1 assert alignments[ 0].rname in genes, 'Alignment refers to sequence not present in GENBANK file' this_feature_hits = [] for item in genes[rname].get(start, end): rel_strand = strand * item.strand key = (item.feature, rel_strand) if key in this_feature_hits: continue this_feature_hits.append(key) if not use_only_monogamous or len( fragment_alignments) == 1: item.feature.count[rel_strand][i] += 1 if this_feature_hits: feature_hits.append(this_feature_hits) if len(this_feature_hits) > 1: for a in this_feature_hits: for b in this_feature_hits: if a[0] is b[0]: continue a[0].common[(a[1], b[1])][b[0]] += 1 if len(feature_hits) > 0: n_something[i] += 1 #else: # print fragment_alignments # print genes[fragment_alignments[0][0].rname].indexes # print if len(feature_hits) > 1: n_multiple[i] += 1 for j in xrange(len(feature_hits)): for k in xrange(len(feature_hits)): if j == k: continue for a in feature_hits[j]: for b in feature_hits[k]: if a[0] is b[0]: continue a[0].ambiguous[(a[1], b[1])][b[0]] += 1 if any(len(item) > 1 for item in feature_hits): n_span[i] += 1 if limit is not None and n_fragments[i] >= limit: break grace.status('') #log.log('%s\n' % titles[i]) #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i])) #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i])) #if n_low_score[i]: # log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i])) #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i])) #if expect_multiple_alignments or n_multiple[i]: # log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i])) #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i])) #log.log('\n') log.datum(titles[i], 'fragments', n_fragments[i]) log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i]) if n_low_score[i]: log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i]) log.datum(titles[i], 'aligned to an annotated gene', n_something[i]) if expect_multiple_alignments or n_multiple[i]: log.datum(titles[i], 'aligned to multiple genes', n_multiple[i]) log.datum(titles[i], 'had an alignment that spanned multiple genes', n_span[i]) log.log('\n') strandedness = [] for feature in features: n_forward = sum(feature.count[1]) n_reverse = sum(feature.count[-1]) if n_forward + n_reverse < 5: continue strandedness.append( (n_forward - n_reverse) * 100.0 / (n_forward + n_reverse)) strandedness = sum(strandedness) / max(1, len(strandedness)) log.log( 'Strand specificity score: %.0f\n' ' (~ -100 reverse strand, ~ 0 non-specific, ~ 100 forward strand\n' ' Average over all features with at least 5 hits.)\n' % strandedness) if use_strand == 'pool': getters = [ lambda f: (feature.name, add_lists(feature.count[1], feature.count[-1]), add_defdicts(feature.common[(1, 1)], feature.common[ (1, -1)], feature.common[(-1, 1)], feature.common[(-1, -1)]), add_defdicts(feature.ambiguous[(1, 1)], feature.ambiguous[ (1, -1)], feature.ambiguous[(-1, 1)], feature.ambiguous[ (-1, -1)])) ] elif use_strand == 'forward': getters = [ lambda f: (feature.name, feature.count[1], feature.common[ (1, 1)], feature.ambiguous[(1, 1)]) ] elif use_strand == 'reverse': getters = [ lambda f: (feature.name, feature.count[-1], feature.common[ (-1, -1)], feature.ambiguous[(-1, -1)]) ] elif use_strand == 'both': getters = [ lambda f: (feature.name, feature.count[1], feature.common[ (1, 1)], feature.ambiguous[(1, 1)]), lambda f: (feature.name + 'r', feature.count[-1], feature.common[ (-1, -1)], feature.ambiguous[(-1, -1)]) ] total_hits = [0] * n_samples for feature in features: for getter in getters: total_hits = add_lists(total_hits, getter(feature)[1]) if equalize: min_hits = min(total_hits) p = [float(min_hits) / item for item in total_hits] total_hits = [min_hits] * n_samples comments = ['#Counts'] + ['#sampleTags=' + ','.join(item) for item in tags] names = [] count_type = io.named_list_type(titles) counts = [] #rpkm_type = io.named_list_type(titles) #rpkms = [ ] annotation_type = io.named_list_type(['Length'] + qualifiers) annotations = [] alignment_type = io.named_list_type( ['On same fragment'] + ['Ambiguous alignment'] if expect_multiple_alignments else []) alignments = [] for feature in features: for getter in getters: feature_name, count, common, ambiguous = getter(feature) if equalize: count = [subsample(count[i], p[i]) for i in xrange(n_samples)] #rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ] #common_str = ' '.join( # '%dx%s' % (item[1],item[0]) # for item in sorted(common.items(), key=lambda item:item[1], reverse=True) # ) #ambiguous_str = ' '.join( # '%dx%s' % (item[1],item[0]) # for item in sorted(ambiguous.items(), key=lambda item:item[1], reverse=True) # ) common_str = count_encode(common) ambiguous_str = count_encode(ambiguous) names.append(feature_name) counts.append(count_type(count)) #rpkms.append(rpkm_type(rpkm)) annotations.append( annotation_type([str(feature.length)] + list(feature.qualifiers))) alignments.append( alignment_type( [common_str] + [ambiguous_str] if expect_multiple_alignments else [])) groups = [ ('Count', io.named_list_type(names, count_type)(counts)), #('RPKM', io.named_list_type(names,rpkm_type)(rpkms)), ('Annotation', io.named_list_type(names, annotation_type)(annotations)), ('Alignment', io.named_list_type(names, alignment_type)(alignments)), ] io.write_grouped_csv(output_prefix + '.csv', groups, rowname_name='Feature', comments=comments)
def run(self): #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given' # Reference genome #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths() chromosomes = collections.OrderedDict(io.read_sequences( self.reference)) def get_interpeak_seq(peaks): start = min(item.transcription_stop for item in peaks) end = max(item.transcription_stop for item in peaks) if end - start > self.max_seq: return '' if peaks[0].strand >= 0: return chromosomes[peaks[0].seqid][start:end] else: return bio.reverse_complement( chromosomes[peaks[0].seqid][start:end]) def get_prepeak_seq(gene, peaks): if gene.strand >= 0: start = gene.utr_pos end = min(item.transcription_stop for item in peaks) if end - start > self.max_seq: return '' return chromosomes[gene.seqid][start:end] else: start = max(item.transcription_stop for item in peaks) end = gene.utr_pos if end - start > self.max_seq: return '' return bio.reverse_complement( chromosomes[gene.seqid][start:end]) # Normalization files if self.norm_file: norm_file = self.norm_file else: nesoni.Norm_from_counts(self.prefix + '-norm', self.counts).run() norm_file = self.prefix + '-norm.csv' norms = io.read_grouped_table(norm_file, [('All', str)])['All'] pair_norm_names = [] pair_norms = [] for i in xrange(len(norms)): pair_norm_names.append(norms.keys()[i] + '-peak1') pair_norms.append(norms.values()[i]) for i in xrange(len(norms)): pair_norm_names.append(norms.keys()[i] + '-peak2') pair_norms.append(norms.values()[i]) io.write_grouped_csv( self.prefix + '-pairs-norm.csv', [('All', io.named_list_type(pair_norm_names)(pair_norms))], comments=['#Normalization'], ) # Read data annotations = list(annotation.read_annotations(self.parents)) if self.utrs: utrs = list(annotation.read_annotations(self.utrs)) else: utrs = [] children = list(annotation.read_annotations(self.children)) count_table = io.read_grouped_table(self.counts, [('Count', int), ('Tail_count', int), ('Tail', _float_or_none), ('Proportion', _float_or_none), ('Annotation', str)]) counts = count_table['Count'] tail_counts = count_table['Tail_count'] proportions = count_table['Proportion'] tails = count_table['Tail'] samples = counts.value_type().keys() sample_tags = {} for line in count_table.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') assert parts[0] not in sample_tags sample_tags[parts[0]] = parts for item in children: item.weight = sum(counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples) parents = [] id_to_parent = {} for item in annotations: if item.type != self.parent_type: continue assert item.get_id( ) not in id_to_parent, 'Duplicate id in parent file: ' + item.get_id( ) parents.append(item) id_to_parent[item.get_id()] = item item.children = [] #item.cds = [ ] # Default utr if item.strand >= 0: item.utr_pos = item.end else: item.utr_pos = item.start if 'three_prime_UTR_start' in item.attr: if item.strand >= 0: item.utr_pos = int(item.attr['three_prime_UTR_start']) - 1 else: item.utr_pos = int(item.attr['three_prime_UTR_start']) for item in utrs: assert item.attr[ 'Parent'] in id_to_parent, 'Unknown gene ' + item.attr['Parent'] id_to_parent[item.attr['Parent']].utr_pos = ( item.start if item.strand >= 0 else item.end) for item in children: item.transcription_stop = item.end if item.strand >= 0 else item.start #End of transcription, 0-based, ie between-positions based if 'Parent' in item.attr and item.attr.get( "Relation") != "Antisense": for item_parent in item.attr['Parent'].split(','): parent = id_to_parent[item_parent] parent.children.append(item) for item in parents: item.children.sort(key=_annotation_sorter) relevant = list(item.children) if self.utr_only: #if item.strand <= 0: # relative_utr_start = item.end - int(item.attr['three_prime_UTR_start']) #else: # relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start # #def relative_start(peak): # return item.end-peak.end if item.strand < 0 else peak.start-item.start #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ] #relevant = [ # peak for peak in relevant # if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos) # ] relevant = [ peak for peak in relevant if peak.attr.get("Relation") == "3'UTR" ] if self.top: relevant.sort(key=lambda peak: peak.weight, reverse=True) relevant = relevant[:self.top] relevant.sort(key=_annotation_sorter) item.relevant_children = relevant # JSON output j_data = {} j_genes = j_data['genes'] = {} j_genes['__comment__'] = 'start is 0-based' j_genes['name'] = [] j_genes['chromosome'] = [] j_genes['strand'] = [] j_genes['start'] = [] j_genes['utr'] = [] j_genes['end'] = [] j_genes['gene'] = [] j_genes['product'] = [] j_genes['peaks'] = [] j_genes['relevant_peaks'] = [] #j_genes['cds'] = [ ] #j_genes['cds_start'] = [ ] #j_genes['cds_end'] = [ ] for item in parents: j_genes['name'].append(item.get_id()) j_genes['chromosome'].append(item.seqid) j_genes['strand'].append(item.strand) j_genes['start'].append(item.start) j_genes['utr'].append(item.utr_pos) j_genes['end'].append(item.end) j_genes['gene'].append( item.attr.get('Name', item.attr.get('gene', ''))) j_genes['product'].append( item.attr.get('Product', item.attr.get('product', ''))) j_genes['peaks'].append( [item2.get_id() for item2 in item.children]) j_genes['relevant_peaks'].append( [item2.get_id() for item2 in item.relevant_children]) #j_genes['cds'].append( item.cds ) #j_genes['cds_start'].append( item.cds_start ) #j_genes['cds_end'].append( item.cds_end ) j_peaks = j_data['peaks'] = {} j_peaks['__comment__'] = 'start is 0-based' j_peaks['name'] = [] j_peaks['chromosome'] = [] j_peaks['strand'] = [] j_peaks['start'] = [] j_peaks['end'] = [] j_peaks['parents'] = [] j_peaks['counts'] = [] j_peaks['tail_lengths'] = [] j_peaks['proportion_tailed'] = [] for item in children: j_peaks['name'].append(item.get_id()) j_peaks['chromosome'].append(item.seqid) j_peaks['strand'].append(item.strand) j_peaks['start'].append(item.start) j_peaks['end'].append(item.end) j_peaks['parents'].append(item.attr['Parent'].split(',') if 'Parent' in item.attr else []) j_peaks['counts'].append(counts[item.get_id()].values()) j_peaks['tail_lengths'].append( count_table['Tail'][item.get_id()].values()) j_peaks['proportion_tailed'].append( count_table['Proportion'][item.get_id()].values()) j_samples = j_data['samples'] = {} j_samples['name'] = [] j_samples['tags'] = [] j_samples['normalizing_multiplier'] = [] for name in samples: j_samples['name'].append(name) j_samples['tags'].append(sample_tags[name]) j_samples['normalizing_multiplier'].append( float(norms[name]['Normalizing.multiplier'])) j_chromosomes = j_data['chromosomes'] = {} j_chromosomes['name'] = [] j_chromosomes['length'] = [] for name, seq in chromosomes.iteritems(): j_chromosomes['name'].append(name) j_chromosomes['length'].append(len(seq)) with open(self.prefix + '.json', 'wb') as f: json.dump(j_data, f) # Output paired peak file output_comments = ['#Counts'] output_samples = [] for item in samples: output_samples.append(item + '-peak1') output_comments.append('#sampleTags=' + ','.join([item + '-peak1', 'peak1'] + sample_tags.get(item, []))) for item in samples: output_samples.append(item + '-peak2') output_comments.append('#sampleTags=' + ','.join([item + '-peak2', 'peak2'] + sample_tags.get(item, []))) output_names = [] output_counts = [] output_tail_counts = [] output_proportions = [] output_tails = [] output_annotation_fields = [ 'gene', 'product', 'biotype', 'mean_tail_1', 'mean_tail_2', 'chromosome', 'strand', 'transcription_stops' ] #, 'interpeak_seq', ] output_annotations = [] for item in parents: peaks = item.relevant_children for i in xrange(len(peaks) - 1): for j in xrange(i + 1, len(peaks)): id_i = peaks[i].get_id() id_j = peaks[j].get_id() id_pair = item.get_id() + '-' + id_i + '-' + id_j output_names.append(id_pair) row = [] row.extend(counts[id_i].values()) row.extend(counts[id_j].values()) output_counts.append(filter(_text, row)) row = [] row.extend(tail_counts[id_i].values()) row.extend(tail_counts[id_j].values()) output_tail_counts.append(filter(_text, row)) row = [] row.extend(proportions[id_i].values()) row.extend(proportions[id_j].values()) output_proportions.append(filter(_text, row)) row = [] row.extend(tails[id_i].values()) row.extend(tails[id_j].values()) output_tails.append(filter(_text, row)) output_annotations.append([ item.attr.get('Name', item.attr.get('gene', '')), item.attr.get('Product', item.attr.get('product', '')), item.attr.get('Biotype', ''), count_table['Annotation'][id_i]['mean-tail'], count_table['Annotation'][id_j]['mean-tail'], item.seqid, str(item.strand), '%d, %d' % (peaks[i].transcription_stop, peaks[j].transcription_stop), #get_interpeak_seq([peaks[i],peaks[j]]), ]) #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts) io.write_grouped_csv( self.prefix + '-pairs.csv', [ ('Count', io.named_matrix_type(output_names, output_samples)(output_counts)), ('Tail_count', io.named_matrix_type(output_names, output_samples)(output_tail_counts)), ('Proportion', io.named_matrix_type(output_names, output_samples)(output_proportions)), ('Tail', io.named_matrix_type(output_names, output_samples)(output_tails)), ('Annotation', io.named_matrix_type( output_names, output_annotation_fields)(output_annotations)), ], comments=output_comments, )
def count_run( min_score, min_size, max_size, filter_mode, equalize, types, locii, qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log): if filter_mode == 'poly': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = False expect_multiple_alignments = True elif filter_mode == 'mono': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = True expect_multiple_alignments = True else: assert filter_mode == 'existing', 'Unrecognized filtering mode' use_bam_filename = 'alignments_filtered.bam' use_only_top = False use_only_monogamous = False expect_multiple_alignments = False types = types.lower().split(',') qualifiers = qualifiers.split(',') if locii: locii = locii.lower().split(',') else: locii = None assert use_strand is not None, 'You must now explicitly specify --strand' assert use_strand in ('pool','forward','reverse','both'), "Can't understand --strand specification." from Bio import Seq, SeqIO annotation_filenames = [ ] bam_filenames = [ ] for arg in filenames: if annotation.is_annotation_file(arg): annotation_filenames.append(arg) else: bam_filenames.append(arg) n_samples = len(bam_filenames) titles = bam_filenames[:] tags = [ ] for i in xrange(len(bam_filenames)): if os.path.isdir(bam_filenames[i]): working = working_directory.Working(bam_filenames[i]) titles[i] = working.name tags.append(working.get_tags()) if not annotation_filenames: reference_filename = working.get_reference().annotations_filename() if reference_filename is not None: annotation_filenames.append(reference_filename) bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename) assert bam_filenames, 'No reference alignments given' merge = { } merge_qualifiers = { } if merge_filename is not None: #First line gives qualifiers #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...> f = open(merge_filename,'rU') qualifiers = f.readline().rstrip('\n').split('\t') for line in f: parts = line.rstrip('\n').split('\t') if not parts: continue for name in parts[len(qualifiers)+1:]: assert name not in merge, 'Duplicate feature name in merge file' merge[name] = parts[len(qualifiers)] merge_qualifiers[name] = parts[:len(qualifiers)] f.close() genes = { } # reference name -> gene index feature_names = { } # feature_name -> number of occurrences features = [ ] n_features = 0 chromosome_length = { } for filename in bam_filenames: headers = sam.bam_headers(filename) for line in headers.split('\n'): if not line: continue parts = line.split('\t') if parts[0] != '@SQ': continue name = None length = None for part in parts[1:]: if part.startswith('SN:'): name = part[3:] if part.startswith('LN:'): length = int(part[3:]) assert name is not None and length is not None if name in chromosome_length: assert chromosome_length[name] == length else: chromosome_length[name] = length for name in chromosome_length: genes[name] = span_index.Span_index() if annotation_filenames: assert not merge, 'Merging not supported with annotation files' for filename in annotation_filenames: for feature in annotation.read_annotations(filename): if feature.type.lower() not in types: continue if (locii is not None and ('locus_tag' not in feature.attr or feature.attr['locus_tag'].lower() not in locii)): continue f = Feature(n_samples) f.name = feature.get_id() if feature.type.lower() != 'cds' and len(types) > 1: f.name = feature.type + ':' + f.name feature_names[f.name] = feature_names.get(f.name,0)+1 if feature_names[f.name] > 1: f.name += '/%d' % feature_names[f.name] f.qualifiers = [ feature.attr.get(item,'') for item in qualifiers ] f.length = feature.end - feature.start assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files' genes[feature.seqid].insert(Span_entry(feature.start, feature.end, feature.strand or 1, f)) features.append(f) else: # Sequences as features log.log('No annotation files given or found, using sequences as features\n') name_feature = { } # (merged)name -> feature for name in chromosome_length: merged_name = merge.get(name, name) if merged_name not in name_feature: f = Feature(n_samples) f.name = merged_name f.length = length f.qualifiers = merge_qualifiers.get(name, ('',)*len(qualifiers)) n_features += 1 name_feature[merged_name] = f features.append(f) else: f = name_feature[merged_name] f.length = max(f.length, length) #... genes[name].insert(Span_entry(0, chromosome_length[name], 1, f)) log.log('%d features\n\n' % len(features)) for name in genes: genes[name].prepare() n_fragments = [ 0 ] * n_samples n_fragments_aligned = [ 0 ] * n_samples n_low_score = [ 0 ] * n_samples n_something = [ 0 ] * n_samples n_multiple = [ 0 ] * n_samples n_span = [ 0 ] * n_samples for i in xrange(n_samples): for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(bam_filenames[i], 'Counting sample %d of %d' % (i+1,n_samples)): n_fragments[i] += 1 if not fragment_alignments: continue n_fragments_aligned[i] += 1 feature_hits = [ ] # [ [ (feature, strand) ] ] # Use only top scoring alignments fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ] best_score = max(fragment_scores) if min_score is not None and best_score < min_score: n_low_score[i] += 1 continue if use_only_top: cutoff = max(best_score, min_score) else: cutoff = min_score fragment_alignments = [ item for item, score in zip(fragment_alignments, fragment_scores) if score >= cutoff ] for alignments in fragment_alignments: strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1 start = min(item.pos-1 for item in alignments) end = max(item.pos+item.length-1 for item in alignments) length = end-start if min_size is not None and length < min_size: continue if max_size is not None and length > max_size: continue rname = alignments[0].rname strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1 assert alignments[0].rname in genes, 'Alignment refers to sequence not present in GENBANK file' this_feature_hits = [ ] for item in genes[rname].get(start, end): rel_strand = strand * item.strand key = (item.feature, rel_strand) if key in this_feature_hits: continue this_feature_hits.append( key ) if not use_only_monogamous or len(fragment_alignments) == 1: item.feature.count[rel_strand][i] += 1 if this_feature_hits: feature_hits.append( this_feature_hits ) if len(this_feature_hits) > 1: for a in this_feature_hits: for b in this_feature_hits: if a[0] is b[0]: continue a[0].common[(a[1],b[1])][b[0]] += 1 if len(feature_hits) > 0: n_something[i] += 1 #else: # print fragment_alignments # print genes[fragment_alignments[0][0].rname].indexes # print if len(feature_hits) > 1: n_multiple[i] += 1 for j in xrange(len(feature_hits)): for k in xrange(len(feature_hits)): if j == k: continue for a in feature_hits[j]: for b in feature_hits[k]: if a[0] is b[0]: continue a[0].ambiguous[(a[1],b[1])][b[0]] += 1 if any(len(item) > 1 for item in feature_hits): n_span[i] += 1 if limit is not None and n_fragments[i] >= limit: break grace.status('') #log.log('%s\n' % titles[i]) #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i])) #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i])) #if n_low_score[i]: # log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i])) #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i])) #if expect_multiple_alignments or n_multiple[i]: # log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i])) #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i])) #log.log('\n') log.datum(titles[i], 'fragments', n_fragments[i]) log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i]) if n_low_score[i]: log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i]) log.datum(titles[i], 'aligned to an annotated gene', n_something[i]) if expect_multiple_alignments or n_multiple[i]: log.datum(titles[i], 'aligned to multiple genes', n_multiple[i]) log.datum(titles[i],'had an alignment that spanned multiple genes', n_span[i]) log.log('\n') strandedness = [ ] for feature in features: n_forward = sum(feature.count[1]) n_reverse = sum(feature.count[-1]) if n_forward+n_reverse < 5: continue strandedness.append( (n_forward-n_reverse)*100.0 / (n_forward+n_reverse) ) strandedness = sum(strandedness) / max(1,len(strandedness)) log.log('Strand specificity score: %.0f\n' ' (~ -100 reverse strand, ~ 0 non-specific, ~ 100 forward strand\n' ' Average over all features with at least 5 hits.)\n' % strandedness) if use_strand == 'pool': getters = [ lambda f: (feature.name, add_lists(feature.count[1],feature.count[-1]), add_defdicts(feature.common[(1,1)], feature.common[(1,-1)], feature.common[(-1,1)], feature.common[(-1,-1)]), add_defdicts(feature.ambiguous[(1,1)], feature.ambiguous[(1,-1)], feature.ambiguous[(-1,1)], feature.ambiguous[(-1,-1)])) ] elif use_strand == 'forward': getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]) ] elif use_strand == 'reverse': getters = [ lambda f: (feature.name, feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ] elif use_strand == 'both': getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]), lambda f: (feature.name + 'r', feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ] total_hits = [0] * n_samples for feature in features: for getter in getters: total_hits = add_lists(total_hits, getter(feature)[1]) if equalize: min_hits = min(total_hits) p = [ float(min_hits)/item for item in total_hits ] total_hits = [ min_hits ] * n_samples comments = [ '#Counts' ] + [ '#sampleTags='+','.join(item) for item in tags ] names = [ ] count_type = io.named_list_type(titles) counts = [ ] #rpkm_type = io.named_list_type(titles) #rpkms = [ ] annotation_type = io.named_list_type([ 'Length' ] + qualifiers) annotations = [ ] alignment_type = io.named_list_type( [ 'On same fragment' ] + [ 'Ambiguous alignment' ] if expect_multiple_alignments else [ ] ) alignments = [ ] for feature in features: for getter in getters: feature_name, count, common, ambiguous = getter(feature) if equalize: count = [ subsample(count[i], p[i]) for i in xrange(n_samples) ] #rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ] #common_str = ' '.join( # '%dx%s' % (item[1],item[0]) # for item in sorted(common.items(), key=lambda item:item[1], reverse=True) # ) #ambiguous_str = ' '.join( # '%dx%s' % (item[1],item[0]) # for item in sorted(ambiguous.items(), key=lambda item:item[1], reverse=True) # ) common_str = count_encode(common) ambiguous_str = count_encode(ambiguous) names.append(feature_name) counts.append(count_type(count)) #rpkms.append(rpkm_type(rpkm)) annotations.append(annotation_type([ str(feature.length) ] + list(feature.qualifiers))) alignments.append(alignment_type([ common_str ] + [ ambiguous_str ] if expect_multiple_alignments else [ ])) groups = [ ('Count', io.named_list_type(names,count_type)(counts)), #('RPKM', io.named_list_type(names,rpkm_type)(rpkms)), ('Annotation', io.named_list_type(names,annotation_type)(annotations)), ('Alignment', io.named_list_type(names,alignment_type)(alignments)), ] io.write_grouped_csv(output_prefix + '.csv', groups, rowname_name='Feature', comments=comments)