def _load_bam(self, filename): spans = { } if self.filter == 'poly': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = False expect_multiple_alignments = True elif self.filter == 'mono': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = True expect_multiple_alignments = True else: assert self.filter == 'existing', 'Unrecognized filtering mode' use_bam_filename = 'alignments_filtered.bam' use_only_top = False use_only_monogamous = False expect_multiple_alignments = False if os.path.isdir(filename): filename = os.path.join(filename, use_bam_filename) for read_name, fragment_alignments, unmapped in \ sam.bam_iter_fragments( filename, 'Scanning'): if not fragment_alignments: continue if use_only_top: fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ] best_score = max(fragment_scores) fragment_alignments = [ item for item, score in zip(fragment_alignments, fragment_scores) if score >= best_score ] for alignments in fragment_alignments: if self.strand_specific: strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1 else: strand = 0 start = min(item.pos-1 for item in alignments) end = max(item.pos+item.length-1 for item in alignments) if end-start <= self.trim*2: continue rname = alignments[0].rname if (rname,strand) not in spans: spans[(rname,strand)] = [ ] spans[(rname, strand)].append((start+self.trim,end-self.trim)) if self.deduplicate: for key in spans: spans[key] = list(set(spans[key])) return spans
def _load_bam(self, filename): spans = { } if self.filter == 'poly': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = False expect_multiple_alignments = True elif self.filter == 'mono': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = True expect_multiple_alignments = True else: assert self.filter == 'existing', 'Unrecognized filtering mode' use_bam_filename = 'alignments_filtered.bam' use_only_top = False use_only_monogamous = False expect_multiple_alignments = False if os.path.isdir(filename): filename = os.path.join(filename, use_bam_filename) for read_name, fragment_alignments, unmapped in \ sam.bam_iter_fragments( filename, 'Scanning'): if not fragment_alignments: continue if use_only_top: fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ] best_score = max(fragment_scores) fragment_alignments = [ item for item, score in zip(fragment_alignments, fragment_scores) if score >= best_score ] for alignments in fragment_alignments: if self.strand_specific: strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1 else: strand = 0 start = min(item.pos-1 for item in alignments) end = max(item.pos+item.length-1 for item in alignments) if self.what == '5prime': if strand >= 0: end = start+1 else: start = end-1 elif self.what == '3prime': if strand >= 0: start = end-1 else: end = start+1 if end+self.lap-start <= 0: continue rname = alignments[0].rname if (rname,strand) not in spans: spans[(rname,strand)] = [ ] spans[(rname, strand)].append((start,end+self.lap)) if self.deduplicate: for key in spans: spans[key] = list(set(spans[key])) return spans
def count_run(min_score, min_size, max_size, filter_mode, equalize, types, locii, qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log): if filter_mode == 'poly': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = False expect_multiple_alignments = True elif filter_mode == 'mono': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = True expect_multiple_alignments = True else: assert filter_mode == 'existing', 'Unrecognized filtering mode' use_bam_filename = 'alignments_filtered.bam' use_only_top = False use_only_monogamous = False expect_multiple_alignments = False types = types.lower().split(',') qualifiers = qualifiers.split(',') if locii: locii = locii.lower().split(',') else: locii = None assert use_strand is not None, 'You must now explicitly specify --strand' assert use_strand in ('pool', 'forward', 'reverse', 'both'), "Can't understand --strand specification." from Bio import Seq, SeqIO annotation_filenames = [] bam_filenames = [] for arg in filenames: if annotation.is_annotation_file(arg): annotation_filenames.append(arg) else: bam_filenames.append(arg) n_samples = len(bam_filenames) titles = bam_filenames[:] for i in xrange(len(bam_filenames)): if os.path.isdir(bam_filenames[i]): titles[i] = os.path.basename(bam_filenames[i]) if not annotation_filenames: working = working_directory.Working(bam_filenames[i]) reference_filename = working.get_reference( ).annotations_filename() if reference_filename is not None: annotation_filenames.append(reference_filename) bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename) assert bam_filenames, 'No reference alignments given' merge = {} merge_qualifiers = {} if merge_filename is not None: #First line gives qualifiers #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...> f = open(merge_filename, 'rU') qualifiers = f.readline().rstrip('\n').split('\t') for line in f: parts = line.rstrip('\n').split('\t') if not parts: continue for name in parts[len(qualifiers) + 1:]: assert name not in merge, 'Duplicate feature name in merge file' merge[name] = parts[len(qualifiers)] merge_qualifiers[name] = parts[:len(qualifiers)] f.close() genes = {} # reference name -> gene index feature_names = {} # feature_name -> number of occurrences features = [] n_features = 0 chromosome_length = {} for filename in bam_filenames: headers = sam.bam_headers(filename) for line in headers.split('\n'): if not line: continue parts = line.split('\t') if parts[0] != '@SQ': continue name = None length = None for part in parts[1:]: if part.startswith('SN:'): name = part[3:] if part.startswith('LN:'): length = int(part[3:]) assert name is not None and length is not None if name in chromosome_length: assert chromosome_length[name] == length else: chromosome_length[name] = length for name in chromosome_length: genes[name] = span_index.Span_index() if annotation_filenames: assert not merge, 'Merging not supported with annotation files' for filename in annotation_filenames: for feature in annotation.read_annotations(filename): if feature.type.lower() not in types: continue if (locii is not None and ('locus_tag' not in feature.attr or feature.attr['locus_tag'].lower() not in locii)): continue f = Feature(n_samples) f.name = feature.get_id() if feature.type.lower() != 'cds' and len(types) > 1: f.name = feature.type + ':' + f.name feature_names[f.name] = feature_names.get(f.name, 0) + 1 if feature_names[f.name] > 1: f.name += '/%d' % feature_names[f.name] f.qualifiers = [ feature.attr.get(item, '') for item in qualifiers ] f.length = feature.end - feature.start assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files' genes[feature.seqid].insert( Span_entry(feature.start, feature.end, feature.strand or 1, f)) features.append(f) else: # Sequences as features log.log( 'No annotation files given or found, using sequences as features\n' ) name_feature = {} # (merged)name -> feature for name in chromosome_length: merged_name = merge.get(name, name) if merged_name not in name_feature: f = Feature(n_samples) f.name = merged_name f.length = length f.qualifiers = merge_qualifiers.get(name, ('', ) * len(qualifiers)) n_features += 1 name_feature[merged_name] = f features.append(f) else: f = name_feature[merged_name] f.length = max(f.length, length) #... genes[name].insert(Span_entry(0, chromosome_length[name], 1, f)) log.log('%d features\n\n' % len(features)) for name in genes: genes[name].prepare() n_fragments = [0] * n_samples n_fragments_aligned = [0] * n_samples n_low_score = [0] * n_samples n_something = [0] * n_samples n_multiple = [0] * n_samples n_span = [0] * n_samples for i in xrange(n_samples): for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments( bam_filenames[i], 'Counting sample %d of %d' % (i + 1, n_samples)): n_fragments[i] += 1 if not fragment_alignments: continue n_fragments_aligned[i] += 1 feature_hits = [] # [ [ (feature, strand) ] ] # Use only top scoring alignments fragment_scores = [ sum(al.get_AS() for al in item) for item in fragment_alignments ] best_score = max(fragment_scores) if min_score is not None and best_score < min_score: n_low_score[i] += 1 continue if use_only_top: cutoff = max(best_score, min_score) else: cutoff = min_score fragment_alignments = [ item for item, score in zip(fragment_alignments, fragment_scores) if score >= cutoff ] for alignments in fragment_alignments: strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1 start = min(item.pos - 1 for item in alignments) end = max(item.pos + item.length - 1 for item in alignments) length = end - start if min_size is not None and length < min_size: continue if max_size is not None and length > max_size: continue rname = alignments[0].rname strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1 assert alignments[ 0].rname in genes, 'Alignment refers to sequence not present in GENBANK file' this_feature_hits = [] for item in genes[rname].get(start, end): rel_strand = strand * item.strand key = (item.feature, rel_strand) if key in this_feature_hits: continue this_feature_hits.append(key) if not use_only_monogamous or len( fragment_alignments) == 1: item.feature.count[rel_strand][i] += 1 if this_feature_hits: feature_hits.append(this_feature_hits) if len(this_feature_hits) > 1: for a in this_feature_hits: for b in this_feature_hits: if a[0] is b[0]: continue a[0].common[(a[1], b[1])][b[0]] += 1 if len(feature_hits) > 0: n_something[i] += 1 #else: # print fragment_alignments # print genes[fragment_alignments[0][0].rname].indexes # print if len(feature_hits) > 1: n_multiple[i] += 1 for j in xrange(len(feature_hits)): for k in xrange(len(feature_hits)): if j == k: continue for a in feature_hits[j]: for b in feature_hits[k]: if a[0] is b[0]: continue a[0].ambiguous[(a[1], b[1])][b[0]] += 1 if any(len(item) > 1 for item in feature_hits): n_span[i] += 1 if limit is not None and n_fragments[i] >= limit: break grace.status('') #log.log('%s\n' % titles[i]) #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i])) #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i])) #if n_low_score[i]: # log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i])) #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i])) #if expect_multiple_alignments or n_multiple[i]: # log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i])) #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i])) #log.log('\n') log.datum(titles[i], 'fragments', n_fragments[i]) log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i]) if n_low_score[i]: log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i]) log.datum(titles[i], 'aligned to an annotated gene', n_something[i]) if expect_multiple_alignments or n_multiple[i]: log.datum(titles[i], 'aligned to multiple genes', n_multiple[i]) log.datum(titles[i], 'had an alignment that spanned multiple genes', n_span[i]) log.log('\n') strandedness = [] for feature in features: n_forward = sum(feature.count[1]) n_reverse = sum(feature.count[-1]) if n_forward + n_reverse < 5: continue strandedness.append( (n_forward - n_reverse) * 100.0 / (n_forward + n_reverse)) strandedness = sum(strandedness) / len(strandedness) log.log( 'Strand specificity: %.0f%%\n' ' (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n' ' Average over all features with at least 5 hits.)\n' % strandedness) if use_strand == 'pool': getters = [ lambda f: (feature.name, add_lists(feature.count[1], feature.count[-1]), add_defdicts(feature.common[(1, 1)], feature.common[ (1, -1)], feature.common[(-1, 1)], feature.common[(-1, -1)]), add_defdicts(feature.ambiguous[(1, 1)], feature.ambiguous[ (1, -1)], feature.ambiguous[(-1, 1)], feature.ambiguous[ (-1, -1)])) ] elif use_strand == 'forward': getters = [ lambda f: (feature.name, feature.count[1], feature.common[ (1, 1)], feature.ambiguous[(1, 1)]) ] elif use_strand == 'reverse': getters = [ lambda f: (feature.name, feature.count[-1], feature.common[ (-1, -1)], feature.ambiguous[(-1, -1)]) ] elif use_strand == 'both': getters = [ lambda f: (feature.name, feature.count[1], feature.common[ (1, 1)], feature.ambiguous[(1, 1)]), lambda f: (feature.name + 'r', feature.count[-1], feature.common[ (-1, -1)], feature.ambiguous[(-1, -1)]) ] total_hits = [0] * n_samples for feature in features: for getter in getters: total_hits = add_lists(total_hits, getter(feature)[1]) if equalize: min_hits = min(total_hits) p = [float(min_hits) / item for item in total_hits] total_hits = [min_hits] * n_samples f = open(output_prefix + '.txt', 'wb') #log.attach(open(output_prefix + '_log.txt', 'wb')) print >> f, tab_encode( ['Feature'] + titles + ['RPKM ' + item for item in titles] + ['Length'] + qualifiers + ['On same fragment'] + (['Ambiguous alignment'] if expect_multiple_alignments else [])) for feature in features: for getter in getters: feature_name, count, common, ambiguous = getter(feature) if equalize: count = [subsample(count[i], p[i]) for i in xrange(n_samples)] rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ] common_str = ' '.join( '%dx%s' % (item[1], item[0]) for item in sorted( common.items(), key=lambda item: item[1], reverse=True)) ambiguous_str = ' '.join( '%dx%s' % (item[1], item[0]) for item in sorted( ambiguous.items(), key=lambda item: item[1], reverse=True)) print >> f, tab_encode( [feature_name] + [str(item) for item in count] + ['%.2f' % item for item in rpkm] + [str(feature.length)] + list(feature.qualifiers) + [common_str] + ([ambiguous_str] if expect_multiple_alignments else [])) f.close()
def count_run( min_score, min_size, max_size, filter_mode, equalize, types, locii, qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log): if filter_mode == 'poly': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = False expect_multiple_alignments = True elif filter_mode == 'mono': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = True expect_multiple_alignments = True else: assert filter_mode == 'existing', 'Unrecognized filtering mode' use_bam_filename = 'alignments_filtered.bam' use_only_top = False use_only_monogamous = False expect_multiple_alignments = False types = types.lower().split(',') qualifiers = qualifiers.split(',') if locii: locii = locii.lower().split(',') else: locii = None assert use_strand is not None, 'You must now explicitly specify --strand' assert use_strand in ('pool','forward','reverse','both'), "Can't understand --strand specification." from Bio import Seq, SeqIO annotation_filenames = [ ] bam_filenames = [ ] for arg in filenames: if annotation.is_annotation_file(arg): annotation_filenames.append(arg) else: bam_filenames.append(arg) n_samples = len(bam_filenames) titles = bam_filenames[:] for i in xrange(len(bam_filenames)): if os.path.isdir(bam_filenames[i]): titles[i] = os.path.basename(bam_filenames[i]) if not annotation_filenames: working = working_directory.Working(bam_filenames[i]) reference_filename = working.get_reference().annotations_filename() if reference_filename is not None: annotation_filenames.append(reference_filename) bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename) assert bam_filenames, 'No reference alignments given' merge = { } merge_qualifiers = { } if merge_filename is not None: #First line gives qualifiers #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...> f = open(merge_filename,'rU') qualifiers = f.readline().rstrip('\n').split('\t') for line in f: parts = line.rstrip('\n').split('\t') if not parts: continue for name in parts[len(qualifiers)+1:]: assert name not in merge, 'Duplicate feature name in merge file' merge[name] = parts[len(qualifiers)] merge_qualifiers[name] = parts[:len(qualifiers)] f.close() genes = { } # reference name -> gene index feature_names = { } # feature_name -> number of occurrences features = [ ] n_features = 0 chromosome_length = { } for filename in bam_filenames: headers = sam.bam_headers(filename) for line in headers.split('\n'): if not line: continue parts = line.split('\t') if parts[0] != '@SQ': continue name = None length = None for part in parts[1:]: if part.startswith('SN:'): name = part[3:] if part.startswith('LN:'): length = int(part[3:]) assert name is not None and length is not None if name in chromosome_length: assert chromosome_length[name] == length else: chromosome_length[name] = length for name in chromosome_length: genes[name] = span_index.Span_index() if annotation_filenames: assert not merge, 'Merging not supported with annotation files' for filename in annotation_filenames: for feature in annotation.read_annotations(filename): if feature.type.lower() not in types: continue if (locii is not None and ('locus_tag' not in feature.attr or feature.attr['locus_tag'].lower() not in locii)): continue f = Feature(n_samples) f.name = feature.get_id() if feature.type.lower() != 'cds' and len(types) > 1: f.name = feature.type + ':' + f.name feature_names[f.name] = feature_names.get(f.name,0)+1 if feature_names[f.name] > 1: f.name += '/%d' % feature_names[f.name] f.qualifiers = [ feature.attr.get(item,'') for item in qualifiers ] f.length = feature.end - feature.start assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files' genes[feature.seqid].insert(Span_entry(feature.start, feature.end, feature.strand or 1, f)) features.append(f) else: # Sequences as features log.log('No annotation files given or found, using sequences as features\n') name_feature = { } # (merged)name -> feature for name in chromosome_length: merged_name = merge.get(name, name) if merged_name not in name_feature: f = Feature(n_samples) f.name = merged_name f.length = length f.qualifiers = merge_qualifiers.get(name, ('',)*len(qualifiers)) n_features += 1 name_feature[merged_name] = f features.append(f) else: f = name_feature[merged_name] f.length = max(f.length, length) #... genes[name].insert(Span_entry(0, chromosome_length[name], 1, f)) log.log('%d features\n\n' % len(features)) for name in genes: genes[name].prepare() n_fragments = [ 0 ] * n_samples n_fragments_aligned = [ 0 ] * n_samples n_low_score = [ 0 ] * n_samples n_something = [ 0 ] * n_samples n_multiple = [ 0 ] * n_samples n_span = [ 0 ] * n_samples for i in xrange(n_samples): for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(bam_filenames[i], 'Counting sample %d of %d' % (i+1,n_samples)): n_fragments[i] += 1 if not fragment_alignments: continue n_fragments_aligned[i] += 1 feature_hits = [ ] # [ [ (feature, strand) ] ] # Use only top scoring alignments fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ] best_score = max(fragment_scores) if min_score is not None and best_score < min_score: n_low_score[i] += 1 continue if use_only_top: cutoff = max(best_score, min_score) else: cutoff = min_score fragment_alignments = [ item for item, score in zip(fragment_alignments, fragment_scores) if score >= cutoff ] for alignments in fragment_alignments: strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1 start = min(item.pos-1 for item in alignments) end = max(item.pos+item.length-1 for item in alignments) length = end-start if min_size is not None and length < min_size: continue if max_size is not None and length > max_size: continue rname = alignments[0].rname strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1 assert alignments[0].rname in genes, 'Alignment refers to sequence not present in GENBANK file' this_feature_hits = [ ] for item in genes[rname].get(start, end): rel_strand = strand * item.strand key = (item.feature, rel_strand) if key in this_feature_hits: continue this_feature_hits.append( key ) if not use_only_monogamous or len(fragment_alignments) == 1: item.feature.count[rel_strand][i] += 1 if this_feature_hits: feature_hits.append( this_feature_hits ) if len(this_feature_hits) > 1: for a in this_feature_hits: for b in this_feature_hits: if a[0] is b[0]: continue a[0].common[(a[1],b[1])][b[0]] += 1 if len(feature_hits) > 0: n_something[i] += 1 #else: # print fragment_alignments # print genes[fragment_alignments[0][0].rname].indexes # print if len(feature_hits) > 1: n_multiple[i] += 1 for j in xrange(len(feature_hits)): for k in xrange(len(feature_hits)): if j == k: continue for a in feature_hits[j]: for b in feature_hits[k]: if a[0] is b[0]: continue a[0].ambiguous[(a[1],b[1])][b[0]] += 1 if any(len(item) > 1 for item in feature_hits): n_span[i] += 1 if limit is not None and n_fragments[i] >= limit: break grace.status('') #log.log('%s\n' % titles[i]) #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i])) #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i])) #if n_low_score[i]: # log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i])) #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i])) #if expect_multiple_alignments or n_multiple[i]: # log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i])) #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i])) #log.log('\n') log.datum(titles[i], 'fragments', n_fragments[i]) log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i]) if n_low_score[i]: log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i]) log.datum(titles[i], 'aligned to an annotated gene', n_something[i]) if expect_multiple_alignments or n_multiple[i]: log.datum(titles[i], 'aligned to multiple genes', n_multiple[i]) log.datum(titles[i],'had an alignment that spanned multiple genes', n_span[i]) log.log('\n') strandedness = [ ] for feature in features: n_forward = sum(feature.count[1]) n_reverse = sum(feature.count[-1]) if n_forward+n_reverse < 5: continue strandedness.append( (n_forward-n_reverse)*100.0 / (n_forward+n_reverse) ) strandedness = sum(strandedness) / len(strandedness) log.log('Strand specificity: %.0f%%\n' ' (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n' ' Average over all features with at least 5 hits.)\n' % strandedness) if use_strand == 'pool': getters = [ lambda f: (feature.name, add_lists(feature.count[1],feature.count[-1]), add_defdicts(feature.common[(1,1)], feature.common[(1,-1)], feature.common[(-1,1)], feature.common[(-1,-1)]), add_defdicts(feature.ambiguous[(1,1)], feature.ambiguous[(1,-1)], feature.ambiguous[(-1,1)], feature.ambiguous[(-1,-1)])) ] elif use_strand == 'forward': getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]) ] elif use_strand == 'reverse': getters = [ lambda f: (feature.name, feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ] elif use_strand == 'both': getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]), lambda f: (feature.name + 'r', feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ] total_hits = [0] * n_samples for feature in features: for getter in getters: total_hits = add_lists(total_hits, getter(feature)[1]) if equalize: min_hits = min(total_hits) p = [ float(min_hits)/item for item in total_hits ] total_hits = [ min_hits ] * n_samples f = open(output_prefix + '.txt', 'wb') #log.attach(open(output_prefix + '_log.txt', 'wb')) print >> f, tab_encode( [ 'Feature' ] + titles + [ 'RPKM ' + item for item in titles ] + [ 'Length' ] + qualifiers + [ 'On same fragment' ] + ([ 'Ambiguous alignment' ] if expect_multiple_alignments else [ ]) ) for feature in features: for getter in getters: feature_name, count, common, ambiguous = getter(feature) if equalize: count = [ subsample(count[i], p[i]) for i in xrange(n_samples) ] rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ] common_str = ' '.join( '%dx%s' % (item[1],item[0]) for item in sorted(common.items(), key=lambda item:item[1], reverse=True) ) ambiguous_str = ' '.join( '%dx%s' % (item[1],item[0]) for item in sorted(ambiguous.items(), key=lambda item:item[1], reverse=True) ) print >> f, tab_encode( [ feature_name ] + [ str(item) for item in count ] + [ '%.2f' % item for item in rpkm ] + [ str(feature.length) ] + list(feature.qualifiers) + [ common_str ] + ([ ambiguous_str ] if expect_multiple_alignments else [ ]) ) f.close()
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [ item.lower() for item in self.types.split(',') ] annotations = [ item for item in annotation.read_annotations(annotations_filename) if item.type.lower() in types ] self.log.log('%d annotations\n' % len(annotations)) assert annotations, 'No annotations of specified types in file' index = { } for item in annotations: if item.strand >= 0: item.tail_pos = item.end item.end += self.extension else: item.tail_pos = item.start item.start -= self.extension if item.seqid not in index: index[item.seqid] = span_index.Span_index() index[item.seqid].insert(item) item.hits = [] # [ (rel_start, rel_end, tail_length) ] for item in index.itervalues(): item.prepare() for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(workspace/'alignments_filtered.bam'): for fragment in fragment_alignments: start = min(item.pos-1 for item in fragment) end = max(item.pos+item.length-1 for item in fragment) alignment_length = end-start strand = -1 if fragment[0].flag&sam.FLAG_REVERSE else 1 if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in fragment[0].extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) if fragment[0].rname in index: hits = [ gene for gene in index[fragment[0].rname].get(start,end) if gene.strand == strand ] if hits: gene = min(hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice if strand > 0: rel_start = start - gene.start rel_end = end - gene.start else: rel_start = gene.end - end rel_end = gene.end - start gene.hits.append( (rel_start,rel_end,tail_length,adaptor_bases) ) f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()