def setup(self): grace.status('Load depths') self.sample_names = [ os.path.split(dirname)[1] for dirname in self.working_dirs ] self.workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in self.working_dirs ] self.depths = [ item.get_depths() for item in self.workspaces ] #self.depths = list(legion.imap(lambda item: item.get_object('depths.pickle.gz'), self.workspaces, local=True)) self.any_pairs = any(item.param['any_pairs'] for item in self.workspaces) grace.status('') lengths = self.workspaces[0].get_reference().get_lengths() self.chromosome_names = [ name for name, length in lengths ] self.lengths = dict(lengths) self.processes = [ ]
def run(self): workspace = working_directory.Working(self.output_dir) workspace.setup_reference(self.reference) workspace.update_param(snp_cost = self.snp_cost) #assert os.path.exists(self.reference), 'Reference file does not exist' #reference_filename = workspace._object_filename('reference.fa') #if os.path.exists(reference_filename): # os.unlink(reference_filename) #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename) bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') if sam.is_bam(self.input): sort_input_filename = self.input temp_filename = None else: temp_filename = io.abspath(self.output_dir, 'temp.bam') sort_input_filename = temp_filename writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-']) f = open(self.input, 'rb') while True: data = f.read(1<<20) if not data: break writer.write(data) writer.close() f.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', sort_input_filename, bam_prefix #]) sam.sort_bam(sort_input_filename, bam_prefix, by_name=True) if temp_filename is not None: os.unlink(temp_filename) grace.status('')
def run(self): title1 = self.title1 title2 = self.title2 working1 = working_directory.Working(self.working_dir1) working2 = working_directory.Working(self.working_dir2) cutoff = self.cutoff sequence_names = [ name for name, length in working1.get_reference().get_lengths() ] if title1 is None: title1 = working1.name if title2 is None: title2 = working2.name n = 1 while significance([('A', n)], [('T', n)], 1.0) > cutoff: n += 1 f = open(self.prefix + '.txt', 'wb') print >> f, '%g\tsignificance cutoff' % cutoff print >> f, '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n print >> f, 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % ( title1, title2, title1, title2) for sequence_name in sequence_names: filename1 = working1 / ( grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') filename2 = working2 / ( grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(read_file(filename1), read_file(filename2)): assert pos1 == pos2 and ref1 == ref2 if pos1 % 1000 == 0: grace.status('Testing %s %d' % (sequence_name, pos1)) dec_ins1 = io.decode_evidence(ins1) dec_ins2 = io.decode_evidence(ins2) if dec_ins1 and dec_ins2: sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff) if sig is not None and sig <= cutoff: print >> f, '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % ( sequence_name, pos1, 'insertion-before', ins1, ins2, sig, conins1, conins2) f.flush() dec_sub1 = io.decode_evidence(sub1) dec_sub2 = io.decode_evidence(sub2) if dec_sub1 and dec_sub2: sig = significance(dec_sub1, dec_sub2, cutoff) if sig is not None and sig <= cutoff: if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-': what = 'deletion' elif dec_sub1[0][0] != dec_sub2[0][0]: what = 'substitution' else: what = 'different mix' print >> f, '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % ( sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2) f.flush() f.close() grace.status('') return 0
def nway_main(gbk_filename, use_indels, use_reference, give_evidence, give_consequences, require_all, require_bisect, full_output, format, working_dirs, split_a, split_b, f=sys.stdout): assert working_dirs, 'Need at least one working directory.' workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ] reference = workspaces[0].get_reference() #if not annotation_filename: # annotation_filename = reference.annotations_filename() #May still be None if use_reference: names = ['reference'] evidence_start = 1 else: names = [ ] evidence_start = 0 names.extend( norm_name(item) for item in working_dirs ) references = io.read_sequences(reference.reference_fasta_filename()) annotations = { } if gbk_filename: from Bio import SeqIO for record in SeqIO.parse(io.open_possibly_compressed_file(gbk_filename),'genbank'): sequence = record.seq.tostring() features = [ item for item in record.features if item.type != 'source' ] features.sort(key=lambda item: item.location.nofuzzy_start) annotations[sequence] = features iterator = reader(working_dirs, references, use_reference, annotations) if not use_indels: iterator = itertools.ifilter(has_no_indels, iterator) if require_all or require_bisect or format == 'counts': iterator = itertools.ifilter(fully_unambiguous, iterator) if require_bisect: iterator = itertools.ifilter(is_binary_partition, iterator) if not require_bisect: if full_output: iterator = itertools.ifilter(not_boring_insertion, iterator) else: iterator = itertools.ifilter(is_interesting, iterator) if split_a or split_b: assert len(names) == len(set(names)), 'Two samples with the same name' try: split_a = [ names.index(norm_name(item)) for item in split_a ] split_b = [ names.index(norm_name(item)) for item in split_b ] except ValueError: raise grace.Error('Sample to be split is not amongst samples given') iterator = itertools.ifilter(is_split(split_a, split_b), iterator) #if limit: # iterator = itertools.islice(iterator, limit) if format == 'table': line = 'Reference\tPosition\tChange type' line += '\t' + '\t'.join(names) if give_evidence: line += '\t' + '\t'.join(names[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(names[evidence_start:]) if annotations: line += '\tAnnotations' print >> f, line for calls in iterator: line = '%s\t%d\t%s\t%s' % ( calls.ref_name, calls.ref_pos+1, change_type(calls), '\t'.join(item.consensus for item in calls.calls)) if give_evidence: line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:]) if annotations: line += '\t' + describe_features(calls.features) print >> f, line elif format == 'compact': for line in transpose_strings(names): print >> f, line print >> f for calls in iterator: if calls.is_insertion: footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name) else: footer = '%12d %s' % (calls.ref_pos+1, calls.ref_name) t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1) top = t[0] + ' ' + footer if give_consequences: consequences = [ ] for call in calls.calls: if call.consequences: for item in call.consequences.split(', '): item = ' '.join(item.split()[:3]) if item not in consequences: consequences.append(item) if consequences: top += ' ' + ' / '.join(sorted(consequences)) top += ' ' + describe_features(calls.features) print >> f, top for line in t[1:]: print >> f, line elif format == 'nexus': buckets = [ [ ] for name in names ] for calls in iterator: for i, char in enumerate(partition_string(calls)): buckets[i].append(char) print >> f, '#NEXUS' print >> f, 'begin taxa;' print >> f, 'dimensions ntax=%d;' % len(names) print >> f, 'taxlabels' for name in names: print >> f, name print >> f, ';' print >> f, 'end;' print >> f, 'begin characters;' print >> f, 'dimensions nchar=%d;' % len(buckets[0]) print >> f, 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;' print >> f, 'matrix' for name, bucket in itertools.izip(names, buckets): print >> f, name, ''.join(bucket) print >> f, ';' print >> f, 'end;' elif format == 'counts': for line in transpose_strings(names): print >> f, line print >> f counts = { } for calls in iterator: count_str = partition_string(calls) if count_str not in counts: counts[count_str] = 1 else: counts[count_str] += 1 for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True): print >> f, '%s %d' % (transpose_strings(count_str)[0], counts[count_str]) else: raise grace.Error('Unknown output format: ' + format)
def run(self): #mincov, args = grace.get_option_value(args, '--mincov', int, 1) #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) #minsize, args = grace.get_option_value(args, '--minsize', int, 200) #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') #is_core = (what == 'core') # #grace.expect_no_further_options(args) # #if len(args) < 2: # print >> sys.stderr, HELP # raise grace.Help_shown() # #output_dir, working_dirs = args[0], args[1:] # ##assert not path.exists(path.join(output_dir, 'reference.fa')), \ #assert not path.exists(path.join(output_dir, 'parameters')), \ # 'Output directory not given' # #if not path.exists(output_dir): # os.mkdir(output_dir) assert self.what in ( 'core', 'unique'), 'Expected --what to be either "core" or "unique".' is_core = (self.what == 'core') workspace = self.get_workspace() for name, seq in io.read_sequences( working_directory.Working(self.working_dirs[0]).get_reference( ).reference_fasta_filename()): self.log.log(name + '\n') friendly_name = grace.filesystem_friendly_name(name) good = [True] * len(seq) for working_dir in self.working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name + suffix)) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= self.mincov else: good[i] = data[i] < self.mincov #Close holes start = -self.maxdiff - 1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i - start <= self.maxdiff: for j in xrange(start, i): good[j] = True n_holes += 1 start = i + 1 self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n') f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))])) f.close() f = open( workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq))])) f.close() f_good = open( workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)), 'wb') f_nongood = open( workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i - start < self.minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i - start io.write_fasta(f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start + 1, i), seq[start:i]) for i in xrange(1, len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() self.log.log( grace.pretty_number(sum(good)) + ' bases are ' + self.what + ', of ' + grace.pretty_number(len(seq)) + ' in reference sequence\n') self.log.log( grace.pretty_number(n_good[0]) + ' parts at least ' + grace.pretty_number(self.minsize) + ' bases long with ' + grace.pretty_number(n_good_bases[0]) + ' total bases\n') self.log.log('\n')
def run(self): bams = [] reference = None reference2 = None extra = [] for sample in self.samples: if sam.is_bam(sample): bams.append(sample) elif os.path.isdir(sample): working = working_directory.Working(sample, True) bams.append(working.get_filtered_sorted_bam()) extra.append('##sampleTags=' + ','.join(working.get_tags())) if reference2 is None: reference2 = working.get_reference( ).reference_fasta_filename() elif io.is_sequence_file(sample): assert reference is None, 'Only one reference FASTA file allowed.' reference = sample if reference is None: reference = reference2 if reference is None: raise grace.Error('No reference FASTA file given.') with nesoni.Stage() as stage: tempspace = stage.enter(workspace.tempspace()) if self.depth_limit: with nesoni.Stage() as stage2: for i in xrange(len(bams)): sam.Bam_depth_limit( tempspace / ('%d' % i), bams[i], depth=self.depth_limit).process_make(stage2) bams[i] = tempspace / ('%d.bam' % i) # FreeBayes claims to handle multiple bams, but it doesn't actually work if len(bams) > 1: sam.Bam_merge(tempspace / 'merged', bams=bams, index=False).run() bams = [tempspace / 'merged.bam'] command = [ 'freebayes', '-f', reference, '--ploidy', str(self.ploidy), '--pvar', str(self.pvar), ] + self.freebayes_options + bams self.log.log('Running: ' + ' '.join(command) + '\n') f_out = stage.enter(open(self.prefix + '.vcf', 'wb')) f_in = stage.enter(io.pipe_from(command)) done_extra = False for line in f_in: if not done_extra and not line.startswith('##'): for extra_line in extra: f_out.write(extra_line + '\n') done_extra = True f_out.write(line) index_vcf(self.prefix + '.vcf')
def count_run(min_score, min_size, max_size, filter_mode, equalize, types, locii, qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log): if filter_mode == 'poly': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = False expect_multiple_alignments = True elif filter_mode == 'mono': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = True expect_multiple_alignments = True else: assert filter_mode == 'existing', 'Unrecognized filtering mode' use_bam_filename = 'alignments_filtered.bam' use_only_top = False use_only_monogamous = False expect_multiple_alignments = False types = types.lower().split(',') qualifiers = qualifiers.split(',') if locii: locii = locii.lower().split(',') else: locii = None assert use_strand is not None, 'You must now explicitly specify --strand' assert use_strand in ('pool', 'forward', 'reverse', 'both'), "Can't understand --strand specification." from Bio import Seq, SeqIO annotation_filenames = [] bam_filenames = [] for arg in filenames: if annotation.is_annotation_file(arg): annotation_filenames.append(arg) else: bam_filenames.append(arg) n_samples = len(bam_filenames) titles = bam_filenames[:] for i in xrange(len(bam_filenames)): if os.path.isdir(bam_filenames[i]): titles[i] = os.path.basename(bam_filenames[i]) if not annotation_filenames: working = working_directory.Working(bam_filenames[i]) reference_filename = working.get_reference( ).annotations_filename() if reference_filename is not None: annotation_filenames.append(reference_filename) bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename) assert bam_filenames, 'No reference alignments given' merge = {} merge_qualifiers = {} if merge_filename is not None: #First line gives qualifiers #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...> f = open(merge_filename, 'rU') qualifiers = f.readline().rstrip('\n').split('\t') for line in f: parts = line.rstrip('\n').split('\t') if not parts: continue for name in parts[len(qualifiers) + 1:]: assert name not in merge, 'Duplicate feature name in merge file' merge[name] = parts[len(qualifiers)] merge_qualifiers[name] = parts[:len(qualifiers)] f.close() genes = {} # reference name -> gene index feature_names = {} # feature_name -> number of occurrences features = [] n_features = 0 chromosome_length = {} for filename in bam_filenames: headers = sam.bam_headers(filename) for line in headers.split('\n'): if not line: continue parts = line.split('\t') if parts[0] != '@SQ': continue name = None length = None for part in parts[1:]: if part.startswith('SN:'): name = part[3:] if part.startswith('LN:'): length = int(part[3:]) assert name is not None and length is not None if name in chromosome_length: assert chromosome_length[name] == length else: chromosome_length[name] = length for name in chromosome_length: genes[name] = span_index.Span_index() if annotation_filenames: assert not merge, 'Merging not supported with annotation files' for filename in annotation_filenames: for feature in annotation.read_annotations(filename): if feature.type.lower() not in types: continue if (locii is not None and ('locus_tag' not in feature.attr or feature.attr['locus_tag'].lower() not in locii)): continue f = Feature(n_samples) f.name = feature.get_id() if feature.type.lower() != 'cds' and len(types) > 1: f.name = feature.type + ':' + f.name feature_names[f.name] = feature_names.get(f.name, 0) + 1 if feature_names[f.name] > 1: f.name += '/%d' % feature_names[f.name] f.qualifiers = [ feature.attr.get(item, '') for item in qualifiers ] f.length = feature.end - feature.start assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files' genes[feature.seqid].insert( Span_entry(feature.start, feature.end, feature.strand or 1, f)) features.append(f) else: # Sequences as features log.log( 'No annotation files given or found, using sequences as features\n' ) name_feature = {} # (merged)name -> feature for name in chromosome_length: merged_name = merge.get(name, name) if merged_name not in name_feature: f = Feature(n_samples) f.name = merged_name f.length = length f.qualifiers = merge_qualifiers.get(name, ('', ) * len(qualifiers)) n_features += 1 name_feature[merged_name] = f features.append(f) else: f = name_feature[merged_name] f.length = max(f.length, length) #... genes[name].insert(Span_entry(0, chromosome_length[name], 1, f)) log.log('%d features\n\n' % len(features)) for name in genes: genes[name].prepare() n_fragments = [0] * n_samples n_fragments_aligned = [0] * n_samples n_low_score = [0] * n_samples n_something = [0] * n_samples n_multiple = [0] * n_samples n_span = [0] * n_samples for i in xrange(n_samples): for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments( bam_filenames[i], 'Counting sample %d of %d' % (i + 1, n_samples)): n_fragments[i] += 1 if not fragment_alignments: continue n_fragments_aligned[i] += 1 feature_hits = [] # [ [ (feature, strand) ] ] # Use only top scoring alignments fragment_scores = [ sum(al.get_AS() for al in item) for item in fragment_alignments ] best_score = max(fragment_scores) if min_score is not None and best_score < min_score: n_low_score[i] += 1 continue if use_only_top: cutoff = max(best_score, min_score) else: cutoff = min_score fragment_alignments = [ item for item, score in zip(fragment_alignments, fragment_scores) if score >= cutoff ] for alignments in fragment_alignments: strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1 start = min(item.pos - 1 for item in alignments) end = max(item.pos + item.length - 1 for item in alignments) length = end - start if min_size is not None and length < min_size: continue if max_size is not None and length > max_size: continue rname = alignments[0].rname strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1 assert alignments[ 0].rname in genes, 'Alignment refers to sequence not present in GENBANK file' this_feature_hits = [] for item in genes[rname].get(start, end): rel_strand = strand * item.strand key = (item.feature, rel_strand) if key in this_feature_hits: continue this_feature_hits.append(key) if not use_only_monogamous or len( fragment_alignments) == 1: item.feature.count[rel_strand][i] += 1 if this_feature_hits: feature_hits.append(this_feature_hits) if len(this_feature_hits) > 1: for a in this_feature_hits: for b in this_feature_hits: if a[0] is b[0]: continue a[0].common[(a[1], b[1])][b[0]] += 1 if len(feature_hits) > 0: n_something[i] += 1 #else: # print fragment_alignments # print genes[fragment_alignments[0][0].rname].indexes # print if len(feature_hits) > 1: n_multiple[i] += 1 for j in xrange(len(feature_hits)): for k in xrange(len(feature_hits)): if j == k: continue for a in feature_hits[j]: for b in feature_hits[k]: if a[0] is b[0]: continue a[0].ambiguous[(a[1], b[1])][b[0]] += 1 if any(len(item) > 1 for item in feature_hits): n_span[i] += 1 if limit is not None and n_fragments[i] >= limit: break grace.status('') #log.log('%s\n' % titles[i]) #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i])) #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i])) #if n_low_score[i]: # log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i])) #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i])) #if expect_multiple_alignments or n_multiple[i]: # log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i])) #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i])) #log.log('\n') log.datum(titles[i], 'fragments', n_fragments[i]) log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i]) if n_low_score[i]: log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i]) log.datum(titles[i], 'aligned to an annotated gene', n_something[i]) if expect_multiple_alignments or n_multiple[i]: log.datum(titles[i], 'aligned to multiple genes', n_multiple[i]) log.datum(titles[i], 'had an alignment that spanned multiple genes', n_span[i]) log.log('\n') strandedness = [] for feature in features: n_forward = sum(feature.count[1]) n_reverse = sum(feature.count[-1]) if n_forward + n_reverse < 5: continue strandedness.append( (n_forward - n_reverse) * 100.0 / (n_forward + n_reverse)) strandedness = sum(strandedness) / len(strandedness) log.log( 'Strand specificity: %.0f%%\n' ' (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n' ' Average over all features with at least 5 hits.)\n' % strandedness) if use_strand == 'pool': getters = [ lambda f: (feature.name, add_lists(feature.count[1], feature.count[-1]), add_defdicts(feature.common[(1, 1)], feature.common[ (1, -1)], feature.common[(-1, 1)], feature.common[(-1, -1)]), add_defdicts(feature.ambiguous[(1, 1)], feature.ambiguous[ (1, -1)], feature.ambiguous[(-1, 1)], feature.ambiguous[ (-1, -1)])) ] elif use_strand == 'forward': getters = [ lambda f: (feature.name, feature.count[1], feature.common[ (1, 1)], feature.ambiguous[(1, 1)]) ] elif use_strand == 'reverse': getters = [ lambda f: (feature.name, feature.count[-1], feature.common[ (-1, -1)], feature.ambiguous[(-1, -1)]) ] elif use_strand == 'both': getters = [ lambda f: (feature.name, feature.count[1], feature.common[ (1, 1)], feature.ambiguous[(1, 1)]), lambda f: (feature.name + 'r', feature.count[-1], feature.common[ (-1, -1)], feature.ambiguous[(-1, -1)]) ] total_hits = [0] * n_samples for feature in features: for getter in getters: total_hits = add_lists(total_hits, getter(feature)[1]) if equalize: min_hits = min(total_hits) p = [float(min_hits) / item for item in total_hits] total_hits = [min_hits] * n_samples f = open(output_prefix + '.txt', 'wb') #log.attach(open(output_prefix + '_log.txt', 'wb')) print >> f, tab_encode( ['Feature'] + titles + ['RPKM ' + item for item in titles] + ['Length'] + qualifiers + ['On same fragment'] + (['Ambiguous alignment'] if expect_multiple_alignments else [])) for feature in features: for getter in getters: feature_name, count, common, ambiguous = getter(feature) if equalize: count = [subsample(count[i], p[i]) for i in xrange(n_samples)] rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ] common_str = ' '.join( '%dx%s' % (item[1], item[0]) for item in sorted( common.items(), key=lambda item: item[1], reverse=True)) ambiguous_str = ' '.join( '%dx%s' % (item[1], item[0]) for item in sorted( ambiguous.items(), key=lambda item: item[1], reverse=True)) print >> f, tab_encode( [feature_name] + [str(item) for item in count] + ['%.2f' % item for item in rpkm] + [str(feature.length)] + list(feature.qualifiers) + [common_str] + ([ambiguous_str] if expect_multiple_alignments else [])) f.close()
def run(self): assert self.reads, 'No read files given.' colorspace = [ io.is_colorspace(item) for item in self.reads ] assert len(set(colorspace)) == 1, 'Mixture of colorspace and basespace reads is not currently supported.' colorspace = colorspace[0] #polya_dir = self.get_polya_dir() working = working_directory.Working(self.output_dir, must_exist=False) working.set_reference(self.reference) reference = working.get_reference() #polya_working = working_directory.Working(polya_dir, must_exist=False) #polya_working.set_reference(self.reference) clipped_prefix = working/'clipped_reads' clipped_filename = clipped_prefix+('.csfastq.gz' if colorspace else '.fastq.gz') raw_filename = working/'alignments_raw.sam.gz' extended_filename = working/'alignments_extended.sam.gz' #polya_filename = working/'alignments_filtered_polyA.sam.gz' if colorspace: self.clip_runs_colorspace( filenames=self.reads, prefix=clipped_prefix, sample=working.name, ).make() else: self.clip_runs_basespace( filenames=self.reads, prefix=clipped_prefix, sample=working.name, ).make() cores = min(nesoni.coordinator().get_cores(), 8) if colorspace: nesoni.Execute( command = reference.shrimp_command(cs=colorspace, parameters=[ clipped_filename ]) + [ '--qv-offset', '33' ], execution_options = [ '-N', str(cores) ], output=raw_filename, cores=cores, prefix=working/'run_alignment' ).make() else: nesoni.Execute( command = [ 'bowtie2', '--rg-id', '1', '--rg', 'SM:'+working.name, '--sensitive-local', '-k', '10', #Up to 10 alignments per read '-x', reference.get_bowtie_index_prefix(), '-U', clipped_filename, ], execution_options = [ '--threads', str(cores) ], output=raw_filename, cores=cores, prefix=working/'run_alignment' ).make() if colorspace: extend_sam.Extend_sam_colorspace( input=raw_filename, output=extended_filename, reads=self.reads, reference_filenames=[ reference.reference_fasta_filename() ], ).make() else: extend_sam.Extend_sam_basespace( input=raw_filename, output=extended_filename, clips=[ clipped_prefix+'.clips.gz' ], reference_filenames=[ reference.reference_fasta_filename() ], prop_a = self.extension_prop_a ).make() nesoni.Import( input=extended_filename, output_dir=self.output_dir, reference=[ self.reference ], ).make() self.get_filter_action().make() #Tail_only( # input=working/'alignments_filtered.bam', # output=polya_filename, #).make() #nesoni.Import( # input=polya_filename, # output_dir=polya_dir, # reference=[ self.reference ], #).make() # This shouldn't actually filter out any alignments. # We do it to produce depth of coverage plots # and position-sorted BAM files. #self.get_polya_filter_action().make() nesoni.Tag(self.output_dir, tags=self.tags).make() #nesoni.Tag(polya_dir, tags=self.tags).make() if self.delete_files: # Delete unneeded files os.unlink(clipped_prefix+'.state') os.unlink(clipped_filename) os.unlink(working/'alignments.bam') os.unlink(working/'alignments_filtered.bam') os.unlink(working/'run_alignment.state') os.unlink(raw_filename) os.unlink(extended_filename)
def run(self): assert self.extension is not None, '--extension must be specified' # Also allow simply the analyse-polya-batch directory working_dirs = [] for item in self.working_dirs: state_filename = os.path.join(item, 'analyse-polya-batch.state') if not os.path.exists(state_filename): working_dirs.append(item) else: with open(state_filename, 'rb') as f: state = pickle.load(f) for sample in state.samples: working_dirs.append( os.path.join(item, 'samples', sample.output_dir)) work = self.get_workspace() if self.reuse: pickle_workspace = workspace.Workspace( os.path.join(self.reuse, 'pickles')) else: pickle_workspace = workspace.Workspace(work / 'pickles') plot_workspace = workspace.Workspace(work / 'plots') pickle_filenames = [] file_prefix = self.file_prefix if file_prefix and not file_prefix.endswith('-'): file_prefix += '-' with nesoni.Stage() as stage: for dir in working_dirs: working = working_directory.Working(dir, must_exist=True) pickle_filenames.append(pickle_workspace / working.name + '.pickle.gz') if self.reuse: continue Tail_count( pickle_workspace / working.name, working_dir=dir, annotations=self.annotations, types=self.types, parts=self.parts, extension=self.extension, ).process_make(stage) assert len(set(pickle_filenames)) == len( pickle_filenames), "Duplicate sample name." with nesoni.Stage() as stage: Aggregate_tail_counts(output_dir=self.output_dir, pickles=pickle_filenames, tail=self.tail, adaptor=self.adaptor).process_make(stage) nesoni.Norm_from_counts( prefix=work / 'norm', counts_filename=work / 'counts.csv', ).make() similarity = nesoni.Similarity( prefix=plot_workspace / 'similarity', counts=work / 'counts.csv', ) plot_pooleds = [ Plot_pooled( prefix=plot_workspace / 'pooled-heatmap', aggregate=self.output_dir, #min_tails = min_tails, min_tails=1, top=100, ) #for min_tails in (20,50,100,200,500,1000,2000) ] #plot_comparisons = [ # Plot_comparison( # prefix = plot_workspace/('comparison-min-tails-%d-min-span-%.1f' % (min_tails,min_span)), # aggregate = self.output_dir, # min_tails = min_tails, # min_span = min_span, # ) # for min_tails in [50,100,200,500] # for min_span in [2,4,8,10,15,20,25,30] # ] # heatmaps = [ nesoni.Heatmap( prefix=plot_workspace / ('heatmap-min-fold-%.1f' % fold), counts=work / 'counts.csv', norm_file=work / 'norm.csv', min_span=math.log(fold) / math.log(2.0), ) for fold in [1.5, 2.0, 4.0, 6.0, 8.0, 10.0, 20.0, 30.0, 40.0] ] with nesoni.Stage() as stage: similarity.process_make(stage) for action in plot_pooleds + heatmaps: #+ plot_comparisons: action.process_make(stage) r = reporting.Reporter( work / 'report', self.title, file_prefix, style=web.style(), ) similarity.report(r) r.heading('Poly(A) tail length distribution') r.p('This plot shows the distribution of lengths of poly(A) tail sequence in top expressed features. ' 'Its main purpose is to assess data quality. ' 'If the plot has many bright spots there may be many identical reads, possibly due to non-random digestion.' ) r.p('Only reads with a poly(A) sequence of four or more bases are used.' ) for heatmap in plot_pooleds: r.report_heatmap(heatmap) r.heading('Heatmaps') r.p('Genes were selected based ' 'on there being at least some fold change difference between ' 'some pair of samples.') for heatmap in heatmaps: r.report_heatmap(heatmap) #r.heading('Average poly(A) tail length and its relation to expression levels') # #r.p( # 'Only reads with a poly(A) sequence of four or more bases was included in the averages.' # ) # #r.p( # 'Genes were selected based on there being at least a certain number of reads with poly(A) sequence in <i>each</i> sample (min-tails), ' # 'and on there being at least some amount of difference in average tail length between samples (min-span).' # ) # #for heatmap in plot_comparisons: # r.report_heatmap(heatmap) r.close()
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [item.lower() for item in self.types.split(',')] parts = self.parts or self.types parts = [item.lower() for item in parts.split(',')] all_annotations = list( annotation.read_annotations(annotations_filename)) annotation.link_up_annotations(all_annotations) for item in all_annotations: item.primary = None annotations = [ item for item in all_annotations if item.type.lower() in types ] part_annotations = [] seen = set() queue = [(item, item) for item in annotations] while queue: primary, item = queue.pop() if item.type.lower() in parts: assert item.primary is None, "Feature with multiple parents" item.primary = primary key = (id(primary), item.start, item.end, item.seqid, item.strand) # Ignore duplicate exons (many isoforms will have the same exons) if key not in seen: seen.add(key) part_annotations.append(item) queue.extend((primary, item2) for item2 in item.children) del seen del all_annotations self.log.log('%d annotations\n' % len(annotations)) self.log.log('%d part annotations\n' % len(part_annotations)) #assert annotations, 'No annotations of specified types in file' for item in part_annotations: this_extension = self.extension if "max_extension" in item.attr: this_extension = min(this_extension, int(item.attr["max_extension"])) if item.strand >= 0: item.tail_pos = item.end item.end += this_extension else: item.tail_pos = item.start item.start -= this_extension for item in annotations: item.hits = [] # [ (tail_length, adaptor_bases) ] index = span_index.index_annotations(part_annotations) for alignment in sam.Bam_reader(workspace / 'alignments_filtered_sorted.bam'): if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary: continue start = alignment.reference_start end = alignment.reference_end alignment_length = end - start strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1 fragment_feature = annotation.Annotation( seqid=alignment.reference_name, start=start, end=end, strand=strand) if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in alignment.extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) hits = index.get(fragment_feature, same_strand=True) if hits: gene = min( hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.primary.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice gene.primary.hits.append((tail_length, adaptor_bases)) for item in annotations: del item.parents del item.children del item.primary f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()
def run(self): working_dirs = [] peaks_file = self.peaks_file for item in self.working_dirs: state_filename = os.path.join(item, 'analyse-polya-batch.state') if not os.path.exists(state_filename): working_dirs.append(item) else: with open(state_filename, 'rb') as f: state = pickle.load(f) for sample in state.samples: working_dirs.append( os.path.join(item, 'samples', sample.output_dir)) if not peaks_file: peaks_file = os.path.join(self.pipeline_dir, "peaks", "relation-child.gff") sample_names = [os.path.split(dirname)[1] for dirname in working_dirs] workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ] workspace = self.get_workspace() with open(workspace / "index.html", "wb") as f: web.emit( f, "igv.html", dict( SAMPLES=json.dumps(sample_names), HAVE_NORM=json.dumps(bool(self.norm_file)), TITLE=self.title, )) bams = [item / "alignments_filtered_sorted.bam" for item in workspaces] for i in xrange(len(sample_names)): io.symbolic_link(bams[i], workspace / (sample_names[i] + ".bam")) io.symbolic_link(bams[i] + ".bai", workspace / (sample_names[i] + ".bam.bai")) io.symbolic_link(peaks_file, workspace / "peaks.gff") if self.norm_file: mults = io.read_grouped_table(self.norm_file)['All'] norm_mult = [ float(mults[name]['Normalizing.multiplier']) for name in sample_names ] with nesoni.Stage() as stage: Bam_to_bigwig( workspace / "total", bam_files=bams, what="ambiguity,span,3p,polyaspan,polya3p", ).process_make(stage) for i in xrange(len(sample_names)): for scale_desc, scale in \ [("raw",1.0)] + \ ([("norm",norm_mult[i])] if self.norm_file else []): Bam_to_bigwig(workspace / (sample_names[i] + "-" + scale_desc), bam_files=[bams[i]], what='span,3p,polyaspan,polya3p', scale=scale).process_make(stage)