def build_snpeff(self): jar = io.find_jar('snpEff.jar') with open(self/'snpeff.config','wb') as f: print >> f, 'data_dir = snpeff' print >> f, 'genomes : ' + self.name print >> f, self.name + '.genome : ' + self.name snpwork = io.Workspace(self/'snpeff',must_exist=False) snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False) snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False) annotations = self.annotations_filename() assert annotations with open(snpwork_genome/'genes.gff','wb') as f: for record in annotation.read_annotations(annotations): if record.end <= record.start: continue if not record.attr: record.attr['attributes'] = 'none' print >> f, record.as_gff() with open(snpwork_genomes/(self.name+'.fa'),'wb') as f: for name, seq in io.read_sequences(self.reference_fasta_filename()): io.write_fasta(f, name, seq) io.execute('java -jar JAR build NAME -gff3 -c CONFIG', JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
def _describe_peaks(self, r): workspace = io.Workspace(self.output_dir, must_exist=False) counts = io.read_grouped_table(workspace/("expression","peakwise","counts.csv"))["Count"] peak_counts = collections.defaultdict(int) read_counts = collections.defaultdict(int) total = 0 for item in annotation.read_annotations(workspace/("peaks","relation-child.gff")): peak_counts[item.attr.get("Relation","None")] += 1 read_counts[item.attr.get("Relation","None")] += sum(int(c) for c in counts[item.get_id()].values()) total += 1 total_reads = sum(read_counts.values()) r.write("<p>\n") r.write("%d peaks\n" % total) for name, desc in [ ("3'UTR", "in a 3' UTR"), ("Exon", "otherwise in an exon"), ("Downstrand", "otherwise downstrand of a non-coding RNA"), ("Intron", "otherwise in an intron"), ("Antisense", "otherwise antisense to a gene"), ("None", "couldn't be related to annotated genes"), ]: r.write("<br/>%d peaks and %.1f%% of reads %s\n" % (peak_counts[name], read_counts[name]*100.0/total_reads, desc)) r.write("</p>\n")
def _run_peaks(self, workspace, expressionspace, reference, dirs, analyse_template, file_prefix): shiftspace = io.Workspace(workspace/'peak-shift') peaks.Call_peaks( workspace/'peaks', annotations = reference/'reference.gff', extension = self.extension, min_depth = self.peak_min_depth, polya = self.peak_polya, min_tail = self.peak_min_tail, peak_length = self.peak_length, samples = dirs, ).make() analyse_template( expressionspace/'peakwise', annotations=workspace/('peaks','relation-child.gff'), extension=0, types='peak', parts='peak', title='Peakwise expression - ' + self.title, file_prefix=file_prefix+'peakwise-', ).make() alternative_tails.Compare_peaks( shiftspace/'individual', norm_file=expressionspace/('peakwise','norm.csv'), utrs=reference/'utr.gff', utr_only=True, top=2, reference=reference/'reference.fa', parents=workspace/('peaks','relation-parent.gff'), children=workspace/('peaks','relation-child.gff'), counts=expressionspace/('peakwise','counts.csv'), ).make() if self.groups: tail_lengths.Collapse_counts( shiftspace/'grouped-counts', counts=expressionspace/('peakwise','counts.csv'), groups=self.groups ).make() alternative_tails.Compare_peaks( shiftspace/'grouped', utrs=reference/'utr.gff', utr_only=True, top=2, reference=reference/'reference.fa', parents=workspace/('peaks','relation-parent.gff'), children=workspace/('peaks','relation-child.gff'), counts=shiftspace/'grouped-counts.csv', ).make()
def __init__(self, directory, title, file_prefix=''): self.workspace = io.Workspace(directory, must_exist=False) self.file_prefix = file_prefix if self.file_prefix: self.file_prefix += '-' self.f = self.workspace.open('index.html', 'wb') print >> self.f, '<html><head>' print >> self.f, '<title>%s</title>' % title print >> self.f, '<style>%s</style>' % STYLE print >> self.f, '</head><body>' print >> self.f, '<h1>%s</h1>' % title self.p(datetime.date.today().strftime('%e %B %Y'))
def run(self): assert self.pipeline is not None, "Pipeline output directory required." path = os.path.abspath(self.pipeline) workspace = io.Workspace(self.output_dir, must_exist=False) with open(workspace / "app.R", "wb") as f: print >> f, "library(tailtools)" print >> f, "shiny_tailtools_report(%s, species=%s, title=%s)" % ( repr(path), "NULL" if not self.species else repr(self.species), repr(self.title)) with open(workspace / "index.html", "wb") as f: web.emit(f, "sorry-no-shiny.html", {})
def _extract_raw(self): work = io.Workspace(self.output_dir, must_exist=False) raw = io.Workspace(work/'raw', must_exist=False) for name, counts, norms in [ ('genewise', work/('expression','genewise','counts.csv'), work/('expression','genewise','norm.csv'), ), ('primarypeakwise', work/('expression','primarypeakwise','counts.csv'), work/('expression','primarypeakwise','norm.csv'), ), ('peakwise', work/('expression','peakwise','counts.csv'), work/('expression','peakwise','norm.csv'), ), ('pairwise', work/('peak-shift','individual-pairs.csv'), work/('peak-shift','individual-pairs-norm.csv'), ), ]: nesoni.Vst( raw/(name+'-mlog2-RPM'), counts, norm_file = norms ).make() counts_table = io.read_grouped_table(counts) io.write_csv_2(raw/(name+'-info.csv'), counts_table['Annotation']) io.write_csv_2(raw/(name+'-count.csv'), counts_table['Count']) io.write_csv_2(raw/(name+'-tail.csv'), counts_table['Tail']) io.write_csv_2(raw/(name+'-tail-count.csv'), counts_table['Tail_count']) io.write_csv_2(raw/(name+'-proportion.csv'), counts_table['Proportion']) norm_table = io.read_grouped_table(norms) io.write_csv_2(raw/(name+'-norm.csv'), norm_table['All'])
def run(self): working = io.Workspace(self.output_dir, must_exist=False) for filename in self.files: reader = io.Table_reader(filename) name = os.path.splitext(os.path.split(filename)[1])[0] rname = None files = None for record in reader: if record['Chromosome'] != rname: if files: for item in files: item.close() rname = record['Chromosome'] grace.status('Convert '+name+' '+rname) files = [ open(working / ( name + '-' + grace.filesystem_friendly_name(rname) + '-' + grace.filesystem_friendly_name(item) + '.userplot' ), 'wb') for item in reader.headings[4:] ] pos = 0 assert int(record['Start']) == pos and int(record['End']) == pos + 1 for val, f in zip(record.values()[4:], files): print >> f, val pos += 1 if files: for item in files: item.close() grace.status('')
def _create_json(self): workspace = io.Workspace(self.output_dir, must_exist=False) samples = [ ] groups = [ ] for sample in self.samples: this_groups = [ ] for item in self.groups: if selection.matches( selection.term_specification(item), sample.tags + [ sample.output_dir ] ): this_groups.append(selection.term_name(item)) group = ','.join(this_groups) if this_groups else 'ungrouped' if group not in groups: groups.append(group) item = { 'name' : sample.output_dir, 'bam' : os.path.abspath( workspace/('samples',sample.output_dir,'alignments_filtered_sorted.bam') ), 'group' : group, 'tags' : sample.tags, } samples.append(item) obj = collections.OrderedDict() obj['reference'] = os.path.abspath( self.reference ) obj['extension'] = self.extension obj['genes'] = os.path.abspath( workspace/('peaks','relation-parent.gff') ) obj['peaks'] = os.path.abspath( workspace/('peaks','relation-child.gff') ) obj['groups'] = groups obj['samples'] = samples with open(workspace/"plotter-config.json","wb") as f: json.dump(obj, f, indent=4)
import sys, os sys.path.insert(0, os.path.join(os.path.split(__file__)[0], '../')) import unittest import nesoni from nesoni import io data = io.Workspace('data', must_exist=True) output = io.Workspace('output', must_exist=False) class Test_clip(unittest.TestCase): def test_single(self): nesoni.Clip( output / 'clip', reads=[data / 'reads_1.txt.gz'], ).run() def test_paired(self): nesoni.Clip( output / 'clip', pairs=[[data / 'reads_1.txt.gz', data / 'reads_2.txt.gz']], ).run() if __name__ == '__main__': unittest.main()
def pastiche(args): if len(args) < 4: print USAGE return 1 mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False) min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20) output_dir, args = args[0], args[1:] #, ref_filename, contig_filenames = args[0], args[1], args[2:] ref_filenames = [] contig_filenames = [] grace.execute(args, {'contigs': lambda args: contig_filenames.extend(args)}, lambda args: ref_filenames.extend(args)) assert ref_filenames, 'No reference sequences given' assert contig_filenames, 'No contig sequences given' contigs = dict([(name.split()[0], seq) for filename in contig_filenames for name, seq in io.read_sequences(filename)]) dir_contigs = {} for name in contigs: dir_contigs[name + '+'] = contigs[name] dir_contigs[name + '-'] = bio.reverse_complement(contigs[name]) dir_contigs_used = {} for name in dir_contigs: dir_contigs_used[name] = [False] * len(dir_contigs[name]) workspace = io.Workspace(output_dir) temp_prefix = workspace._object_filename('temp-pastiche') out_f = workspace.open('pastiche.fa', 'wb') for ref_filename in ref_filenames: for ref_name, ref_seq in io.read_sequences(ref_filename): ref_name = ref_name.split()[0] grace.status(ref_name) f = open(temp_prefix + '.fa', 'wb') io.write_fasta(f, 'ref', ref_seq) f.close() scores = [-1] * (len(ref_seq) * 2) strings = ['N', ''] * (len(ref_seq)) contexts = [None for i in xrange(len(ref_seq) * 2)] #MAXSCORE = len(ref_seq)+1 #for i in xrange(len(ref_seq)): # if ref_seq[i].upper() != 'N': # strings[i*2] = ref_seq[i] # scores[i*2] = MAXSCORE #for i in xrange(len(ref_seq)-1): # if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N': # scores[i*2+1] = MAXSCORE if mask_only: for i in xrange(len(ref_seq)): strings[i * 2] = ref_seq[i].lower() def put(position, dir_contig_name, start, end, score): if scores[position] < score: scores[position] = score strings[position] = dir_contigs[dir_contig_name][start:end] contexts[position] = (dir_contig_name, start, end, score) for contig_filename in contig_filenames: execute([ 'nucmer', '--prefix', temp_prefix, #'--maxmatch', #Very slow '--nosimplify', '--minmatch', '9', '--mincluster', '50', #'--maxgap', '1000', #'--breaklen', '1000', # Increasing this reduces Ns, but is slow #'--diagfactor', '1.0', temp_prefix + '.fa', contig_filename ]) for contig_name, contig_seq in io.read_sequences( contig_filename): contig_name = contig_name.split()[0] grace.status(ref_name + ' vs ' + contig_name) p = run([ 'show-aligns', temp_prefix + '.delta', 'ref', contig_name ], stderr=subprocess.PIPE) alignments = [] while True: line = p.stdout.readline() if not line: break if not line.startswith('-- BEGIN'): continue parts = line.split() ref_start = int(parts[5]) ref_end = int(parts[7]) query_start = int(parts[10]) query_end = int(parts[12]) #assert ref_start < ref_end #ref_start -= 1 #Zero based coordinates al_ref = [] al_query = [] while True: block = [] end = False while True: line = p.stdout.readline() if line.startswith('-- END'): end = True break if line == '\n': if block: break else: continue block.append(line) if end: break al_ref.append(block[0].split()[1]) al_query.append(block[1].split()[1]) al_ref = ''.join(al_ref) al_query = ''.join(al_query) if ref_start > ref_end: al_ref = bio.reverse_complement(al_ref) al_query = bio.reverse_complement(al_query) ref_start, ref_end = ref_end, ref_start query_start, query_end = query_end, query_start if query_start > query_end: dir_contig_name = contig_name + '-' query_start = len(contig_seq) + 1 - query_start query_end = len(contig_seq) + 1 - query_end else: dir_contig_name = contig_name + '+' ref_start -= 1 #Zero based coordinates query_start -= 1 #print al_ref #print al_query #Pretty dumb scoring scheme al_score = 0 for i in xrange(len(al_ref)): if al_ref[i] == al_query[i]: al_score += 1 #else: # al_score -= 1 #Pastiche alignment over reference ref_pos = ref_start query_pos = query_start al_pos = 0 while al_pos < len(al_ref): assert al_ref[al_pos] != '.' if al_query[al_pos] == '.': put(ref_pos * 2, dir_contig_name, query_pos, query_pos, al_score) else: assert al_query[al_pos].lower() == dir_contigs[ dir_contig_name][query_pos].lower() put(ref_pos * 2, dir_contig_name, query_pos, query_pos + 1, al_score) query_pos += 1 al_pos += 1 al_pos_end = al_pos query_pos_end = query_pos while al_pos_end < len( al_ref) and al_ref[al_pos_end] == '.': al_pos_end += 1 query_pos_end += 1 #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score) assert al_query[al_pos:al_pos_end].lower( ) == dir_contigs[dir_contig_name][ query_pos:query_pos_end].lower() put(ref_pos * 2 + 1, dir_contig_name, query_pos, query_pos_end, al_score) al_pos = al_pos_end query_pos = query_pos_end ref_pos += 1 p.wait() grace.status(ref_name) result = ''.join(strings) io.write_fasta(out_f, ref_name, result) for context in contexts: if context is None: continue name, start, end, score = context for i in xrange(start, end): dir_contigs_used[name][i] = True #Interpolation #result = [ ] #i = 0 #while i < len(ref_seq): # if strings[i*2].upper() != 'N': # result.append(strings[i*2]) # result.append(strings[i*2+1]) # i += 1 # continue # # j = i # while strings[j*2].upper() == 'N': # j += 1 # # grace.status('') # print >> sys.stderr, 'interpolating', i+1,'..',j # # window = 20 #!!!!!!!!!!! # left_contexts = collections.defaultdict(lambda:0) # for i1 in xrange(max(0,i-window),i): # for context_name, context_start, context_end, context_score in contexts[i1*2]: # key = (context_name, context_end + i - i1) # left_contexts[key] = max(left_contexts[key],context_score) # # right_contexts = collections.defaultdict(lambda:0) # for j1 in xrange(j,min(j+window,len(ref_seq))): # for context_name, context_start, context_end, context_score in contexts[j1*2]: # key = (context_name, context_start + j - j1) # right_contexts[key] = max(left_contexts[key],context_score) # # #print >> sys.stderr, left_contexts # #print >> sys.stderr, right_contexts # # options = [ ] # # for (left_name, left_pos), left_score in left_contexts.items(): # for (right_name, right_pos), right_score in right_contexts.items(): # if left_name != right_name: continue # if right_pos < left_pos: continue # # if right_pos-left_pos > (j-i) * 4.0 + 10: continue #!!!!!!!!!!!!!!!!!!!!!!1 # if right_pos-left_pos < (j-i) * 0.25 - 10: continue # # score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i) # score *= left_score + right_score # #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score # options.append( (score, left_name, left_pos, right_pos) ) # # if options: # best = max(options, key=lambda option: option[0]) # print >> sys.stderr, '->', best # result.append( dir_contigs[best[1]][best[2]:best[3]].lower() ) # else: # print >> sys.stderr, '-> no good interpolation' # result.append( ref_seq[i:j] ) # # i = j # #result = ''.join(result) #io.write_fasta(sys.stdout, ref_name, result) #print >> sys.stderr, len(result), result.count('N') #for pos, size in N_runs: # out_size = len(''.join( strings[pos*2:pos*2+2] )) # print >> sys.stderr, pos, size, '->', out_size out_f.close() grace.status('') #for name, seq in io.read_sequences(ref_filename): # result = pastiche(seq, contigs_filename) # io.write_fasta(sys.stdout, name, result) leftover_f = workspace.open('leftovers.fa', 'wb') for name in sorted(contigs): used = [ (a or b) for a, b in zip(dir_contigs_used[name + '+'], dir_contigs_used[name + '-'][::-1]) ] i = 0 while i < len(used): j = i while j < len(used) and not used[j]: j += 1 if j - i > min_leftover: if i == 0 and j == len(used): out_name = name else: out_name = name + ':%d..%d' % (i + 1, j) io.write_fasta(leftover_f, out_name, contigs[name][i:j]) i = j + 1 leftover_f.close() for suffix in ['.fa', '.delta']: os.unlink(temp_prefix + suffix)
def fill_scaffolds(args): max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000) if len(args) < 2: print USAGE return 1 (output_dir, graph_dir), args = args[:2], args[2:] scaffolds = [ ] def scaffold(args): circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False) scaffold = [ ] for item in args: scaffold.append( ('contig', int(item)) ) scaffold.append( ('gap', None) ) if not circular: scaffold = scaffold[:-1] name = 'custom_scaffold_%d' % (len(scaffolds)+1) scaffolds.append( (name, scaffold) ) grace.execute(args, [scaffold]) custom_scaffolds = (len(scaffolds) != 0) sequences = dict( (a.split()[0], b.upper()) for a,b in io.read_sequences(os.path.join( graph_dir, '454AllContigs.fna'))) sequence_names = sorted(sequences) sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1))) contexts = { } context_names = { } context_depths = { } for i in xrange(1,len(sequence_names)+1): seq = sequences[sequence_names[i-1]] contexts[ i ] = seq context_names[ i ] = sequence_names[i-1]+'-fwd' contexts[ -i ] = bio.reverse_complement(seq) context_names[ -i ] = sequence_names[i-1]+'-rev' links = collections.defaultdict(list) for line in open( os.path.join(graph_dir, '454ContigGraph.txt'), 'rU'): parts = line.rstrip('\n').split('\t') if parts[0].isdigit(): seq = sequence_ids[parts[1]] context_depths[ seq] = float(parts[3]) context_depths[-seq] = float(parts[3]) if parts[0] == 'C': name1 = 'contig%05d' % int(parts[1]) dir1 = {"3'" : 1, "5'" : -1 }[parts[2]] name2 = 'contig%05d' % int(parts[3]) dir2 = {"5'" : 1, "3'" : -1 }[parts[4]] depth = int(parts[5]) #print name1, dir1, name2, dir2, depth links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) ) links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) ) if parts[0] == 'S' and not custom_scaffolds: name = 'scaffold%05d' % int(parts[2]) components = parts[3].split(';') scaffold = [ ] for component in components: a,b = component.split(':') if a == 'gap': scaffold.append( ('gap',int(b)) ) else: strand = { '+': +1, '-': -1 }[ b ] scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) ) scaffolds.append( (name, scaffold) ) #paths = { } # #todo = [ ] #for i in contexts: # for depth_left, neg_left in links[-i]: # left = -neg_left # for depth_right, right in links[i]: # todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) ) # #heapq.heapify(todo) #while todo: # score, source, dest, path = heapq.heappop(todo) # if (source,dest) in paths: continue # # paths[(source,dest)] = path # # if len(contexts[dest]) > max_filler_length: continue # # for depth, next in links[dest]: # heapq.heappush(todo, # ( max(score,-depth,-context_depths[dest]), source, next, path+(dest,)) # ) path_source_dest = collections.defaultdict(dict) # source -> dest -> next path_dest_source = collections.defaultdict(dict) # dest -> source -> next # Use links, in order to depth of coverage, to construct paths between contigs # Thus: paths have maximum minimum depth # subsections of paths also have this property todo = [ ] for i in contexts: for depth_link, right in links[i]: todo.append( ( depth_link, i, right) ) todo.sort(reverse=True) for score, left, right in todo: if right in path_source_dest[left]: continue sources = [(left,right)] if len(contexts[left]) <= max_filler_length: sources += path_dest_source[left].items() destinations = [right] if len(contexts[right]) <= max_filler_length: destinations += path_source_dest[right].keys() for source, next in sources: for dest in destinations: if dest in path_source_dest[source]: continue path_source_dest[source][dest] = next path_dest_source[dest][source] = next workspace = io.Workspace(output_dir) scaffold_f = workspace.open('scaffolds.fa','wb') #comments = [ ] features = [ ] used = set() previous_total = 0 for i, (name, scaffold) in enumerate(scaffolds): result = '' # Inefficient. Meh. n_filled = 0 n_failed = 0 for j, item in enumerate(scaffold): if item[0] == 'contig': result += contexts[item[1]] used.add(abs(item[1])) else: left = scaffold[j-1] right = scaffold[ (j+1) % len(scaffold) ] #If gap at end, assume circular assert left[0] == 'contig' assert right[0] == 'contig' gap_start = len(result) can_fill = right[1] in path_source_dest[left[1]] if can_fill: n = 0 k = path_source_dest[left[1]][right[1]] while k != right[1]: n += len(contexts[k]) result += contexts[k].lower() used.add(abs(k)) k = path_source_dest[k][right[1]] n_filled += 1 if item[1] is not None and max(n,item[1]) > min(n,item[1])*4: print >> sys.stderr, 'Warning: gap size changed from %d to %d in scaffold %d' % (item[1],n,i+1) else: n_failed += 1 #print >> sys.stderr, 'Warning: No path to fill a gap in scaffold %d' % (i+1) result += 'n' * (9 if item[1] is None else item[1]) gap_end = len(result) #features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( # 'all-scaffolds', # 'fill-scaffolds', # 'gap', # previous_total + gap_start+1, # previous_total + max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm. # '.', #score # '+', #strand # '.', #frame # '' #properties #)) features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( name, 'fill-scaffolds', 'gap', gap_start+1, max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm. '.', #score '+', #strand '.', #frame '' #properties )) io.write_fasta(scaffold_f, name, result) previous_total += len(result) #comments.append('##sequence-region %s %d %d' % (name, 1, len(result))) print >> sys.stderr, 'Scaffold%05d: %d gaps filled, %d could not be filled' % (i+1, n_filled, n_failed) scaffold_f.close() gff_f = workspace.open('scaffolds.gff', 'wb') #print >>gff_f, '##gff-version 3' #for comment in comments: # print >>gff_f, comment for feature in features: print >>gff_f, feature gff_f.close() leftovers_f = workspace.open('leftovers.fa', 'wb') for name in sequence_names: if sequence_ids[name] not in used: io.write_fasta(leftovers_f, name, sequences[name]) leftovers_f.close() ends = { } for i, (name, scaffold) in enumerate(scaffolds): if scaffold[-1][0] == 'gap': continue ends[ '%s start' % name ] = scaffold[-1][1] ends[ '%s end ' % name ] = -scaffold[0][1] for end1 in sorted(ends): options = [ end2 for end2 in ends if -ends[end2] in path_source_dest[ends[end1]] ] if len(options) == 1: print >> sys.stderr, 'Note: from', end1, 'only', options[0], 'is reachable'
def run(self): #=============================================== # Sanity checks #=============================================== assert len(set([ item.output_dir for item in self.samples ])) == len(self.samples), "Duplicate sample name." all_inputs = [ ] for sample in self.samples: all_inputs.extend(sample.reads) assert len(set(all_inputs)) == len(all_inputs), "Duplicate read filename." assert len(set([ item.output_dir for item in self.tests ])) == len(self.tests), "Duplicate test name." for test in self.tests: assert not test.analysis, "analysis parameter for tests should not be set, will be filled in automatically" #=============================================== # Run pipeline #=============================================== names = [ sample.output_dir for sample in self.samples ] reference = reference_directory.Reference(self.reference, must_exist=True) workspace = io.Workspace(self.output_dir, must_exist=False) samplespace = io.Workspace(workspace/'samples', must_exist=False) expressionspace = io.Workspace(workspace/'expression', must_exist=False) testspace = io.Workspace(workspace/'test', must_exist=False) self._create_json() file_prefix = self.file_prefix if file_prefix and not file_prefix.endswith('-'): file_prefix += '-' samples = [ ] for sample in self.samples: samples.append(sample( samplespace / sample.output_dir, reference = self.reference, )) dirs = [ item.output_dir for item in samples ] clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ] filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ] filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ] analyse_template = tail_lengths.Analyse_tail_counts( working_dirs = dirs, extension = self.extension, annotations = reference/'reference.gff', types = self.types, parts = self.parts ) with nesoni.Stage() as stage: for item in samples: item.process_make(stage) job_gene_counts = analyse_template( output_dir = expressionspace/'genewise', extension = self.extension, title = 'Genewise expression - ' + self.title, file_prefix = file_prefix+'genewise-', ).make job_peaks = _call(self._run_peaks, workspace=workspace, expressionspace=expressionspace, reference=reference, dirs = dirs, analyse_template = analyse_template, file_prefix=file_prefix, ) job_norm = nesoni.Norm_from_samples( workspace/'norm', working_dirs = dirs ).make job_bigwig = bigwig.Polya_bigwigs( workspace/'bigwigs', working_dirs = dirs, norm_file = workspace/"norm.csv", peaks_file = workspace/("peaks", "relation-child.gff"), title = "IGV tracks - "+self.title ).make job_norm_bigwig = _call(_serial, job_norm, job_bigwig) job_utrs = tail_tools.Call_utrs( workspace/('peaks','primary-peak'), self.reference, self.output_dir, extension=self.extension ).make job_primpeak_counts = analyse_template( expressionspace/'primarypeakwise', annotations=workspace/('peaks','primary-peak-peaks.gff'), extension=0, types='peak', parts='peak', title='Primary-peakwise expression - ' + self.title, file_prefix=file_prefix+'primarypeakwise-', ).make job_primpeak = _call(_serial, job_utrs, job_primpeak_counts) job_peak_primpeak_bigwig = _call(_serial, job_peaks, _call(_parallel, job_norm_bigwig, job_primpeak)) job_count = _call(_parallel, job_gene_counts, job_peak_primpeak_bigwig) test_jobs = [ ] for test in self.tests: test_jobs.append(test( output_dir = testspace/test.output_dir, analysis = self.output_dir, ).make) job_test = _call(_parallel, *test_jobs) job_raw = self._extract_raw job_all = _call(_serial, job_count, _call(_parallel, job_raw, job_test)) job_all() #=============================================== # Report #=============================================== r = reporting.Reporter(workspace/'report', self.title, self.file_prefix, style=web.style()) io.symbolic_link(source=workspace/'bigwigs', link_name=r.workspace/'bigwigs') r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="bigwigs/index.html">→ Load tracks into IGV</a></div>') tail_tools.Shiny(workspace/('report','shiny'), self.output_dir, title=self.title, species=self.species).run() r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="shiny/" target="_blank">→ Interactive report (shiny)</a></div>') r.heading('Alignment to reference') r.report_logs('alignment-statistics', #[ workspace/'stats.txt' ] + clipper_logs + filter_logs + #filter_polya_logs + [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ], filter=lambda sample, field: ( field not in [ 'fragments','fragments aligned to the reference','reads kept', 'average depth of coverage, ambiguous', 'average depth of coverage, unambiguous', ] ), ) r.heading('Genewise expression') r.p("This is based on all reads within each gene (possibly from multiple peaks, or decay products).") io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise') r.p('<a href="genewise/index.html">→ Genewise expression</a>') r.heading('Peakwise expression') r.p("This shows results from all called peaks.") peak_filename = expressionspace/('peakwise','features-with-data.gff') r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called') self._describe_peaks(r) io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise') r.p('<a href="peakwise/index.html">→ Peakwise expression</a>') r.subheading('Primary-peakwise expression') r.p("This is based on the most prominent peak in the 3'UTR for each gene. (Peak can be up to %d bases downstrand of the annotated 3'UTR end, but not inside another gene on the same strand.)" % self.extension) io.symbolic_link(source=expressionspace/('primarypeakwise','report'),link_name=r.workspace/'primarypeakwise') r.p('<a href="primarypeakwise/index.html">→ Primary-peakwise expression</a>') r.p(r.get(workspace/('peaks','primary-peak-peaks.gff')) + ' - primary peaks for each gene.') r.p(r.get(workspace/('peaks','primary-peak-utrs.gff')) + ' - 3\' UTR regions, based on primary peak call.') r.p(r.get(workspace/('peaks','primary-peak-genes.gff')) + ' - full extent of gene, based on primary peak call.') if self.tests: r.heading('Differential tests') for test in self.tests: io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir)) r.p('<a href="test-%s">→ %s</a> ' % (test.output_dir, test.get_title())) web.Geneview_webapp(r.workspace/'view').run() r.heading('Gene viewers') r.p('Having identified interesting genes from heatmaps and differential tests above, ' 'these viewers allow specific genes to be examined in detail.') if self.groups: r.get(workspace/('peak-shift','grouped.json')) r.p('<a href="view.html?json=%sgrouped.json">→ Gene viewer, grouped samples</a>' % r.file_prefix) r.get(workspace/('peak-shift','individual.json')) r.p('<a href="view.html?json=%sindividual.json">→ Gene viewer, individual samples</a>' % r.file_prefix) r.heading('Raw data') r.p(r.tar('csv-files',glob.glob(workspace/('raw','*.csv')))) r.write('<ul>\n') r.write('<li> -info.csv = gene name and product, etc\n') r.write('<li> -count.csv = read count\n') r.write('<li> -mlog2-RPM.csv = moderated log2 Reads Per Million\n') r.write('<li> -tail.csv = average poly(A) tail length\n') r.write('<li> -tail-count.csv = poly(A) read count\n') r.write('<li> -proportion.csv = proportion of reads with poly(A)\n') r.write('<li> -norm.csv = read count normalization used for log2 transformation, heatmaps, differential tests, etc etc\n') r.write('</ul>\n') r.p('This set of genes was used in the analysis:') r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format') r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions') r.p('<b>%d further bases 3\' extension was allowed</b> beyond the GFF files above (but not extending into the next gene on the same strand).' % self.extension) r.write('<p/><hr>\n') r.subheading('About normalization and log transformation') r.p('Counts are converted to ' 'log2 Reads Per Million using Anscombe\'s variance stabilizing transformation ' 'for the negative binomial distribution, implemented in ' 'R package "varistran".') r.write('<p/><hr>\n') r.p('Reference directory '+self.reference) r.p('Tail Tools version '+tail_tools.VERSION) r.p('Nesoni version '+nesoni.VERSION) r.close()