def get_reference(self): if 'reference' in self.param: path = self.relative_path_as_path(self.param['reference']) else: path = self.working_dir return reference_directory.Reference(path, must_exist=True)
def run(self): workspace = self.get_workspace() reference = reference_directory.Reference(self.reference, must_exist=True) reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) variants = collections.defaultdict(list) for record in reader: variants[record.CHROM].append(record) reader_f.close() for chrom in variants: variants[chrom].sort(key=lambda item: item.POS) filenames = [workspace / (item + '.fa') for item in reader.samples] for filename in filenames: with open(filename, 'wb'): pass for name, seq in io.read_sequences( reference.reference_fasta_filename()): for i, sample in enumerate(reader.samples): revised = [] pos = 0 for variant in variants[name]: gt = variant.samples[i].data.GT if gt is None: continue assert gt.isdigit( ), 'Unsupported genotype (can only use haploid genotypes): ' + gt gt_number = int(gt) if gt_number == 0: var_seq = variant.REF else: var_seq = str(variant.ALT[gt_number - 1]) assert re.match( '[ACGTN]*$', var_seq), 'Unsupported variant type: ' + var_seq new_pos = variant.POS - 1 assert new_pos >= pos, 'Variants overlap.' revised.append(seq[pos:new_pos]) pos = new_pos revised.append(var_seq) assert seq[pos:pos + len(variant.REF)].upper( ) == variant.REF, 'REF column in VCF does not match reference sequence' pos += len(variant.REF) revised.append(seq[pos:]) with open(filenames[i], 'ab') as f: io.write_fasta(f, name, ''.join(revised)) del variants[name] assert not variants, 'Chromosome names in VCF not in reference: ' + ' '.join( variants)
def run(self): reference = reference_directory.Reference(self.reference, must_exist=True) jar = io.find_jar('snpEff.jar') with open(self.prefix + '.vcf', 'wb') as f: io.execute('java -jar JAR eff GENOME VCF -c CONFIG', JAR=jar, GENOME=reference.name, VCF=self.vcf, CONFIG=reference / 'snpeff.config', stdout=f) index_vcf(self.prefix + '.vcf')
def get_context(self): assert self.reference is not None, 'reference not given.' context = Context() context.action = self context.space = self.get_workspace() context.name = context.space.name context.sample_space = workspace.Workspace(context.space / 'samples', False) context.reference = reference_directory.Reference(self.reference, True) for sample in self.samples: assert sample.reference is None, 'reference should not be specified within sample.' assert sample.output_dir, 'sample given without name.' assert os.path.sep not in sample.output_dir, 'sample name contains ' + os.path.sep context.samples = [ sample( output_dir=context.sample_space / sample.output_dir, reference=self.reference, ) for sample in self.samples ] context.sample_dirs = [item.output_dir for item in context.samples] if not self.variants: context.variants = None else: context.variants = self.variants( context.space / 'variants', self.reference, samples=context.sample_dirs, analysis=self.variants.analysis or self. template, #Use aligner from template unless aligner explicitly given ) if not self.expression: context.expression = None else: context.expression = self.expression( context.space / 'expression', samples=context.sample_dirs, ) return context
def run(self): base = os.path.split(self.prefix)[1] annotations = [ ] sequences = [ ] for filename in self.filenames: any = False if os.path.isdir(filename): reference = reference_directory.Reference(filename,must_exist=True) sequences.append(reference.reference_fasta_filename()) if reference.annotations_filename(): annotations.append(reference.annotations_filename()) any = True if io.is_sequence_file(filename): sequences.append(filename) any = True if annotation.is_annotation_file(filename): annotations.append(filename) any = True assert any, 'File is neither a recognized sequence or annotation file' cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt') property_filename = os.path.join(self.prefix,'property.txt') gff_filename = os.path.join(self.prefix,base+'.gff') output_filenames = [ cytoband_filename, property_filename, gff_filename ] if not os.path.exists(self.prefix): os.mkdir(self.prefix) f = open(property_filename,'wb') print >> f, 'ordered=true' print >> f, 'id=%s' % base print >> f, 'name=%s' % (self.name or base) print >> f, 'cytobandFile=%s_cytoband.txt' % base print >> f, 'geneFile=%s.gff' % base print >> f, 'sequenceLocation=%s' % base f.close() trivia.As_gff(output=gff_filename, filenames=annotations, select=self.select ).run() f_cyt = open(cytoband_filename,'wb') for filename in sequences: for name, seq in io.read_sequences(filename): assert '/' not in name f = open(os.path.join(self.prefix, name + '.txt'), 'wb') f.write(seq) f.close() print >> f_cyt, '%s\t0\t%d' % (name, len(seq)) f_cyt.close() genome_filename = self.prefix + '.genome' if os.path.exists(genome_filename): os.unlink(genome_filename) io.execute( ['zip', '-j', io.abspath(genome_filename)] + [ io.abspath(item) for item in output_filenames ] ) for filename in output_filenames: if os.path.exists(filename): os.unlink(filename)
def __init__(self, dirname): self.dirname = dirname self.ref = reference_directory.Reference(dirname, must_exist=True)
def run(self): #=============================================== # Sanity checks #=============================================== assert len(set([ item.output_dir for item in self.samples ])) == len(self.samples), "Duplicate sample name." all_inputs = [ ] for sample in self.samples: all_inputs.extend(sample.reads) assert len(set(all_inputs)) == len(all_inputs), "Duplicate read filename." assert len(set([ item.output_dir for item in self.tests ])) == len(self.tests), "Duplicate test name." for test in self.tests: assert not test.analysis, "analysis parameter for tests should not be set, will be filled in automatically" #=============================================== # Run pipeline #=============================================== names = [ sample.output_dir for sample in self.samples ] reference = reference_directory.Reference(self.reference, must_exist=True) workspace = io.Workspace(self.output_dir, must_exist=False) samplespace = io.Workspace(workspace/'samples', must_exist=False) expressionspace = io.Workspace(workspace/'expression', must_exist=False) testspace = io.Workspace(workspace/'test', must_exist=False) self._create_json() file_prefix = self.file_prefix if file_prefix and not file_prefix.endswith('-'): file_prefix += '-' samples = [ ] for sample in self.samples: samples.append(sample( samplespace / sample.output_dir, reference = self.reference, )) dirs = [ item.output_dir for item in samples ] clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ] filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ] filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ] analyse_template = tail_lengths.Analyse_tail_counts( working_dirs = dirs, extension = self.extension, annotations = reference/'reference.gff', types = self.types, parts = self.parts ) with nesoni.Stage() as stage: for item in samples: item.process_make(stage) job_gene_counts = analyse_template( output_dir = expressionspace/'genewise', extension = self.extension, title = 'Genewise expression - ' + self.title, file_prefix = file_prefix+'genewise-', ).make job_peaks = _call(self._run_peaks, workspace=workspace, expressionspace=expressionspace, reference=reference, dirs = dirs, analyse_template = analyse_template, file_prefix=file_prefix, ) job_norm = nesoni.Norm_from_samples( workspace/'norm', working_dirs = dirs ).make job_bigwig = bigwig.Polya_bigwigs( workspace/'bigwigs', working_dirs = dirs, norm_file = workspace/"norm.csv", peaks_file = workspace/("peaks", "relation-child.gff"), title = "IGV tracks - "+self.title ).make job_norm_bigwig = _call(_serial, job_norm, job_bigwig) job_utrs = tail_tools.Call_utrs( workspace/('peaks','primary-peak'), self.reference, self.output_dir, extension=self.extension ).make job_primpeak_counts = analyse_template( expressionspace/'primarypeakwise', annotations=workspace/('peaks','primary-peak-peaks.gff'), extension=0, types='peak', parts='peak', title='Primary-peakwise expression - ' + self.title, file_prefix=file_prefix+'primarypeakwise-', ).make job_primpeak = _call(_serial, job_utrs, job_primpeak_counts) job_peak_primpeak_bigwig = _call(_serial, job_peaks, _call(_parallel, job_norm_bigwig, job_primpeak)) job_count = _call(_parallel, job_gene_counts, job_peak_primpeak_bigwig) test_jobs = [ ] for test in self.tests: test_jobs.append(test( output_dir = testspace/test.output_dir, analysis = self.output_dir, ).make) job_test = _call(_parallel, *test_jobs) job_raw = self._extract_raw job_all = _call(_serial, job_count, _call(_parallel, job_raw, job_test)) job_all() #=============================================== # Report #=============================================== r = reporting.Reporter(workspace/'report', self.title, self.file_prefix, style=web.style()) io.symbolic_link(source=workspace/'bigwigs', link_name=r.workspace/'bigwigs') r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="bigwigs/index.html">→ Load tracks into IGV</a></div>') tail_tools.Shiny(workspace/('report','shiny'), self.output_dir, title=self.title, species=self.species).run() r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="shiny/" target="_blank">→ Interactive report (shiny)</a></div>') r.heading('Alignment to reference') r.report_logs('alignment-statistics', #[ workspace/'stats.txt' ] + clipper_logs + filter_logs + #filter_polya_logs + [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ], filter=lambda sample, field: ( field not in [ 'fragments','fragments aligned to the reference','reads kept', 'average depth of coverage, ambiguous', 'average depth of coverage, unambiguous', ] ), ) r.heading('Genewise expression') r.p("This is based on all reads within each gene (possibly from multiple peaks, or decay products).") io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise') r.p('<a href="genewise/index.html">→ Genewise expression</a>') r.heading('Peakwise expression') r.p("This shows results from all called peaks.") peak_filename = expressionspace/('peakwise','features-with-data.gff') r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called') self._describe_peaks(r) io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise') r.p('<a href="peakwise/index.html">→ Peakwise expression</a>') r.subheading('Primary-peakwise expression') r.p("This is based on the most prominent peak in the 3'UTR for each gene. (Peak can be up to %d bases downstrand of the annotated 3'UTR end, but not inside another gene on the same strand.)" % self.extension) io.symbolic_link(source=expressionspace/('primarypeakwise','report'),link_name=r.workspace/'primarypeakwise') r.p('<a href="primarypeakwise/index.html">→ Primary-peakwise expression</a>') r.p(r.get(workspace/('peaks','primary-peak-peaks.gff')) + ' - primary peaks for each gene.') r.p(r.get(workspace/('peaks','primary-peak-utrs.gff')) + ' - 3\' UTR regions, based on primary peak call.') r.p(r.get(workspace/('peaks','primary-peak-genes.gff')) + ' - full extent of gene, based on primary peak call.') if self.tests: r.heading('Differential tests') for test in self.tests: io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir)) r.p('<a href="test-%s">→ %s</a> ' % (test.output_dir, test.get_title())) web.Geneview_webapp(r.workspace/'view').run() r.heading('Gene viewers') r.p('Having identified interesting genes from heatmaps and differential tests above, ' 'these viewers allow specific genes to be examined in detail.') if self.groups: r.get(workspace/('peak-shift','grouped.json')) r.p('<a href="view.html?json=%sgrouped.json">→ Gene viewer, grouped samples</a>' % r.file_prefix) r.get(workspace/('peak-shift','individual.json')) r.p('<a href="view.html?json=%sindividual.json">→ Gene viewer, individual samples</a>' % r.file_prefix) r.heading('Raw data') r.p(r.tar('csv-files',glob.glob(workspace/('raw','*.csv')))) r.write('<ul>\n') r.write('<li> -info.csv = gene name and product, etc\n') r.write('<li> -count.csv = read count\n') r.write('<li> -mlog2-RPM.csv = moderated log2 Reads Per Million\n') r.write('<li> -tail.csv = average poly(A) tail length\n') r.write('<li> -tail-count.csv = poly(A) read count\n') r.write('<li> -proportion.csv = proportion of reads with poly(A)\n') r.write('<li> -norm.csv = read count normalization used for log2 transformation, heatmaps, differential tests, etc etc\n') r.write('</ul>\n') r.p('This set of genes was used in the analysis:') r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format') r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions') r.p('<b>%d further bases 3\' extension was allowed</b> beyond the GFF files above (but not extending into the next gene on the same strand).' % self.extension) r.write('<p/><hr>\n') r.subheading('About normalization and log transformation') r.p('Counts are converted to ' 'log2 Reads Per Million using Anscombe\'s variance stabilizing transformation ' 'for the negative binomial distribution, implemented in ' 'R package "varistran".') r.write('<p/><hr>\n') r.p('Reference directory '+self.reference) r.p('Tail Tools version '+tail_tools.VERSION) r.p('Nesoni version '+nesoni.VERSION) r.close()