Esempio n. 1
0
    def get(self, filename, name=None, title=None, prefix=None, image=False):
        if name is None:
            name = os.path.split(filename)[1]
        if prefix is None:
            prefix = self.file_prefix
        dest = self.workspace / (prefix + name)

        #Copy(
        #    dest = dest,
        #    source = filename,
        #    ).make()
        io.symbolic_link(source=filename, link_name=dest)

        return self.href(dest, title, image)
Esempio n. 2
0
 def get(self, filename, name=None, title=None, prefix=None, image=False):
     if name is None:
         name = os.path.split(filename)[1]
     if prefix is None:
         prefix = self.file_prefix
     dest = self.workspace / (prefix+name)
     
     #Copy(
     #    dest = dest,
     #    source = filename,
     #    ).make()
     io.symbolic_link(source=filename, link_name=dest)
     
     return self.href(dest, title, image)
    def run(self):
        working_dirs = [ ] 
        peaks_file = self.peaks_file       
        for item in self.working_dirs:
            state_filename = os.path.join(item,'analyse-polya-batch.state')
            if not os.path.exists(state_filename):
                working_dirs.append(item)
            else:
                with open(state_filename,'rb') as f:
                    state = pickle.load(f)

                for sample in state.samples:
                    working_dirs.append(os.path.join(item,'samples',sample.output_dir))
                
                if not peaks_file:
                    peaks_file = os.path.join(self.pipeline_dir, "peaks", "relation-child.gff")

        
        sample_names = [ os.path.split(dirname)[1] for dirname in working_dirs ]
        workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ]
        
        workspace = self.get_workspace()
        
        with open(workspace/"index.html","wb") as f:
            web.emit(f, "igv.html", dict(
                SAMPLES = json.dumps(sample_names),
                HAVE_NORM = json.dumps(bool(self.norm_file)),
                TITLE = self.title,
            ))
        
        bams = [ item/"alignments_filtered_sorted.bam" for item in workspaces ]
        
        for i in xrange(len(sample_names)):
            io.symbolic_link(bams[i], workspace/(sample_names[i]+".bam"))
            io.symbolic_link(bams[i]+".bai", workspace/(sample_names[i]+".bam.bai"))
        
        io.symbolic_link(peaks_file, workspace/"peaks.gff")

                
        if self.norm_file:
            mults = io.read_grouped_table(self.norm_file)['All']
            norm_mult = [ float(mults[name]['Normalizing.multiplier']) for name in sample_names ]
        
        with nesoni.Stage() as stage:
            Bam_to_bigwig(workspace/"total", bam_files=bams, what="ambiguity,span,3p,polyaspan,polya3p",
                ).process_make(stage)
            
            for i in xrange(len(sample_names)):
                for scale_desc, scale in \
                        [("raw",1.0)] + \
                        ([("norm",norm_mult[i])] if self.norm_file else []):
                    Bam_to_bigwig(
                        workspace/(sample_names[i]+"-"+scale_desc), 
                        bam_files=[bams[i]], 
                        what='span,3p,polyaspan,polya3p', scale=scale
                        ).process_make(stage)
    def run(self):
        #===============================================
        #                Sanity checks
        #===============================================
        
        assert len(set([ item.output_dir for item in self.samples ])) == len(self.samples), "Duplicate sample name."
        
        all_inputs = [ ]
        for sample in self.samples:
            all_inputs.extend(sample.reads)
        assert len(set(all_inputs)) == len(all_inputs), "Duplicate read filename."
        
        assert len(set([ item.output_dir for item in self.tests ])) == len(self.tests), "Duplicate test name."
        
        for test in self.tests:
            assert not test.analysis, "analysis parameter for tests should not be set, will be filled in automatically"
        
        #===============================================
        #                Run pipeline
        #===============================================
        
        names = [ sample.output_dir for sample in self.samples ]
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        workspace = io.Workspace(self.output_dir, must_exist=False)
        samplespace = io.Workspace(workspace/'samples', must_exist=False)
        expressionspace = io.Workspace(workspace/'expression', must_exist=False)
        testspace = io.Workspace(workspace/'test', must_exist=False)
        
        self._create_json()
                
        file_prefix = self.file_prefix
        if file_prefix and not file_prefix.endswith('-'):
            file_prefix += '-'


        samples = [ ]
        for sample in self.samples:
            samples.append(sample(
                samplespace / sample.output_dir,
                reference = self.reference,
                ))
        
        dirs = [ item.output_dir for item in samples ]
        
        clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ]
        filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ]
        filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ]

        analyse_template = tail_lengths.Analyse_tail_counts(
            working_dirs = dirs,
            extension = self.extension,
            annotations = reference/'reference.gff',
            types = self.types,
            parts = self.parts
            )
        
        with nesoni.Stage() as stage:        
            for item in samples:
                item.process_make(stage)

        job_gene_counts = analyse_template(
            output_dir = expressionspace/'genewise',
            extension = self.extension,
            title = 'Genewise expression - ' + self.title,
            file_prefix = file_prefix+'genewise-',
            ).make
        
        job_peaks = _call(self._run_peaks, 
            workspace=workspace, 
            expressionspace=expressionspace, 
            reference=reference, 
            dirs = dirs,
            analyse_template = analyse_template,
            file_prefix=file_prefix,
            )
        
        job_norm = nesoni.Norm_from_samples(
            workspace/'norm',
            working_dirs = dirs
            ).make
            
        job_bigwig = bigwig.Polya_bigwigs(
            workspace/'bigwigs', 
            working_dirs = dirs, 
            norm_file = workspace/"norm.csv",
            peaks_file = workspace/("peaks", "relation-child.gff"),
            title = "IGV tracks - "+self.title
            ).make
        
        job_norm_bigwig = _call(_serial, job_norm, job_bigwig)

        job_utrs = tail_tools.Call_utrs(
            workspace/('peaks','primary-peak'),
            self.reference,
            self.output_dir,
            extension=self.extension
            ).make
            
        job_primpeak_counts = analyse_template(
            expressionspace/'primarypeakwise',
            annotations=workspace/('peaks','primary-peak-peaks.gff'), 
            extension=0,
            types='peak',
            parts='peak',
            title='Primary-peakwise expression - ' + self.title,
            file_prefix=file_prefix+'primarypeakwise-',
            ).make
        
        job_primpeak = _call(_serial, job_utrs, job_primpeak_counts)
        
        job_peak_primpeak_bigwig = _call(_serial, 
            job_peaks, 
            _call(_parallel, job_norm_bigwig, job_primpeak))
        
        job_count = _call(_parallel, job_gene_counts, job_peak_primpeak_bigwig)
            
        test_jobs = [ ]
        for test in self.tests:
            test_jobs.append(test(
                output_dir = testspace/test.output_dir,
                analysis = self.output_dir,
                ).make)

        job_test = _call(_parallel, *test_jobs)

        job_raw = self._extract_raw

        job_all = _call(_serial, job_count, _call(_parallel, job_raw, job_test))        
        
        job_all()



        #===============================================
        #                   Report        
        #===============================================

        r = reporting.Reporter(workspace/'report', self.title, self.file_prefix, style=web.style())
        
        io.symbolic_link(source=workspace/'bigwigs', link_name=r.workspace/'bigwigs')
        r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="bigwigs/index.html">&rarr; Load tracks into IGV</a></div>')

        tail_tools.Shiny(workspace/('report','shiny'), self.output_dir, title=self.title, species=self.species).run()
        r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="shiny/" target="_blank">&rarr; Interactive report (shiny)</a></div>')
        
        r.heading('Alignment to reference')
        
        r.report_logs('alignment-statistics',
            #[ workspace/'stats.txt' ] +
            clipper_logs + filter_logs + #filter_polya_logs +
            [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ],
            filter=lambda sample, field: (
                field not in [
                    'fragments','fragments aligned to the reference','reads kept',
                    'average depth of coverage, ambiguous',
                    'average depth of coverage, unambiguous',
                    ]
            ),
        )
        

        r.heading('Genewise expression')
        
        r.p("This is based on all reads within each gene (possibly from multiple peaks, or decay products).")
        
        io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise')
        r.p('<a href="genewise/index.html">&rarr; Genewise expression</a>')


        r.heading('Peakwise expression')
        
        r.p("This shows results from all called peaks.")
        
        peak_filename = expressionspace/('peakwise','features-with-data.gff')
        r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called')        

        self._describe_peaks(r)
        
        io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise')
        r.p('<a href="peakwise/index.html">&rarr; Peakwise expression</a>')


        r.subheading('Primary-peakwise expression')
        
        r.p("This is based on the most prominent peak in the 3'UTR for each gene. (Peak can be up to %d bases downstrand of the annotated 3'UTR end, but not inside another gene on the same strand.)" % self.extension)
        
        io.symbolic_link(source=expressionspace/('primarypeakwise','report'),link_name=r.workspace/'primarypeakwise')
        r.p('<a href="primarypeakwise/index.html">&rarr; Primary-peakwise expression</a>')

        r.p(r.get(workspace/('peaks','primary-peak-peaks.gff')) + ' - primary peaks for each gene.')
        r.p(r.get(workspace/('peaks','primary-peak-utrs.gff')) + ' - 3\' UTR regions, based on primary peak call.')
        r.p(r.get(workspace/('peaks','primary-peak-genes.gff')) + ' - full extent of gene, based on primary peak call.')


        if self.tests:
            r.heading('Differential tests')
            for test in self.tests:
                io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir))
                r.p('<a href="test-%s">&rarr; %s</a> '
                    % (test.output_dir, test.get_title()))


        web.Geneview_webapp(r.workspace/'view').run()        
                
        r.heading('Gene viewers')
        r.p('Having identified interesting genes from heatmaps and differential tests above, '
            'these viewers allow specific genes to be examined in detail.')
        
        if self.groups:
            r.get(workspace/('peak-shift','grouped.json'))
            r.p('<a href="view.html?json=%sgrouped.json">&rarr; Gene viewer, grouped samples</a>' % r.file_prefix)
        r.get(workspace/('peak-shift','individual.json'))
        r.p('<a href="view.html?json=%sindividual.json">&rarr; Gene viewer, individual samples</a>' % r.file_prefix)
        
        
        r.heading('Raw data')
        
        r.p(r.tar('csv-files',glob.glob(workspace/('raw','*.csv'))))
        
        r.write('<ul>\n')
        r.write('<li> -info.csv = gene name and product, etc\n')
        r.write('<li> -count.csv = read count\n')
        r.write('<li> -mlog2-RPM.csv = moderated log2 Reads Per Million\n')
        r.write('<li> -tail.csv = average poly(A) tail length\n')
        r.write('<li> -tail-count.csv = poly(A) read count\n')
        r.write('<li> -proportion.csv = proportion of reads with poly(A)\n')
        r.write('<li> -norm.csv = read count normalization used for log2 transformation, heatmaps, differential tests, etc etc\n')
        r.write('</ul>\n')

        r.p('This set of genes was used in the analysis:')
        
        r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format')
        r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions')

        r.p('<b>%d further bases 3\' extension was allowed</b> beyond the GFF files above (but not extending into the next gene on the same strand).' % self.extension)

        r.write('<p/><hr>\n')
        r.subheading('About normalization and log transformation')

        r.p('Counts are converted to '
            'log2 Reads Per Million using Anscombe\'s variance stabilizing transformation '
            'for the negative binomial distribution, implemented in '
            'R package "varistran".')
                
        r.write('<p/><hr>\n')

        r.p('Reference directory '+self.reference)
        r.p('Tail Tools version '+tail_tools.VERSION)
        r.p('Nesoni version '+nesoni.VERSION)
        
        r.close()
Esempio n. 5
0
    def run(self):
        context = self.get_context()

        with nesoni.Stage() as stage:
            for sample in context.samples:
                sample.process_make(stage)

        with nesoni.Stage() as stage:
            if context.variants:
                context.variants.process_make(stage)

            if context.expression:
                context.expression.process_make(stage)

        if self.igv_plots:
            plot_space = workspace.Workspace(context.space / 'plot', False)
            self.igv_plots(
                prefix=plot_space / ('plot'),
                genome=context.reference.get_genome_filename(),
                norm_file=context.space /
                ('expression', 'norm.csv') if context.expression else None,
                working_dirs=context.sample_dirs,
            ).make()

        # =================================================================================
        # =================================================================================
        # =================================================================================

        reporter = reporting.Reporter(context.space / 'report',
                                      self.report_title, context.name)

        reporter.report_logs(
            'alignment-statistics',
            [
                sample.get_context().clip.log_filename()
                for sample in context.samples if sample.clip
            ] + [
                sample.get_context().filter.log_filename() if not sample.count
                else sample.get_context().count.log_filename()
                for sample in context.samples if sample.filter or sample.count
            ],
            filter=lambda sample, field: field != 'fragments',
        )

        if self.expression:
            io.symbolic_link(source=context.space / ('expression', 'report'),
                             link_name=context.space /
                             ('report', 'expression'))
            reporter.heading(
                '<a href="expression/index.html">&gt; Expression analysis</a>')

        if self.variants:
            io.symbolic_link(source=context.space / ('variants', 'report'),
                             link_name=context.space / ('report', 'variants'))
            reporter.heading(
                '<a href="variants/index.html">&gt; Variants analysis</a>')

        if self.igv_plots:
            reporter.heading('IGV plots')
            reporter.p(
                'These files show the depth of coverage. They can be viewed with the IGV genome browser.'
            )

            genome_files = []
            if self.include_genome:
                genome_filename = context.reference.get_genome_filename()
                genome_dir = context.reference.get_genome_dir()
                genome_files.append(genome_filename)
                if genome_dir:
                    base = os.path.split(genome_dir)[1]
                    for filename in os.listdir(genome_dir):
                        genome_files.append(
                            (os.path.join(genome_dir, filename),
                             os.path.join(base, filename)))

            reporter.p(
                reporter.tar('igv-plots',
                             genome_files + glob.glob(plot_space / '*.tdf')))

        if self.include_bams:
            reporter.heading('BAM files')

            reporter.p(
                'These BAM files contain the alignments of reads to the reference sequences.'
                ' They can also be viewed using IGV.')

            bam_files = []
            for sample in self.samples:
                name = sample.output_dir
                bam_files.append(
                    (context.space /
                     ('samples', name, 'alignments_filtered_sorted.bam'),
                     name + '.bam'))
                bam_files.append(
                    (context.space /
                     ('samples', name, 'alignments_filtered_sorted.bam.bai'),
                     name + '.bam.bai'))
            reporter.p(reporter.tar('bam-files', bam_files))

        reporter.write('<p/><hr/>\n')
        reporter.p('nesoni version ' + nesoni.VERSION)
        reporter.close()
Esempio n. 6
0
    def run(self):
        assert self.reference is not None, 'No reference directory given.'
        space = self.get_workspace()

        if self.analysis:
            nesoni.Power_variant_call(
                space / 'power',
                template__analysis=self.analysis,
                template__freebayes=self.freebayes,
                template__vcf_filter=self.vcf_filter,
                legacy=False,
            ).make()

        self.freebayes(
            space / 'variants-raw',
            samples=self.samples,
        ).make()

        self.vcf_filter(
            space / 'variants-filtered',
            space / 'variants-raw.vcf',
        ).make()
        filename = space / 'variants-filtered.vcf'

        if self.snpeff:
            self.snpeff(space / 'variants-filtered-annotated', self.reference,
                        space / 'variants-filtered.vcf').make()
            filename = space / 'variants-filtered-annotated.vcf'

        io.symbolic_link(source=filename, link_name=space / 'variants.vcf')
        if os.path.exists(filename + '.idx'):
            io.symbolic_link(source=filename + '.idx',
                             link_name=space / 'variants.vcf.idx')

        nesoni.Vcf_patch(space / 'patched', self.reference,
                         space / 'variants.vcf').make()

        nesoni.Vcf_nway(
            space / 'net',
            space / 'variants.vcf',
            require='all',
            as_='splitstree',
        ).make()

        reporter = reporting.Reporter(space / 'report', 'Variants analysis')

        reporter.report_logs(None, [space / 'variants-filtered_log.txt'],
                             renaming={
                                 'input': 'Found by freebayes',
                                 'kept': 'Kept after quality filtering'
                             })

        reporter.p(reporter.get(filename))
        if os.path.exists(filename + '.idx'):
            reporter.p(
                reporter.get(filename + '.idx') +
                ' (needed to view VCF file in IGV)')

        reporter.p(reporter.get(space / 'net.svg', title='Phylogenetic net'))

        if self.analysis:
            reporter.p(
                reporter.get(space / 'power_log.txt', title='Power report') +
                '<br/>(Test of the ability of the pipeline to call various variants at various depths of coverage and in the presence of errors, using synthetic reads.)'
            )

        reporter.close()
Esempio n. 7
0
    def run(self):
        names = [ sample.output_dir for sample in self.samples ]
            #os.path.splitext(os.path.split(item)[1])[0]
            #for item in self.reads
            #]
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        workspace = io.Workspace(self.output_dir, must_exist=False)
        samplespace = io.Workspace(workspace/'samples', must_exist=False)
        plotspace = io.Workspace(workspace/'plots', must_exist=False)
        expressionspace = io.Workspace(workspace/'expression', must_exist=False)
        testspace = io.Workspace(workspace/'test', must_exist=False)
        testspace_dedup = io.Workspace(workspace/'test-dedup', must_exist=False)
                
        file_prefix = self.file_prefix
        if file_prefix and not file_prefix.endswith('-'):
            file_prefix += '-'


        #dirs = [
        #    workspace/item
        #    for item in names
        #]

        samples = [ ]
        for sample in self.samples:
            samples.append(sample(
                samplespace / sample.output_dir,
                reference = self.reference,
                ))
        
        dirs = [ item.output_dir for item in samples ]
        polya_dirs = [ item + '-polyA' for item in dirs ]        
        interleaved = [ item2 for item in zip(dirs,polya_dirs) for item2 in item ]
        
        clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ]
        filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ]
        filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ]                
        #filter_logs = [ item.get_filter_action().log_filename() for item in samples ]
        #filter_polya_logs = [ item.get_polya_filter_action().log_filename() for item in samples ]

        analyse_template = tail_lengths.Analyse_tail_counts(
            working_dirs = dirs,
            saturation = 0,
            extension = self.extension,
            annotations = reference/'reference.gff',
            types = 'gene',
            )
        

        with nesoni.Stage() as stage:        
            for item in samples:
                item.process_make(stage)

        
        
        nesoni.Norm_from_samples(
            workspace/'norm',
            working_dirs = dirs
            ).make()

        def writer():
            for row in io.read_table(workspace/'norm.csv'):
                row['Name'] = row['Name']+'-polyA'
                yield row
        io.write_csv(workspace/'norm-polyA.csv', writer(), comments=['Normalization'])


        with nesoni.Stage() as stage:
            if self.include_plots:        
                for plot_name, directories, norm_filename in [
                      ('all',   dirs,       workspace/'norm.csv'),
                      ('polyA', polya_dirs, workspace/'norm-polyA.csv'),
                      ]:
                    nesoni.IGV_plots(
                        plotspace/plot_name,
                        working_dirs = directories,
                        label_prefix = plot_name+' ',
                        raw = True,
                        norm = True,
                        genome = reference.get_genome_filename(),
                        norm_file = norm_filename,
                        #delete_igv = False,
                        ).process_make(stage)

            analyse_gene_counts_0 = analyse_template(
                output_dir = expressionspace/'genewise',
                saturation = 0,
                extension = self.extension,
                title = 'Genewise expression - ' + self.title,
                file_prefix = file_prefix+'genewise-',
                )
            analyse_gene_counts_0.process_make(stage)
            
            analyse_gene_counts_1 = analyse_template(
                output_dir = expressionspace/'genewise-dedup',
                saturation = 1,
                title = 'Genewise expression with read deduplication - ' + self.title,
                file_prefix = file_prefix+'genewise-dedup-',
                )
            analyse_gene_counts_1.process_make(stage)
            
            stage.process(self._run_peaks, 
                workspace=workspace, expressionspace=expressionspace, reference=reference, 
                polya_dirs=polya_dirs, analyse_template=analyse_template, file_prefix=file_prefix,
                )
            
        with nesoni.Stage() as stage:
            for test in self.tests:
                test(
                    output_dir = testspace/test.output_dir,
                    analysis = self.output_dir
                    ).process_make(stage)

                test(
                    output_dir = testspace_dedup/test.output_dir,
                    analysis = self.output_dir,
                    dedup = True,
                    ).process_make(stage)
        
        #===============================================
        #                   Report        
        #===============================================

        r = reporting.Reporter(os.path.join(self.output_dir, 'report'), self.title, self.file_prefix)
                    
        r.heading('Alignment to reference')
        
        r.report_logs('alignment-statistics',
            #[ workspace/'stats.txt' ] +
            clipper_logs + filter_logs + #filter_polya_logs +
            [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ],
            filter=lambda sample, field: (
                field not in [
                    
                    'fragments','fragments aligned to the reference','reads kept',
                    'average depth of coverage, ambiguous',
                    'average depth of coverage, unambiguous',
                    ]
            ),
        )


        if self.include_plots:        
            r.heading('IGV plots')
            
            r.p('These files show the depth of coverage. They can be viewed with the IGV genome browser.')
            
            genome_files = [ ]
            if self.include_genome:
                genome_files.append(reference.get_genome_filename())
                genome_dir = reference.get_genome_dir()
                base = os.path.split(self.genome_dir)[1]
                for filename in os.listdir(genome_dir):
                    genome_files.append((
                        os.path.join(genome_dir, filename),
                        os.path.join(base, filename)
                        ))
            
            r.p(r.tar('igv-plots',
                genome_files +
                glob.glob(plotspace/'*.tdf')
                ))
        

        if self.include_bams:
            r.heading('BAM files')
            
            r.p('These BAM files contain the alignments of reads to the reference sequences.')
            
            r.p('Reads with a poly(A) tail have an \'AN\' attribute giving the length of non-templated poly(A) sequence. '
                'Tail-tools only treats a read as having a tail if this length is at least 4.')
            
            bam_files = [ ]
            for name in names:
                bam_files.append( (samplespace/(name,'alignments_filtered_sorted.bam'),name+'.bam') )
                bam_files.append( (samplespace/(name,'alignments_filtered_sorted.bam.bai'),name+'.bam.bai') )
            r.p(r.tar('bam-files', bam_files))


        r.heading('Genewise expression')
        
        io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise')
        r.p('<a href="genewise/index.html">&rarr; Genewise expression</a>')

        io.symbolic_link(source=expressionspace/('genewise-dedup','report'),link_name=r.workspace/'genewise-dedup')
        r.p('<a href="genewise-dedup/index.html">&rarr; Genewise expression with read deduplication</a>')


        r.heading('Peakwise expression')

        web.Geneview_webapp(r.workspace/'view').run()        
        
        peak_filename = expressionspace/('peakwise','features-with-data.gff')
        n_peaks = len(list(annotation.read_annotations(peak_filename)))
        r.p('%d peaks called (%d poly(A) reads were required to call a peak).' % (n_peaks, self.peak_min_depth))
        
        r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called')        

        #if self.groups:
            #r.subheading('Peak shift between groups')
            #r.p(r.get(workspace/('peak-shift','grouped.csv')) + ' - genes with a potential peak shift')        
            #r.get(workspace/('peak-shift','grouped.json'))

        #r.subheading('Peak shift between samples')
        #r.p(r.get(workspace/('peak-shift','individual.csv')) + ' - genes with a potential peak shift')        
        #r.get(workspace/('peak-shift','individual.json'))

        
        io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise')
        r.p('<a href="peakwise/index.html">&rarr; Peakwise expression</a>')

        io.symbolic_link(source=expressionspace/('peakwise-dedup','report'),link_name=r.workspace/'peakwise-dedup')
        r.p('<a href="peakwise-dedup/index.html">&rarr; Peakwise expression with read deduplication</a>')
                
        if self.tests:
            r.heading('Differential tests')
            for test in self.tests:
                io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir))
                io.symbolic_link(source=testspace_dedup/test.output_dir,link_name=r.workspace/('test-dedup-'+test.output_dir))
                r.p('<a href="test-%s">&rarr; %s</a> '
                    ' &nbsp; <a href="test-dedup-%s" style="font-size: 66%%">[&rarr; Deduplicated version]</a>' % (test.output_dir, test.get_title(), test.output_dir))

        r.heading('Gene viewers')
        r.p('Having identified interesting genes from heatmaps and differential tests above, '
            'these viewers allow specific genes to be examined in detail.')
        
        if self.groups:
            r.p('<a href="view.html?json=%sgrouped.json">&rarr; Gene viewer, grouped samples</a>' % r.file_prefix)
        r.p('<a href="view.html?json=%sindividual.json">&rarr; Gene viewer, individual samples</a>' % r.file_prefix)
       

        r.write('<p/><hr>\n')
        
        r.p('Note: Use deduplicated versions with care. '
            'They may possibly provide more significant results, however they are less quantitative. '
            'Read deduplication involves throwing away a large amount of data, much of which will not be a technical artifact. '
            'Deduplicated versions might best be viewed as a check on data quality.')
        
        r.p('This set of genes was used in the analysis:')
        
        r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format')
        r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions')

        r.p('tail-tools version '+tail_tools.VERSION)
        r.p('nesoni version '+nesoni.VERSION)
        #r.p('SHRiMP version '+grace.get_shrimp_2_version())
        
        r.close()
Esempio n. 8
0
    def run(self):
        #===============================================
        #                Sanity checks
        #===============================================
        
        assert len(set([ item.output_dir for item in self.samples ])) == len(self.samples), "Duplicate sample name."
        
        all_inputs = [ ]
        for sample in self.samples:
            all_inputs.extend(sample.reads)
        assert len(set(all_inputs)) == len(all_inputs), "Duplicate read filename."
        
        assert len(set([ item.output_dir for item in self.tests ])) == len(self.tests), "Duplicate test name."
        
        for test in self.tests:
            assert not test.analysis, "analysis parameter for tests should not be set, will be filled in automatically"
        
        #===============================================
        #                Run pipeline
        #===============================================
        
        names = [ sample.output_dir for sample in self.samples ]
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        workspace = io.Workspace(self.output_dir, must_exist=False)
        samplespace = io.Workspace(workspace/'samples', must_exist=False)
        expressionspace = io.Workspace(workspace/'expression', must_exist=False)
        testspace = io.Workspace(workspace/'test', must_exist=False)
        
        self._create_json()
                
        file_prefix = self.file_prefix
        if file_prefix and not file_prefix.endswith('-'):
            file_prefix += '-'


        samples = [ ]
        for sample in self.samples:
            samples.append(sample(
                samplespace / sample.output_dir,
                reference = self.reference,
                ))
        
        dirs = [ item.output_dir for item in samples ]
        
        clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ]
        filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ]
        filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ]

        analyse_template = tail_lengths.Analyse_tail_counts(
            working_dirs = dirs,
            extension = self.extension,
            annotations = reference/'reference.gff',
            types = self.types,
            parts = self.parts
            )
        
        with nesoni.Stage() as stage:        
            for item in samples:
                item.process_make(stage)

        job_gene_counts = analyse_template(
            output_dir = expressionspace/'genewise',
            extension = self.extension,
            title = 'Genewise expression - ' + self.title,
            file_prefix = file_prefix+'genewise-',
            ).make
        
        job_peaks = _call(self._run_peaks, 
            workspace=workspace, 
            expressionspace=expressionspace, 
            reference=reference, 
            dirs = dirs,
            analyse_template = analyse_template,
            file_prefix=file_prefix,
            )
        
        job_norm = nesoni.Norm_from_samples(
            workspace/'norm',
            working_dirs = dirs
            ).make
            
        job_bigwig = bigwig.Polya_bigwigs(
            workspace/'bigwigs', 
            working_dirs = dirs, 
            norm_file = workspace/"norm.csv",
            peaks_file = workspace/("peaks", "relation-child.gff"),
            title = "IGV tracks - "+self.title
            ).make
        
        job_norm_bigwig = _call(_serial, job_norm, job_bigwig)

        job_utrs = tail_tools.Call_utrs(
            workspace/('peaks','primary-peak'),
            self.reference,
            self.output_dir,
            extension=self.extension
            ).make
            
        job_primpeak_counts = analyse_template(
            expressionspace/'primarypeakwise',
            annotations=workspace/('peaks','primary-peak-peaks.gff'), 
            extension=0,
            types='peak',
            parts='peak',
            title='Primary-peakwise expression - ' + self.title,
            file_prefix=file_prefix+'primarypeakwise-',
            ).make
        
        job_primpeak = _call(_serial, job_utrs, job_primpeak_counts)
        
        job_peak_primpeak_bigwig = _call(_serial, 
            job_peaks, 
            _call(_parallel, job_norm_bigwig, job_primpeak))
        
        job_count = _call(_parallel, job_gene_counts, job_peak_primpeak_bigwig)
            
        test_jobs = [ ]
        for test in self.tests:
            test_jobs.append(test(
                output_dir = testspace/test.output_dir,
                analysis = self.output_dir,
                ).make)

        job_test = _call(_parallel, *test_jobs)

        job_raw = self._extract_raw

        job_all = _call(_serial, job_count, _call(_parallel, job_raw, job_test))        
        
        job_all()



        #===============================================
        #                   Report        
        #===============================================

        r = reporting.Reporter(workspace/'report', self.title, self.file_prefix, style=web.style())
        
        io.symbolic_link(source=workspace/'bigwigs', link_name=r.workspace/'bigwigs')
        r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="bigwigs/index.html">&rarr; Load tracks into IGV</a></div>')

        tail_tools.Shiny(workspace/('report','shiny'), self.output_dir, title=self.title, species=self.species).run()
        r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="shiny/" target="_blank">&rarr; Interactive report (shiny)</a></div>')
        
        r.heading('Alignment to reference')
        
        r.report_logs('alignment-statistics',
            #[ workspace/'stats.txt' ] +
            clipper_logs + filter_logs + #filter_polya_logs +
            [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ],
            filter=lambda sample, field: (
                field not in [
                    'fragments','fragments aligned to the reference','reads kept',
                    'average depth of coverage, ambiguous',
                    'average depth of coverage, unambiguous',
                    ]
            ),
        )
        

        r.heading('Genewise expression')
        
        r.p("This is based on all reads within each gene (possibly from multiple peaks, or decay products).")
        
        io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise')
        r.p('<a href="genewise/index.html">&rarr; Genewise expression</a>')


        r.heading('Peakwise expression')
        
        r.p("This shows results from all called peaks.")
        
        peak_filename = expressionspace/('peakwise','features-with-data.gff')
        r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called')        

        self._describe_peaks(r)
        
        io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise')
        r.p('<a href="peakwise/index.html">&rarr; Peakwise expression</a>')


        r.subheading('Primary-peakwise expression')
        
        r.p("This is based on the most prominent peak in the 3'UTR for each gene. (Peak can be up to %d bases downstrand of the annotated 3'UTR end, but not inside another gene on the same strand.)" % self.extension)
        
        io.symbolic_link(source=expressionspace/('primarypeakwise','report'),link_name=r.workspace/'primarypeakwise')
        r.p('<a href="primarypeakwise/index.html">&rarr; Primary-peakwise expression</a>')

        r.p(r.get(workspace/('peaks','primary-peak-peaks.gff')) + ' - primary peaks for each gene.')
        r.p(r.get(workspace/('peaks','primary-peak-utrs.gff')) + ' - 3\' UTR regions, based on primary peak call.')
        r.p(r.get(workspace/('peaks','primary-peak-genes.gff')) + ' - full extent of gene, based on primary peak call.')


        if self.tests:
            r.heading('Differential tests')
            for test in self.tests:
                io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir))
                r.p('<a href="test-%s">&rarr; %s</a> '
                    % (test.output_dir, test.get_title()))


        web.Geneview_webapp(r.workspace/'view').run()        
                
        r.heading('Gene viewers')
        r.p('Having identified interesting genes from heatmaps and differential tests above, '
            'these viewers allow specific genes to be examined in detail.')
        
        if self.groups:
            r.get(workspace/('peak-shift','grouped.json'))
            r.p('<a href="view.html?json=%sgrouped.json">&rarr; Gene viewer, grouped samples</a>' % r.file_prefix)
        r.get(workspace/('peak-shift','individual.json'))
        r.p('<a href="view.html?json=%sindividual.json">&rarr; Gene viewer, individual samples</a>' % r.file_prefix)
        
        
        r.heading('Raw data')
        
        r.p(r.tar('csv-files',glob.glob(workspace/('raw','*.csv'))))
        
        r.write('<ul>\n')
        r.write('<li> -info.csv = gene name and product, etc\n')
        r.write('<li> -count.csv = read count\n')
        r.write('<li> -mlog2-RPM.csv = moderated log2 Reads Per Million\n')
        r.write('<li> -tail.csv = average poly(A) tail length\n')
        r.write('<li> -tail-count.csv = poly(A) read count\n')
        r.write('<li> -proportion.csv = proportion of reads with poly(A)\n')
        r.write('<li> -norm.csv = read count normalization used for log2 transformation, heatmaps, differential tests, etc etc\n')
        r.write('</ul>\n')

        r.p('This set of genes was used in the analysis:')
        
        r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format')
        r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions')

        r.p('<b>%d further bases 3\' extension was allowed</b> beyond the GFF files above (but not extending into the next gene on the same strand).' % self.extension)

        r.write('<p/><hr>\n')
        r.subheading('About normalization and log transformation')

        r.p('Counts are converted to '
            'log2 Reads Per Million using Anscombe\'s variance stabilizing transformation '
            'for the negative binomial distribution, implemented in '
            'R package "varistran".')
                
        r.write('<p/><hr>\n')

        r.p('Reference directory '+self.reference)
        r.p('Tail Tools version '+tail_tools.VERSION)
        r.p('Nesoni version '+nesoni.VERSION)
        
        r.close()
Esempio n. 9
0
    def run(self):
        context = self.get_context()

        with nesoni.Stage() as stage:
            for sample in context.samples:
                sample.process_make(stage)
            
        with nesoni.Stage() as stage:
            if context.variants:
                context.variants.process_make(stage)
    
            if context.expression:
                context.expression.process_make(stage)

        if self.igv_plots:
            plot_space = workspace.Workspace(context.space/'plot',False)
            self.igv_plots(
                prefix = plot_space / ('plot'),
                genome = context.reference.get_genome_filename(),
                norm_file = context.space/('expression','norm.csv') if context.expression else None,
                working_dirs = context.sample_dirs,
                ).make()

        # =================================================================================
        # =================================================================================
        # =================================================================================

        reporter = reporting.Reporter(context.space / 'report', self.report_title, context.name)
        
        reporter.report_logs('alignment-statistics',
            [ sample.get_context().clip.log_filename() 
                for sample in context.samples
                if sample.clip
                ] +
            ([ sample.get_context().filter.log_filename() 
                for sample in context.samples 
                if sample.filter 
                ] if not context.expression else [ ]) +
            ([ context.space/('expression','counts_log.txt') ] if context.expression else [ ]),
            filter=lambda sample,field: field != 'fragments',
            )
        
        if self.expression:
            io.symbolic_link(source=context.space/('expression','report'),link_name=context.space/('report','expression'))
            reporter.heading('<a href="expression/index.html">&gt; Expression analysis</a>')
        
        if self.variants:
            io.symbolic_link(source=context.space/('variants','report'),link_name=context.space/('report','variants'))
            reporter.heading('<a href="variants/index.html">&gt; Variants analysis</a>')
                
        if self.igv_plots:
            reporter.heading('IGV plots')            
            reporter.p('These files show the depth of coverage. They can be viewed with the IGV genome browser.')

            genome_files = [ ]
            if self.include_genome:
                genome_filename = context.reference.get_genome_filename()
                genome_dir = context.reference.get_genome_dir()
                genome_files.append(genome_filename)
                if genome_dir:
                    base = os.path.split(genome_dir)[1]
                    for filename in os.listdir(genome_dir):
                        genome_files.append((
                            os.path.join(genome_dir, filename),
                            os.path.join(base, filename)
                            ))
            
            reporter.p(reporter.tar('igv-plots',
                genome_files +
                glob.glob(plot_space/'*.tdf')
                ))

        if self.include_bams:
            reporter.heading('BAM files')
            
            reporter.p('These BAM files contain the alignments of reads to the reference sequences.'
                       ' They can also be viewed using IGV.')
            
            bam_files = [ ]
            for sample in self.samples:
                name = sample.output_dir
                bam_files.append( (context.space/('samples',name,'alignments_filtered_sorted.bam'),name+'.bam') )
                bam_files.append( (context.space/('samples',name,'alignments_filtered_sorted.bam.bai'),name+'.bam.bai') )
            reporter.p(reporter.tar('bam-files', bam_files))
        
        reporter.write('<p/><hr/>\n')
        reporter.p('nesoni version '+nesoni.VERSION)
        reporter.close()
Esempio n. 10
0
    def run(self):
        assert self.reference is not None, 'No reference directory given.'
        space = self.get_workspace()
        
        if self.analysis:
            nesoni.Power_variant_call(
                space/'power',
                template__analysis   = self.analysis,
                template__freebayes  = self.freebayes,
                template__vcf_filter = self.vcf_filter,
                legacy = False,
                ).make()
        
        self.freebayes(
            space / 'variants-raw',
            samples=self.samples,
            ).make()
        
        self.vcf_filter(
            space / 'variants-filtered',
            space / 'variants-raw.vcf',
            ).make()        
        filename = space/'variants-filtered.vcf'
        
        if self.snpeff:
            self.snpeff(
                space / 'variants-filtered-annotated',
                self.reference,
                space / 'variants-filtered.vcf'
                ).make()
            filename = space / 'variants-filtered-annotated.vcf'
        
        io.symbolic_link(source=filename, link_name=space / 'variants.vcf')
        if os.path.exists(filename+'.idx'):
            io.symbolic_link(source=filename+'.idx', link_name=space / 'variants.vcf.idx')
        
        nesoni.Vcf_patch(
            space / 'patched',
            self.reference,
            space / 'variants.vcf'
            ).make()
        
        nesoni.Vcf_nway(
            space / 'net',
            space / 'variants.vcf',
            require='all',
            as_='splitstree',
            ).make()

        reporter = reporting.Reporter(space / 'report', 'Variants analysis')
                
        reporter.report_logs(None, [ space / 'variants-filtered_log.txt' ], 
            renaming = {'input':'Found by freebayes', 'kept':'Kept after quality filtering'})
        
        reporter.p(reporter.get(filename))
        if os.path.exists(filename+'.idx'):
            reporter.p(reporter.get(filename + '.idx') + ' (needed to view VCF file in IGV)')
        
        reporter.p(reporter.get(space / 'net.svg', title='Phylogenetic net'))
        
        if self.analysis:
            reporter.p(reporter.get(space / 'power_log.txt', title='Power report') +
                       '<br/>(Test of the ability of the pipeline to call various variants at various depths of coverage and in the presence of errors, using synthetic reads.)'
                       )
        
        reporter.close()
Esempio n. 11
0
    def run(self):
        working_dirs = []
        peaks_file = self.peaks_file
        for item in self.working_dirs:
            state_filename = os.path.join(item, 'analyse-polya-batch.state')
            if not os.path.exists(state_filename):
                working_dirs.append(item)
            else:
                with open(state_filename, 'rb') as f:
                    state = pickle.load(f)

                for sample in state.samples:
                    working_dirs.append(
                        os.path.join(item, 'samples', sample.output_dir))

                if not peaks_file:
                    peaks_file = os.path.join(self.pipeline_dir, "peaks",
                                              "relation-child.gff")

        sample_names = [os.path.split(dirname)[1] for dirname in working_dirs]
        workspaces = [
            working_directory.Working(dirname, must_exist=True)
            for dirname in working_dirs
        ]

        workspace = self.get_workspace()

        with open(workspace / "index.html", "wb") as f:
            web.emit(
                f, "igv.html",
                dict(
                    SAMPLES=json.dumps(sample_names),
                    HAVE_NORM=json.dumps(bool(self.norm_file)),
                    TITLE=self.title,
                ))

        bams = [item / "alignments_filtered_sorted.bam" for item in workspaces]

        for i in xrange(len(sample_names)):
            io.symbolic_link(bams[i], workspace / (sample_names[i] + ".bam"))
            io.symbolic_link(bams[i] + ".bai",
                             workspace / (sample_names[i] + ".bam.bai"))

        io.symbolic_link(peaks_file, workspace / "peaks.gff")

        if self.norm_file:
            mults = io.read_grouped_table(self.norm_file)['All']
            norm_mult = [
                float(mults[name]['Normalizing.multiplier'])
                for name in sample_names
            ]

        with nesoni.Stage() as stage:
            Bam_to_bigwig(
                workspace / "total",
                bam_files=bams,
                what="ambiguity,span,3p,polyaspan,polya3p",
            ).process_make(stage)

            for i in xrange(len(sample_names)):
                for scale_desc, scale in \
                        [("raw",1.0)] + \
                        ([("norm",norm_mult[i])] if self.norm_file else []):
                    Bam_to_bigwig(workspace /
                                  (sample_names[i] + "-" + scale_desc),
                                  bam_files=[bams[i]],
                                  what='span,3p,polyaspan,polya3p',
                                  scale=scale).process_make(stage)