Exemple #1
0
    def compatible(self, options):
        """Compatible command"""

        check_file_exists(options.reference_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        # read scaffold statistics and calculate genome stats
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify putative homologs to reference genomes
        reference = Reference(1, None)
        putative_homologs = reference.homology_check(options.reference_file,
                                                     options.min_genes,
                                                     float(options.perc_genes))

        # identify scaffolds compatible with bins
        outliers = Outliers()
        output_file = os.path.join(options.output_dir, 'compatible.tsv')
        outliers.compatible(putative_homologs, scaffold_stats, genome_stats,
                            options.gc_perc, options.td_perc, options.cov_corr,
                            options.cov_perc, options.report_type, output_file)

        self.logger.info('Results written to: ' + output_file)
Exemple #2
0
    def modify(self, options):
        """Modify command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - modify] Modifying scaffolds in genome.')
        self.logger.info('*******************************************************************************')

        make_sure_path_exists(os.path.dirname(options.output_genome))

        if not (options.add or options.remove or options.outlier_file or options.compatible_file):
            self.logger.warning('  [Warning] No modification to bin requested.\n')
            sys.exit()

        if (options.add or options.remove) and (options.outlier_file or options.compatible_file):
            self.logger.warning("  [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n")
            sys.exit()

        if options.outlier_file and options.compatible_file:
            self.logger.warning("  [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n")
            sys.exit()

        failed_to_add = []
        failed_to_remove = []
        if options.add or options.remove:
            failed_to_add, failed_to_remove = genome_tk.modify(options.genome_file,
                                                               options.scaffold_file,
                                                               options.add,
                                                               options.remove,
                                                               options.output_genome)
        elif options.outlier_file:
            outliers = Outliers()
            outliers.remove_outliers(options.genome_file, options.outlier_file, options.output_genome)
        elif options.compatible_file:
            outliers = Outliers()
            if options.unique_only:
                outliers.add_compatible_unique(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome)
            else:
                outliers.add_compatible_closest(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome)

        if failed_to_add:
            self.logger.warning('  [Warning] Failed to add the following sequence(s):')
            for seq_id in failed_to_add:
                self.logger.warning('    %s' % seq_id)

        if failed_to_remove:
            self.logger.warning('  [Warning] Failed to remove the following sequence(s):')
            for seq_id in failed_to_remove:
                self.logger.warning('    %s' % seq_id)

        self.logger.info('')
        self.logger.info('  Modified genome written to: ' + options.output_genome)

        self.time_keeper.print_time_stamp()
Exemple #3
0
    def filter_bins(self, options):
        """Filter bins command"""

        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_nt_dir,
                                          options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        outliers = Outliers()
        for genome_file in genome_files:
            gf = remove_extension(
                genome_file) + '.filtered.' + options.genome_ext
            out_genome = os.path.join(options.output_dir, gf)
            outliers.remove_outliers(genome_file, options.filter_file,
                                     out_genome, options.modified_only)

        self.logger.info('Modified genome written to: ' + options.output_dir)
Exemple #4
0
    def outliers(self, options):
        """Outlier command"""

        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify outliers
        outliers = Outliers()
        outlier_file = os.path.join(options.output_dir, 'outliers.tsv')
        outliers.identify(scaffold_stats, genome_stats, options.gc_perc,
                          options.td_perc, options.cov_corr, options.cov_perc,
                          options.report_type, outlier_file)
        self.logger.info('Outlier information written to: ' + outlier_file)

        # create outlier plots
        if not options.no_plots:
            plot_dir = os.path.join(options.output_dir, 'plots')
            make_sure_path_exists(plot_dir)

            outliers.plot(scaffold_stats, genome_stats, outliers.gc_dist,
                          outliers.td_dist, options, options.highlight_file,
                          options.links_file, options.individual_plots,
                          plot_dir)

            self.logger.info('Outlier plots written to: ' + plot_dir)
Exemple #5
0
    def compatible(self, options):
        """Compatible command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - compatible] Identify scaffolds with compatible genomic statistics.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.reference_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        # read scaffold statistics and calculate genome stats
        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify putative homologs to reference genomes
        reference = Reference(1, None)
        putative_homologs = reference.homology_check(options.reference_file,
                                                         options.min_genes,
                                                         float(options.perc_genes))

        # identify scaffolds compatible with bins
        outliers = Outliers()
        output_file = os.path.join(options.output_dir, 'compatible.tsv')
        outliers.compatible(putative_homologs, scaffold_stats, genome_stats,
                                      options.gc_perc, options.td_perc,
                                      options.cov_corr, options.cov_perc,
                                      options.report_type, output_file)

        self.logger.info('')
        self.logger.info('  Results written to: ' + output_file)

        self.time_keeper.print_time_stamp()
Exemple #6
0
    def modify_bin(self, options):
        """Modify bin command"""

        make_sure_path_exists(os.path.dirname(options.output_genome))

        if not (options.add or options.remove or options.outlier_file
                or options.compatible_file):
            self.logger.warning('No modification to bin requested.\n')
            sys.exit()

        if (options.add or options.remove) and (options.outlier_file
                                                or options.compatible_file):
            self.logger.warning(
                "The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n"
            )
            sys.exit()

        if options.outlier_file and options.compatible_file:
            self.logger.warning(
                "The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n"
            )
            sys.exit()

        failed_to_add = []
        failed_to_remove = []
        if options.add or options.remove:
            failed_to_add, failed_to_remove = genome_tk.modify(
                options.genome_file, options.scaffold_file, options.add,
                options.remove, options.output_genome)
        elif options.outlier_file:
            outliers = Outliers()
            outliers.remove_outliers(options.genome_file, options.outlier_file,
                                     options.output_genome, False)
        elif options.compatible_file:
            outliers = Outliers()
            if options.unique_only:
                outliers.add_compatible_unique(options.scaffold_file,
                                               options.genome_file,
                                               options.compatible_file,
                                               options.min_len,
                                               options.output_genome)
            elif options.closest_only:
                outliers.add_compatible_closest(options.scaffold_file,
                                                options.genome_file,
                                                options.compatible_file,
                                                options.min_len,
                                                options.output_genome)
            else:
                outliers.add_compatible(options.scaffold_file,
                                        options.genome_file,
                                        options.compatible_file,
                                        options.min_len, options.output_genome)

        if failed_to_add:
            self.logger.warning('Failed to add the following sequence(s):')
            for seq_id in failed_to_add:
                print '    %s' % seq_id

        if failed_to_remove:
            self.logger.warning('Failed to remove the following sequence(s):')
            for seq_id in failed_to_remove:
                print '    %s' % seq_id

        self.logger.info('Modified genome written to: ' +
                         options.output_genome)
Exemple #7
0
    def outliers(self, options):
        """Outlier command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - outliers] Identifying scaffolds with divergent characteristics.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify outliers
        outliers = Outliers()
        outlier_file = os.path.join(options.output_dir, 'outliers.tsv')
        outliers.identify(scaffold_stats, genome_stats,
                                      options.gc_perc, options.td_perc,
                                      options.cov_corr, options.cov_perc,
                                      options.report_type, outlier_file)
        self.logger.info('  Outlier information written to: ' + outlier_file)

        # create outlier plots
        self.logger.info('')

        highlight_scaffolds_ids = {}
        if options.highlight_file:
            for line in open(options.highlight_file):
                line_split = line.strip().split('\t')
                if len(line_split) > 1:
                    highlight_scaffolds_ids[line_split[0]] = [float(x.strip()) / 255.0 for x in line_split[1].split(',')]
                else:
                    highlight_scaffolds_ids[line_split[0]] = [1.0, 0, 0]

        link_scaffold_ids = []
        if options.links_file:
            with open(options.links_file) as links_file:
                for line in links_file:
                    #print line.strip().split('\t')
                    link_scaffold_ids.append([ast.literal_eval(item) if i not in (0,2) else item for i,item in enumerate((line.strip().split('\t')))])
            #link_scaffold_ids.append(line.strip().split('\t') for line in open(options.links_file))
            
        #print list(link_scaffold_ids[0])
        
        # create plots
        genomes_processed = 0
        plot_dir = os.path.join(options.output_dir, 'plots')
        make_sure_path_exists(plot_dir)
        genome_plots = defaultdict(list)
        for genome_id, gs in genome_stats.iteritems():
            genomes_processed += 1

            sys.stdout.write('  Plotting scaffold distribution for %d of %d (%.1f%%) genomes.\r' %
                                                                                            (genomes_processed,
                                                                                             len(genome_stats),
                                                                                             genomes_processed * 100.0 / len(genome_stats)))
            sys.stdout.flush()

            genome_scaffold_stats = {}
            for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]:
                genome_scaffold_stats[scaffold_id] = scaffold_stats.stats[scaffold_id]

            if options.individual_plots:
                #~ # GC plot
                #~ gc_plots = GcPlots(options)
                #~ gc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_gc, outliers.gc_dist, [options.gc_perc])
#~ 
                #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_plots.' + options.image_type)
                #~ gc_plots.save_plot(output_plot, dpi=options.dpi)
                #~ gc_plots.save_html(os.path.join(plot_dir, genome_id + '.gc_plots.html'))

                # TD plot
                td_plots = TdPlots(options)
                td_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_signature, outliers.td_dist, [options.td_perc])

                output_plot = os.path.join(plot_dir, genome_id + '.td_plots.' + options.image_type)
                td_plots.save_plot(output_plot, dpi=options.dpi)
                td_plots.save_html(os.path.join(plot_dir, genome_id + '.td_plots.html'))

                #~ # mean absolute deviation of coverage profiles
                #~ cov_perc_plots = CovPercPlots(options)
                #~ cov_perc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_coverage, [options.cov_perc])
#~ 
                #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_perc.' + options.image_type)
                #~ cov_perc_plots.save_plot(output_plot, dpi=options.dpi)
                #~ cov_perc_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_perc.html'))
#~ 
                #~ # coverage correlation plots
                #~ if len(gs.mean_coverage) > 1:
                    #~ cov_corr_plots = CovCorrPlots(options)
                    #~ cov_corr_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, gs.mean_coverage, [options.cov_corr])
#~ 
                    #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_corr.' + options.image_type)
                    #~ cov_corr_plots.save_plot(output_plot, dpi=options.dpi)
                    #~ cov_corr_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_corr.html'))

            #~ # combined distribution, GC vs. coverage, and tetranucleotide signature plots
            #~ combined_plots = CombinedPlots(options)
            #~ combined_plots.plot(genome_scaffold_stats,
                            #~ highlight_scaffolds_ids, link_scaffold_ids, gs,
                            #~ outliers.gc_dist, outliers.td_dist,
                            #~ options.gc_perc, options.td_perc, options.cov_perc)
#~ 
            #~ output_plot = os.path.join(plot_dir, genome_id + '.combined.' + options.image_type)
            #~ combined_plots.save_plot(output_plot, dpi=options.dpi)
            #~ combined_plots.save_html(os.path.join(plot_dir, genome_id + '.combined.html'))
#~ 
            #~ genome_plots[genome_id].append(('Combined', genome_id + '.combined.html'))
#~ 
            #~ # combined plot of distributions
            #~ dist_plots = DistributionPlots(options)
            #~ dist_plots.plot(genome_scaffold_stats,
                            #~ highlight_scaffolds_ids,
                            #~ link_scaffold_ids,
                            #~ gs,
                            #~ outliers.gc_dist, outliers.td_dist,
                            #~ options.gc_perc, options.td_perc, options.cov_perc)
#~ 
            #~ output_plot = os.path.join(plot_dir, genome_id + '.dist_plot.' + options.image_type)
            #~ dist_plots.save_plot(output_plot, dpi=options.dpi)
            #~ dist_plots.save_html(os.path.join(plot_dir, genome_id + '.dist_plot.html'))
#~ 
            #~ genome_plots[genome_id].append(('Distributions', genome_id + '.dist_plot.html'))
#~ 
            #~ # GC vs. coverage plot
            #~ gc_cov_plot = GcCovPlot(options)
            #~ gc_cov_plot.plot(genome_scaffold_stats,
                             #~ highlight_scaffolds_ids, link_scaffold_ids,
                             #~ gs.mean_gc, gs.mean_coverage)
#~ 
            #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_coverge.' + options.image_type)
            #~ gc_cov_plot.save_plot(output_plot, dpi=options.dpi)
            #~ gc_cov_plot.save_html(os.path.join(plot_dir, genome_id + '.gc_coverge.html'))
#~ 
            #~ genome_plots[genome_id].append(('GC vs. coverage', genome_id + '.gc_coverge.html'))

            # tetranucleotide signature PCA plot
            tetra = TetraPcaPlot(options)
            tetra.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids)

            output_plot = os.path.join(plot_dir, genome_id + '.tetra_pca.' + options.image_type)
            tetra.save_plot(output_plot, dpi=options.dpi)
            tetra.save_html(os.path.join(plot_dir, genome_id + '.tetra_pca.html'))

            genome_plots[genome_id].append(('Tetra PCA', genome_id + '.tetra_pca.html'))

        sys.stdout.write('\n')

        outliers.create_html_index(plot_dir, genome_plots)

        self.logger.info('  Outlier plots written to: ' + plot_dir)

        self.time_keeper.print_time_stamp()