def compatible(self, options): """Compatible command""" check_file_exists(options.reference_file) check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) # read scaffold statistics and calculate genome stats self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify putative homologs to reference genomes reference = Reference(1, None) putative_homologs = reference.homology_check(options.reference_file, options.min_genes, float(options.perc_genes)) # identify scaffolds compatible with bins outliers = Outliers() output_file = os.path.join(options.output_dir, 'compatible.tsv') outliers.compatible(putative_homologs, scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, output_file) self.logger.info('Results written to: ' + output_file)
def modify(self, options): """Modify command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - modify] Modifying scaffolds in genome.') self.logger.info('*******************************************************************************') make_sure_path_exists(os.path.dirname(options.output_genome)) if not (options.add or options.remove or options.outlier_file or options.compatible_file): self.logger.warning(' [Warning] No modification to bin requested.\n') sys.exit() if (options.add or options.remove) and (options.outlier_file or options.compatible_file): self.logger.warning(" [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n") sys.exit() if options.outlier_file and options.compatible_file: self.logger.warning(" [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n") sys.exit() failed_to_add = [] failed_to_remove = [] if options.add or options.remove: failed_to_add, failed_to_remove = genome_tk.modify(options.genome_file, options.scaffold_file, options.add, options.remove, options.output_genome) elif options.outlier_file: outliers = Outliers() outliers.remove_outliers(options.genome_file, options.outlier_file, options.output_genome) elif options.compatible_file: outliers = Outliers() if options.unique_only: outliers.add_compatible_unique(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome) else: outliers.add_compatible_closest(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome) if failed_to_add: self.logger.warning(' [Warning] Failed to add the following sequence(s):') for seq_id in failed_to_add: self.logger.warning(' %s' % seq_id) if failed_to_remove: self.logger.warning(' [Warning] Failed to remove the following sequence(s):') for seq_id in failed_to_remove: self.logger.warning(' %s' % seq_id) self.logger.info('') self.logger.info(' Modified genome written to: ' + options.output_genome) self.time_keeper.print_time_stamp()
def filter_bins(self, options): """Filter bins command""" make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() outliers = Outliers() for genome_file in genome_files: gf = remove_extension( genome_file) + '.filtered.' + options.genome_ext out_genome = os.path.join(options.output_dir, gf) outliers.remove_outliers(genome_file, options.filter_file, out_genome, options.modified_only) self.logger.info('Modified genome written to: ' + options.output_dir)
def outliers(self, options): """Outlier command""" check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify outliers outliers = Outliers() outlier_file = os.path.join(options.output_dir, 'outliers.tsv') outliers.identify(scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, outlier_file) self.logger.info('Outlier information written to: ' + outlier_file) # create outlier plots if not options.no_plots: plot_dir = os.path.join(options.output_dir, 'plots') make_sure_path_exists(plot_dir) outliers.plot(scaffold_stats, genome_stats, outliers.gc_dist, outliers.td_dist, options, options.highlight_file, options.links_file, options.individual_plots, plot_dir) self.logger.info('Outlier plots written to: ' + plot_dir)
def compatible(self, options): """Compatible command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - compatible] Identify scaffolds with compatible genomic statistics.') self.logger.info('*******************************************************************************') check_file_exists(options.reference_file) check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) # read scaffold statistics and calculate genome stats self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify putative homologs to reference genomes reference = Reference(1, None) putative_homologs = reference.homology_check(options.reference_file, options.min_genes, float(options.perc_genes)) # identify scaffolds compatible with bins outliers = Outliers() output_file = os.path.join(options.output_dir, 'compatible.tsv') outliers.compatible(putative_homologs, scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, output_file) self.logger.info('') self.logger.info(' Results written to: ' + output_file) self.time_keeper.print_time_stamp()
def modify_bin(self, options): """Modify bin command""" make_sure_path_exists(os.path.dirname(options.output_genome)) if not (options.add or options.remove or options.outlier_file or options.compatible_file): self.logger.warning('No modification to bin requested.\n') sys.exit() if (options.add or options.remove) and (options.outlier_file or options.compatible_file): self.logger.warning( "The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n" ) sys.exit() if options.outlier_file and options.compatible_file: self.logger.warning( "The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n" ) sys.exit() failed_to_add = [] failed_to_remove = [] if options.add or options.remove: failed_to_add, failed_to_remove = genome_tk.modify( options.genome_file, options.scaffold_file, options.add, options.remove, options.output_genome) elif options.outlier_file: outliers = Outliers() outliers.remove_outliers(options.genome_file, options.outlier_file, options.output_genome, False) elif options.compatible_file: outliers = Outliers() if options.unique_only: outliers.add_compatible_unique(options.scaffold_file, options.genome_file, options.compatible_file, options.min_len, options.output_genome) elif options.closest_only: outliers.add_compatible_closest(options.scaffold_file, options.genome_file, options.compatible_file, options.min_len, options.output_genome) else: outliers.add_compatible(options.scaffold_file, options.genome_file, options.compatible_file, options.min_len, options.output_genome) if failed_to_add: self.logger.warning('Failed to add the following sequence(s):') for seq_id in failed_to_add: print ' %s' % seq_id if failed_to_remove: self.logger.warning('Failed to remove the following sequence(s):') for seq_id in failed_to_remove: print ' %s' % seq_id self.logger.info('Modified genome written to: ' + options.output_genome)
def outliers(self, options): """Outlier command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - outliers] Identifying scaffolds with divergent characteristics.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify outliers outliers = Outliers() outlier_file = os.path.join(options.output_dir, 'outliers.tsv') outliers.identify(scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, outlier_file) self.logger.info(' Outlier information written to: ' + outlier_file) # create outlier plots self.logger.info('') highlight_scaffolds_ids = {} if options.highlight_file: for line in open(options.highlight_file): line_split = line.strip().split('\t') if len(line_split) > 1: highlight_scaffolds_ids[line_split[0]] = [float(x.strip()) / 255.0 for x in line_split[1].split(',')] else: highlight_scaffolds_ids[line_split[0]] = [1.0, 0, 0] link_scaffold_ids = [] if options.links_file: with open(options.links_file) as links_file: for line in links_file: #print line.strip().split('\t') link_scaffold_ids.append([ast.literal_eval(item) if i not in (0,2) else item for i,item in enumerate((line.strip().split('\t')))]) #link_scaffold_ids.append(line.strip().split('\t') for line in open(options.links_file)) #print list(link_scaffold_ids[0]) # create plots genomes_processed = 0 plot_dir = os.path.join(options.output_dir, 'plots') make_sure_path_exists(plot_dir) genome_plots = defaultdict(list) for genome_id, gs in genome_stats.iteritems(): genomes_processed += 1 sys.stdout.write(' Plotting scaffold distribution for %d of %d (%.1f%%) genomes.\r' % (genomes_processed, len(genome_stats), genomes_processed * 100.0 / len(genome_stats))) sys.stdout.flush() genome_scaffold_stats = {} for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]: genome_scaffold_stats[scaffold_id] = scaffold_stats.stats[scaffold_id] if options.individual_plots: #~ # GC plot #~ gc_plots = GcPlots(options) #~ gc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_gc, outliers.gc_dist, [options.gc_perc]) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_plots.' + options.image_type) #~ gc_plots.save_plot(output_plot, dpi=options.dpi) #~ gc_plots.save_html(os.path.join(plot_dir, genome_id + '.gc_plots.html')) # TD plot td_plots = TdPlots(options) td_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_signature, outliers.td_dist, [options.td_perc]) output_plot = os.path.join(plot_dir, genome_id + '.td_plots.' + options.image_type) td_plots.save_plot(output_plot, dpi=options.dpi) td_plots.save_html(os.path.join(plot_dir, genome_id + '.td_plots.html')) #~ # mean absolute deviation of coverage profiles #~ cov_perc_plots = CovPercPlots(options) #~ cov_perc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_coverage, [options.cov_perc]) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_perc.' + options.image_type) #~ cov_perc_plots.save_plot(output_plot, dpi=options.dpi) #~ cov_perc_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_perc.html')) #~ #~ # coverage correlation plots #~ if len(gs.mean_coverage) > 1: #~ cov_corr_plots = CovCorrPlots(options) #~ cov_corr_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, gs.mean_coverage, [options.cov_corr]) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_corr.' + options.image_type) #~ cov_corr_plots.save_plot(output_plot, dpi=options.dpi) #~ cov_corr_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_corr.html')) #~ # combined distribution, GC vs. coverage, and tetranucleotide signature plots #~ combined_plots = CombinedPlots(options) #~ combined_plots.plot(genome_scaffold_stats, #~ highlight_scaffolds_ids, link_scaffold_ids, gs, #~ outliers.gc_dist, outliers.td_dist, #~ options.gc_perc, options.td_perc, options.cov_perc) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.combined.' + options.image_type) #~ combined_plots.save_plot(output_plot, dpi=options.dpi) #~ combined_plots.save_html(os.path.join(plot_dir, genome_id + '.combined.html')) #~ #~ genome_plots[genome_id].append(('Combined', genome_id + '.combined.html')) #~ #~ # combined plot of distributions #~ dist_plots = DistributionPlots(options) #~ dist_plots.plot(genome_scaffold_stats, #~ highlight_scaffolds_ids, #~ link_scaffold_ids, #~ gs, #~ outliers.gc_dist, outliers.td_dist, #~ options.gc_perc, options.td_perc, options.cov_perc) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.dist_plot.' + options.image_type) #~ dist_plots.save_plot(output_plot, dpi=options.dpi) #~ dist_plots.save_html(os.path.join(plot_dir, genome_id + '.dist_plot.html')) #~ #~ genome_plots[genome_id].append(('Distributions', genome_id + '.dist_plot.html')) #~ #~ # GC vs. coverage plot #~ gc_cov_plot = GcCovPlot(options) #~ gc_cov_plot.plot(genome_scaffold_stats, #~ highlight_scaffolds_ids, link_scaffold_ids, #~ gs.mean_gc, gs.mean_coverage) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_coverge.' + options.image_type) #~ gc_cov_plot.save_plot(output_plot, dpi=options.dpi) #~ gc_cov_plot.save_html(os.path.join(plot_dir, genome_id + '.gc_coverge.html')) #~ #~ genome_plots[genome_id].append(('GC vs. coverage', genome_id + '.gc_coverge.html')) # tetranucleotide signature PCA plot tetra = TetraPcaPlot(options) tetra.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids) output_plot = os.path.join(plot_dir, genome_id + '.tetra_pca.' + options.image_type) tetra.save_plot(output_plot, dpi=options.dpi) tetra.save_html(os.path.join(plot_dir, genome_id + '.tetra_pca.html')) genome_plots[genome_id].append(('Tetra PCA', genome_id + '.tetra_pca.html')) sys.stdout.write('\n') outliers.create_html_index(plot_dir, genome_plots) self.logger.info(' Outlier plots written to: ' + plot_dir) self.time_keeper.print_time_stamp()