def call_genes(self, options): """Call genes command""" make_sure_path_exists(options.output_dir) genome_files = self._input_files(options.input_genomes, options.file_ext) prodigal = Prodigal(options.cpus, not options.silent) summary_stats = prodigal.run(genome_files, options.output_dir, called_genes=False, translation_table=options.force_table, meta=False, closed_ends=True) # write gene calling summary fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w') fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n') for genome_id, stats in summary_stats.items(): fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id, stats.best_translation_table, stats.coding_density_4, stats.coding_density_11)) fout.close() self.logger.info('Identified genes written to: %s' % options.output_dir)
def gene_profile(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - gene_profile] Generating taxonomic profiles from genes.') self.logger.info('*******************************************************************************') make_sure_path_exists(options.output_dir) check_file_exists(options.scaffold_stats_file) check_file_exists(options.taxonomy_file) check_file_exists(options.db_file) gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext) if not self._check_protein_seqs(gene_files): self.logger.warning('[Warning] All files must contain amino acid sequences.') sys.exit() # build gene profile gene_profile = GeneProfile(options.cpus, options.output_dir) gene_profile.run(gene_files, options.scaffold_stats_file, options.db_file, options.taxonomy_file, options.per_to_classify, options.evalue, options.per_identity) self.logger.info('') self.logger.info(' Results written to: %s' % options.output_dir) self.time_keeper.print_time_stamp()
def reference(self, options): """Reference command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - reference] Identifying scaffolds similar to specific genome(s).') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_prot_file) check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) ref_gene_files = self._genome_files(options.ref_genome_prot_dir, options.protein_ext) if not self._check_protein_seqs(ref_gene_files): self.logger.warning('[Warning] All files must contain amino acid sequences.') sys.exit() reference = Reference(options.cpus, options.output_dir) reference_out = reference.run(options.scaffold_prot_file, options.scaffold_stats_file, ref_gene_files, options.db_file, options.evalue, options.per_identity) self.logger.info('') self.logger.info(' Results written to: ' + reference_out) self.time_keeper.print_time_stamp()
def cluster(self, options): """Cluster command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - cluster] Partitioning bin into clusters.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_stats_file) check_file_exists(options.genome_file) make_sure_path_exists(options.output_dir) self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) cluster = Cluster(options.cpus) cluster.run(scaffold_stats, options.num_clusters, options.num_components, options.K, options.no_coverage, options.no_pca, options.iterations, options.genome_file, options.output_dir) self.logger.info('') self.logger.info(' Partitioned sequences written to: ' + options.output_dir) self.time_keeper.print_time_stamp()
def call_genes(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - call_genes] Identifying genes within genomes.') self.logger.info('*******************************************************************************') make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_dir, options.genome_ext) if not genome_files: self.logger.warning(' [Warning] No genome files found. Check the --genome_ext flag used to identify genomes.') sys.exit() prodigal = Prodigal(options.cpus) summary_stats = prodigal.run(genome_files, False, options.force_table, False, options.output_dir) # write gene calling summary fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w') fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n') for genome_id, stats in summary_stats.iteritems(): fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id, stats.best_translation_table, stats.coding_density_4, stats.coding_density_11)) fout.close() self.logger.info('') self.logger.info(' Identified genes written to: %s' % options.output_dir) self.time_keeper.print_time_stamp()
def call_genes(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - call_genes] Identifying genes within genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.genome_nt_dir) make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() # call genes in genomes prodigal = Prodigal(options.cpus) prodigal.run(genome_files, options.output_dir) self.logger.info(' Genes in genomes written to: %s' % options.output_dir) # call genes in unbinned scaffolds if options.unbinned_file: unbinned_output_dir = os.path.join(options.output_dir, 'unbinned') prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True) self.logger.info(' Genes in unbinned scaffolds written to: %s' % unbinned_output_dir) self.time_keeper.print_time_stamp()
def lsu_tree(self, options): """Infer 23S tree spanning GTDB genomes.""" check_dependencies(['esl-sfetch', 'cmsearch', 'cmalign', 'esl-alimask', 'FastTreeMP', 'blastn']) check_file_exists(options.gtdb_metadata_file) check_file_exists(options.gtdb_lsu_file) make_sure_path_exists(options.output_dir) rna_workflow = RNA_Workflow(options.cpus) rna_workflow.run('lsu', options.gtdb_metadata_file, options.gtdb_lsu_file, options.min_lsu_length, options.min_scaffold_length, options.min_quality, options.max_contigs, options.min_N50, not options.disable_tax_filter, #options.reps_only, #options.user_genomes, options.genome_list, options.output_dir) self.logger.info('Results written to: %s' % options.output_dir)
def select_type_genomes(self, options): """Select representative genomes for named species.""" check_file_exists(options.qc_file) check_file_exists(options.gtdb_metadata_file) check_file_exists(options.genome_path_file) check_file_exists(options.prev_rep_file) check_file_exists(options.ncbi_refseq_assembly_file) check_file_exists(options.ncbi_genbank_assembly_file) check_file_exists(options.gtdb_domain_report) check_file_exists(options.species_exception_file) check_file_exists(options.gtdb_type_genome_file) make_sure_path_exists(options.output_dir) try: p = SelectTypeGenomes(options.ani_cache_file, options.cpus, options.output_dir) p.run(options.qc_file, options.gtdb_metadata_file, options.ltp_blast_file, options.genome_path_file, options.prev_rep_file, options.ncbi_refseq_assembly_file, options.ncbi_genbank_assembly_file, options.gtdb_domain_report, options.species_exception_file, options.gtdb_type_genome_file) except GenomeTreeTkError as e: print e.message raise SystemExit self.logger.info('GTDB type genomes written to: %s' % options.output_dir)
def cluster_named_types(self, options): """Cluster genomes to selected GTDB type genomes.""" check_file_exists(options.qc_file) check_file_exists(options.gtdb_metadata_file) check_file_exists(options.genome_path_file) check_file_exists(options.named_type_genome_file) check_file_exists(options.type_genome_ani_file) check_file_exists(options.species_exception_file) make_sure_path_exists(options.output_dir) try: p = ClusterNamedTypes(options.ani_sp, options.af_sp, options.ani_cache_file, options.cpus, options.output_dir) p.run(options.qc_file, options.gtdb_metadata_file, options.genome_path_file, options.named_type_genome_file, options.type_genome_ani_file, options.mash_sketch_file, options.species_exception_file) except GenomeTreeTkError as e: print e.message raise SystemExit self.logger.info('Clustering results written to: %s' % options.output_dir)
def gene(self, options): self.logger.info('Calculating gene properties of genome.') check_file_exists(options.genome_file) check_file_exists(options.gff_file) make_sure_path_exists(options.output_dir) meta_genes = MetadataGenes() metadata_values, metadata_desc = meta_genes.generate(options.genome_file, options.gff_file) # write statistics to file output_file = os.path.join(options.output_dir, 'metadata.genome_gene.tsv') fout = open(output_file, 'w') for field in sorted(metadata_values.keys()): fout.write('%s\t%s\n' % (field, str(metadata_values[field]))) fout.close() # write description to file output_file = os.path.join(options.output_dir, 'metadata.genome_gene.desc.tsv') fout = open(output_file, 'w') for field in sorted(metadata_desc.keys()): fout.write('%s\t%s\t%s\n' % (field, metadata_desc[field], type(metadata_values[field]).__name__.upper())) fout.close()
def ani(self, options): """ANI command""" make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_dir, options.file_ext) self.logger.info('Average nucleotide identity information written to: %s' % options.output_dir)
def scaffold_stats(self, options): """Scaffold statistics command""" print options self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - scaffold_stats] Calculating statistics for scaffolds.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_file) if not self._check_nuclotide_seqs([options.scaffold_file]): self.logger.warning('[Warning] Scaffold file must contain nucleotide sequences.') sys.exit() genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() make_sure_path_exists(options.output_dir) # get coverage information if not options.coverage_file: if not options.bam_files: self.logger.warning('\n [Warning] One or more BAM files must be specified in order to calculate coverage profiles.') coverage_file = None else: coverage = Coverage(options.cpus) coverage_file = os.path.join(options.output_dir, 'coverage.tsv') coverage.run(options.bam_files, coverage_file, options.cov_all_reads, options.cov_min_align, options.cov_max_edit_dist) self.logger.info('') self.logger.info(' Coverage profiles written to: %s' % coverage_file) else: coverage_file = options.coverage_file # get tetranucleotide signatures - ALEX - IMPORTANT FOR MY STUFF if not options.tetra_file: self.logger.info('') tetra = Tetranucleotide(options.cpus) tetra_file = os.path.join(options.output_dir, 'tetra.tsv') signatures = tetra.run(options.scaffold_file) tetra.write(signatures, tetra_file) self.logger.info(' Tetranucleotide signatures written to: %s' % tetra_file) else: tetra_file = options.tetra_file # write out scaffold statistics stats_output = os.path.join(options.output_dir, 'scaffold_stats.tsv') stats = ScaffoldStats(options.cpus) stats.run(options.scaffold_file, genome_files, tetra_file, coverage_file, stats_output) self.logger.info(' Scaffold statistic written to: %s' % stats_output) self.time_keeper.print_time_stamp()
def modify(self, options): """Modify command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - modify] Modifying scaffolds in genome.') self.logger.info('*******************************************************************************') make_sure_path_exists(os.path.dirname(options.output_genome)) if not (options.add or options.remove or options.outlier_file or options.compatible_file): self.logger.warning(' [Warning] No modification to bin requested.\n') sys.exit() if (options.add or options.remove) and (options.outlier_file or options.compatible_file): self.logger.warning(" [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n") sys.exit() if options.outlier_file and options.compatible_file: self.logger.warning(" [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n") sys.exit() failed_to_add = [] failed_to_remove = [] if options.add or options.remove: failed_to_add, failed_to_remove = genome_tk.modify(options.genome_file, options.scaffold_file, options.add, options.remove, options.output_genome) elif options.outlier_file: outliers = Outliers() outliers.remove_outliers(options.genome_file, options.outlier_file, options.output_genome) elif options.compatible_file: outliers = Outliers() if options.unique_only: outliers.add_compatible_unique(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome) else: outliers.add_compatible_closest(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome) if failed_to_add: self.logger.warning(' [Warning] Failed to add the following sequence(s):') for seq_id in failed_to_add: self.logger.warning(' %s' % seq_id) if failed_to_remove: self.logger.warning(' [Warning] Failed to remove the following sequence(s):') for seq_id in failed_to_remove: self.logger.warning(' %s' % seq_id) self.logger.info('') self.logger.info(' Modified genome written to: ' + options.output_genome) self.time_keeper.print_time_stamp()
def rd_ranks(self, options): """Calculate number of taxa for specified rd thresholds.""" check_file_exists(options.input_tree) make_sure_path_exists(options.output_dir) r = RdRanks() r.run(options.input_tree, options.thresholds, options.output_dir) self.logger.info('Done.')
def run(self, input_tree, msa_file, outgroup_file, perc_taxa_to_keep, num_replicates, model, output_dir): """Jackknife taxa. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. outgroup_file : str File indicating labels of outgroup taxa. perc_taxa_to_keep : float Percentage of taxa to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str input_tree directory for bootstrap trees. """ assert(model in ['wag', 'jtt']) self.perc_taxa_to_keep = perc_taxa_to_keep self.model = model self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read outgroup taxa self.outgroup_ids = set() if outgroup_file: for line in open(outgroup_file): self.outgroup_ids.add(line.strip()) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates #***self.logger.info('Calculating jackknife taxa replicates:') #***parallel = Parallel(self.cpus) #***parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support rep_tree_files = [] for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_taxa.tree.' + str(rep_index) + '.tre')) tree_support = TreeSupport() output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_taxa.tree') tree_support.subset_taxa(input_tree, rep_tree_files, output_tree) return output_tree
def bl_dist(self, options): """Calculate distribution of branch lengths at each taxonomic rank.""" check_file_exists(options.input_tree) make_sure_path_exists(options.output_dir) b = BranchLengthDistribution() b.run(options.input_tree, options.trusted_taxa_file, options.min_children, options.taxonomy_file, options.output_dir) self.logger.info('Done.')
def ani(self, options): """ANI command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - ani] Calculating the ANI between genome pairs.') self.logger.info('*******************************************************************************') self.logger.info('') make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_dir, options.genome_ext) self.logger.info('') self.logger.info(' Average nucleotide identity information written to: %s' % options.output_dir) self.time_keeper.print_time_stamp()
def derep_tree(self, options): """Dereplicate tree.""" check_file_exists(options.input_tree) check_file_exists(options.gtdb_metadata) check_file_exists(options.msa_file) make_sure_path_exists(options.output_dir) derep_tree = DereplicateTree() derep_tree.run(options.input_tree, options.lineage_of_interest, options.outgroup, options.gtdb_metadata, options.taxa_to_retain, options.msa_file, options.keep_unclassified, options.output_dir)
def jk_taxa(self, options): """Jackknife taxa.""" check_file_exists(options.input_tree) check_file_exists(options.msa_file) make_sure_path_exists(options.output_dir) jackknife_taxa = JackknifeTaxa(options.cpus) output_tree = jackknife_taxa.run(options.input_tree, options.msa_file, options.outgroup_ids, options.perc_taxa, options.num_replicates, options.model, options.output_dir) self.logger.info('Jackknifed taxa tree written to: %s' % output_tree)
def classify(self, options): """Classify genomes based on AAI values.""" check_file_exists(options.sorted_hit_table) make_sure_path_exists(options.output_dir) classify = Classify(options.cpus) results_file = classify.run(options.query_gene_file, options.target_gene_file, options.sorted_hit_table, options.evalue, options.per_identity, options.per_aln_len, options.num_top_targets, options.taxonomy_file, options.keep_rbhs, options.output_dir) self.logger.info('Classification results written to: %s' % results_file)
def rna_tree(self, options): """Infer 16S + 23S tree spanning GTDB genomes.""" check_dependencies(['FastTreeMP']) check_file_exists(options.ssu_msa) check_file_exists(options.ssu_tree) check_file_exists(options.lsu_msa) check_file_exists(options.lsu_tree) make_sure_path_exists(options.output_dir) rna_workflow = RNA_Workflow(options.cpus) rna_workflow.combine(options.ssu_msa, options.ssu_tree, options.lsu_msa, options.lsu_tree, options.output_dir) self.logger.info('Results written to: %s' % options.output_dir)
def rna_dump(self, options): """Dump all 5S, 16S, and 23S sequences to files.""" check_file_exists(options.genomic_file) make_sure_path_exists(options.output_dir) rna_workflow = RNA_Workflow(1) rna_workflow.dump(options.genomic_file, options.gtdb_taxonomy, options.min_5S_len, options.min_16S_ar_len, options.min_16S_bac_len, options.min_23S_len, options.min_contig_len, options.include_user, options.genome_list, options.output_dir) self.logger.info('Results written to: %s' % options.output_dir)
def aai(self, options): """AAI command""" check_file_exists(options.sorted_hit_table) make_sure_path_exists(options.output_dir) aai_calculator = AAICalculator(options.cpus) aai_output_file, rbh_output_file = aai_calculator.run(options.query_gene_file, None, options.sorted_hit_table, options.evalue, options.per_identity, options.per_aln_len, options.keep_rbhs, options.output_dir) if rbh_output_file: self.logger.info('Identified reciprocal best hits written to: %s' % rbh_output_file) self.logger.info('AAI between genomes written to: %s' % aai_output_file)
def cluster_user(self, options): """Cluster User genomes to GTDB species clusters.""" check_file_exists(options.gtdb_metadata_file) check_file_exists(options.genome_path_file) check_file_exists(options.final_cluster_file) make_sure_path_exists(options.output_dir) try: p = ClusterUser(options.ani_cache_file, options.cpus, options.output_dir) p.run(options.gtdb_metadata_file, options.genome_path_file, options.final_cluster_file) except GenomeTreeTkError as e: print e.message raise SystemExit self.logger.info('Clustering results written to: %s' % options.output_dir)
def jk_markers(self, options): """Jackknife marker genes.""" check_file_exists(options.input_tree) if options.msa_file != 'NONE': check_file_exists(options.msa_file) make_sure_path_exists(options.output_dir) jackknife_markers = JackknifeMarkers(options.cpus) output_tree = jackknife_markers.run(options.input_tree, options.msa_file, options.marker_info_file, options.mask_file, options.perc_markers, options.num_replicates, options.model, options.jk_dir, options.output_dir) self.logger.info('Jackknifed marker tree written to: %s' % output_tree)
def bootstrap(self, options): """Bootstrap multiple sequence alignment.""" check_file_exists(options.input_tree) if options.msa_file != 'NONE': check_file_exists(options.msa_file) make_sure_path_exists(options.output_dir) bootstrap = Bootstrap(options.cpus) output_tree = bootstrap.run(options.input_tree, options.msa_file, options.num_replicates, options.model, options.gamma, options.base_type, options.fraction, options.boot_dir, options.output_dir) self.logger.info('Bootstrapped tree written to: %s' % output_tree)
def assign(self, options): """Assign genomes to canonical genomes comprising GTDB reference tree.""" check_file_exists(options.canonical_taxonomy_file) check_file_exists(options.full_taxonomy_file) check_file_exists(options.metadata_file) check_file_exists(options.genome_path_file) make_sure_path_exists(options.output_dir) try: assign = AssignGenomes(options.cpus, options.output_dir) assign.run(options.canonical_taxonomy_file, options.full_taxonomy_file, options.metadata_file, options.genome_path_file, options.user_genomes) except GenomeTreeTkError as e: print e.message raise SystemExit
def similarity(self, options): """Perform sequence similarity search between genes""" make_sure_path_exists(options.output_dir) query_gene_files = self._input_files(options.query_proteins, options.file_ext) target_gene_files = self._input_files(options.target_proteins, options.file_ext) ss = SimilaritySearch(options.cpus) ss.run(query_gene_files, target_gene_files, options.evalue, options.per_identity, options.per_aln_len, True, options.tmp_dir, options.blastp, options.sensitive, options.keep_headers, options.output_dir) self.logger.info('Sequence similarity results written to: %s' % options.output_dir)
def compatible(self, options): """Compatible command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - compatible] Identify scaffolds with compatible genomic statistics.') self.logger.info('*******************************************************************************') check_file_exists(options.reference_file) check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) # read scaffold statistics and calculate genome stats self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify putative homologs to reference genomes reference = Reference(1, None) putative_homologs = reference.homology_check(options.reference_file, options.min_genes, float(options.perc_genes)) # identify scaffolds compatible with bins outliers = Outliers() output_file = os.path.join(options.output_dir, 'compatible.tsv') outliers.compatible(putative_homologs, scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, output_file) self.logger.info('') self.logger.info(' Results written to: ' + output_file) self.time_keeper.print_time_stamp()
def cluster_de_novo(self, options): """Infer de novo species clusters and type genomes for remaining genomes.""" check_file_exists(options.qc_file) check_file_exists(options.gtdb_metadata_file) check_file_exists(options.gtdb_user_genomes_file) check_file_exists(options.genome_path_file) check_file_exists(options.type_genome_cluster_file) check_file_exists(options.type_genome_synonym_file) check_file_exists(options.ncbi_refseq_assembly_file) check_file_exists(options.ncbi_genbank_assembly_file) check_file_exists(options.ani_af_nontype_vs_type) check_file_exists(options.species_exception_file) make_sure_path_exists(options.output_dir) try: p = ClusterDeNovo(options.ani_sp, options.af_sp, options.ani_cache_file, options.cpus, options.output_dir) p.run(options.qc_file, options.gtdb_metadata_file, options.gtdb_user_genomes_file, options.genome_path_file, options.type_genome_cluster_file, options.type_genome_synonym_file, options.ncbi_refseq_assembly_file, options.ncbi_genbank_assembly_file, options.ani_af_nontype_vs_type, options.species_exception_file, options.rnd_type_genome) except GenomeTreeTkError as e: print e.message raise SystemExit self.logger.info('Clustering results written to: %s' % options.output_dir)
def run(self, input_tree, msa_file, num_replicates, model, base_type, frac, boot_dir, output_dir): """Bootstrap multiple sequence alignment. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. base_type : str Indicates if bases are nucleotides or amino acids. frac : float Fraction of alignment to subsample. output_dir : str Directory for bootstrap trees. """ assert (model in ['wag', 'lg', 'jtt']) assert (base_type in ['nt', 'prot']) self.model = model self.base_type = base_type self.frac = frac rep_tree_files = [] if not boot_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates self.logger.info('Calculating bootstrap replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) for rep_index in xrange(num_replicates): rep_tree_files.append( os.path.join( self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree')) else: for f in os.listdir(boot_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(boot_dir, f)) self.logger.info('Read %d bootstrap replicates.' % len(rep_tree_files)) # calculate support values self.logger.info('Calculating bootstrap support values.') output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def _calculate_fastani_distance(self, list_leaf, genomes): """ Calculate the FastANI distance between all user genomes and the reference to classfy them at the species level Parameters ---------- list_leaf : List of leaves uncluding one or many user genomes and one reference genome. genomes : Dictionary of user genomes d[genome_id] -> FASTA file Returns ------- dictionary dict_results[user_g]={"ref_genome":ref_genome,"mash_dist":mash_dist} """ try: self.tmp_output_dir = tempfile.mkdtemp() query_list_file = open( os.path.join(self.tmp_output_dir, 'query_list.txt'), 'w') ref_list_file = open( os.path.join(self.tmp_output_dir, 'ref_list.txt'), 'w') make_sure_path_exists(self.tmp_output_dir) for leaf in list_leaf: if not leaf.startswith('GB_') and not leaf.startswith( 'RS_') and not leaf.startswith('UBA'): query_list_file.write('{0}\n'.format(genomes.get(leaf))) else: shortleaf = leaf if leaf.startswith('GB_') or leaf.startswith('RS_'): shortleaf = leaf[3:] ref_list_file.write('{0}{1}{2}\n'.format( Config.FASTANI_GENOMES, shortleaf, Config.FASTANI_GENOMES_EXT)) query_list_file.close() ref_list_file.close() if not os.path.isfile( os.path.join(self.tmp_output_dir, 'query_list.txt')) or not os.path.isfile( os.path.join(self.tmp_output_dir, 'ref_list.txt')): raise cmd = 'fastANI --ql {0} --rl {1} -o {2} > /dev/null 2>{3}'.format( os.path.join(self.tmp_output_dir, 'query_list.txt'), os.path.join(self.tmp_output_dir, 'ref_list.txt'), os.path.join(self.tmp_output_dir, 'results.tab'), os.path.join(self.tmp_output_dir, 'error.log')) os.system(cmd) if not os.path.isfile( os.path.join(self.tmp_output_dir, 'results.tab')): errstr = 'FastANI has stopped:\n' if os.path.isfile( os.path.join(self.tmp_output_dir, 'error.log')): with open(os.path.join(self.tmp_output_dir, 'error.log')) as debug: for line in debug: finalline = line errstr += finalline raise ValueError(errstr) dict_parser_distance = self._parse_fastani_results( os.path.join(self.tmp_output_dir, 'results.tab'), list_leaf) shutil.rmtree(self.tmp_output_dir) return dict_parser_distance except ValueError as error: if os.path.exists(self.tmp_output_dir): shutil.rmtree(self.tmp_output_dir) raise except Exception as error: if os.path.exists(self.tmp_output_dir): shutil.rmtree(self.tmp_output_dir) raise
def main(args=None): # initialize the options parser parser = argparse.ArgumentParser(add_help=False) subparsers = parser.add_subparsers(help="--", dest='subparser_name') # create table and plot useful for identifying taxonomic outliers. outliers_parser = subparsers.add_parser( 'outliers', formatter_class=CustomHelpFormatter, description='Create information for identifying taxonomic outliers') outliers_parser.add_argument( 'input_tree', help="decorated tree for inferring RED outliers") outliers_parser.add_argument( 'taxonomy_file', help='taxonomy file for inferring RED outliers', default=None) outliers_parser.add_argument( 'output_dir', help="desired output directory for generated files") outliers_parser.add_argument( '--viral', action="store_true", help='indicates a viral input tree and taxonomy') outliers_parser.add_argument( '--fixed_root', action="store_true", help='use single fixed root to infer outliers') outliers_parser.add_argument( '-t', '--trusted_taxa_file', help= "file indicating trusted taxonomic groups to use for inferring distribution (default: all taxa)", default=None) outliers_parser.add_argument( '-m', '--min_children', help= 'minimum required child taxa to consider taxa when inferring distribution', type=int, default=2) outliers_parser.add_argument( '-s', '--min_support', help= "minimum support value to consider taxa when inferring distribution (default: 0)", type=float, default=0.0) outliers_parser.add_argument( '--fmeasure_table', help="table indicating F-measure score for each taxa") outliers_parser.add_argument( '--min_fmeasure', help="minimum F-measure to consider taxa when inferring distribution", type=float, default=0.95) outliers_parser.add_argument( '--fmeasure_mono', help="minimum F-measure to consider taxa monophyletic", type=float, default=0.95) outliers_parser.add_argument( '--highlight_polyphyly', help='highlight taxa with an F-measure less than --fmeasure_mono', action="store_true") outliers_parser.add_argument( '--mblet', action="store_true", help= "calculate 'mean branch length to extent taxa' instead of 'relative evolutionary distances'" ) outliers_parser.add_argument( '-p', '--plot_taxa_file', help="file indicating taxonomic groups to plot (default: all taxa)", default=None) outliers_parser.add_argument('--plot_domain', action="store_true", help='show domain rank in plot') outliers_parser.add_argument( '--plot_dist_taxa_only', help='only plot taxa used to infer distribution', action="store_true") outliers_parser.add_argument('--highlight_taxa_file', help='file indicating taxa to highlight') outliers_parser.add_argument('--dpi', help='DPI of plots', type=int, default=96) outliers_parser.add_argument('--verbose_table', action="store_true", help='add additional columns to output table') outliers_parser.add_argument('--skip_mpld3', action="store_true", help='skip plots requiring mpld3') # create table and plot useful for identifying taxonomic outliers. scale_tree_parser = subparsers.add_parser( 'scale_tree', formatter_class=CustomHelpFormatter, description='Scale a rooted tree based on RED') scale_tree_parser.add_argument('input_tree', help="rooted tree to scale") scale_tree_parser.add_argument('output_tree', help="tree scaled by RED") # Compare RED values of taxa calculated over different trees compare_red_parser = subparsers.add_parser( 'compare_red', formatter_class=CustomHelpFormatter, description='Compare RED values of taxa calculated over different trees' ) compare_red_parser.add_argument( 'red_table1', help="RED table calculated by 'outlier' command.") compare_red_parser.add_argument( 'red_table2', help="RED table calculated by 'outlier' command.") compare_red_parser.add_argument( 'red_dict2', help="Median RED dictionary calculated by 'outlier' command.") compare_red_parser.add_argument('output_table', help='output table') compare_red_parser.add_argument( '--viral', action="store_true", help='indicates a viral input tree and taxonomy') # plot distribution of groups in each taxonomic rank dist_plot_parser = subparsers.add_parser( 'dist_plot', formatter_class=CustomHelpFormatter, description='Plot distribution of taxa in each taxonomic rank') dist_plot_parser.add_argument( 'input_tree', help="decorated tree for establishing relative divergence distributions" ) dist_plot_parser.add_argument('output_prefix', help="output prefix for generated files") dist_plot_parser.add_argument( '-p', '--plot_taxa_file', help="file indicating taxonomic groups to plot (default: all taxa)", default=None) dist_plot_parser.add_argument( '-t', '--trusted_taxa_file', help= "file indicating trusted taxonomic groups to use for inferring distribution (default: all taxa)", default=None) dist_plot_parser.add_argument( '-m', '--min_children', help= 'minimum required child taxa to consider taxa when inferring distribution (default: 0)', type=int, default=0) dist_plot_parser.add_argument( '-s', '--min_support', help= "minimum support value to consider taxa when inferring distribution (default: 0)", type=float, default=0.0) # decorate nodes with inferred taxonomic ranks # ******** MAYBE THIS SHOULD JUST TAKE A 'DISTRIBUTIONS FILE' produce by 'dist_plot' # ************************************************************************************ mark_tree_parser = subparsers.add_parser( 'mark_tree', formatter_class=CustomHelpFormatter, description= 'Mark nodes with distribution information and predicted taxonomic ranks.' ) mark_tree_parser.add_argument('input_tree', help="input tree to mark") mark_tree_parser.add_argument( 'output_tree', help="output tree with assigned taxonomic ranks") mark_tree_parser.add_argument( '-t', '--thresholds', help="relative divergence thresholds for taxonomic ranks", type=json.loads, default= '{"d": 0.33, "p": 0.56, "c": 0.65, "o": 0.78, "f": 0.92, "g": 0.99}') mark_tree_parser.add_argument( '-s', '--min_support', help="only mark nodes above the specified support value (default=0)", type=float, default=0) mark_tree_parser.add_argument( '-n', '--only_named_clades', help="only mark nodes with an existing label", action='store_true') mark_tree_parser.add_argument( '-l', '--min_length', help= "only mark nodes with a parent branch above the specified length (default=0)", type=float, default=0.0) mark_tree_parser.add_argument( '--no_percentile', action="store_true", help="do not mark with percentile information") mark_tree_parser.add_argument( '--no_relative_divergence', action="store_true", help="do not mark with relative divergence information") mark_tree_parser.add_argument( '--no_prediction', action="store_true", help="do not mark with predicted rank information") # rogue test rogue_test_parser = subparsers.add_parser( 'rogue_test', formatter_class=CustomHelpFormatter, description= 'Index indicating the incongruence of genomes over a set of tree.') rogue_test_parser.add_argument( 'input_tree_dir', help="directory containing trees to assess incongruence over") rogue_test_parser.add_argument( 'taxonomy_file', help='file indicating taxonomy of extant taxa') rogue_test_parser.add_argument( 'output_dir', help="desired output directory for generated files") rogue_test_parser.add_argument( '--outgroup_taxon', help= 'taxon to use as outgroup (e.g., d__Archaea); imples tree should be rooted' ) rogue_test_parser.add_argument('--decorate', help='indicates trees should be decorated', action='store_true') # decorate ree decorate_parser = subparsers.add_parser( 'decorate', formatter_class=CustomHelpFormatter, description='Place internal taxonomic labels on tree.') decorate_parser.add_argument('input_tree', help='tree to decorate') decorate_parser.add_argument( 'taxonomy_file', help='file indicating taxonomy of extant taxa') decorate_parser.add_argument('output_tree', help='decorated tree') decorate_parser.add_argument( '--viral', action="store_true", help='indicates a viral input tree and taxonomy') decorate_parser.add_argument( '--skip_rd_refine', help= "skip refinement of taxonomy based on relative divergence information", action='store_true') decorate_parser.add_argument( '-t', '--trusted_taxa_file', help= "file indicating trusted taxonomic groups to use for inferring distribution (default: all taxa)", default=None) decorate_parser.add_argument( '-m', '--min_children', help= 'minimum required child taxa to consider taxa when inferring distribution', type=int, default=2) decorate_parser.add_argument( '-s', '--min_support', help= "minimum support value to consider taxa when inferring distribution (default: 0)", type=float, default=0.0) # pull taxonomy strings from tree pull_parser = subparsers.add_parser( 'pull', formatter_class=CustomHelpFormatter, description='Pull taxonomy information from tree.') pull_parser.add_argument('input_tree', help="input tree to extract taxonomy from") pull_parser.add_argument( 'output_file', help="file to contain taxonomy strings for each extant taxon") pull_parser.add_argument('--no_rank_fill', action="store_true", help="do not automatically fill in missing ranks") # validate consistency of taxonomy validate_parser = subparsers.add_parser( 'validate', formatter_class=CustomHelpFormatter, description='Validate consistency of taxonomy.') validate_parser.add_argument('taxonomy_file', help="file with taxonomy for extant taxa") validate_parser.add_argument('--no_prefix', action="store_true", help="do not check taxon prefixes") validate_parser.add_argument( '--no_all_ranks', action="store_true", help="do not check for the presence of all ranks") validate_parser.add_argument( '--no_hierarhcy', action="store_true", help="do not check for inconsistencies in the taxonomic hierarchy") validate_parser.add_argument( '--no_species', action="store_true", help="do not check for hierarchical inconsistencies with named species" ) # summary statistics of taxonomic groups taxon_stats_parser = subparsers.add_parser( 'taxon_stats', formatter_class=CustomHelpFormatter, description='Summary statistics of taxonomic groups.') taxon_stats_parser.add_argument('taxonomy_file', help="file with taxonomy for extant taxa") taxon_stats_parser.add_argument('output_file', help="output file with summary statistics") # plot relative distance of groups across a set of trees. robustness_plot_parser = subparsers.add_parser( 'robustness_plot', formatter_class=CustomHelpFormatter, description='Plot relative divergence of groups across a set of trees') robustness_plot_parser.add_argument( 'rank', help="taxonomic rank of named groups to plot", type=int, choices=[1, 2, 3, 4, 5, 6]) robustness_plot_parser.add_argument( 'input_tree_dir', help="directory containing trees to inferred relative divergence across" ) robustness_plot_parser.add_argument( 'full_tree_file', help= "unmodified tree to include in plot; must be decorate with taxonomy") robustness_plot_parser.add_argument( 'derep_tree_file', help="dereplicated tree to include in plot") robustness_plot_parser.add_argument( 'taxonomy_file', help="file indicating taxonomy string for each genome") robustness_plot_parser.add_argument( 'output_prefix', help="output prefix for generated files") robustness_plot_parser.add_argument( '-m', '--min_children', help='minimum named child taxa to consider taxa', type=int, default=2) robustness_plot_parser.add_argument('-t', '--title', help='title of plot', default=None) rd_ranks_parser = subparsers.add_parser( 'rd_ranks', formatter_class=CustomHelpFormatter, description='Calculate number of taxa for specified rd thresholds.') rd_ranks_parser.add_argument('input_tree', help="input tree to calculate ranks over") rd_ranks_parser.add_argument( 'output_dir', help="desired output directory for generated files") rd_ranks_parser.add_argument( '-t', '--thresholds', help="relative divergence thresholds for taxonomic ranks", type=json.loads, default= '{"p": 0.35, "c": 0.52, "o": 0.67, "f": 0.79, "g": 0.94, "s":0.996}') bl_dist_parser = subparsers.add_parser( 'bl_dist', formatter_class=CustomHelpFormatter, description= 'Calculate distribution of branch lengths at each taxonomic rank.') bl_dist_parser.add_argument( 'input_tree', help="input tree to calculate branch length distributions") bl_dist_parser.add_argument( 'output_dir', help="desired output directory for generated files") bl_dist_parser.add_argument( '-t', '--trusted_taxa_file', help= "file indicating trusted taxonomic groups to use for inferring distribution (default: all taxa)", default=None) bl_dist_parser.add_argument( '-m', '--min_children', help= 'minimum required child taxa to consider taxa when inferring distribution', type=int, default=2) bl_dist_parser.add_argument( '--taxonomy_file', help='read taxonomy from this file instead of directly from tree', default=None) bl_optimal_parser = subparsers.add_parser( 'bl_optimal', formatter_class=CustomHelpFormatter, description= 'Determine branch length for best congruency with existing taxonomy.') bl_optimal_parser.add_argument( 'input_tree', help="input tree to calculate branch length distributions") bl_optimal_parser.add_argument('rank', help="rank of labels", type=int, choices=[1, 2, 3, 4, 5, 6]) bl_optimal_parser.add_argument('output_table', help="desired named of output table") bl_optimal_parser.add_argument( '--min_dist', help='minimum mean branch length value to evaluate', type=float, default=0.5) bl_optimal_parser.add_argument( '--max_dist', help='maximum mean branch length value to evaluate', type=float, default=1.2) bl_optimal_parser.add_argument( '--step_size', help='step size of mean branch length values', type=float, default=0.025) bl_decorate_parser = subparsers.add_parser( 'bl_decorate', formatter_class=CustomHelpFormatter, description='Decorate tree using a mean branch length criterion.') bl_decorate_parser.add_argument('input_tree', help="input tree to decorate") bl_decorate_parser.add_argument( 'taxonomy_file', help="file with taxonomic information for each taxon") bl_decorate_parser.add_argument('threshold', help="mean branch length threshold", type=float) bl_decorate_parser.add_argument('rank', help="rank of labels", type=int, choices=[1, 2, 3, 4, 5, 6]) bl_decorate_parser.add_argument('output_tree', help="decorate tree") bl_decorate_parser.add_argument( '--retain_named_lineages', action="store_true", help='retain existing named lineages at the specified rank') bl_decorate_parser.add_argument('--keep_labels', action="store_true", help='keep all existing internal labels') bl_decorate_parser.add_argument( '--prune', action="store_true", help= 'prune tree to preserve only the shallowest and deepest taxa in each child lineage from newly decorated nodes' ) bl_table_parser = subparsers.add_parser( 'bl_table', formatter_class=CustomHelpFormatter, description= 'Produce table with number of lineage for increasing mean branch lengths.' ) bl_table_parser.add_argument( 'input_tree', help="input tree to calculate branch length distributions") bl_table_parser.add_argument( 'taxon_category', help="file indicating category for each taxon in the tree") bl_table_parser.add_argument('output_table', help="desired named of output table") bl_table_parser.add_argument( '--step_size', help="step size for mean branch length criterion", type=float, default=0.01) rank_res_parser = subparsers.add_parser( 'rank_res', formatter_class=CustomHelpFormatter, description='Calculate taxonomic resolution at each rank.') rank_res_parser.add_argument('input_tree', help="decorated tree") rank_res_parser.add_argument('taxonomy_file', help="file with taxonomy for extant taxa") rank_res_parser.add_argument( 'output_file', help="output file with resolution of taxa at each rank") rank_res_parser.add_argument( '--taxa_file', help="output file indicating taxa within each resolution category", default=None) # get and check options if len(sys.argv) == 1 or sys.argv[1] in {'-h', '--help'}: print_help() sys.exit(0) else: args = parser.parse_args() if hasattr(args, 'output_dir'): make_sure_path_exists(args.output_dir) logger_setup(os.path.join(args.output_dir, 'phylorank.log'), False) elif hasattr(args, 'output_prefix'): output_dir, output_prefix = os.path.split(args.output_prefix) if output_dir: make_sure_path_exists(output_dir) logger_setup(os.path.join(output_dir, 'phylorank.log'), False) else: logger_setup('phylorank.log', False) # do what we came here to do try: parser = OptionsParser() if (False): # import pstats # p = pstats.Stats('prof') # p.sort_stats('cumulative').print_stats(10) # p.sort_stats('time').print_stats(10) import cProfile cProfile.run('parser.parse_options(args)', 'prof') elif False: import pdb pdb.run(parser.parse_options(args)) else: parser.parse_options(args) except SystemExit: print( "\n Controlled exit resulting from an unrecoverable error or warning." ) except: print("\nUnexpected error:", sys.exc_info()[0]) raise
def logger_setup(log_dir, log_file, program_name, version, silent): """Setup loggers. Two logger are setup which both print to the stdout and a log file when the log_dir is not None. The first logger is named 'timestamp' and provides a timestamp with each call, while the other is named 'no_timestamp' and does not prepend any information. The attribution 'is_silent' is also added to each logger to indicate if the silent flag is thrown. Parameters ---------- log_dir : str Output directory for log file. log_file : str Desired name of log file. program_name : str Name of program. version : str Program version number. silent : boolean Flag indicating if output to stdout should be suppressed. """ # setup general properties of loggers timestamp_logger = logging.getLogger('timestamp') timestamp_logger.setLevel(logging.DEBUG) log_format = logging.Formatter( fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S") no_timestamp_logger = logging.getLogger('no_timestamp') no_timestamp_logger.setLevel(logging.DEBUG) # setup logging to console timestamp_stream_logger = logging.StreamHandler(sys.stdout) timestamp_stream_logger.setFormatter(log_format) timestamp_logger.addHandler(timestamp_stream_logger) no_timestamp_stream_logger = logging.StreamHandler(sys.stdout) no_timestamp_stream_logger.setFormatter(None) no_timestamp_logger.addHandler(no_timestamp_stream_logger) timestamp_logger.is_silent = False no_timestamp_stream_logger.is_silent = False if silent: timestamp_logger.is_silent = True timestamp_stream_logger.setLevel(logging.ERROR) no_timestamp_stream_logger.is_silent = True if log_dir: make_sure_path_exists(log_dir) timestamp_file_logger = logging.FileHandler( os.path.join(log_dir, log_file), 'a') timestamp_file_logger.setFormatter(log_format) timestamp_logger.addHandler(timestamp_file_logger) no_timestamp_file_logger = logging.FileHandler( os.path.join(log_dir, log_file), 'a') no_timestamp_file_logger.setFormatter(None) no_timestamp_logger.addHandler(no_timestamp_file_logger) timestamp_logger.info('%s v%s' % (program_name, version)) timestamp_logger.info( ntpath.basename(sys.argv[0]) + ' ' + ' '.join(sys.argv[1:]))
def download_strains(self, options): make_sure_path_exists(options.output_dir) p = BacDive(options.output_dir, options.username, options.pwd) p.download_strains()
def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity, window_size, step_size): """Create taxonomic profiles for a set of genomes. Parameters ---------- genome_files : list of str Fasta files of genomes to process. db_file : str Database of reference genes. taxonomy_file : str File containing GreenGenes taxonomy strings for reference genomes. evalue : float E-value threshold used by blast. per_identity: float Percent identity threshold used by blast. window_size : int Size of each fragment. step_size : int Number of bases to move after each window. """ # parse taxonomy file self.logger.info( ' Reading taxonomic assignment of reference genomes.') taxonomy = Taxonomy().read(taxonomy_file) # fragment each genome into fixed sizes windows self.logger.info('') self.logger.info(' Fragmenting sequences in each bin:') diamond_output_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(diamond_output_dir) fragment_file = os.path.join(diamond_output_dir, 'fragments.fna') fragment_out = open(fragment_file, 'w') contig_id_to_genome_id = {} for genome_file in genome_files: genome_id = remove_extension(genome_file) self.profiles[genome_id] = Profile(genome_id, taxonomy) self._fragment_genomes(genome_file, window_size, step_size, self.profiles[genome_id], fragment_out) for seq_id, _seq in seq_io.read_seq(genome_file): contig_id_to_genome_id[seq_id] = genome_id # run diamond self.logger.info('') self.logger.info( ' Running diamond blastx with %d processes (be patient!)' % self.cpus) diamond = Diamond(self.cpus) diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits') diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out) diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv') diamond.view(diamond_daa_out + '.daa', diamond_table_out) self.logger.info('') self.logger.info(' Creating taxonomic profile for each genome.') self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id) self.logger.info('') self.logger.info(' Writing taxonomic profile for each genome.') report_dir = os.path.join(self.output_dir, 'bin_reports') make_sure_path_exists(report_dir) for genome_id, profile in self.profiles.iteritems(): seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv') profile.write_seq_summary(seq_summary_out) genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv') profile.write_genome_profile(genome_profile_out) genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv') self._write_genome_summary(genome_summary_out) # create Krona plot krona_profiles = defaultdict(lambda: defaultdict(int)) for genome_id, profile in self.profiles.iteritems(): seq_assignments = profile.classify_seqs(taxonomy) for seq_id, classification in seq_assignments.iteritems(): taxa = [] for r in xrange(0, len(profile.rank_labels)): taxa.append(classification[r][0]) krona_profiles[genome_id][';'.join( taxa)] += profile.seq_len[seq_id] krona = Krona() krona_output_file = os.path.join(self.output_dir, 'taxonomic_profiles.krona.html') krona.create(krona_profiles, krona_output_file)
def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len): """Create taxonomic profiles for a set of genomes. Parameters ---------- scaffold_gene_file : str Fasta file of genes on scaffolds in amino acid space. stat_file : str File with statistics for individual scaffolds. ref_genome_gene_files : list of str Fasta files of called genes on reference genomes of interest. db_file : str Database of competing reference genes. evalue : float E-value threshold of valid hits. per_identity : float Percent identity threshold of valid hits [0,100]. per_aln_len : float Percent query coverage of valid hits [0, 100]. """ # read statistics file self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(stat_file) # perform homology searches self.logger.info('Creating diamond database for reference genomes.') ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa') concatenate_gene_files(ref_genome_gene_files, ref_gene_file) diamond = Diamond(self.cpus) ref_diamond_db = os.path.join(self.output_dir, 'ref_genes') diamond.make_database(ref_gene_file, ref_diamond_db) self.logger.info('Identifying homologs within reference genomes of interest (be patient!).') self.diamond_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(self.diamond_dir) hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv') diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, hits_ref_genomes) self.logger.info('Identifying homologs within competing reference genomes (be patient!).') hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv') diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, hits_comp_ref_genomes) # get list of genes with a top hit to the reference genomes of interest hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes) # get number of genes on each scaffold num_genes_on_scaffold = defaultdict(int) for seq_id, _seq in seq_io.read_seq(scaffold_gene_file): scaffold_id = seq_id[0:seq_id.rfind('_')] num_genes_on_scaffold[scaffold_id] += 1 # get hits to each scaffold hits_to_scaffold = defaultdict(list) for query_id, hit in hits_to_ref.iteritems(): gene_id = query_id[0:query_id.rfind('~')] scaffold_id = gene_id[0:gene_id.rfind('_')] hits_to_scaffold[scaffold_id].append(hit) # report summary stats for each scaffold reference_out = os.path.join(self.output_dir, 'references.tsv') fout = open(reference_out, 'w') fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs') fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage') fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n') for scaffold_id, hits in hits_to_scaffold.iteritems(): aln_len = [] perc_iden = [] evalue = [] bitscore = [] subject_scaffold_ids = defaultdict(int) subject_bin_ids = defaultdict(int) for hit in hits: aln_len.append(hit.aln_length) perc_iden.append(hit.perc_identity) evalue.append(hit.evalue) bitscore.append(hit.bitscore) subject_bin_id, subject_gene_id = hit.subject_id.split('~') subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')] subject_scaffold_ids[subject_scaffold_id] += 1 subject_bin_ids[subject_bin_id] += 1 sorted_subject_bin_ids = sorted(subject_bin_ids.items(), key=operator.itemgetter(1), reverse=True) subject_bin_id_str = [] for bin_id, num_hits in sorted_subject_bin_ids: subject_bin_id_str.append(bin_id + ':' + str(num_hits)) subject_bin_id_str = ','.join(subject_bin_id_str) sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), key=operator.itemgetter(1), reverse=True) subject_scaffold_id_str = [] for subject_id, num_hits in sorted_subject_scaffold_ids: subject_scaffold_id_str.append(subject_id + ':' + str(num_hits)) subject_scaffold_id_str = ','.join(subject_scaffold_id_str) fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % ( scaffold_id, subject_bin_id_str, subject_scaffold_id_str, scaffold_stats.print_stats(scaffold_id), mean(scaffold_stats.coverage(scaffold_id)), num_genes_on_scaffold[scaffold_id], len(hits), len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id], mean(aln_len), mean(perc_iden), mean(evalue), mean(bitscore))) fout.close() return reference_out
def full_lpsn_wf(self, options): """Full workflow to parse LPSN.""" make_sure_path_exists(options.output_dir) p = LPSN(options.output_dir) p.full_lpsn_wf()
def parse_html(self, options): """Parse all html files.""" make_sure_path_exists(options.output_dir) p = LPSN(options.output_dir) p.parse_html(options.input_dir)
def features(self, options): """Making bam features matrix""" make_sure_path_exists(options.output_dir) reads_abundance = os.path.join( options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[0]) reads_normalised = os.path.join( options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[1]) reads_relative = os.path.join( options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[2]) base_abundance = os.path.join( options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[3]) base_normalised = os.path.join( options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[4]) base_relative = os.path.join(options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[5]) features_size = {} counts = {} counts_base = {} self.logger.info('Get features and initialise matrix') with open(options.faidx) as f: for line in f: if not line.startswith('#'): line_list = line.rstrip().split('\t') features = line_list[0] features_size[features] = line_list[1] counts[features] = 0 counts_base[features] = 0 counts_all = [] counts_all_normalised = [] counts_all_relative = [] counts_base_all = [] counts_base_all_normalised = [] counts_base_all_relative = [] header = ["Features", "Features_size"] self.logger.info('Browse alignement file(s)') samtoolsexec = findEx('samtools') samtoolsthreads = '-@ ' + options.threads samtoolsminqual = '-q ' + options.mapQ with open(options.bam_list, 'r') as b: for bam in b: if bam.startswith('#'): continue i = 0 alignementfile, librarysize = bam.split('\t') if librarysize == '' or librarysize == 0: librarysize = 1 samplename = remove_extension(os.path.basename(alignementfile), options.extension) header.append(samplename) self.logger.info('\t' + samplename) cmd = [ samtoolsexec, 'view', samtoolsthreads, samtoolsminqual, alignementfile ] p = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout for line in p: line = line.decode(sys.getdefaultencoding()).rstrip() if i > 0 and i % 1000000 == 0: self.logger.info("Alignment record %s processed" % i) i += 1 line_list = line.split('\t') features = line_list[2] cigar = line_list[5] base_mapped = 0 match = re.findall(r'(\d+)M', cigar) read_len = len(line_list[6]) for base_match in match: base_mapped += int(base_match) if read_len == 0: self.logger.info(line_list) if base_mapped / read_len < float(options.id_cutoff): continue counts[features] += 1 if options.discard_gene_length_normalisation: counts_base[features] += base_mapped else: counts_base[features] += base_mapped / int( features_size[features]) # raw reads count counts_all.append(counts.copy()) # normalised reads count count_tmp = {} count_tmp = { k: (v / int(librarysize)) * options.feature_normalisation for k, v in counts.items() } counts_all_normalised.append(count_tmp.copy()) # relative reads count count_tmp = {} count_tmp = { k: v / total for total in (sum(counts.values()), ) for k, v in counts.items() } counts_all_relative.append(count_tmp.copy()) # raw bases count counts_base_all.append(counts_base.copy()) # normalised bases count count_tmp = {} count_tmp = { k: (v / int(librarysize)) * options.feature_normalisation for k, v in counts_base.items() } counts_base_all_normalised.append(count_tmp.copy()) # relative bases count count_tmp = {} count_tmp = { k: v / total for total in (sum(counts_base.values()), ) for k, v in counts_base.items() } counts_base_all_relative.append(count_tmp.copy()) for fn in counts: counts[fn] = 0 counts_base[fn] = 0 self.logger.info('Print matrices') self.logger.info('Print raw reads abundance matrix in %s' % reads_abundance) output_handle = open(reads_abundance, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_all]) == 0 and options.removed: continue else: output_handle.write('\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_all]) + '\n') output_handle.close() self.logger.info('Print normalised reads abundance matrix in %s' % reads_normalised) output_handle = open(reads_normalised, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_all_normalised]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_all_normalised]) + '\n') output_handle.close() self.logger.info('Print relative reads abundance matrix in %s' % reads_normalised) output_handle = open(reads_relative, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts.keys(): if sum([c[fn] for c in counts_all_relative]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_all_relative]) + '\n') output_handle.close() self.logger.info('Print raw base abundance matrix in %s' % reads_normalised) output_handle = open(base_abundance, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts_base.keys(): if sum([c[fn] for c in counts_all]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_base_all]) + '\n') output_handle.close() self.logger.info('Print normalised base abundance matrix in %s' % reads_normalised) output_handle = open(base_normalised, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts_base.keys(): if sum([c[fn] for c in counts_all_normalised]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_base_all_normalised]) + '\n') output_handle.close() self.logger.info('Print relative base abundance matrix in %s' % reads_normalised) output_handle = open(base_relative, "w") output_handle.write('\t'.join(header) + '\n') for fn in counts_base.keys(): if sum([c[fn] for c in counts_all_relative]) == 0 and options.removed: continue else: output_handle.write( '\t'.join([fn] + [features_size[fn]] + [str(c[fn]) for c in counts_base_all_relative]) + '\n') output_handle.close() self.logger.info('Matrices printed')
def modify_bin(self, options): """Modify bin command""" make_sure_path_exists(os.path.dirname(options.output_genome)) if not (options.add or options.remove or options.outlier_file or options.compatible_file): self.logger.warning('No modification to bin requested.\n') sys.exit() if (options.add or options.remove) and (options.outlier_file or options.compatible_file): self.logger.warning( "The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n" ) sys.exit() if options.outlier_file and options.compatible_file: self.logger.warning( "The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n" ) sys.exit() failed_to_add = [] failed_to_remove = [] if options.add or options.remove: failed_to_add, failed_to_remove = genome_tk.modify( options.genome_file, options.scaffold_file, options.add, options.remove, options.output_genome) elif options.outlier_file: outliers = Outliers() outliers.remove_outliers(options.genome_file, options.outlier_file, options.output_genome, False) elif options.compatible_file: outliers = Outliers() if options.unique_only: outliers.add_compatible_unique(options.scaffold_file, options.genome_file, options.compatible_file, options.min_len, options.output_genome) elif options.closest_only: outliers.add_compatible_closest(options.scaffold_file, options.genome_file, options.compatible_file, options.min_len, options.output_genome) else: outliers.add_compatible(options.scaffold_file, options.genome_file, options.compatible_file, options.min_len, options.output_genome) if failed_to_add: self.logger.warning('Failed to add the following sequence(s):') for seq_id in failed_to_add: print ' %s' % seq_id if failed_to_remove: self.logger.warning('Failed to remove the following sequence(s):') for seq_id in failed_to_remove: print ' %s' % seq_id self.logger.info('Modified genome written to: ' + options.output_genome)
def run(self, query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, num_top_targets, taxonomy_file, keep_rbhs, output_dir): """Classify genomes based on AAI to reference genomes. Parameters ---------- query_gene_file : str File with all query genes in FASTA format. target_gene_file : str File with all target genes in FASTA format. sorted_hit_table : str Sorted table indicating genes with sequence similarity. evalue_threshold : float Evalue threshold used to define a homologous gene. per_identity_threshold : float Percent identity threshold used to define a homologous gene. per_aln_len_threshold : float Alignment length threshold used to define a homologous gene. num_top_targets : int Number of top scoring target genomes to report per query genome. taxonomy_file : str File indicating taxonomic identification of all target genomes. keep_rbhs : boolean Flag indicating if RBH should be written to file. output_dir : str Directory to store AAI results. """ # read taxonomic identification of each genome taxonomy = {} if taxonomy_file: for line in open(taxonomy_file): genome_id, taxa_str = line.rstrip().split('\t') taxonomy[genome_id] = taxa_str # calculate AAI between query and target genomes aai_output_dir = os.path.join(output_dir, 'aai') make_sure_path_exists(aai_output_dir) aai_calculator = AAICalculator(self.cpus) aai_output_file, rbh_output_file = aai_calculator.run( query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, keep_rbhs, aai_output_dir) # determine matches to each query genomes aai_results_file = os.path.join(aai_output_dir, 'aai_summary.tsv') with open(aai_results_file) as f: f.readline() hits = defaultdict(list) for line in f: line_split = line.rstrip().split('\t') query_id = line_split[0] target_id = line_split[2] aai = float(line_split[5]) of = float(line_split[7]) hits[query_id].append([target_id, aai, of]) # report top matches results_file = os.path.join(output_dir, 'classify.tsv') fout = open(results_file, 'w') fout.write('Query Id\tTarget Id\tAAI\tOF\tScore') if taxonomy: fout.write('\tTarget Taxonomy') fout.write('\n') for query_id, cur_hits in hits.items(): cur_hits.sort(key=lambda x: x[1], reverse=True) for i in range(0, min(num_top_targets, len(cur_hits))): data = [query_id] + cur_hits[i] fout.write('%s\t%s\t%.2f\t%.2f' % tuple(data)) aai = data[2] of = data[3] fout.write('\t%.2f' % (aai + of)) target_id = cur_hits[i][0] if target_id in taxonomy: fout.write('\t%s' % taxonomy[target_id]) fout.write('\n') fout.close() return results_file
def run(self, taxonomy_file, type_strains_file, genome_prot_dir, extension, max_taxa, rank, per_identity, per_aln_len, genomes_to_process, keep_all_genes, no_reformat_gene_ids, output_dir): """ Create dereplicate set of genes. Taxonomy file should have the following format: <genome_id>\t<taxonomy_str> where taxonomy_str is in GreenGenes format: d__Bacteria;p__Proteobacteria;...;s__Escherichia coli Type strain file should have the following format: <genome_id>\t<genome name> Parameters ---------- taxonomy_file : str File indicating taxonomy string for all genomes of interest type_strains_file : str File indicating type strains. genome_prot_dir : str Directory containing amino acid genes for each genome. extension : str Extension of files with called genes. max_taxa : int Maximum taxa to retain in a named group. rank : int Taxonomic rank to perform dereplication (0 = domain, ..., 6 = species). per_identity : float Percent identity for subsampling similar genes. per_aln_len : float Percent alignment length for subsampling similar genes. genomes_to_process : str File with list of genomes to retain instead of performing taxon subsampling. keep_all_genes : boolean Flag indicating that no gene subsampling should be performed. no_reformat_gene_ids : boolean Flag indicating if gene ids should be reformatted to include scaffold names given by the GFF file. output_dir : str Desired output directory for storing results. """ make_sure_path_exists(output_dir) self.logger.info('Dereplicating at the rank of %s.' % self.rank_labels[rank]) # get taxonomy string for each genome taxonomy = {} if taxonomy_file: self.logger.info('Reading taxonomy file.') taxonomy = Taxonomy().read(taxonomy_file) self.logger.info('There are %d genomes with taxonomy strings.' % len(taxonomy)) # get type strains; genomes which should never be dereplicated type_strains = set() if type_strains_file: self.logger.info('Reading type strain file.') type_strains = self.read_type_strain(type_strains_file) self.logger.info('There are %d type strains.' % len(type_strains)) # get specific list of genomes to process genomes_to_retain = set() if genomes_to_process: self.logger.info('Reading genomes to retain.') for line in open(genomes_to_process): line_split = line.split() genomes_to_retain.add(line_split[0]) self.logger.info('Retaining %d genomes.' % len(genomes_to_retain)) # make sure extension filter starts with a '.' if not extension.startswith('.'): extension = '.' + extension # identify unique genes in each named group fout = open(os.path.join(output_dir, 'genomes_without_called_genes.tsv'), 'w') rank_genomes = defaultdict(list) genome_files = os.listdir(genome_prot_dir) underclassified_genomes = 0 genomes_with_missing_data = 0 for genome_file in genome_files: genome_id = remove_extension(genome_file, extension) if not genome_file.endswith(extension): continue if genomes_to_process and genome_id not in genomes_to_retain: continue genome_file = os.path.join(genome_prot_dir, genome_file) if not os.path.exists(genome_file): genomes_with_missing_data += 1 fout.write(genome_id + '\t' + ';'.join(taxonomy[genome_id]) + '\n') continue t = taxonomy.get(genome_id, self.rank_prefixes) taxa = t[rank] if taxa[3:] == '': underclassified_genomes += 1 rank_genomes[self.underclassified].append(genome_id) else: rank_genomes[taxa].append(genome_id) validate_seq_ids(genome_file) fout.close() total_genomes_to_process = sum([len(genome_list) for genome_list in rank_genomes.values()]) if total_genomes_to_process == 0: self.logger.error('No genomes found in directory: %s. Check the --extension flag used to identify genomes.' % genome_prot_dir) sys.exit(-1) self.logger.info('Under-classified genomes automatically placed into the database: %d' % underclassified_genomes) self.logger.info('Genomes with missing sequence data: %d' % genomes_with_missing_data) self.logger.info('Total named groups: %d' % len(rank_genomes)) self.logger.info('Total genomes to process: %d' % total_genomes_to_process) # process each named group gene_file = os.path.join(output_dir, 'custom_db.faa') gene_out = open(gene_file, 'w') taxonomy_out = open(os.path.join(output_dir, 'custom_taxonomy.tsv'), 'w') tmp_dir = tempfile.mkdtemp() total_genes_removed = 0 total_genes_kept = 0 total_genomes_kept = 0 processed_genomes = 0 for taxa, genome_list in rank_genomes.iteritems(): processed_genomes += len(genome_list) print '-------------------------------------------------------------------------------' self.logger.info('Processing %s | Finished %d of %d (%.2f%%) genomes.' % (taxa, processed_genomes, total_genomes_to_process, processed_genomes * 100.0 / total_genomes_to_process)) # create directory with selected genomes taxon_dir = os.path.join(tmp_dir, 'taxon') os.mkdir(taxon_dir) reduced_genome_list = genome_list if not genomes_to_process and taxa != self.underclassified: # perform taxon subsampling reduced_genome_list = self.select_taxa(genome_list, taxonomy, type_strains, max_taxa) total_genomes_kept += len(reduced_genome_list) gene_dir = os.path.join(taxon_dir, 'genes') os.mkdir(gene_dir) for genome_id in reduced_genome_list: taxonomy_out.write(genome_id + '\t' + ';'.join(taxonomy.get(genome_id, self.rank_prefixes)) + '\n') genome_gene_file = os.path.join(genome_prot_dir, genome_id + extension) gff_file = os.path.join(genome_prot_dir, genome_id + '.gff') output_gene_file = os.path.join(gene_dir, genome_id + '.faa') if not no_reformat_gene_ids: self.reformat_gene_id_to_scaffold_id(genome_gene_file, gff_file, taxonomy, output_gene_file) else: os.system('cp %s %s' % (genome_gene_file, output_gene_file)) # filter genes based on amino acid identity genes_to_remove = [] amended_gene_dir = os.path.join(taxon_dir, 'amended_genes') if keep_all_genes or taxa == self.underclassified: # modify gene identifiers to include genome ids self.amend_gene_identifies(gene_dir, amended_gene_dir) else: # filter genes on AAI genes_to_remove = self.filter_aai(taxon_dir, gene_dir, amended_gene_dir, per_identity, per_aln_len, self.cpus) self.logger.info('Writing unique genes from genomes in %s.' % taxa) genes_kept = self.write_gene_file(gene_out, amended_gene_dir, reduced_genome_list, taxonomy, genes_to_remove) self.logger.info('Retain %d of %d taxa.' % (len(reduced_genome_list), len(genome_list))) self.logger.info('Genes to keep: %d' % genes_kept) self.logger.info('Genes removed: %d' % len(genes_to_remove)) total_genes_kept += genes_kept total_genes_removed += len(genes_to_remove) shutil.rmtree(taxon_dir) taxonomy_out.close() gene_out.close() self.logger.info('Retain %d of %d (%.1f%%) genomes' % (total_genomes_kept, total_genomes_to_process, total_genomes_kept * 100.0 / (total_genomes_to_process))) self.logger.info('Total genes kept: %d' % total_genes_kept) self.logger.info('Total genes removed: %d (%.1f%%)' % (total_genes_removed, total_genes_removed * 100.0 / (total_genes_kept + total_genes_removed))) self.logger.info('Creating BLAST database.') os.system('makeblastdb -dbtype prot -in %s' % gene_file) shutil.rmtree(tmp_dir)
def deleteGenomes(self, batchfile=None, db_genome_ids=None, reason=None): ''' Delete Genomes Returns True for success or False for fail Parameters: :param batchfile: text file listing a range of ids to delete :param db_genome_ids: a list of ids can be written directly in the command line ''' self._loggerSetup() try: if db_genome_ids is False: raise GenomeDatabaseError( "Unable to delete genomes. Unable to retrieve genome ids.") # restrict deletion to genomes owned by user has_permission, username, genomes_owners = self._hasPermissionToEditGenomes( db_genome_ids) if has_permission is None: raise GenomeDatabaseError( "Unable to delete genomes. Unable to retrieve permissions for genomes." ) if has_permission is False: raise GenomeDatabaseError( "Unable to delete genomes. Insufficient permissions.") if db_genome_ids: if not confirm( "Are you sure you want to delete %i genomes (this action cannot be undone)" % len(db_genome_ids)): raise GenomeDatabaseError("User aborted database action.") self.cur.execute( "DELETE FROM aligned_markers " + "WHERE genome_id IN %s ", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM genome_list_contents " + "WHERE genome_id IN %s", (tuple(db_genome_ids), )) # Deletion of metadata self.cur.execute( "DELETE FROM metadata_genes " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM metadata_ncbi " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM metadata_nucleotide " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM metadata_taxonomy " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM metadata_rna " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM metadata_sequence " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute("DELETE FROM genomes " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "UPDATE metadata_taxonomy set gtdb_genome_representative = NULL where " + "gtdb_genome_representative in %s", (tuple(genomes_owners.keys()), )) for genome, info in genomes_owners.iteritems(): if str(username) != str(info.get("owner")): logging.info( '''Genome {0} has been deleted by {1} for the following reason '{2}' WARNING: {1} is not the owner of this {0} (real owner {3} ) {0} needs to be moved manually to the deprecated folder''' .format(genome, username, reason, info.get("owner"))) else: if info.get("prefix") is "U": target = os.path.dirname( os.path.join(self.deprecatedUserDir, info.get("relative_path"))) elif info.get("prefix") is "GB": target = os.path.join(self.deprecatedGBKDir, info.get("relative_path")) elif info.get("prefix") is "RS": target = os.path.join(self.deprecatedRSQDir, info.get("relative_path")) make_sure_path_exists(target) os.rename( os.path.dirname( Tools.fastaPathGenerator( info.get("relative_path"), info.get("prefix"))), target) logging.info( "Genome {0} has been deleted by {1} for the following reason '{2}'" .format(genome, username, reason)) except GenomeDatabaseError as e: raise e return True
def run(self, genome_files, output_dir, called_genes=False, translation_table=None, meta=False, closed_ends=False): """Call genes with Prodigal. Call genes with prodigal and store the results in the specified output directory. For convenience, the called_gene flag can be used to indicate genes have previously been called and simply need to be copied to the specified output directory. Parameters ---------- genome_files : list of str Nucleotide fasta files to call genes on. called_genes : boolean Flag indicating if genes are already called. translation_table : int Specifies desired translation table, use None to automatically select between tables 4 and 11. meta : boolean Flag indicating if prodigal should call genes with the metagenomics procedure. closed_ends : boolean If True, do not allow genes to run off edges (throws -c flag). output_dir : str Directory to store called genes. Returns ------- d[genome_id] -> namedtuple(best_translation_table coding_density_4 coding_density_11) Summary statistics of called genes for each genome. """ self.called_genes = called_genes self.translation_table = translation_table self.meta = meta self.closed_ends = closed_ends self.output_dir = output_dir make_sure_path_exists(self.output_dir) progress_func = None if self.verbose: file_type = 'genomes' self.progress_str = ' Finished processing %d of %d (%.2f%%) genomes.' if meta: file_type = 'scaffolds' if len(genome_files): file_type = ntpath.basename(genome_files[0]) self.progress_str = ' Finished processing %d of %d (%.2f%%) files.' self.logger.info('Identifying genes within %s: ' % file_type) progress_func = self._progress parallel = Parallel(self.cpus) summary_stats = parallel.run(self._producer, self._consumer, genome_files, progress_func) return summary_stats
def pull_html(self, options): """Pull all genus.html files.""" make_sure_path_exists(options.output_dir) p = LPSN(options.output_dir) p.pull_html()
def parse_options(self, options): """Parse user options and call the correct pipeline(s)""" try: if options.file == "stdout": options.file = '' except: pass if (options.subparser_name == 'call_genes'): self.call_genes(options) elif (options.subparser_name == 'similarity'): self.similarity(options) elif (options.subparser_name == 'aai'): self.aai(options) elif (options.subparser_name == 'classify'): self.classify(options) elif (options.subparser_name == 'aai_wf'): root_dir = options.output_dir make_sure_path_exists(root_dir) if options.proteins: if options.file_ext == 'fna': self.logger.warning( "Changing file extension from 'fna' to 'faa' since 'proteins' flag was given." ) options.file_ext = 'faa' options.query_proteins = options.input_files options.target_proteins = options.input_files else: options.input_genomes = options.input_files options.output_dir = os.path.join(root_dir, 'genes') self.call_genes(options) options.query_proteins = os.path.join(root_dir, 'genes') options.target_proteins = os.path.join(root_dir, 'genes') options.file_ext = 'faa' options.output_dir = os.path.join(root_dir, 'similarity') self.similarity(options) options.query_gene_file = os.path.join(options.output_dir, 'query_genes.faa') options.sorted_hit_table = os.path.join(options.output_dir, 'hits_sorted.tsv') options.output_dir = os.path.join(root_dir, 'aai') self.aai(options) elif (options.subparser_name == 'classify_wf'): root_dir = options.output_dir make_sure_path_exists(root_dir) if options.query_files == options.target_files: self.logger.error( "The 'query_files' and 'target_files' arguments must be different." ) sys.exit() if options.proteins: if options.file_ext == 'fna': self.logger.warning( "Changing file extension from 'fna' to 'faa' since 'proteins' flag was given." ) options.file_ext = 'faa' options.query_proteins = options.query_files options.target_proteins = options.target_files else: options.input_genomes = options.query_files options.output_dir = os.path.join(root_dir, 'query_genes') self.call_genes(options) options.input_genomes = options.target_files options.output_dir = os.path.join(root_dir, 'target_genes') self.call_genes(options) options.query_proteins = os.path.join(root_dir, 'query_genes') options.target_proteins = os.path.join(root_dir, 'target_genes') options.file_ext = 'faa' options.output_dir = os.path.join(root_dir, 'similarity') self.similarity(options) options.query_gene_file = os.path.join(options.output_dir, 'query_genes.faa') options.target_gene_file = os.path.join(options.output_dir, 'target_genes.faa') options.sorted_hit_table = os.path.join(options.output_dir, 'hits_sorted.tsv') options.output_dir = os.path.join(root_dir, 'classify') self.classify(options) elif (options.subparser_name == 'aa_usage'): self.aa_usage(options) elif (options.subparser_name == 'codon_usage'): self.codon_usage(options) elif (options.subparser_name == 'kmer_usage'): self.kmer_usage(options) elif (options.subparser_name == 'stop_usage'): self.stop_usage(options) elif (options.subparser_name == 'lgt_di'): self.lgt_di(options) elif (options.subparser_name == 'lgt_codon'): self.lgt_codon(options) elif (options.subparser_name == 'diss'): self.diss(options) elif (options.subparser_name == 'hclust'): self.hclust(options) elif (options.subparser_name == 'pcoa_plot'): self.pcoa_plot(options) elif (options.subparser_name == 'heatmap'): self.heatmap(options) else: self.logger.error(' [Error] Unknown CompareM command: "' + options.subparser_name + '"\n') sys.exit() return 0
def __tigrfam_worker(self, queue_in, queue_out): """Process each data item in parallel.""" tigrfam_version = 'tigrfam_15.0' tigrfam_extension = f'_{tigrfam_version}.tsv' tigrfam_tophit_extension = f'_{tigrfam_version}_tophit.tsv' symlink_tigrfam_extension = '_tigrfam.tsv' symlink_tigrfam_tophit_extension = '_tigrfam_tophit.tsv' while True: gene_file = queue_in.get(block=True, timeout=None) if gene_file == None: break assembly_dir, filename = os.path.split(gene_file) make_sure_path_exists(os.path.join(assembly_dir, tigrfam_version)) output_hit_file = os.path.join( assembly_dir, tigrfam_version, filename.replace(self.protein_file_ext, tigrfam_extension)) hmmsearch_out = os.path.join( assembly_dir, tigrfam_version, filename.replace(self.protein_file_ext, f'_{tigrfam_version}.out')) cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu 1 %s %s' % ( hmmsearch_out, output_hit_file, self.tigrfam_hmms, gene_file) os.system(cmd) #================================================================== # print(cmd) #================================================================== # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + '.sha256', 'w') fout.write(checksum) fout.close() # determine top hits tigrfam_tophit_file = os.path.join( assembly_dir, tigrfam_version, filename.replace(self.protein_file_ext, tigrfam_tophit_extension)) self._tigr_top_hit(output_hit_file, tigrfam_tophit_file) # create symlink in prodigal_folder new_hit_link = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, symlink_tigrfam_extension)) new_tophit_link = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, symlink_tigrfam_tophit_extension)) #================================================================== # print(f'{new_hit_link} will point to {output_hit_file}') # print(f'{new_tophit_link} will point to {tigrfam_tophit_file}') #================================================================== os.symlink(output_hit_file, new_hit_link) os.symlink(tigrfam_tophit_file, new_tophit_link) # allow results to be processed or written to file queue_out.put(gene_file)
def _calculate_fastani_distance(self, user_genome, genome_reps): """ Calculate the FastANI distance between all user genomes and the reference to classify them at the species level Parameters ---------- user_leaf : User genome genome_reps : list of representatives genomes """ try: self.tmp_output_dir = tempfile.mkdtemp() make_sure_path_exists(self.tmp_output_dir) # we write the two input files for fastani, the query file and # reference file query_list_file = open(os.path.join( self.tmp_output_dir, 'query_list.txt'), 'w') # We need to rebuild the path for each unprocessed genomes genome_dirs_query = ("SELECT g.id, g.fasta_file_location,gs.external_id_prefix " "FROM genomes g " + "LEFT JOIN genome_sources gs ON gs.id = g.genome_source_id " + "WHERE g.id in %s") self.cur.execute(genome_dirs_query, (tuple([user_genome]),)) raw_results = self.cur.fetchall() genome_dir_user = {a: fastaPathGenerator( b, c) for a, b, c in raw_results} for _k, v in genome_dir_user.iteritems(): query_list_file.write('{}\n'.format(v)) query_list_file.close() # We need to rebuild the path for each potential reps genome_dirs_query = ("SELECT g.id, g.fasta_file_location,gs.external_id_prefix " "FROM genomes g " + "LEFT JOIN genome_sources gs ON gs.id = g.genome_source_id " + "WHERE g.id in %s") self.cur.execute(genome_dirs_query, (tuple(list(zip(*genome_reps))[0]),)) raw_results = self.cur.fetchall() genome_dirs = {a: fastaPathGenerator( b, c) for a, b, c in raw_results} ref_list_file = open(os.path.join( self.tmp_output_dir, 'ref_list.txt'), 'w') for _k, v in genome_dirs.iteritems(): ref_list_file.write('{}\n'.format(v)) ref_list_file.close() # run fastANI if not os.path.isfile(os.path.join(self.tmp_output_dir, 'query_list.txt')) or not os.path.isfile(os.path.join(self.tmp_output_dir, 'ref_list.txt')): raise cmd = 'fastANI --ql {0} --rl {1} -o {2} > /dev/null 2>{3}'.format(os.path.join(self.tmp_output_dir, 'query_list.txt'), os.path.join( self.tmp_output_dir, 'ref_list.txt'), os.path.join( self.tmp_output_dir, 'results.tab'), os.path.join(self.tmp_output_dir, 'error.log')) os.system(cmd) if not os.path.isfile(os.path.join(self.tmp_output_dir, 'results.tab')): errstr = 'FastANI has stopped:\n' if os.path.isfile(os.path.join(self.tmp_output_dir, 'error.log')): with open(os.path.join(self.tmp_output_dir, 'error.log')) as debug: for line in debug: finalline = line errstr += finalline raise ValueError(errstr) dict_parser_distance = self._parse_fastani_results( os.path.join(self.tmp_output_dir, 'results.tab'), genome_dirs, user_genome) if len(dict_parser_distance) == 0: return None sorted_dict = sorted(dict_parser_distance.get( user_genome).iteritems(), key=lambda(_x, y): y['ani'], reverse=True) fastani_matching_reference = sorted_dict[0][0] shutil.rmtree(self.tmp_output_dir) return fastani_matching_reference except ValueError as error: if os.path.exists(self.tmp_output_dir): shutil.rmtree(self.tmp_output_dir) raise error except Exception as error: if os.path.exists(self.tmp_output_dir): shutil.rmtree(self.tmp_output_dir) raise error
def run(self, input_tree, msa_file, marker_info_file, mask_file, perc_markers_to_keep, num_replicates, model, output_dir): """Jackknife marker genes. Marker file should have the format: <marker id>\t<marker name>\t<marker desc>\t<length>\n Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. marker_info_file : str File indicating database id, HMM name, description and length of each marker in the alignment. mask_file : str File indicating masking of multiple sequence alignment. perc_markers_to_keep : float [0, 1] Percentage of marker genes to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str Output directory for jackkife trees. """ assert (model in ['wag', 'jtt']) self.model = model self.perc_markers_to_keep = perc_markers_to_keep self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # determine length of each marker gene in alignment marker_lengths = [] total_len = 0 with open(marker_info_file) as f: f.readline() for line in f: line_split = line.split('\t') ml = int(line_split[3]) marker_lengths.append(ml) total_len += ml self.logger.info('Concatenated length of markers: %d' % total_len) # read mask mask = open(mask_file).readline().strip() start = 0 self.marker_lengths = [] total_mask_len = 0 for ml in marker_lengths: end = start + ml zeros = mask[start:end].count('0') start = end self.marker_lengths.append(ml - zeros) total_mask_len += ml - zeros self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) if len(list(self.msa.values())[0]) != total_mask_len: self.logger.error('Length of MSA does not meet length of mask.') sys.exit() # calculate replicates self.logger.info('Calculating jackknife marker replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, range(num_replicates), self._progress) # calculate support self.logger.info('Calculating support for %d replicates.' % num_replicates) rep_tree_files = [] for rep_index in range(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre')) output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.jk_markers.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def __init__(self, output_dir): self.outdir = output_dir make_sure_path_exists(self.outdir) self.outfile = os.path.join(self.outdir, 'existing_names.tsv')