def run_search_and_parse_results(self): """Align the protein against the database based on only sequence""" if not self.percent_identical_cutoff or not self.max_number_templates: raise ConfigError( "run_search_and_parse_results :: You initiated this class without providing values for percent_identical_cutoff " "and max_number_templates, which is required for this function." ) # Change to MODELLER working directory os.chdir(self.directory) driver = diamond.Diamond( query_fasta=self.target_fasta_path, target_fasta=J(self.database_dir, 'pdb_95.dmnd'), run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False), ) driver.blastp() # Change back to user directory os.chdir(self.start_dir) search_df = driver.view_as_dataframe( J(self.directory, driver.tabular_output_path)) matches_found = search_df.shape[0] if not matches_found: self.run.warning( "No proteins with homologous sequence were found for {}. No structure will be modelled" .format(self.corresponding_gene_call)) raise self.EndModeller # We need the gene length for proper_pident target_fasta = u.SequenceSource(self.target_fasta_path, lazy_init=False) while next(target_fasta): gene_length = len(target_fasta.seq) # add some useful columns search_df["proper_pident"] = search_df["pident"] * search_df[ "length"] / gene_length search_df["code"] = search_df["sseqid"].str[:-1] search_df["chain"] = search_df["sseqid"].str[-1] # filter results by self.percent_identical_cutoff. max_pident_found = search_df["proper_pident"].max() id_of_max_pident = tuple( search_df.loc[search_df["proper_pident"].idxmax(), ["code", "chain"]].values) search_df = search_df[ search_df["proper_pident"] >= self.percent_identical_cutoff] search_df = search_df.sort_values("proper_pident", ascending=False) # If more than 1 template in 1 PDB id, just choose 1 search_df = search_df.drop_duplicates('code', keep='first') # Order them and take the first self.modeller.max_number_templates. matches_after_filter = len(search_df) if not matches_after_filter: self.run.warning("Gene {} did not have a search result with proper percent identicalness above or equal " "to {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a " "proper percent identicalness of {:.2f}%. No structure will be modelled.".\ format(self.corresponding_gene_call, self.percent_identical_cutoff, id_of_max_pident[1], id_of_max_pident[0], max_pident_found)) raise self.EndModeller # get up to self.modeller.max_number_templates of those with the highest proper_ident scores. search_df = search_df.iloc[:min( [len(search_df), self.max_number_templates])] # Get their chain and 4-letter ids self.list_of_template_code_and_chain_ids = list( zip(search_df["code"], search_df["chain"])) self.run.info("Max number of templates allowed", self.max_number_templates) self.run.info("Number of candidate templates", matches_found) self.run.info( "After >{}% identical filter".format( self.percent_identical_cutoff), matches_after_filter) self.run.info("Number accepted as templates", len(self.list_of_template_code_and_chain_ids)) # update user on which templates are used, and write the templates to self.out for i in range(len(self.list_of_template_code_and_chain_ids)): pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i] ppi = search_df["proper_pident"].iloc[i] self.out["templates"]["pdb_id"].append(pdb_id) self.out["templates"]["chain_id"].append(chain_id) self.out["templates"]["ppi"].append(ppi) self.run.info( "Template {}".format(i + 1), "Protein ID: {}, Chain {} ({:.1f}% identical)".format( pdb_id, chain_id, ppi))
from anvio.errors import ConfigError from anvio.tables.views import TablesForViews from anvio.tables.codonfrequencies import TableForCodonFrequencies from anvio.tables.variability import TableForVariability from anvio.tables.miscdata import TableForLayerAdditionalData __author__ = "Developers of anvi'o (see AUTHORS.txt)" __copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)" __credits__ = [] __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "A. Murat Eren" __email__ = "*****@*****.**" null_progress = terminal.Progress(verbose=False) null_run = terminal.Run(verbose=False) pp = terminal.pretty_print class BAMProfiler(dbops.ContigsSuperclass): """Creates an über class for BAM file operations""" def __init__(self, args, r=terminal.Run(width=35), p=terminal.Progress()): self.args = args self.progress = p self.run = r A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.input_file_path = A('input_file') self.contigs_db_path = A('contigs_db') self.serialized_profile_path = A('serialized_profile')
def __init__(self, args, r=terminal.Run(width=35), p=terminal.Progress()): self.args = args self.progress = p self.run = r A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.input_file_path = A('input_file') self.contigs_db_path = A('contigs_db') self.serialized_profile_path = A('serialized_profile') self.output_directory = A('output_dir') self.list_contigs_and_exit = A('list_contigs') self.min_contig_length = A('min_contig_length') or 0 self.max_contig_length = A('max_contig_length') or sys.maxsize self.min_mean_coverage = A('min_mean_coverage') self.min_coverage_for_variability = A('min_coverage_for_variability') self.contigs_shall_be_clustered = A('cluster_contigs') self.skip_hierarchical_clustering = A('skip_hierarchical_clustering') self.sample_id = A('sample_name') self.report_variability_full = A('report_variability_full') self.overwrite_output_destinations = A('overwrite_output_destinations') self.skip_SNV_profiling = A('skip_SNV_profiling') self.profile_SCVs = A('profile_SCVs') self.ignore_orphans = A('ignore_orphans') self.max_coverage_depth = A('max_coverage_depth') or 8000 self.gen_serialized_profile = A('gen_serialized_profile') self.distance = A('distance') or constants.distance_metric_default self.linkage = A('linkage') or constants.linkage_method_default self.num_threads = int(A('num_threads') or 1) self.queue_size = int( A('queue_size') if A('queue_size') is not None else 0) self.write_buffer_size = int( A('write_buffer_size') if A('write_buffer_size' ) is not None else 500) self.total_length_of_all_contigs = 0 self.total_coverage_values_for_all_contigs = 0 self.description_file_path = A('description') # make sure early on that both the distance and linkage is OK. clustering.is_distance_and_linkage_compatible(self.distance, self.linkage) # whehther the profile database is a blank (without any BAM files or reads): self.blank = A('blank_profile') if not self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering: raise ConfigError( "You are confused, and confusing anvi'o, too. You can't as hierarchical clustering\ to be performed with one flag, and try to skip it with another one :(" ) if self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering: raise ConfigError( "So you want to generate a blank profile, and you both want hierarchical clustering\ of your contigs to be performed, and skipped. No." ) if self.blank and self.contigs_shall_be_clustered: raise ConfigError( "When the blank profile is asked to be generated, there is no need to ask for the\ hierarchical clustering of contigs. It is going to be done by default. If it is\ not changing anything, why is anvi'o upset with you? Because. Let's don't use flags\ we don't need.") if self.max_coverage_depth >= auxiliarydataops.COVERAGE_MAX_VALUE: raise ConfigError("The value %s for the maximum coverage depth is not going to work :/ While the maximum\ depth of coverage for anvi'o to care about is a soft cut-off (hence you have some level\ of freedom through the parameter `--max-coverage-depth`), there are database limitations\ anvi'o must consider and can not change. The maximum value allowed in the database for\ coverage information is 65536. Hence, you should set your depth of coverage to something \ that is less than this value. In addition, it is also recommended to leave a little gap\ and don't go beyond 90%% of this hard limit (that's why anvi'o will keep telling you,\ \"%s is nice, but %s is the best I can do\" when you try to exceed that)." \ % (pp(self.max_coverage_depth), pp(self.max_coverage_depth), pp(auxiliarydataops.COVERAGE_MAX_VALUE))) if self.blank and not self.skip_hierarchical_clustering: self.contigs_shall_be_clustered = True if A('contigs_of_interest'): filesnpaths.is_file_exists(args.contigs_of_interest) self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\ if c.strip() and not c.startswith('#')]) else: self.contig_names_of_interest = None if self.list_contigs_and_exit: self.list_contigs() sys.exit() if not self.contigs_db_path: raise ConfigError("No contigs database, no profilin'. Bye.") # Initialize contigs db dbops.ContigsSuperclass.__init__(self, self.args, r=self.run, p=self.progress) self.init_contig_sequences() self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys()) self.bam = None self.contigs = [] self.database_paths = { 'CONTIGS.db': os.path.abspath(self.contigs_db_path) } self.profile_db_path = None self.clustering_configs = constants.clustering_configs[ 'blank' if self.blank else 'single'] # following variable will be populated during the profiling, and its content will eventually # be stored in t.variable_nts_table_name self.variable_nts_table_entries = [] # if genes are not called, yet the user is asking for codon frequencies to be profiled, we give # a warning and force-turn that flag off. if (not self.a_meta['genes_are_called']) and self.profile_SCVs: self.run.warning( "You asked the codon frequencies to be profiled, but genes were not called\ for your contigs database. Anvi'o is assigning `False` to the profile-codon-frequncies\ flag, overruling your request like a boss.") self.profile_SCVs = False # following variable will be populated while the variable positions table is computed self.codons_in_genes_to_profile_SCVs = set([]) # we don't know what we are about self.description = None # additional layer data will be filled later self.layer_additional_keys = [] self.layer_additional_data = {}
def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()): self.init_workflow_super_class(args, workflow_name='pangenomics') self.target_files = [ ] # TODO: Once we update all other workflows then this will be initiated in WorkflowSuperClass self.pan_project_name = None self.valid_sequence_sources_for_phylogeny = ['gene_clusters', 'hmm'] self.sequence_source_for_phylogeny = None self.tree_name = None self.phylogeny_imported_flag = None # initialize the base class PhylogenomicsWorkflow.__init__(self) self.rules.extend([ 'anvi_gen_genomes_storage', 'anvi_pan_genome', 'anvi_get_sequences_for_gene_clusters', 'import_phylogenetic_tree_to_pangenome' ]) self.general_params.extend([ "project_name", "fasta_txt", "internal_genomes", "external_genomes", "sequence_source_for_phylogeny" ]) self.dirs_dict.update({ "FASTA_DIR": "01_FASTA", "CONTIGS_DIR": "02_CONTIGS", "PAN_DIR": "03_PAN" }) self.default_config.update({ "fasta_txt": "fasta.txt", "anvi_pan_genome": { "threads": 7 }, "import_phylogenetic_tree_to_pangenome": { 'tree_name': 'phylogeny' } }) pan_params = ["--project-name", "--genome-names", "--skip-alignments",\ "--align-with", "--exclude-partial-gene-calls", "--use-ncbi-blast",\ "--minbit", "--mcl-inflation", "--min-occurrence",\ "--min-percent-identity", "--sensitive", "--description",\ "--overwrite-output-destinations", "--skip-hierarchical-clustering",\ "--enforce-hierarchical-clustering", "--distance", "--linkage"] self.rule_acceptable_params_dict['anvi_pan_genome'] = pan_params storage_params = ["--gene-caller"] self.rule_acceptable_params_dict[ 'anvi_gen_genomes_storage'] = storage_params seq_params = [ "--gene-cluster-id", "--gene-cluster-ids-file", "--collection-name", "--bin-id", "--min-num-genomes-gene-cluster-occurs", "--max-num-genomes-gene-cluster-occurs", "--min-num-genes-from-each-genome", "--max-num-genes-from-each-genome", "--max-num-gene-clusters-missing-from-genome", "--min-functional-homogeneity-index", "--max-functional-homogeneity-index", "--min-geometric-homogeneity-index", "--max-geometric-homogeneity-index", "--add-into-items-additional-data-table", "--concatenate-gene-clusters", "--separator", "--align-with" ] self.rule_acceptable_params_dict[ 'anvi_get_sequences_for_gene_clusters'] = seq_params import_params = ['--just-do-it', 'tree_name'] self.rule_acceptable_params_dict[ 'import_phylogenetic_tree_to_pangenome'] = import_params
def __init__(self, args): self.args = args A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.input_file_path = A('input_file') self.contigs_db_path = A('contigs_db') self.serialized_profile_path = A('serialized_profile') self.output_directory = A('output_dir') self.list_contigs_and_exit = A('list_contigs') self.min_contig_length = A('min_contig_length') self.min_mean_coverage = A('min_mean_coverage') self.min_coverage_for_variability = A('min_coverage_for_variability') self.contigs_shall_be_clustered = A('cluster_contigs') self.sample_id = A('sample_name') self.report_variability_full = A('report_variability_full') self.overwrite_output_destinations = A('overwrite_output_destinations') self.skip_SNV_profiling = A('skip_SNV_profiling') self.profile_AA_frequencies = A('profile_AA_frequencies') self.gen_serialized_profile = A('gen_serialized_profile') self.distance = A('distance') or constants.distance_metric_default self.linkage = A('linkage') or constants.linkage_method_default self.num_threads = int(A('num_threads')) self.queue_size = int(A('queue_size')) self.write_buffer_size = int(A('write_buffer_size')) self.total_length_of_all_contigs = 0 self.total_coverage_values_for_all_contigs = 0 self.description_file_path = A('description') # make sure early on that both the distance and linkage is OK. clustering.is_distance_and_linkage_compatible(self.distance, self.linkage) # whehther the profile database is a blank (without any BAM files or reads): self.blank = A('blank_profile') if self.blank: self.contigs_shall_be_clustered = True if args.contigs_of_interest: filesnpaths.is_file_exists(args.contigs_of_interest) self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\ if c.strip() and not c.startswith('#')]) else: self.contig_names_of_interest = None self.progress = terminal.Progress() self.run = terminal.Run(width=35) if self.list_contigs_and_exit: self.list_contigs() sys.exit() if not self.contigs_db_path: raise ConfigError("No contigs database, no profilin'. Bye.") # Initialize contigs db dbops.ContigsSuperclass.__init__(self, self.args, r=self.run, p=self.progress) self.init_contig_sequences() self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys()) self.bam = None self.contigs = [] self.database_paths = {'CONTIGS.db': self.contigs_db_path} self.profile_db_path = None self.clustering_configs = constants.clustering_configs[ 'blank' if self.blank else 'single'] # following variable will be populated during the profiling, and its content will eventually # be stored in t.variable_nts_table_name self.variable_nts_table_entries = [] # following variable will be populated while the variable positions table is computed self.codons_in_genes_to_profile_AA_frequencies = set([]) # we don't know what we are about self.description = None
def __init__(self, args=None): self.args = None self.input_file_path = None self.contigs_db_path = None self.serialized_profile_path = None self.output_directory = None self.list_contigs_and_exit = None self.min_contig_length = 10000 self.min_mean_coverage = 0 self.min_coverage_for_variability = 10 # if a nucleotide position is covered less than this, don't bother self.contig_names_of_interest = None self.contigs_shall_be_clustered = False self.report_variability_full = False # don't apply any noise filtering, and simply report ALL base frequencies self.overwrite_output_destinations = False self.skip_SNV_profiling = False if args: self.args = args self.input_file_path = args.input_file self.contigs_db_path = args.contigs_db self.serialized_profile_path = args.serialized_profile self.output_directory = args.output_dir self.list_contigs_and_exit = args.list_contigs self.min_contig_length = args.min_contig_length self.min_mean_coverage = args.min_mean_coverage self.min_coverage_for_variability = args.min_coverage_for_variability self.contigs_shall_be_clustered = args.cluster_contigs self.number_of_threads = 4 self.no_trehading = True self.sample_id = args.sample_name self.report_variability_full = args.report_variability_full self.overwrite_output_destinations = args.overwrite_output_destinations self.skip_SNV_profiling = args.skip_SNV_profiling if args.contigs_of_interest: if not os.path.exists(args.contigs_of_interest): raise ConfigError, "Contigs file (%s) is missing..." % ( args.contigs_of_interest) self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\ if c.strip() and not c.startswith('#')]) self.progress = terminal.Progress() self.run = terminal.Run(width=35) if self.list_contigs_and_exit: self.list_contigs() sys.exit() # Initialize contigs db dbops.ContigsSuperclass.__init__(self, self.args, r=self.run, p=self.progress) self.init_contig_sequences() self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys()) self.bam = None self.contigs = {} self.database_paths = {'CONTIGS.db': self.contigs_db_path} self.profile_db_path = None self.clustering_configs = constants.clustering_configs['single'] self.atomic_contig_split_data = contigops.AtomicContigSplitData( self.progress) # following variable will be populated during the profiling, and its content will eventually # be stored in t.variable_nts_table_name self.variable_nts_table_entries = []
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.args = args self.run = run self.progress = progress # know thyself. self.name = 'metagenomics' self.samples_information = {} self.kraken_annotation_dict = {} # initialize the base class ContigsDBWorkflow.__init__(self) self.rules.extend(['iu_gen_configs', 'iu_filter_quality_minoche', 'gen_qc_report', 'gzip_fastqs',\ 'fq2fa', 'merge_fastas_for_co_assembly', 'megahit',\ 'anvi_gen_contigs_database', 'anvi_export_gene_calls', 'centrifuge',\ 'anvi_import_taxonomy', 'anvi_run_hmms', 'anvi_run_ncbi_cogs',\ 'bowtie_build', 'bowtie', 'samtools_view', 'anvi_init_bam', 'idba_ud', \ 'anvi_profile', 'annotate_contigs_database', 'anvi_merge', 'import_percent_of_reads_mapped', \ 'krakenhll', 'krakenhll_mpa_report', 'import_kraken_hll_taxonomy']) self.general_params.extend(["samples_txt", "references_mode", "all_against_all", \ "kraken_txt"]) rule_acceptable_params_dict = {} # defining the accesible params per rule rule_acceptable_params_dict['iu_gen_configs'] = [ "--r1-prefix", "--r2-prefix" ] rule_acceptable_params_dict['iu_filter_quality_minoche'] = [ 'run', '--visualize-quality-curves', '--ignore-deflines', '--limit-num-pairs', '--print-qual-scores', '--store-read-fate' ] rule_acceptable_params_dict['gzip_fastqs'] = ["run"] rule_acceptable_params_dict['megahit'] = [ "run", "--min-contig-len", "--min-count", "--k-min", "--k-max", "--k-step", "--k-list", "--no-mercy", "--no-bubble", "--merge-level", "--prune-level", "--prune-depth", "--low-local-ratio", "--max-tip-len", "--no-local", "--kmin-1pass", "--presets", "--memory", "--mem-flag", "--use-gpu", "--gpu-mem", "--keep-tmp-files", "--tmp-dir", "--continue", "--verbose" ] rule_acceptable_params_dict['idba_ud'] = [ "run", "--mink", "--maxk", "--step", "--inner_mink", "--inner_step", "--prefix", "--min_count", "--min_support", "--seed_kmer", "--min_contig", "--similar", "--max_mismatch", "--min_pairs", "--no_bubble", "--no_local", "--no_coverage", "--no_correct", "--pre_correction" ] rule_acceptable_params_dict['bowtie'] = ["additional_params"] rule_acceptable_params_dict['samtools_view'] = ["additional_params"] rule_acceptable_params_dict['anvi_profile'] = [ "--overwrite-output-destinations", "--sample-name", "--report-variability-full", "--skip-SNV-profiling", "--profile-SCVs", "--description", "--skip-hierarchical-clustering", "--distance", "--linkage", "--min-contig-length", "--min-mean-coverage", "--min-coverage-for-variability", "--cluster-contigs", "--contigs-of-interest", "--queue-size", "--write-buffer-size" ] rule_acceptable_params_dict['annotate_contigs_database'] = [] rule_acceptable_params_dict['anvi_merge'] = [ "--sample-name", "--description", "--skip-hierarchical-clustering", "--enforce-hierarchical-clustering", "--distance", "--linkage", "--skip-concoct-binning", "--overwrite-output-destinations" ] rule_acceptable_params_dict['import_percent_of_reads_mapped'] = ["run"] rule_acceptable_params_dict['krakenhll'] = [ "additional_params", "run", "--db", "--gzip-compressed" ] rule_acceptable_params_dict['krakenhll_mpa_report'] = [ "additional_params" ] rule_acceptable_params_dict['import_kraken_hll_taxonomy'] = [ "--min-abundance" ] self.rule_acceptable_params_dict.update(rule_acceptable_params_dict) forbidden_params = {} forbidden_params['krakenhll'] = [ '--fastq-input', '--paired', '--output' ] self.forbidden_params.update(forbidden_params) self.dirs_dict.update({ "QC_DIR": "01_QC", "FASTA_DIR": "02_FASTA", "CONTIGS_DIR": "03_CONTIGS", "MAPPING_DIR": "04_MAPPING", "PROFILE_DIR": "05_ANVIO_PROFILE", "MERGE_DIR": "06_MERGED", "TAXONOMY_DIR": "07_TAXONOMY" }) self.default_config.update({ 'samples_txt': "samples.txt", 'megahit': { "--min-contig-len": min_contig_length_for_assembly, "--memory": 0.4, "threads": 11 }, 'idba_ud': { "--min_contig": min_contig_length_for_assembly, "threads": 11 }, 'iu_filter_quality_minoche': { "run": True, "--ignore-deflines": True, "threads": 2 }, "gzip_fastqs": { "run": True }, "bowtie_build": { "threads": 10 }, "bowtie": { "additional_params": "--no-unal", "threads": 10 }, "samtools_view": { "additional_params": "-F 4", "threads": 4 }, "anvi_init_bam": { "threads": 4 }, "anvi_profile": { "threads": 5, "--sample-name": "{sample}", "--overwrite-output-destinations": True }, "anvi_merge": { "--sample-name": "{group}", "--overwrite-output-destinations": True }, "import_percent_of_reads_mapped": { "run": True }, "krakenhll": { "threads": 12, "--gzip-compressed": True, "additional_params": "--preload" } })
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress self.run.warning( "Anvi'o will use 'InteracDome' by Kobren and Singh (DOI: 10.1093/nar/gky1224) to attribute binding frequencies. " "If you publish your findings, please do not forget to properly credit their work.", lc='green', header="CITATION") A = lambda x, t: t(args.__dict__[x]) if x in args.__dict__ else None null = lambda x: x self.interacdome_data_dir = A( 'interacdome_data_dir', null) or constants.default_interacdome_data_path self.information_content_cutoff = A('information_content_cutoff', null) or 4 self.min_binding_frequency = A('min_binding_frequency', null) or 0 self.min_hit_fraction = A('min_hit_fraction', null) or 0.8 self.interacdome_dataset = A('interacdome_dataset', null) or 'representable' self.output_prefix = A('output_file_prefix', null) self.just_do_it = A('just_do_it', null) self.run.warning("", header='INITIALIZATION', lc='green') self.run.info("Interacdome dataset used", self.interacdome_dataset) self.run.info("Minimum hit fraction", self.min_hit_fraction) self.hmm_filepath = os.path.join(self.interacdome_data_dir, 'Pfam-A.hmm') # Init the InteracDome table self.interacdome_table = InteracDomeTableData( kind=self.interacdome_dataset, interacdome_data_dir=self.interacdome_data_dir) self.interacdome_table.load() # Init the Pfam baseclass args.hmmer_program = 'hmmsearch' # Force use of hmmsearch args.pfam_data_dir = self.interacdome_data_dir Pfam.__init__(self, args, run=self.run, progress=self.progress) # Init contigs database args = argparse.Namespace(contigs_db=self.contigs_db_path) self.contigs_db = dbops.ContigsSuperclass(args) self.potentially_remove_previous_interacdome_data() # Init the HMM profile self.hmms = pfam.HMMProfile(self.hmm_filepath) # This dictionary is populated and cast as a dataframe. It contains all of the per-residue # binding frequency information for each hit self.bind_freq = {} # This dictionary (eventual dataframe) is just like self.bind_freq, except has averaged # binding frequencies for residue-ligand combos that have multiple contributing hits. It # also drops all contributing match state information self.avg_bind_freq = {} # This is a modified version of self.avg_bind_freq that is compatible with the # amino_acid_additional_data table structure, i.e. # tables.amino_acid_additional_data_table_structure self.amino_acid_additional_data = {}
def __init__(self, args, hmm_sources, run=terminal.Run(), progress=terminal.Progress()): self.args = args self.run = run self.progress = progress self.hmm_sources = hmm_sources self.splits_dict = {} # initialize the super SequencesForHMMHits.__init__(self, None, sources=hmm_sources, run=self.run, progress=self.progress) # process genome descriptions GenomeDescriptions.__init__(self, args, run=self.run, progress=self.progress) self.load_genomes_descriptions(skip_functions=True, init=False) hmm_sources_in_all_genomes = self.get_HMM_sources_common_to_all_genomes( ) if not len(hmm_sources_in_all_genomes): raise ConfigError( "There are no HMM sources among your external genomes that occur in every genome :/" ) num_internal_genomes = len( set([ g['genome_hash'] for g in self.genomes.values() if 'profile_db_path' in g ])) collection_names = set([ g['collection_id'] for g in self.genomes.values() if 'collection_id' in g ]) if num_internal_genomes: self.run.warning( "SequencesForHMMHitsWrapperForMultipleContigs class is speaking (yes, the class is " "quite aware of its very long name thankyouverymuch). Of the total %d genome descriptions " "it was given, %d seem to represent internal genomes with bins in collection(s) '%s'. Anvi'o " "will make sure HMM hits to be used for downstream analyses are only those that match to contigs " "that were included in those selections." % (len(self.genomes), num_internal_genomes, ', '.join(collection_names)), lc="green") # very hacky code follows. here we generate a self SequencesForHMMHits object, # and we will fill everything in it with slightly modified information so multiple # contigs databases could be processed by this talented class seamlessly. hmm_hits_splits_counter = 0 for genome_name in self.genomes: g = self.genomes[genome_name] contigs_db_path = g['contigs_db_path'] contigs_db_hash = g['contigs_db_hash'] # this is an important variable and allows us to track origins of HMM hits for bins # and individual contigs databases seamlessly. if you want to understand truly what # the hell does this mean, look at `get_genome_hash_for_external_genome` and # `get_genome_hash_for_internal_genome` functions in `genomedescriptions.py`. genome_hash = None # here we check if the genome descriptions contain reference to a collection name, # because if it is the case, we need to focus only on hmm hits that are relevant # to splits in this collection: if 'collection_id' in g: if ('bin_id' not in g) or ('profile_db_path' not in g): raise ConfigError( "There is something VERY weird going on. Your genome descriptions object contains " "a collection name, yet it doesn't know anything about a bin name or profile database " "path. While this is very interesting because it should never happen, anvi'o will say " "goodbye and abruptly quit in confusion :(") # setup an args object, and recover the split names of interest args = argparse.Namespace(profile_db=g['profile_db_path'], contigs_db=g['contigs_db_path'], bin_id=g['bin_id'], collection_name=g['collection_id']) split_names_of_interest = ccollections.GetSplitNamesInBins( args).get_split_names_only() genome_hash = hashlib.sha224('_'.join( [''.join(split_names_of_interest), contigs_db_hash]).encode('utf-8')).hexdigest()[0:12] # current hmm hits now will match to the collection current = SequencesForHMMHits( contigs_db_path, sources=hmm_sources, split_names_of_interest=split_names_of_interest) else: current = SequencesForHMMHits(contigs_db_path, sources=hmm_sources) genome_hash = contigs_db_hash for hmm_hit_id in current.hmm_hits: hit = current.hmm_hits[hmm_hit_id] hit['gene_callers_id'] = '%s_%d' % (contigs_db_hash, hit['gene_callers_id']) hit['genome_hash'] = genome_hash self.hmm_hits['%s_%d' % (contigs_db_hash, hmm_hit_id)] = hit if not self.hmm_hits_info: for hmm_source in hmm_sources_in_all_genomes: self.hmm_hits_info[hmm_source] = current.hmm_hits_info[ hmm_source] for hit in current.hmm_hits_splits.values(): hit['split'] = '%s_%s' % (contigs_db_hash, hit['split']) hit['hmm_hit_entry_id'] = '%s_%d' % (contigs_db_hash, hit['hmm_hit_entry_id']) self.hmm_hits_splits[hmm_hits_splits_counter] = hit hmm_hits_splits_counter += 1 for seq in current.contig_sequences: self.contig_sequences['%s_%s' % (contigs_db_hash, seq)] = current.contig_sequences[seq] for seq in current.aa_sequences: self.aa_sequences['%s_%s' % (contigs_db_hash, seq)] = current.aa_sequences[seq] for gene_callers_id in current.genes_in_contigs: entry = current.genes_in_contigs[gene_callers_id] entry['contig'] = '%s_%s' % (contigs_db_hash, entry['contig']) self.genes_in_contigs['%s_%d' % (contigs_db_hash, gene_callers_id)] = entry self.splits_dict[genome_name] = [ '%s_%s' % (contigs_db_hash, s) for s in current.splits_in_contigs ]
def __init__(self, program_name='fastANI', args={}, run=terminal.Run(), progress=terminal.Progress()): FastANIDriver.__init__(self, program_name, args, run, progress)
def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()): self.init_workflow_super_class(args, workflow_name='trnaseq') self.rules.extend([ 'make_iu_input', 'iu_gen_configs', 'iu_merge_pairs', 'gen_qc_report', 'anvi_reformat_fasta', 'anvi_trnaseq', 'anvi_convert_trnaseq_database', 'anvi_run_trna_taxonomy' ]) # "General" section of the workflow config file. self.general_params.extend(['samples_txt']) # Parameters for each rule that are accessible in the config file. rule_acceptable_params_dict = {} rule_acceptable_params_dict['iu_merge_pairs'] = [ 'run', '--gzip-output', '--marker-gene-stringent', '--max-num-mismatches', '--report-r1-prefix', '--report-r2-prefix' ] rule_acceptable_params_dict['anvi_reformat_fasta'] = [ 'run', '--gzip-output', '--simplify-names' ] rule_acceptable_params_dict['anvi_trnaseq'] = [ 'run', '--treatment', '--overwrite-output-destinations', '--description', '--write-checkpoints', '--load-checkpoint', '--feature-param-file', '--min-length-long-fiveprime', '--min-trna-fragment-size', '--agglomeration-max-mismatch-freq', '--fiveprimemost-deletion-start', '--threeprimemost-deletion-start', '--fiveprimemost-deletion-stop', '--threeprimemost-deletion-stop', '--max-distinct-deletions', '--skip-fasta-check', '--write-buffer-size', '--alignment-target-chunk-size', '--fragment-mapping-query-chunk-length', '--alignment-progress-interval', '--agglomeration-progress-interval' ] rule_acceptable_params_dict['anvi_convert_trnaseq_database'] = [ 'run', '--project-name', '--max-reported-trna-seeds', '--overwrite-output-destinations', '--description', '--feature-threshold', '--preferred-treatment', '--nonspecific-output', '--min-variation', '--min-third-fourth-nt', '--distance', '--linkage' ] rule_acceptable_params_dict['anvi_run_trna_taxonomy'] = [ 'run', '--trna-taxonomy-data-dir', '--min-percent-identity', '--max-num-target-sequences', '--num-parallel-processes', '--write-buffer-size' ] self.rule_acceptable_params_dict.update(rule_acceptable_params_dict) # Default values for accessible parameters: all defaults are written to the config file so # the user can see them succinctly. # Though the workflow superclass automatically adds a threads argument of "" to each # workflow, here we make explicit that the default is 1 and the user does not need to # enclose the value in quotes. Likewise, the superclass adds mandatory arguments at the end # of the list for each rule in the config file, but we explicitly add them here to ensure # they appear in the order of each script's help display. self.default_config.update({ 'samples_txt': 'samples.txt', 'iu_merge_pairs': { 'run': True, '--gzip-output': False, '--marker-gene-stringent': True, '--max-num-mismatches': 0, '--report-r1-prefix': False, '--report-r2-prefix': False, 'threads': 1 }, 'anvi_reformat_fasta': { 'run': True, '--gzip-output': False, # not an argument of anvi-script-reformat-fasta '--simplify-names': True, # not the default in anvi-script-reformat-fasta 'threads': 1 }, 'anvi_trnaseq': { 'run': True, '--treatment': "", # if provided in the config file, the treatment is assumed to be for all samples '--overwrite-output-destinations': anvio.D['overwrite-output-destinations'][1]['default'], '--description': "", '--write-checkpoints': anvio.D['write-checkpoints'][1]['default'], '--load-checkpoint': "", '--feature-param-file': "", '--min-length-long-fiveprime': anvio.D['min-length-long-fiveprime'][1]['default'], '--min-trna-fragment-size': anvio.D['min-trna-fragment-size'][1]['default'], '--agglomeration-max-mismatch-freq': anvio.D['agglomeration-max-mismatch-freq'][1]['default'], '--fiveprimemost-deletion-start': anvio.D['fiveprimemost-deletion-start'][1]['default'], '--threeprimemost-deletion-start': anvio.D['threeprimemost-deletion-start'][1]['default'], '--fiveprimemost-deletion-stop': anvio.D['fiveprimemost-deletion-stop'][1]['default'], '--threeprimemost-deletion-stop': anvio.D['threeprimemost-deletion-stop'][1]['default'], '--max-distinct-deletions': anvio.D['max-distinct-deletions'][1]['default'], '--skip-fasta-check': True, # not the default in anvi-trnaseq '--write-buffer-size': 100000, # the default set in anvi-trnaseq (not the anvi'o-wide default) '--alignment-target-chunk-size': anvio.D['alignment-target-chunk-size'][1]['default'], '--fragment-mapping-query-chunk-length': anvio.D['fragment-mapping-query-chunk-length'][1]['default'], '--alignment-progress-interval': anvio.D['alignment-progress-interval'][1]['default'], '--agglomeration-progress-interval': anvio.D['agglomeration-progress-interval'][1]['default'], 'threads': 1 }, 'anvi_convert_trnaseq_database': { 'run': True, '--project-name': "", '--max-reported-trna-seeds': anvio.D['max-reported-trna-seeds'][1]['default'], '--overwrite-output-destinations': anvio.D['overwrite-output-destinations'][1]['default'], '--description': "", '--feature-threshold': anvio.D['feature-threshold'][1]['default'], '--preferred-treatment': "", '--nonspecific-output': anvio.D['nonspecific-output'][1]['default'], '--min-variation': anvio.D['min-variation'][1]['default'], '--min-third-fourth-nt': anvio.D['min-third-fourth-nt'][1]['default'], '--distance': anvio.D['distance'][1]['default'], '--linkage': anvio.D['linkage'][1]['default'], 'threads': 1 }, 'anvi_run_trna_taxonomy': { 'run': True, '--trna-taxonomy-data-dir': "", '--min-percent-identity': anvio.D['min-percent-identity'][1]['default'], '--max-num-target-sequences': anvio.D['max-num-target-sequences'][1]['default'], '--num-parallel-processes': anvio.D['num-parallel-processes'][1]['default'], '--write-buffer-size': anvio.D['write-buffer-size'][1]['default'], 'threads': 1 }, 'make_iu_input': { 'threads': 1 }, 'iu_gen_configs': { 'threads': 1 }, 'gen_qc_report': { 'threads': 1 }, 'output_dirs': {}, # This ensures that output_dirs comes before max_threads in the file 'max_threads': 1 }) self.dirs_dict.update({ 'QC_DIR': '01_QC', 'IDENT_DIR': '02_IDENT', 'CONVERT_DIR': '03_CONVERT' })
from anvio.drivers.blast import BLAST from anvio.dbops import ContigsDatabase from anvio.taxonomyops import AccessionIdToTaxonomy from anvio.taxonomyops import TaxonomyEstimatorSingle from anvio.taxonomyops import PopulateContigsDatabaseWithTaxonomy __author__ = "Developers of anvi'o (see AUTHORS.txt)" __copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)" __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "A. Murat Eren" __email__ = "*****@*****.**" run_quiet = terminal.Run(log_file_path=None, verbose=False) progress_quiet = terminal.Progress(verbose=False) pp = terminal.pretty_print HASH = lambda d: str( hashlib.sha224(''.join([ str(d[level]) for level in constants.levels_of_taxonomy ]).encode('utf-8')).hexdigest()[0:8]) class TRNATaxonomyContext(AccessionIdToTaxonomy): """The purpose of this base class is ot define file paths and constants for trna taxonomy ops.""" def __init__(self, trna_taxonomy_data_dir=None, scgs_taxonomy_remote_database_url=None, run=terminal.Run(), progress=terminal.Progress()):
def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()): self.init_workflow_super_class(args, workflow_name='contigs') self.group_names = [] self.contigs_information = {} self.fasta_txt_file = None self.fasta_information = {} # we have external_genomes_file defined here for the sake of pangenomics and phylogenomics workflows self.external_genomes_file = '' # we have references_mode defined here for the sake of the metagenomics workflow (it is only used when this workflow is inherited) self.references_mode = None self.import_external_functions_flags = [] self.rules.extend([ 'gen_external_genome_file', 'anvi_script_reformat_fasta', 'anvi_gen_contigs_database', 'export_gene_calls_for_centrifuge', 'centrifuge', 'anvi_import_taxonomy_for_genes', 'anvi_run_scg_taxonomy', 'anvi_run_trna_scan', 'anvi_run_hmms', 'anvi_run_ncbi_cogs', 'annotate_contigs_database', 'anvi_get_sequences_for_gene_calls', 'emapper', 'anvi_script_run_eggnog_mapper', 'gunzip_fasta', 'reformat_external_gene_calls_table', 'reformat_external_functions', 'import_external_functions', 'anvi_run_pfams' ]) self.general_params.extend(["fasta_txt"]) self.dirs_dict.update({ "FASTA_DIR": "01_FASTA", "CONTIGS_DIR": "02_CONTIGS" }) self.default_config.update({ "fasta_txt": "fasta.txt", "anvi_gen_contigs_database": { "--project-name": "{group}" }, "centrifuge": { "threads": 2 }, "anvi_run_hmms": { "run": True, "threads": 5 }, "anvi_run_ncbi_cogs": { "run": True, "threads": 5 }, "anvi_run_scg_taxonomy": { "run": True, "threads": 6 }, 'anvi_run_trna_scan': { "run": True, "threads": 6 }, "anvi_script_reformat_fasta": { "run": True, "--prefix": "{group}", "--simplify-names": True }, "emapper": { "--database": "bact", "--usemem": True, "--override": True }, "anvi_script_run_eggnog_mapper": { "--use-version": "0.12.6" } }) self.rule_acceptable_params_dict['anvi_run_ncbi_cogs'] = [ 'run', '--cog-data-dir', '--sensitive', '--temporary-dir-path', '--search-with' ] self.rule_acceptable_params_dict['anvi_run_scg_taxonomy'] = [ 'run', '--scgs-taxonomy-data-dir' ] self.rule_acceptable_params_dict['anvi_run_trna_scan'] = [ 'run', '--trna-cutoff-score' ] self.rule_acceptable_params_dict['anvi_run_hmms'] = [ 'run', '--installed-hmm-profile', '--hmm-profile-dir', '--also-scan-trnas' ] self.rule_acceptable_params_dict['anvi_run_pfams'] = [ 'run', '--pfam-data-dir' ] self.rule_acceptable_params_dict['centrifuge'] = ['run', 'db'] self.rule_acceptable_params_dict['emapper'] = [ '--database', '--usemem', '--override', 'path_to_emapper_dir' ] self.rule_acceptable_params_dict['anvi_script_run_eggnog_mapper'] = [ 'run', '--cog-data-dir', '--drop-previous-annotations', '--use-version' ] self.rule_acceptable_params_dict['anvi_script_reformat_fasta'] = \ ['run', '--keep-ids', '--exclude-ids', '--min-len', "--prefix", "--simplify-names"] gen_contigs_params = ['--description', '--skip-gene-calling', '--external-gene-calls',\ '--ignore-internal-stop-codons', '--skip-mindful-splitting',\ '--contigs-fasta', '--project-name',\ '--description', '--split-length', '--kmer-size',\ '--skip-mindful-splitting', '--skip-gene-calling', '--external-gene-calls',\ '--ignore-internal-stop-codons', '--skip-predict-frame', '--prodigal-translation-table'] self.rule_acceptable_params_dict[ 'anvi_gen_contigs_database'] = gen_contigs_params
def __init__(self, input_file_paths, taxonomy_table_structure, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress self.min_hit_score = 250 files_expected = {'report': 'centrifuge_report.tsv', 'hits': 'centrifuge_hits.tsv'} files_structure = {'report': {'col_names': ['t_species', 'taxon_id', 'f1', 'f2', 'f3', 'f4', 'f5'], 'col_mapping': [str, int, str, str, str, str, str], 'indexing_field': 1}, 'hits': {'col_names': ['gene_callers_id', 'f1', 'taxon_id', 'score', 'f2', 'f3', 'f4', 'f5'], 'col_mapping': [lambda x: int(x.split('|')[0]), str, int, int, str, str, str, str], 'indexing_field': -1}, } self.taxonomy_table_structure = taxonomy_table_structure Parser.__init__(self, 'centrifuge', input_file_paths, files_expected, files_structure)
def check_database(self): """Setup the database files Downloads the .pir file if it is missing Binarizes .pir file if .bin is missing Creates the .dmnd file if it is missing """ extensionless, extension = os.path.splitext(self.modeller_database) if extension not in [".bin", ".pir", ""]: raise ConfigError( "MODELLER :: The only possible database extensions are .bin and .pir" ) bin_db_path = J(self.database_dir, extensionless + ".bin") pir_db_path = J(self.database_dir, extensionless + ".pir") bin_exists = utils.filesnpaths.is_file_exists(bin_db_path, dont_raise=True) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) self.database_path = bin_db_path if bin_exists and pir_exists: # We good pass else: if not pir_exists: # Download .pir self.run.warning( "Anvi'o looked in {} for a database with the name {} and with an extension " "of either .bin or .pir, but didn't find anything matching that " "criteria. Anvi'o will try and download the best database it knows of from " "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. " "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 " "database".format(self.database_dir, self.modeller_database)) db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz") utils.download_file( "https://salilab.org/modeller/downloads/pdb_95.pir.gz", db_download_path) utils.run_command( ['gzip', '-d', db_download_path], log_file_path=filesnpaths.get_temp_file_path()) # Binarize .pir (make .bin) self.run.warning( "Your database is not in binary format. That means accessing its contents is slower " "than it could be. Anvi'o is going to make a binary format. Just FYI" ) self.run_binarize_database(pir_db_path, bin_db_path) dmnd_db_path = J(self.database_dir, 'pdb_95.dmnd') if os.path.exists(dmnd_db_path): return self.run.warning( "Your diamond database does not exist. It will be created.") script_name = "pir_to_fasta.py" self.copy_script_to_directory(script_name) input_pir_path = J(self.database_dir, 'pdb_95.pir') fasta_path = J(self.database_dir, 'pdb_95.fa') dmnd_path = J(self.database_dir, 'pdb_95') command = [self.executable, script_name, input_pir_path, fasta_path] self.run_command(command, script_name=script_name, rename_log=False) temp = u.FastaOutput(filesnpaths.get_temp_file_path()) fasta = u.SequenceSource(fasta_path) while next(fasta): temp.write_id(fasta.id) temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X')) shutil.move(temp.output_file_path, fasta_path) fasta.close() temp.close() driver = diamond.Diamond( query_fasta=fasta_path, run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False), ) driver.makedb(output_file_path=dmnd_path) os.remove(fasta_path)
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress(), skip_sanity_check=False): """Parses arguments and run sanity_check""" self.args = args self.run = run self.progress = progress # Parse arguments A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.annotation_source = A('annotation_source') self.window_range = A('ngram_window_range') or "2:3" self.is_in_unknowns_mode = A('analyze_unknown_functions') self.output_file = A('output_file') self.skip_init_functions = A('skip_init_functions') self.genome_names_to_focus = A('genome_names') self.ngram_source = A("ngram_source") self.annotation_source_dict = {} self.pan_db_path = A('pan_db') if self.annotation_source and self.pan_db_path: self.annotation_sources = [self.annotation_source, 'gene_clusters'] if self.pan_db_path: self.pan_db = PanDatabase(self.pan_db_path) self.p_meta = self.pan_db.meta self.p_meta['creation_date'] = utils.get_time_to_date( self.p_meta['creation_date'] ) if 'creation_date' in self.p_meta else 'unknown' self.p_meta['genome_names'] = sorted([ s.strip() for s in self.p_meta['external_genome_names'].split(',') + self.p_meta['internal_genome_names'].split(',') if s ]) self.p_meta['num_genomes'] = len(self.p_meta['genome_names']) self.genome_names = self.p_meta['genome_names'] self.gene_clusters_gene_alignments_available = self.p_meta[ 'gene_alignments_computed'] else: self.pan_db = None self.genomes_storage_path = A('genomes_storage') # confirm genome-storage and pangenome hashes match of pangenome is provided if self.pan_db: self.genomes_storage = genomestorage.GenomeStorage( self.genomes_storage_path, self.p_meta['genomes_storage_hash'], genome_names_to_focus=self.p_meta['genome_names'], skip_init_functions=self.skip_init_functions, run=self.run, progress=self.progress) else: self.genomes_storage = genomestorage.GenomeStorage( self.genomes_storage_path, skip_init_functions=self.skip_init_functions, run=self.run, progress=self.progress) # list-annotation-resources self.list_annotation_sources = A('list_annotation_sources') self.gene_function_source_set = self.genomes_storage.db.get_table_as_dataframe( 'gene_function_calls').source.unique() if self.list_annotation_sources: self.run.info('Available functional annotation sources', ', '.join(self.gene_function_source_set)) sys.exit() # This houses the ngrams' data self.ngram_attributes_list = [] # Focus on specfic set of genomes if self.genome_names_to_focus: if filesnpaths.is_file_exists(self.genome_names_to_focus, dont_raise=True): self.genome_names_to_focus = utils.get_column_data_from_TAB_delim_file( self.genome_names_to_focus, column_indices=[0], expected_number_of_fields=1)[0] else: self.genome_names_to_focus = [ g.strip() for g in self.genome_names_to_focus.split(',') ] self.run.warning( "A subset of genome names is found, and anvi'o will focus only on to those." ) self.genomes_storage = genomestorage.GenomeStorage( self.genomes_storage_path, storage_hash=None, genome_names_to_focus=self.genome_names_to_focus) self.genomes = self.genomes_storage.get_genomes_dict() self.external_genome_names = [ g for g in self.genomes if self.genomes[g]['external_genome'] ] self.internal_genome_names = [ g for g in self.genomes if not self.genomes[g]['external_genome'] ] self.hash_to_genome_name = {} for genome_name in self.genomes: self.hash_to_genome_name[self.genomes[genome_name] ['genome_hash']] = genome_name # number of genomes in genome-storage self.num_contigs_in_external_genomes_with_genes = len(self.genomes) # number of genomes in genome-storage if self.genome_names_to_focus: self.num_contigs_in_external_genomes_with_genes = len( self.genome_names_to_focus) else: self.num_contigs_in_external_genomes_with_genes = len( self.genomes_storage.get_all_genome_names()) if not skip_sanity_check: self.sanity_check() # unless we are in debug mode, let's keep things quiet. if anvio.DEBUG: self.run_object = terminal.Run() else: self.run_object = terminal.Run(verbose=False)
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress(), progress_title=None): self.args = args self.run = run self.progress = progress up_to_date_modeller_exec = "mod9.21" # default exec to use A = lambda x, t: t(args.__dict__[x] ) if x in self.args.__dict__ else None null = lambda x: x self.scoring_method = A('scoring_method', str) self.deviation = A('deviation', float) self.directory = A('directory', str) self.very_fast = A('very_fast', bool) self.executable = A('modeller_executable', null) or up_to_date_modeller_exec self.num_models = A('num_models', int) self.target_fasta_path = A('target_fasta_path', str) self.modeller_database = A('modeller_database', str) or "pdb_95" self.max_number_templates = A('max_number_templates', null) self.percent_identical_cutoff = A('percent_identical_cutoff', null) self.deviation = A('deviation', null) self.alignment_pap_path = None self.alignment_pir_path = None self.get_template_path = None self.search_results_path = None self.target_pir_path = None self.template_family_matrix_path = None self.template_info_path = None self.template_pdbs = None self.model_info = None self.logs = {} self.scripts = {} self.sanity_check() # as reward, whoever called this class will receive self.out when they run self.process() self.out = { "templates": { "pdb_id": [], "chain_id": [], "ppi": [] }, "models": { "molpdf": [], "GA341_score": [], "DOPE_score": [], "picked_as_best": [] }, "corresponding_gene_call": self.corresponding_gene_call, "structure_exists": False, "best_model_path": None, "best_score": None, "scoring_method": self.scoring_method, "percent_identical_cutoff": self.percent_identical_cutoff, "very_fast": self.very_fast, "deviation": self.deviation, } # All MODELLER databases are housed in self.database_dir self.database_dir = J(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/db') # copy fasta into the working directory try: shutil.copy2(self.target_fasta_path, self.directory) self.target_fasta_path = J(self.directory, self.target_fasta_path) except shutil.SameFileError: pass # store the original directory so we can cd back and forth between # self.directory and self.start_dir self.start_dir = os.getcwd() self.progress_title = progress_title if not self.progress_title: self.progress_title = "Running MODELLER for gene id {}".format( self.corresponding_gene_call)
def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()): self.init_workflow_super_class(args, workflow_name='metagenomics') self.samples_information = {} self.kraken_annotation_dict = {} self.run_krakenuniq = None self.run_metaspades = None self.use_scaffold_from_metaspades = None self.use_scaffold_from_idba_ud = None self.remove_short_reads_based_on_references = None self.references_for_removal_txt = None self.references_for_removal = {} self.references_mode = None self.fasta_txt_file = None self.samples_txt_file = None self.sample_names = None self.group_sizes = None self.collections_txt = None self.collections = None # initialize the base class ContigsDBWorkflow.__init__(self) self.rules.extend(['iu_gen_configs', 'iu_filter_quality_minoche', 'gen_qc_report', 'gzip_fastqs',\ 'merge_fastqs_for_co_assembly', 'megahit', 'merge_fastas_for_co_assembly',\ 'bowtie_build', 'bowtie', 'samtools_view', 'anvi_init_bam', 'idba_ud',\ 'anvi_profile', 'anvi_merge', 'import_percent_of_reads_mapped', 'anvi_cluster_contigs',\ 'krakenuniq', 'krakenuniq_mpa_report', 'import_krakenuniq_taxonomy', 'metaspades',\ 'remove_short_reads_based_on_references', 'anvi_summarize', 'anvi_split']) self.general_params.extend(['samples_txt', "references_mode", "all_against_all",\ "kraken_txt", "collections_txt"]) rule_acceptable_params_dict = {} # defining the accesible params per rule. NOTE --threads is a parameter for every rule # and is not explicitly provided in what follows rule_acceptable_params_dict['iu_gen_configs'] = ["--r1-prefix", "--r2-prefix"] rule_acceptable_params_dict['iu_filter_quality_minoche'] = ['run', '--visualize-quality-curves', '--ignore-deflines', '--limit-num-pairs', '--print-qual-scores', '--store-read-fate'] rule_acceptable_params_dict['gzip_fastqs'] = ["run"] # add parameters for modifying binning algorithms additional_params_for_anvi_cluster_contigs = [self.get_param_name_for_binning_driver(d) for d in driver_modules['binning'].keys()] rule_acceptable_params_dict['anvi_cluster_contigs'] = ["run", "--collection-name", "--driver", "--just-do-it"] rule_acceptable_params_dict['anvi_cluster_contigs'].extend(additional_params_for_anvi_cluster_contigs) rule_acceptable_params_dict['anvi_summarize'] = ["additional_params", "run"] rule_acceptable_params_dict['anvi_split'] = ["additional_params", "run"] rule_acceptable_params_dict['metaspades'] = ["run", "additional_params", "use_scaffolds"] rule_acceptable_params_dict['megahit'] = ["run", "--min-contig-len", "--min-count", "--k-min", "--k-max", "--k-step", "--k-list", "--no-mercy", "--no-bubble", "--merge-level", "--prune-level", "--prune-depth", "--low-local-ratio", "--max-tip-len", "--no-local", "--kmin-1pass", "--presets", "--memory", "--mem-flag", "--use-gpu", "--gpu-mem", "--keep-tmp-files", "--tmp-dir", "--continue", "--verbose"] rule_acceptable_params_dict['idba_ud'] = ["run", "--mink", "--maxk", "--step", "--inner_mink", "--inner_step", "--prefix", "--min_count", "--min_support", "--seed_kmer", "--min_contig", "--similar", "--max_mismatch", "--min_pairs", "--no_bubble", "--no_local", "--no_coverage", "--no_correct", "--pre_correction", "use_scaffolds"] rule_acceptable_params_dict['bowtie'] = ["additional_params"] rule_acceptable_params_dict['bowtie_build'] = ["additional_params"] rule_acceptable_params_dict['samtools_view'] = ["additional_params"] rule_acceptable_params_dict['anvi_profile'] = ["--overwrite-output-destinations", "--sample-name", "--report-variability-full", "--skip-SNV-profiling", "--profile-SCVs", "--description", "--skip-hierarchical-clustering", "--distance", "--linkage", "--min-contig-length", "--min-mean-coverage", "--min-coverage-for-variability", "--cluster-contigs", "--contigs-of-interest", "--queue-size", "--write-buffer-size-per-thread", "--max-contig-length"] rule_acceptable_params_dict['merge_fastas_for_co_assembly'] = [] rule_acceptable_params_dict['merge_fastqs_for_co_assembly'] = [] rule_acceptable_params_dict['anvi_merge'] = ["--sample-name", "--description", "--skip-hierarchical-clustering", "--enforce-hierarchical-clustering", "--distance", "--linkage", "--overwrite-output-destinations"] rule_acceptable_params_dict['import_percent_of_reads_mapped'] = ["run"] rule_acceptable_params_dict['krakenuniq'] = ["additional_params", "run", "--db", "--gzip-compressed"] rule_acceptable_params_dict['import_krakenuniq_taxonomy'] = ["--min-abundance"] rule_acceptable_params_dict['remove_short_reads_based_on_references'] = ["dont_remove_just_map", \ "references_for_removal_txt", \ "delimiter-for-iu-remove-ids-from-fastq"] self.rule_acceptable_params_dict.update(rule_acceptable_params_dict) forbidden_params = {} forbidden_params['krakenuniq'] = ['--fastq-input', '--paired', '--output'] self.forbidden_params.update(forbidden_params) self.dirs_dict.update({"QC_DIR": "01_QC", "FASTA_DIR": "02_FASTA", "CONTIGS_DIR": "03_CONTIGS", "MAPPING_DIR": "04_MAPPING", "PROFILE_DIR": "05_ANVIO_PROFILE", "MERGE_DIR": "06_MERGED", "TAXONOMY_DIR": "07_TAXONOMY", "SUMMARY_DIR": "08_SUMMARY", "SPLIT_PROFILES_DIR": "09_SPLIT_PROFILES"}) self.default_config.update({'samples_txt': "samples.txt", 'metaspades': {"additional_params": "--only-assembler", "threads": 7}, 'megahit': {"--min-contig-len": min_contig_length_for_assembly, "--memory": 0.4, "threads": 7}, 'idba_ud': {"--min_contig": min_contig_length_for_assembly, "threads": 7}, 'iu_filter_quality_minoche': {"run": True, "--ignore-deflines": True}, "gzip_fastqs": {"run": True}, "bowtie": {"additional_params": "--no-unal", "threads": 3}, "samtools_view": {"additional_params": "-F 4"}, "anvi_profile": {"threads": 3, "--sample-name": "{sample}", "--overwrite-output-destinations": True}, "anvi_merge": {"--sample-name": "{group}", "--overwrite-output-destinations": True}, "import_percent_of_reads_mapped": {"run": True}, "krakenuniq": {"threads": 3, "--gzip-compressed": True, "additional_params": ""}, "remove_short_reads_based_on_references": {"delimiter-for-iu-remove-ids-from-fastq": " "}, "anvi_cluster_contigs": {"--collection-name": "{driver}"}})
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress A = lambda x: (args.__dict__[x] if x in args.__dict__ else None) if args else None if self.mode == 'train': self.genomes_dir = os.path.abspath(A('genomes_dir')) self.classifier_output_path = os.path.abspath(A('output')) if A('classifier'): raise ConfigError("You should not initialize the domain training class with a input classifier path (`args.classifier`).") if not self.genomes_dir: raise ConfigError("You must provide a genomes directory. Please read the help menu if you are not sure\ how the contents of this directory should look like.") filesnpaths.is_output_file_writable(self.classifier_output_path) filesnpaths.is_file_exists(self.genomes_dir) elif self.mode == 'predict': if A('output'): raise ConfigError("You should not initialize the domain prediction class with an output classifier path (`args.output`).") default_classifier_path = 'misc/SCGDOMAINCLASSIFIER.rf' self.input_classifier_path = A('classifier') or os.path.join(os.path.dirname(anvio.data.__file__), default_classifier_path) if A('classifier'): filesnpaths.is_file_exists(self.input_classifier_path) else: if not filesnpaths.is_file_exists(self.input_classifier_path, dont_raise=True): raise ConfigError("Somehow, this anvi'o installation dose not seem to have a SCG domain classifier. This is one of\ those anvi'o things that should never happen. If you are an anvi'o user, please feel free to panic :(\ If you are an anvi'o developer, what you need to do is to follow the instructions in \ `anvi-script-gen-scg-domain-classifier` with a reasonable set of genomes and store the resulting\ classifier at the default anvi'o path of /blah/blah/anvio/data/%s." % (default_classifier_path)) self.rf = RF(self.input_classifier_path, r=self.run, p=self.progress) self.rf.initialize_classifier() else: raise ConfigError("Someone initialized the SCG domain classifier class without an explicit mode :(") self.SCG_sources = [d for d in hmm_data.sources if hmm_data.sources[d]['kind'] == 'singlecopy'] self.SCG_domains = sorted([hmm_data.sources[source]['domain'] for source in self.SCG_sources]) self.SCG_domain_to_source = dict([(hmm_data.sources[source]['domain'], source) for source in self.SCG_sources]) if not len(self.SCG_sources): raise ConfigError("There is something wrong :( There is not even a single SCG source found. Usually\ anvi'o comes with multiple of them :/") if len(self.SCG_sources) == 1: raise ConfigError("There is only a single SCG source in your anvi'o installation. It is OK if you are\ being a hacker and playing with things, but there is no logic behind creating a\ classifier with a single class.") if len(self.SCG_domains) != len(set(self.SCG_domains)): raise ConfigError("Something is wrong. For each domain, there must be a single sinlge-copy core gene\ source.") self.data, self.labels, self.features = [], [], [] for domain in self.SCG_domains: self.features.extend(sorted(hmm_data.sources[self.SCG_domain_to_source[domain]]['genes'])) self.run.info('SCG domain classifier mode', self.mode) self.run.info("SCG domains found", ', '.join(self.SCG_domains)) self.run.info("Num features", len(self.features))
def __init__(self, db_path, client_version, new_database=False, ignore_version=False, run=terminal.Run(), progress=terminal.Progress()): self.db_path = db_path self.version = None self.run = run self.progress = progress if new_database: filesnpaths.is_output_file_writable(db_path) else: filesnpaths.is_file_exists(db_path) if new_database and os.path.exists(self.db_path): os.remove(self.db_path) self.check_if_db_writable() self.conn = sqlite3.connect(self.db_path) self.conn.text_factory = str self.cursor = self.conn.cursor() if new_database: self.create_self() self.set_version(client_version) else: self.version = self.get_version() if str(self.version) != str(client_version) and not ignore_version: if int(self.version) > int(client_version): raise ConfigError("Bad news of the day: the database at %s was generated with an anvi'o version that is 'newer' than\ the one you are actively using right now. We know, you hate to hear this, but you need to upgrade\ your anvi'o :(" % self.db_path) else: raise ConfigError("The database at '%s' is outdated (its version is v%s, but your anvi'o installation only knows how to\ deal with v%s). You can migrate your database without losing any data using the program `anvi-migrate`."\ % (self.db_path, self.version, client_version))
from anvio.drivers.mcl import MCL from anvio.drivers import Aligners from anvio.errors import ConfigError, FilesNPathsError from anvio.genomestorage import GenomeStorage __author__ = "A. Murat Eren" __copyright__ = "Copyright 2017, The anvio Project" __credits__ = [] __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "A. Murat Eren" __email__ = "*****@*****.**" run = terminal.Run() progress = terminal.Progress() pp = terminal.pretty_print aligners = Aligners() class Pangenome(GenomeStorage): def __init__(self, args=None, run=run, progress=progress): GenomeStorage.__init__(self, args, run, progress) self.init_genomes_data_storage() self.args = args self.run = run self.progress = progress self.max_num_PCs_for_hierarchical_clustering = constants.max_num_items_for_hierarchical_clustering
def populate_search_tables(self, contigs_db_path): utils.is_contigs_db(contigs_db_path) filesnpaths.is_output_file_writable(contigs_db_path, ok_if_exists=True) contig_sequences_fasta_path = os.path.join(self.tmp_directory_path, 'contig_sequences.fa') utils.export_sequences_from_contigs_db(contigs_db_path, contig_sequences_fasta_path) search_results_dict = self.run_trnascan_on_FASTA( fasta_file_path=contig_sequences_fasta_path) # At this point we need to turn this search_results_dict into one that matches how it is used # in HMM operations. Here is an entry from tRNA results dict: # # {1: {'contig': 'Bfragilis_0100_000000000001', # 'trna_no': '1', # 'start': 135361, # 'stop': 135433, # 'amino_acid': 'Thr', # 'codon': 'CGT', # 'score': 67.6}} # # and here is one exmple from the rRNA HMMs results dict: # # {1: {'entry_id': 0, # 'gene_name': 'Bacterial_23S_rRNA', # 'gene_hmm_id': '-', # 'contig_name': 'Bfragilis_0100_000000000001', # 'start': 1110877, # 'stop': 1113757, # 'e_value': 0.0}} # # so we will have to make the former look like the latter. I have the feeling that the # score / e_value will cause issues later :( missing_amino_acids = Counter() missing_codons = Counter() entries_to_remove = set([]) for entry_id in search_results_dict: entry = search_results_dict[entry_id] aa, codon = entry['amino_acid'], utils.rev_comp(entry['anticodon']) if codon not in self.codons: missing_codons[codon] += 1 entries_to_remove.add(entry_id) continue if aa not in self.amino_acids: missing_amino_acids[aa] += 1 entries_to_remove.add(entry_id) continue aa_codon = '%s_%s' % (aa, codon) entry['gene_name'] = aa_codon entry['e_value'] = entry['score'] entry['gene_hmm_id'] = '-' if entry['stop'] > entry['start']: # so we are forward entry['start'] = entry[ 'start'] - 1 # setting the pythonic start. else: # so this one is reverse entry['stop'] = entry['stop'] - 1 for entry_id in entries_to_remove: search_results_dict.pop(entry_id) self.run.info("Num tRNA genes recovered", len(search_results_dict)) if len(missing_codons): self.run.warning( "While anvi'o was trying to parse the output from tRNAScan-SE, it " "became clear that some of the codons the tool identified was not " "known to anvi'o, so we conservatively discareded those entries. " "Here is the list of codons that were discareded and their frequency " "among your contigs: '%s'." % (', '.join([ '%s (%d)' % (codon, missing_codons[codon]) for codon in missing_codons ])), header="WEIRD CODONS ALERT") if len(missing_amino_acids): self.run.warning( "While anvi'o was trying to parse the output from tRNAScan-SE, it " "run into some amino acid names that were not known to anvi'o. " "All those entries are now gone :/ But here is the list of amino " "acids and their frequencies: '%s'." % (', '.join([ '%s (%d)' % (amino_acid, missing_amino_acids[amino_acid]) for amino_acid in missing_amino_acids ])), header="WEIRD AMINO ACIDS ALERT") search_results_dict = utils.get_pruned_HMM_hits_dict( search_results_dict) tables_for_hmm_hits = TablesForHMMHits(contigs_db_path, run=self.run, progress=self.progress) search_results_dict = tables_for_hmm_hits.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict( self.kind_of_search, search_results_dict, skip_amino_acid_sequences=True) tables_for_hmm_hits.append(self.source_name, self.reference, self.kind_of_search, self.domain, self.all_genes_searched_against, search_results_dict) # when the code comes all the way here, the entries in the search results dict already look like # this, so we have a gene callers id for the newly generate genes for tRNAs. we will use it # to populate a functions dict and submit it to the contigs database as well: # # {'contig_name': 'Bfragilis_0100_000000000001', # 'trna_no': '1', # 'start': 135361, # 'stop': 135433, # 'amino_acid': 'Thr', # 'anticodon': 'CGT', # 'score': 67.6, # 'gene_name': 'Thr_ACG', # 'e_value': 67.6, # 'gene_hmm_id': '-', # 'gene_callers_id': 4502} # functions_dict = {} for entry_id in search_results_dict: entry = search_results_dict[entry_id] function_text = 'tRNA gene for amino acid %s (codon: %s; anticodon:%s; score:%.1f; intron_start:%d; intron_end:%d)' \ % (entry['amino_acid'], codon, entry['anticodon'], \ entry['score'], entry['intron_start'], entry['intron_end']) functions_dict[entry_id] = { 'gene_callers_id': entry['gene_callers_id'], 'source': self.source_name, 'accession': '%s_%d' % (aa_codon, entry['gene_callers_id']), 'function': function_text, 'e_value': 0.0 } gene_function_calls_table = TableForGeneFunctions( contigs_db_path, run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False)) gene_function_calls_table.create(functions_dict) if not anvio.DEBUG: self.clean_tmp_directory() self.run.info_single( "Temp directory is now cleaned (if you would like to keep it the " "next time use the flag `--debug`).", nl_before=1) else: self.run.info_single( "Due to the `--debug` flag, anvi'o did not remove the temoporary files " "directory (which is still at '%s')." % (self.tmp_directory_path), nl_before=1)