def __init__(self, db_path, client_version, new_database=False, ignore_version=False): self.db_path = db_path self.version = None if new_database: filesnpaths.is_output_file_writable(db_path) else: filesnpaths.is_file_exists(db_path) if new_database and os.path.exists(self.db_path): os.remove(self.db_path) self.conn = sqlite3.connect(self.db_path) self.conn.text_factory = str self.cursor = self.conn.cursor() if new_database: self.create_self() self.set_version(client_version) else: self.version = self.get_version() if str(self.version) != str(client_version) and not ignore_version: raise ConfigError("It seems the database '%s' was generated when your client was at version %s,\ however, your client now is at version %s. Which means this database file\ cannot be used with this client anymore, but THERE MAY be a script to UPGRADE \ your database! Just type `anvi-script-upgrade-` on your terminal, and press the TAB key\ twice to see all available scripts. If there is nothing there to upgrade your database from\ v%s to v%s, you may need to re-create it with the new anvi'o :/ Please feel free to\ get in touch with the anvi'o developers if you are not sure, and they will help you\ figure out what is the next best step for you."\ % (self.db_path, self.version, client_version, self.version, client_version))
def export_collection(self, collection_name, output_file_prefix=None): self.sanity_check(collection_name) if not output_file_prefix: output_file_prefix = 'collection-%s' % (collection_name.strip().replace(' ', '-')) info_file_path = output_file_prefix + '-info.txt' items_file_path = output_file_prefix + '.txt' self.run.info('Items file path', items_file_path) filesnpaths.is_output_file_writable(items_file_path) bins_info = self.get_bins_info_dict(collection_name) collection = self.get_collection_dict(collection_name) if len(bins_info): self.run.info('Info file path', info_file_path) info_file = open(info_file_path, 'w') for bin_name in bins_info: info_file.write('%s\t%s\t%s\n' % (bin_name, bins_info[bin_name]['source'], bins_info[bin_name]['html_color'])) info_file.close() items_file = open(items_file_path, 'w') for bin_name in collection: for item_name in collection[bin_name]: items_file.write('%s\t%s\n' % (item_name, bin_name))
def __init__(self, args = {}, p=progress, r=run): self.args = args self.run = r self.progress = p self.samples = None self.samples_information_dict = None self.data = None A = lambda x, t: t(args.__dict__[x]) if args.__dict__.has_key(x) else None null = lambda x: x self.input_file_path = A('input_file', null) self.samples_information_path = A('samples_information', null) self.max_num_unique_positions = A('max_num_unique_positions', int) self.output_file_path = A('output_file', null) filesnpaths.is_output_file_writable(self.output_file_path) if self.samples_information_path: filesnpaths.is_file_tab_delimited(self.samples_information_path) self.samples_information_dict = utils.get_TAB_delimited_file_as_dictionary(self.samples_information_path) num_attributes = len(self.samples_information_dict.values()[0]) self.run.info('samples_information', '%d attributes read for %d samples' % (num_attributes, len(self.samples_information_dict))) if self.input_file_path: filesnpaths.is_file_tab_delimited(self.input_file_path) self.progress.new('Reading the input file') self.progress.update('...') self.data = utils.get_TAB_delimited_file_as_dictionary(self.input_file_path) self.progress.end() self.run.info('input_file', '%d entries read' % len(self.data))
def __init__(self, db_path, client_version, new_database=False, ignore_version=False): self.db_path = db_path self.version = None if new_database: filesnpaths.is_output_file_writable(db_path) else: filesnpaths.is_file_exists(db_path) if new_database and os.path.exists(self.db_path): os.remove(self.db_path) self.conn = sqlite3.connect(self.db_path) self.conn.text_factory = str self.cursor = self.conn.cursor() if new_database: self.create_self() self.set_version(client_version) else: self.version = self.get_version() if str(self.version) != str(client_version) and not ignore_version: raise ConfigError, "It seems the database '%s' was generated when your client was at version %s,\ however, your client now is at version %s. Which means this database file\ cannot be used with this client anymore and needs to be upgraded to the\ version %s :/"\ % (self.db_path, self.version, client_version, client_version)
def sanity_check(self): """Executes rudimentary checks Parameters ========== N\A Returns ======= N\A """ self.check_programs() for name, variable in [('a log file', self.log_file_path), ('a FASTA file path', self.fasta_file_path)]: if not variable: raise ConfigError( "A proper instance of this driver must have %s variable set." % name) filesnpaths.is_output_file_writable(self.log_file_path) filesnpaths.is_file_exists(self.fasta_file_path) try: self.cutoff_score = float(self.cutoff_score) except: raise ConfigError("Cutoff score must be a float.") if self.cutoff_score < 20 or self.cutoff_score > 100: raise ConfigError("The cutoff score must be between 20 and 100.")
def __init__(self, db_path, client_version, new_database=False, ignore_version=False): self.db_path = db_path self.version = None if new_database: filesnpaths.is_output_file_writable(db_path) else: filesnpaths.is_file_exists(db_path) if new_database and os.path.exists(self.db_path): os.remove(self.db_path) self.check_if_db_writable() self.conn = sqlite3.connect(self.db_path) self.conn.text_factory = str self.cursor = self.conn.cursor() if new_database: self.create_self() self.set_version(client_version) else: self.version = self.get_version() if str(self.version) != str(client_version) and not ignore_version: if int(self.version) > int(client_version): raise ConfigError("Bad news of the day: the database at %s was generated with an anvi'o version that is 'newer' than\ the one you are actively using right now. We know, you hate to hear this, but you need to upgrade\ your anvi'o :(" % self.db_path) else: raise ConfigError("The database at '%s' is outdated (its version is v%s, but your anvi'o installation only knows how to\ deal with v%s). You can migrate your database without losing any data using the program `anvi-migrate-db`."\ % (self.db_path, self.version, client_version))
def migrate(config_path): if config_path is None: raise ConfigError("No config path is given.") workflow_name, version = w.get_workflow_name_and_version_from_config( config_path, dont_raise=True) if not workflow_name: raise ConfigError( 'Your config must include a workflow_name. For example\ if this config file is used for the metagenomics workflow\ then add \'"workflow_name": "metagenomics"\' to your config.' ) if version != current_version: raise ConfigError( "Version of this config file is not %s (hence, this script cannot really do anything)." % current_version) progress.new("Upgrading your config") progress.update("...") args = argparse.Namespace(workflow=workflow_name, config_file=config_path) workflow_module_dict = w.get_workflow_module_dict() workflow_object = workflow_module_dict[workflow_name](args) config = workflow_object.config default_config = workflow_object.default_config new_config = config.copy() new_config['config_version'] = '1' ## Deal with special cases special_params = ['fasta_txt', 'references_for_removal', 'references_mode'] for param in default_config: if param in special_params: # if the param belongs to special params then we skip it continue elif type(default_config[param]) == dict: # otherwise update config rule parameters new_config[param] = default_config[param] new_config[param].update(config.get(param, '')) else: # if it's not a dict then it's a general parameter # update the general parameter new_config[param] = config.get(param, default_config[param]) filesnpaths.is_output_file_writable(config_path) open(config_path, 'w').write(json.dumps(new_config, indent=4)) progress.end() run.info_single( "The config file version is now %s. This upgrade brought back any default value that was\ previously removed from your config file. It will not change anything about the\ configuration of the resulting workflow and you can just carry on your work.\ " % (next_version), nl_after=1, nl_before=1, mc='green')
def check_storage_path_for_create_new(self): if not self.storage_path.endswith('GENOMES.db'): raise ConfigError("The genomes storage file must end with '-GENOMES.db'. Anvi'o developers do know how ridiculous\ this requirement sounds like, but if you have seen the things they did, you would totally\ understand why this is necessary.") filesnpaths.is_output_file_writable(self.storage_path)
def sanity_check(self): """Basic sanity check for class inputs""" if self.output_file_prefix: filesnpaths.is_output_file_writable( self.output_file_prefix + '-additional-layers.txt', ok_if_exists=self.overwrite_output_destinations) try: if self.gen_figures: plot_dir = self.output_file_prefix + '-nucleotide-coverage-distribution-plots' os.makedirs(plot_dir, exist_ok=self.overwrite_output_destinations) except FileExistsError as e: raise FilesNPathsError( "%s already exists, if you would like to overwrite it, then use -W (see help menu)." % plot_dir) # checking alpha if not isinstance(self.alpha, float): raise ConfigError("alpha value must be a type float.") # alpha must be a min of 0 and smaller than 0.5 if self.alpha < 0 or self.alpha >= 0.5: raise ConfigError( "alpha must be a minimum of 0 and smaller than 0.5") if self.exclude_samples and self.include_samples: raise ConfigError( "You cannot use both --include-samples and --exclude-samples! Please choose one." )
def store_dict_as_TAB_delimited_file(d, output_path, headers = None, file_obj = None): if not file_obj: filesnpaths.is_output_file_writable(output_path) if not file_obj: f = open(output_path, 'w') else: f = file_obj if not headers: headers = ['key'] + sorted(d.values()[0].keys()) f.write('%s\n' % '\t'.join(headers)) for k in sorted(d.keys()): line = [str(k)] for header in headers[1:]: try: val = d[k][header] except KeyError: raise ConfigError, "Header ('%s') is not found in the dict :/" % (header) except TypeError: raise ConfigError, "Your dictionary is not properly formatted to be exported\ as a TAB-delimited file :/ You ask for '%s', but it is not\ even a key in the dictionary" % (header) line.append(str(val) if type(val) != type(None) else '') f.write('%s\n' % '\t'.join(line)) f.close() return output_path
def unique_FASTA_file(input_file_path, output_fasta_path = None, names_file_path = None, store_frequencies_in_deflines = True): filesnpaths.is_file_exists(input_file_path) if not output_fasta_path: output_fasta_path = input_file_path + '.unique' if not names_file_path: names_file_path = output_fasta_path + '.names' if output_fasta_path == names_file_path: raise ConfigError, "I can't unique this. Output FASTA file path can't be identical to\ the names file path..." if output_fasta_path == input_file_path or names_file_path == input_file_path: raise ConfigError, "Anvi'o will not unique this. Output FASTA path and names file path should\ be different from the the input file path..." filesnpaths.is_output_file_writable(output_fasta_path) filesnpaths.is_output_file_writable(names_file_path) input_fasta = u.SequenceSource(input_file_path, unique = True) output_fasta = u.FastaOutput(output_fasta_path) names_file = open(names_file_path, 'w') names_dict = {} while input_fasta.next(): output_fasta.store(input_fasta, split = False, store_frequencies = store_frequencies_in_deflines) names_file.write('%s\t%s\n' % (input_fasta.id, ','.join(input_fasta.ids))) names_dict[input_fasta.id] = input_fasta.ids return output_fasta_path, names_file_path, names_dict
def export_collection(self, collection_name, output_file_prefix=None, include_unbinned=False): self.sanity_check(collection_name) if not output_file_prefix: output_file_prefix = 'collection-%s' % ( collection_name.strip().replace(' ', '-')) info_file_path = output_file_prefix + '-info.txt' items_file_path = output_file_prefix + '.txt' self.run.info('Report unbinned items if there are any', include_unbinned) self.run.info('Items file path', items_file_path) filesnpaths.is_output_file_writable(items_file_path) bins_info = self.get_bins_info_dict(collection_name) collection = self.get_collection_dict(collection_name) if len(bins_info): self.run.info('Bins info file path', info_file_path) info_file = open(info_file_path, 'w') if include_unbinned: bins_info['UNBINNED_ITEMS_BIN'] = { 'html_color': '#000000', 'source': 'anvi-export-collections' } for bin_name in bins_info: info_file.write('%s\t%s\t%s\n' % (bin_name, bins_info[bin_name]['source'], bins_info[bin_name]['html_color'])) info_file.close() binned_items = set([]) items_file = open(items_file_path, 'w') for bin_name in collection: for item_name in collection[bin_name]: items_file.write('%s\t%s\n' % (item_name, bin_name)) binned_items.add(item_name) if include_unbinned: all_items = utils.get_all_item_names_from_the_database( self.db_path) unbinned_items = all_items.difference(binned_items) for item_name in unbinned_items: items_file.write('%s\tUNBINNED_ITEMS_BIN\n' % (item_name)) self.run.warning( "As per your request, %d items that were not in any of the bins in the collection '%s' are stored\ in the output file under the bin name 'UNBINNED_ITEMS_BIN'." % (len(unbinned_items), collection_name)) items_file.close()
def create_newick_file_from_matrix_file( observation_matrix_path, output_file_path, linkage=constants.linkage_method_default, distance=constants.distance_metric_default, norm='l1', progress=progress, transpose=False, items_order_file_path=None): is_distance_and_linkage_compatible(distance, linkage) filesnpaths.is_file_exists(observation_matrix_path) filesnpaths.is_file_tab_delimited(observation_matrix_path) filesnpaths.is_output_file_writable(output_file_path) if items_order_file_path: filesnpaths.is_output_file_writable(items_order_file_path) id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix( observation_matrix_path, transpose=transpose) vectors = np.array(vectors) newick = get_newick_from_matrix(vectors, distance, linkage, norm, id_to_sample_dict) if output_file_path: open(output_file_path, 'w').write(newick.strip() + '\n') if items_order_file_path: open(items_order_file_path, 'w').write( '\n'.join(utils.get_names_order_from_newick_tree(newick)) + '\n')
def store_hmm_sequences_into_FASTA(self, hmm_sequences_dict_for_splits, output_file_path, wrap=120, concatenate_genes=False, partition_file_path=None, separator=None, genes_order=None, align_with=None, just_do_it=False): """Stores HMM sequences into a FASTA file.""" filesnpaths.is_output_file_writable(output_file_path) filesnpaths.is_output_file_writable(partition_file_path) if partition_file_path else None if wrap and not isinstance(wrap, int): raise ConfigError('"wrap" has to be an integer instance') if genes_order and concatenate_genes: gene_frequencies = Counter(genes_order) non_unique_genes = [g for g in gene_frequencies if gene_frequencies[g] > 1] if len(non_unique_genes): if just_do_it: self.run.warning("Anvi'o found that some gene names occur multiple times (i.e., %s), but is letting this get away " "since the user invoked the grumpy flag." % (', '.join(non_unique_genes)), nl_before=1) else: raise ConfigError("The list of gene names you wish to concatenate contains those that occur more than once. " "Here is the list: '%s'. While anvi'o believes it is a silly idea to have the same gene " "names multiple times, it will not care about it and will let you get away with it if you " "really want that. In which case you can use the flag `--just-do-it`, and move on with your " "very unconventional and cool analysis." % (', '.join(non_unique_genes))) if concatenate_genes: self.__store_concatenated_hmm_sequences_into_FASTA(hmm_sequences_dict_for_splits, output_file_path, partition_file_path, wrap, separator, genes_order, align_with, just_do_it) else: self.__store_individual_hmm_sequences_into_FASTA(hmm_sequences_dict_for_splits, output_file_path, wrap, separator, genes_order, align_with)
def store_array_as_TAB_delimited_file(a, output_path, header, exclude_columns=[]): filesnpaths.is_output_file_writable(output_path) num_fields = len(a[0]) if len(header) != num_fields: raise ConfigError, "store array: header length (%d) differs from data (%d)..." % ( len(header), num_fields) for col in exclude_columns: if not col in header: raise ConfigError, "store array: column %s is not in the header array..." exclude_indices = set([header.index(c) for c in exclude_columns]) header = [ header[i] for i in range(0, len(header)) if i not in exclude_indices ] f = open(output_path, 'w') f.write('%s\n' % '\t'.join(header)) for row in a: f.write('\t'.join([ str(row[i]) for i in range(0, num_fields) if i not in exclude_indices ]) + '\n') f.close() return output_path
def save_empty_config_in_json_format(self, filename='empty_config.json'): import json filesnpaths.is_output_file_writable(filename) empty_config = self.get_empty_config() open(filename, 'w').write(json.dumps(empty_config, indent=4))
def store_hmm_sequences_into_FASTA(self, hmm_sequences_dict_for_splits, output_file_path, wrap=120, concatenate_genes=False, separator=None, genes_order=None, align_with=None, just_do_it=False): """Stores HMM sequences into a FASTA file.""" filesnpaths.is_output_file_writable(output_file_path) if wrap and not isinstance(wrap, int): raise ConfigError('"wrap" has to be an integer instance') if genes_order and concatenate_genes: gene_frequencies = Counter(genes_order) non_unique_genes = [g for g in gene_frequencies if gene_frequencies[g] > 1] if len(non_unique_genes): if just_do_it: self.run.warning("Anvi'o found that some gene names occur multiple times (i.e., %s), but is letting this get away\ since the user invoked the grumpy flag." % (', '.join(non_unique_genes)), nl_before=1) else: raise ConfigError("The list of gene names you wish to concatenate contains those that occur more than once.\ Here is the list: '%s'. While anvi'o believes it is a silly idea to have the same gene\ names multiple times, it will not care about it and will let you get away with it if you\ really want that. In which case you can use the flag `--just-do-it`, and move on with your\ very unconventional and cool analysis." % (', '.join(non_unique_genes))) if concatenate_genes: self.__store_concatenated_hmm_sequences_into_FASTA(hmm_sequences_dict_for_splits, output_file_path, wrap, concatenate_genes, separator, genes_order, align_with, just_do_it) else: self.__store_individual_hmm_sequences_into_FASTA(hmm_sequences_dict_for_splits, output_file_path, wrap, concatenate_genes, separator, genes_order, align_with)
def check_params(self): # if the user did not set a specific output directory name, use the project name # for it: self.output_dir = self.output_dir if self.output_dir else self.project_name # deal with the output directory: try: filesnpaths.is_file_exists(self.output_dir) except FilesNPathsError: filesnpaths.gen_output_directory( self.output_dir, delete_if_exists=self.overwrite_output_destinations) filesnpaths.is_output_dir_writable(self.output_dir) self.output_dir = os.path.abspath(self.output_dir) if not self.log_file_path: self.log_file_path = self.get_output_file_path('log.txt') filesnpaths.is_output_file_writable(self.log_file_path) os.remove(self.log_file_path) if os.path.exists( self.log_file_path) else None if not isinstance(self.minbit, float): raise ConfigError("minbit value must be of type float :(") if self.minbit < 0 or self.minbit > 1: raise ConfigError( "Well. minbit must be between 0 and 1. Yes. Very boring.") if not isinstance(self.min_percent_identity, float): raise ConfigError( "Minimum percent identity value must be of type float :(") if self.min_percent_identity < 0 or self.min_percent_identity > 100: raise ConfigError( "Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is " "pretty cute, too." % self.min_percent_identity) if len( [c for c in list(self.genomes.values()) if 'genome_hash' not in c]): raise ConfigError( "self.genomes does not seem to be a properly formatted dictionary for " "the anvi'o class Pangenome.") if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering: raise ConfigError( "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering " "while also asking it to enforce it.") if self.description_file_path: filesnpaths.is_file_plain_text(self.description_file_path) self.description = open( os.path.abspath(self.description_file_path), 'rU').read() self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
def check_params(self): # check the project name: if not self.project_name: raise ConfigError("Please set a project name, and be prepared to see it around as (1) anvi'o will use\ that name to set the output directory and to name various output files such as the\ databases that will be generated at the end of the process. If you set your own output\ directory name, you can have multiple projects in it and all of those projects can use\ the same intermediate files whenever possible.") utils.is_this_name_OK_for_database('pan project name', self.project_name, stringent=False) # if the user did not set a specific output directory name, use the project name # for it: self.output_dir = self.output_dir if self.output_dir else self.project_name # deal with the output directory: try: filesnpaths.is_file_exists(self.output_dir) except FilesNPathsError: filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations) filesnpaths.is_output_dir_writable(self.output_dir) self.output_dir = os.path.abspath(self.output_dir) if not self.log_file_path: self.log_file_path = self.get_output_file_path('log.txt') filesnpaths.is_output_file_writable(self.log_file_path) os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None if not isinstance(self.maxbit, float): raise ConfigError("maxbit value must be of type float :(") if self.maxbit < 0 or self.maxbit > 1: raise ConfigError("Well. maxbit must be between 0 and 1. Yes. Very boring.") if not isinstance(self.min_percent_identity, float): raise ConfigError("Minimum percent identity value must be of type float :(") if self.min_percent_identity < 0 or self.min_percent_identity > 100: raise ConfigError("Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\ pretty cute, too." % self.min_percent_identity) if len([c for c in list(self.genomes.values()) if 'genome_hash' not in c]): raise ConfigError("self.genomes does not seem to be a properly formatted dictionary for\ the anvi'o class Pangenome.") if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering: raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\ while also asking it to enforce it.") if self.description_file_path: filesnpaths.is_file_plain_text(self.description_file_path) self.description = open(os.path.abspath(self.description_file_path), 'rU').read() self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
def sanity_check(self): filesnpaths.is_file_tab_delimited(self.metadata_file_path) if os.path.exists(self.output_directory_path): filesnpaths.is_output_dir_writable(self.output_directory_path) else: filesnpaths.gen_output_directory(self.output_directory_path) filesnpaths.is_output_file_writable(self.output_fasta_descriptor)
def store_dict_as_FASTA_file(d, output_file_path, wrap_from = 200): filesnpaths.is_output_file_writable(output_file_path) output = open(output_file_path, 'w') for key in d: output.write('>%s\n' % key) output.write('%s\n' % textwrap.fill(d[key], wrap_from, break_on_hyphens = False)) output.close() return True
def sanity_check(self): """Basic sanity check for class inputs""" if self.profile_db_path is None and self.gene_coverages_data_file_path is None: raise ConfigError( "You must provide either a profile.db or a gene coverage matrix data file" ) if self.profile_db_path and self.gene_coverages_data_file_path: raise ConfigError( "You provided both a profile database and a gene coverage matrix data file, you \ must provide only one or the other (hint: if you have a profile database, the use that" ) # checking output file filesnpaths.is_output_file_writable(self.output_file_prefix + '-additional-layers.txt', ok_if_exists=False) # checking alpha if not isinstance(self.alpha, float): raise ConfigError("alpha value must be a type float.") if self.alpha <= 0 or self.alpha > 1: raise ConfigError( "alpha value must be greater than zero and a max of 1, the value you supplied %s" % self.alpha) # Checking beta if not isinstance(self.beta, float): raise ConfigError("beta value must be a type float.") if self.beta <= 0: raise ConfigError( "beta value must be greater than zero, the value you supplied %s" % self.beta) # Checking gamma if not isinstance(self.gamma, float): raise ConfigError("Gamma value must be a type float.") if self.gamma <= 0: raise ConfigError( "Gamma value must be greater than zero, the value you supplied %s" % self.gamma) # Checking eta if self.eta <= 0 or self.eta > 1: raise ConfigError( "eta value must be greater than zero and a max of 1, the value you supplied %s" % self.eta) if self.collection_name: if not self.profile_db_path: raise ConfigError( "You specified a collection name %s, but you provided a gene coverage matrix data file \ collections are only available when working with a profile database." % self.collection_name)
def __init__(self, args, r=terminal.Run(), p=terminal.Progress()): self.args = args self.run = r self.progress = p A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.output_file_path = A("output_file") or 'NETWORK.json' filesnpaths.is_output_file_writable(self.output_file_path) AnvioPrograms.__init__(self, args, r=self.run, p=self.progress)
def report(self, output_file_path): filesnpaths.is_output_file_writable(output_file_path) output_file = open(output_file_path, 'w') entry_id = 0 for d in self.data: entry_id += 1 output_file.write('>%.9d|sample_id:%s|reverse:%s|contig_name:%s\n' % (entry_id, d.sample_id, d.reverse, d.contig_name)) output_file.write('%s\n' % (d.sequence)) output_file.close() self.run.info('output_file', output_file_path)
def train(self, features, data, labels, n_trees=64): self.run.info('RF Train', "%d observations with %d features grouped into %d classes." % (len(data), len(features), len(set(labels)))) filesnpaths.is_output_file_writable(self.classifier_object_path) self.progress.new('Training') self.progress.update('...') rf = sklearn.ensemble.RandomForestClassifier(n_estimators=n_trees) rf.fit(np.array(data), labels) self.progress.end() pickle.dump({'features': features, 'classes': rf.classes_, 'classifier': rf}, open(self.classifier_object_path, 'wb')) self.run.info('Classifier output', self.classifier_object_path)
def store_hmm_sequences_into_FASTA(self, hmm_sequences_dict_for_splits, output_file_path, wrap=120, concatenate_genes=False, separator=None, genes_order=None, align_with=None): """Stores HMM sequences into a FASTA file.""" filesnpaths.is_output_file_writable(output_file_path) if wrap and not isinstance(wrap, int): raise ConfigError('"wrap" has to be an integer instance') if concatenate_genes: self.__store_concatenated_hmm_sequences_into_FASTA(hmm_sequences_dict_for_splits, output_file_path, wrap, concatenate_genes, separator, genes_order, align_with) else: self.__store_individual_hmm_sequences_into_FASTA(hmm_sequences_dict_for_splits, output_file_path, wrap, concatenate_genes, separator, genes_order, align_with)
def store_hmm_sequences_into_FASTA(self, hmm_sequences_dict_for_splits, output_file_path, wrap=120, concatenate_genes=False, separator=None, genes_order=None, align_with=None): """Stores HMM sequences into a FASTA file.""" filesnpaths.is_output_file_writable(output_file_path) if not isinstance(wrap, int): raise ConfigError('"wrap" has to be an integer instance') if concatenate_genes: self.__store_concatenated_hmm_sequences_into_FASTA(hmm_sequences_dict_for_splits, output_file_path, wrap, concatenate_genes, separator, genes_order, align_with) else: self.__store_individual_hmm_sequences_into_FASTA(hmm_sequences_dict_for_splits, output_file_path, wrap, concatenate_genes, separator, genes_order, align_with)
def process(self): """Processes a given FASTA file with tRNAScan-SE and reports all hits Parameters ========== N\A Returns ======= results: `dict` A Dictionary of hits """ filesnpaths.is_output_file_writable(self.output_file_path, ok_if_exists=False) command = [ self.program_name, self.fasta_file_path, '--score', self.cutoff_score, '-G', '-o', self.output_file_path, '--thread', self.num_threads ] self.run.warning( "Anvi'o will use 'tRNAScan-SE' by Chan and Lowe (doi:10.1007/978-1-4939-9173-0_1) to identify tRNA\ sequences in your data. When you publish your findings, please do not forget to properly credit\ their work.", lc='green', header="CITATION") self.progress.new('Running tRNAScan-SE') self.progress.update('...') exit_code = utils.run_command(command, self.log_file_path) self.progress.end() if exit_code: raise ConfigError( "tRNAScan-SE finished with a non-zero exit code, which indicates that something went\ wrong with it :/ Please check the log file to see learn more :/" ) d = self.parse_output() # CLEANUP if not self.keep_output_file and not anvio.DEBUG: os.remove(self.output_file_path) if not self.keep_log_file and not anvio.DEBUG: os.remove(self.log_file_path) return d
def report(self, output_file_path): filesnpaths.is_output_file_writable(output_file_path) output_file = open(output_file_path, 'w') output_file.write('\t'.join(['entry_id', 'contig_name', 'pos_in_contig', 'pos_in_read', 'base', 'read_id']) + '\n') entry_id = 0 for d in self.linkmers.data: entry_id += 1 output_file.write('%.5d\t%s\t%d\t%d\t%s\t%s\n' % (entry_id, d.contig_name, d.pos_in_contig, \ d.pos_in_read, d.base, d.read_hash)) output_file.close() self.run.info('output_file', output_file_path)
def transpose_tab_delimited_file(input_file_path, output_file_path): filesnpaths.is_file_exists(input_file_path) filesnpaths.is_file_tab_delimited(input_file_path) filesnpaths.is_output_file_writable(output_file_path) file_content = [line.strip('\n').split('\t') for line in open(input_file_path, 'rU').readlines()] output_file = open(output_file_path, 'w') for entry in zip(*file_content): output_file.write('\t'.join(entry) + '\n') output_file.close() return output_file_path
def transpose_tab_delimited_file(input_file_path, output_file_path): filesnpaths.is_file_exists(input_file_path) filesnpaths.is_file_tab_delimited(input_file_path) filesnpaths.is_output_file_writable(output_file_path) file_content = [line.strip('\n').split('\t') for line in open(input_file_path).readlines()] output_file = open(output_file_path, 'w') for entry in zip(*file_content): output_file.write('\t'.join(entry) + '\n') output_file.close() return output_file_path
def store_clusters_as_TAB_delimited_text(self, output_file_path): filesnpaths.is_output_file_writable(output_file_path) self.progress.new('Storing clusters as TAB-delimited file') self.progress.update('creating the clusters dictionary ...') clusters_dict = {} for contig_name in self.clusters: clusters_dict[contig_name] = {'concoct_bin': self.clusters[contig_name]} self.progress.update('writing the file ...') utils.store_dict_as_TAB_delimited_file(clusters_dict, output_file_path, ['contig', 'concoct_bin']) self.progress.end() self.run.info('CONCOCT results in txt', output_file_path, display_only = True)
def store_clusters_as_TAB_delimited_text(self, output_file_path): filesnpaths.is_output_file_writable(output_file_path) self.progress.new("Storing clusters as TAB-delimited file") self.progress.update("creating the clusters dictionary ...") clusters_dict = {} for contig_name in self.clusters: clusters_dict[contig_name] = {"concoct_bin": self.clusters[contig_name]} self.progress.update("writing the file ...") utils.store_dict_as_TAB_delimited_file(clusters_dict, output_file_path, ["contig", "concoct_bin"]) self.progress.end() self.run.info("CONCOCT results in txt", output_file_path, display_only=True)
def export_sequences_table_in_db_into_FASTA_file( self, table=t.contig_sequences_table_name, output_file_path=None): if self.db_type != 'contigs': return None if output_file_path: filesnpaths.is_output_file_writable(output_file_path) else: output_file_path = os.path.join( filesnpaths.get_temp_directory_path(), 'aa_sequences.fa') database = db.DB(self.db_path, self.version) if table not in database.get_table_names(): raise ConfigError( 'Trying to export sequences into a FASTA file, but the table\ "%s" does not seem to be in this database :/' % (table)) if 'sequence' not in database.get_table_structure(table): raise ConfigError( "You requested to store sequences in table '%s' into a FASTA\ file, however this table does not seem to be a table that\ stores sequence information :(" % table) sequences_table = database.get_table_as_dict(table) database.disconnect() if not len([sequences_table]): raise ConfigError( "There are no sequences to report in table '%s'." % (table)) self.progress.new('Exporting %d sequences into a FASTA file' % len(sequences_table)) self.progress.update('...') sequences_fasta = u.FastaOutput(output_file_path) for seq_id in sequences_table: sequences_fasta.write_id(seq_id) sequences_fasta.write_seq(sequences_table[seq_id]['sequence'], split=False) self.progress.end() self.run.info('Sequences', '%d sequences reported.' % (len(sequences_table))) self.run.info('FASTA', output_file_path) return output_file_path
def create_newick_file_from_matrix_file(observation_matrix_path, output_file_name, linkage=constants.linkage_method_default, distance=constants.distance_metric_default, norm='l1', progress=progress, transpose=False): is_distance_and_linkage_compatible(distance, linkage) filesnpaths.is_file_exists(observation_matrix_path) filesnpaths.is_file_tab_delimited(observation_matrix_path) filesnpaths.is_output_file_writable(output_file_name) id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(observation_matrix_path, transpose=transpose) vectors = np.array(vectors) newick = get_newick_from_matrix(vectors, distance, linkage, norm, id_to_sample_dict) if output_file_name: open(output_file_name, 'w').write(newick.strip() + '\n')
def store_hmm_sequences_into_FASTA(self, hmm_sequences_dict_for_splits, output_file_path, wrap=120): filesnpaths.is_output_file_writable(output_file_path) if not isinstance(wrap, int): raise ConfigError('"wrap" has to be an integer instance') f = open(output_file_path, 'w') for gene_unique_id in hmm_sequences_dict_for_splits: header, sequence = self.get_FASTA_header_and_sequence_for_gene_unique_id(hmm_sequences_dict_for_splits, gene_unique_id) if wrap: sequence = textwrap.fill(sequence, wrap, break_on_hyphens=False) f.write('>%s\n' % header) f.write('%s\n' % sequence)
def store_hmm_sequences_into_FASTA(self, hmm_sequences_dict_for_splits, output_file_path, wrap = 200): filesnpaths.is_output_file_writable(output_file_path) if type(wrap) != int: raise ConfigError, '"wrap" has to be an integer instance' f = open(output_file_path, 'w') for gene_unique_id in hmm_sequences_dict_for_splits: header, sequence = self.get_FASTA_header_and_sequence_for_gene_unique_id(hmm_sequences_dict_for_splits, gene_unique_id) if wrap: sequence = textwrap.fill(sequence, wrap, break_on_hyphens = False) f.write('>%s\n' % header) f.write('%s\n' % sequence)
def concatenate_files(dest_file, file_list): if not dest_file: raise ConfigError, "Destination cannot be empty." if not len(file_list): raise ConfigError, "File list cannot be empty." for f in file_list: filesnpaths.is_file_exists(f) filesnpaths.is_output_file_writable(dest_file) dest_file_obj = open(dest_file, 'w') for chunk_path in file_list: for line in open(chunk_path, 'rU'): dest_file_obj.write(line) dest_file_obj.close() return dest_file
def concatenate_files(dest_file, file_list): if not dest_file: raise ConfigError, "Destination cannot be empty." if not len(file_list): raise ConfigError, "File list cannot be empty." for f in file_list: filesnpaths.is_file_exists(f) filesnpaths.is_output_file_writable(dest_file) dest_file_obj = open(dest_file, 'w') for chunk_path in file_list: for line in open(chunk_path): dest_file_obj.write(line) dest_file_obj.close() return dest_file
def check_params(self): # if the user did not set a specific output directory name, use the project name # for it: self.output_dir = self.output_dir if self.output_dir else self.project_name # deal with the output directory: try: filesnpaths.is_file_exists(self.output_dir) except FilesNPathsError: filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations) filesnpaths.is_output_dir_writable(self.output_dir) self.output_dir = os.path.abspath(self.output_dir) if not self.log_file_path: self.log_file_path = self.get_output_file_path('log.txt') filesnpaths.is_output_file_writable(self.log_file_path) os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None if not isinstance(self.minbit, float): raise ConfigError("minbit value must be of type float :(") if self.minbit < 0 or self.minbit > 1: raise ConfigError("Well. minbit must be between 0 and 1. Yes. Very boring.") if not isinstance(self.min_percent_identity, float): raise ConfigError("Minimum percent identity value must be of type float :(") if self.min_percent_identity < 0 or self.min_percent_identity > 100: raise ConfigError("Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\ pretty cute, too." % self.min_percent_identity) if len([c for c in list(self.genomes.values()) if 'genome_hash' not in c]): raise ConfigError("self.genomes does not seem to be a properly formatted dictionary for\ the anvi'o class Pangenome.") if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering: raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\ while also asking it to enforce it.") if self.description_file_path: filesnpaths.is_file_plain_text(self.description_file_path) self.description = open(os.path.abspath(self.description_file_path), 'rU').read() self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
def export_collection(self, collection_name, output_file_prefix=None, include_unbinned=False): self.sanity_check(collection_name) if not output_file_prefix: output_file_prefix = 'collection-%s' % (collection_name.strip().replace(' ', '-')) info_file_path = output_file_prefix + '-info.txt' items_file_path = output_file_prefix + '.txt' self.run.info('Report unbinned items if there are any', include_unbinned) self.run.info('Items file path', items_file_path) filesnpaths.is_output_file_writable(items_file_path) bins_info = self.get_bins_info_dict(collection_name) collection = self.get_collection_dict(collection_name) if len(bins_info): self.run.info('Bins info file path', info_file_path) info_file = open(info_file_path, 'w') if include_unbinned: bins_info['UNBINNED_ITEMS_BIN'] = {'html_color': '#000000', 'source': 'anvi-export-collections'} for bin_name in bins_info: info_file.write('%s\t%s\t%s\n' % (bin_name, bins_info[bin_name]['source'], bins_info[bin_name]['html_color'])) info_file.close() binned_items = set([]) items_file = open(items_file_path, 'w') for bin_name in collection: for item_name in collection[bin_name]: items_file.write('%s\t%s\n' % (item_name, bin_name)) binned_items.add(item_name) if include_unbinned: all_items = utils.get_all_item_names_from_the_database(self.db_path) unbinned_items = all_items.difference(binned_items) for item_name in unbinned_items: items_file.write('%s\tUNBINNED_ITEMS_BIN\n' % (item_name)) self.run.warning("As per your request, %d items that were not in any of the bins in the collection '%s' are stored\ in the output file under the bin name 'UNBINNED_ITEMS_BIN'." % (len(unbinned_items), collection_name)) items_file.close()
def sanity_check(self): bad_bam_files = [] for bam_file_path in self.input_bam_files: try: filesnpaths.is_file_exists(bam_file_path) pysam.Samfile(bam_file_path, 'rb') except ValueError as e: bad_bam_files.append(bam_file_path) if len(bad_bam_files): raise ConfigError, 'Samtools is not happy with some of your bam files. The following\ file(s) do not look like proper BAM files [ here is the actual\ error: "%s"]: %s.' % (e, ','.join(bad_bam_files)) if not self.output_file_path: self.output_file_path = 'short_reads.fa' filesnpaths.is_output_file_writable(self.output_file_path)
def report(self, output_file_path): filesnpaths.is_output_file_writable(output_file_path) output_file = open(output_file_path, 'w') output_file.write('\t'.join(['entry_id', 'sample_id', 'request_id', 'contig_name', 'pos_in_contig',\ 'pos_in_read', 'base', 'read_unique_id', 'read_X', 'reverse',\ 'sequence']) + '\n') entry_id = 0 for contig_name, positions, data in self.linkmers.data: for d in data: entry_id += 1 output_file.write('%.9d\t%s\t%.3d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\n' % (entry_id, d.sample_id, d.request_id, d.contig_name,\ d.pos_in_contig, d.pos_in_read, d.base, d.read_unique_id,\ d.read_X, d.reverse, d.sequence)) output_file.close() self.run.info('output_file', output_file_path)
def sanity_check(self): bad_bam_files = [] for bam_file_path in self.input_bam_files: try: bam_file_object = BAMFileObject(bam_file_path).get() except ConfigError as e: bad_bam_files.append(bam_file_path) bam_file_object.close() if len(bad_bam_files): raise ConfigError, 'Samtools is not happy with some of your bam files. The following\ file(s) do not look like proper BAM files [ here is the actual\ error: "%s"]: %s.' % (e, ','.join(bad_bam_files)) if not self.output_file_path: self.output_file_path = 'short_reads.fa' filesnpaths.is_output_file_writable(self.output_file_path)
def check_params(self): # deal with the output directory: try: filesnpaths.is_file_exists(self.output_dir) except FilesNPathsError: filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations) filesnpaths.is_output_dir_writable(self.output_dir) self.output_dir = os.path.abspath(self.output_dir) if not self.log_file_path: self.log_file_path = self.get_output_file_path('log.txt') filesnpaths.is_output_file_writable(self.log_file_path) os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None if not isinstance(self.maxbit, float): raise ConfigError, "maxbit value must be of type float :(" if self.maxbit < 0 or self.maxbit > 1: raise ConfigError, "Well. maxbit must be between 0 and 1. Yes. Very boring." if not isinstance(self.min_percent_identity, float): raise ConfigError, "Minimum percent identity value must be of type float :(" if self.min_percent_identity < 0 or self.min_percent_identity > 100: raise ConfigError, "Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\ pretty cute, too." % self.min_percent_identity if len([c for c in self.genomes.values() if 'contigs_db_path' not in c]): raise ConfigError, "self.genomes does not seem to be a properly formatted dictionary for\ the anvi'o class Pangenome." for genome_name in self.genomes: if not os.path.exists(self.genomes[genome_name]['contigs_db_path']): raise ConfigError, "The contigs database for genome %s is not where the input data suggested where\ it would be.." % genome_name if genome_name in self.internal_genome_names and not os.path.exists(self.genomes[genome_name]['profile_db_path']): raise ConfigError, "The profile database for genome %s is not where the input data suggested where\ it would be.." % genome_name
def export_sequences_table_in_db_into_FASTA_file(self, table = t.contig_sequences_table_name, output_file_path = None): if self.db_type != 'contigs': return None if output_file_path: filesnpaths.is_output_file_writable(output_file_path) else: output_file_path = os.path.join(filesnpaths.get_temp_directory_path(), 'sequences.fa') database = db.DB(self.db_path, self.version) if table not in database.get_table_names(): raise ConfigError, 'Trying to export sequences into a FASTA file, but the table\ "%s" does not seem to be in this database :/' % (table) if 'sequence' not in database.get_table_structure(table): raise ConfigError, "You requested to store sequences in table '%s' into a FASTA\ file, however this table does not seem to be a table that\ stores sequence information :(" % table sequences_table = database.get_table_as_dict(table) database.disconnect() if not len([sequences_table]): raise ConfigError, "There are no sequences to report in table '%s'." % (table) self.progress.new('Exporting %d sequences into a FASTA file' % len(sequences_table)) self.progress.update('...') sequences_fasta = u.FastaOutput(output_file_path) for seq_id in sequences_table: sequences_fasta.write_id(seq_id) sequences_fasta.write_seq(sequences_table[seq_id]['sequence'], split=False) self.progress.end() self.run.info('Sequences', '%d sequences reported.' % (len(sequences_table))) self.run.info('FASTA', output_file_path) return output_file_path
def sanity_check(self): """Basic sanity check for class inputs""" if self.output_file_prefix: filesnpaths.is_output_file_writable(self.output_file_prefix + '-additional-layers.txt', ok_if_exists=self.overwrite_output_destinations) try: if self.gen_figures: plot_dir = self.output_file_prefix + '-nucleotide-coverage-distribution-plots' os.makedirs(plot_dir, exist_ok=self.overwrite_output_destinations) except FileExistsError as e: raise FilesNPathsError("%s already exists, if you would like to overwrite it, then use -W (see help menu)." % plot_dir) # checking alpha if not isinstance(self.alpha, float): raise ConfigError("alpha value must be a type float.") # alpha must be a min of 0 and smaller than 0.5 if self.alpha < 0 or self.alpha >= 0.5: raise ConfigError("alpha must be a minimum of 0 and smaller than 0.5") if self.exclude_samples and self.include_samples: raise ConfigError("You cannot use both --include-samples and --exclude-samples! Please choose one.")
def _run(self): self.check_sge_binaries() if not self.binary: raise ConfigError('A binary has to be declared.') if not self.command: raise ConfigError('SGE module cannot run without a command.') if not self.tmp_dir: raise ConfigError('SGE module needs a tmp dir.') filesnpaths.is_file_exists(self.input_file_path) filesnpaths.is_output_file_writable(self.merged_results_file_path) self.run.info('temp_directory', self.tmp_dir) parts = self.split_input_file() old_workdir = os.getcwd() os.chdir(os.path.dirname(self.tmp_dir)) self.clusterize(parts) if self.wild_card_for_partial_results: self.merge_partial_results() os.chdir(old_workdir)
def store_array_as_TAB_delimited_file(a, output_path, header, exclude_columns = []): filesnpaths.is_output_file_writable(output_path) num_fields = len(a[0]) if len(header) != num_fields: raise ConfigError, "store array: header length (%d) differs from data (%d)..." % (len(header), num_fields) for col in exclude_columns: if not col in header: raise ConfigError, "store array: column %s is not in the header array..." exclude_indices = set([header.index(c) for c in exclude_columns]) header = [header[i] for i in range(0, len(header)) if i not in exclude_indices] f = open(output_path, 'w') f.write('%s\n' % '\t'.join(header)) for row in a: f.write('\t'.join([str(row[i]) for i in range(0, num_fields) if i not in exclude_indices]) + '\n') f.close() return output_path
def sanity_check(self): bad_bam_files = [] error_message = None for bam_file_path in self.input_bam_files: try: bam_file_object = BAMFileObject(bam_file_path).get() bam_file_object.close() except ConfigError as e: bad_bam_files.append(bam_file_path) error_message = e if len(bad_bam_files): raise ConfigError('Samtools is not happy with some of your bam files. The following\ file(s) do not look like proper BAM files [here is the actual\ error: "%s"]: %s.' % (error_message, ','.join(bad_bam_files))) if self.output_file_prefix and self.output_file_path: raise ConfigError("You must either use the parameter output file name, or output file prefix.") if self.output_file_prefix and not self.split_R1_and_R2: raise ConfigError("Output file prefix parameter is only relevant when you want to split R1 reads\ from R2 reads and so on.") if self.split_R1_and_R2 and not self.output_file_prefix: raise ConfigError("If you wish R1 and R2 reads to be reported in separate FASTA files, \ you need to provide an output file prefix so anvi'o can generate\ multiple output files that start with it (i.e., PREFIX_R1.fa, PREFIX_R2.fa\ PREFIX_UNPAIRED.fa).") if self.split_R1_and_R2: filesnpaths.is_output_file_writable(self.output_file_prefix + '_R1.fa') elif self.output_file_path: filesnpaths.is_output_file_writable(self.output_file_path) else: filesnpaths.is_output_file_writable('short_reads.fa') self.sanity_checked = True