def get_newick_tree_data(observation_matrix_path, output_file_name = None, clustering_distance='euclidean', clustering_method = 'complete', norm = 'l1', progress = progress): filesnpaths.is_file_exists(observation_matrix_path) filesnpaths.is_file_tab_delimited(observation_matrix_path) if output_file_name: output_file_name = os.path.abspath(output_file_name) output_directory = os.path.dirname(output_file_name) if not os.access(output_directory, os.W_OK): raise ConfigError, "You do not have write permission for the output directory: '%s'" % output_directory id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(observation_matrix_path) vectors = np.array(vectors) # normalize vectors: vectors = get_normalized_vectors(vectors, norm=norm, progress=progress) tree = get_clustering_as_tree(vectors, clustering_distance, clustering_method, progress) newick = get_tree_object_in_newick(tree, id_to_sample_dict) if output_file_name: open(output_file_name, 'w').write(newick.strip() + '\n') return newick
def get_internal_and_external_genomes_files(self): internal_genomes_file = self.get_param_value_from_config('internal_genomes') external_genomes_file = self.get_param_value_from_config('external_genomes') fasta_txt_file = self.get_param_value_from_config('fasta_txt', repress_default=True) if fasta_txt_file and not external_genomes_file: raise ConfigError('You provided a fasta_txt, but didn\'t specify a path for an external-genomes file. \ If you wish to use external genomes, you must specify a name for the external-genomes \ file, using the "external_genomes" parameter in your config file. Just to clarify: \ the external genomes file doesn\'t have to exist, since we will create it for you, \ by using the information you supplied in the "fasta_txt" file, but you must specify \ a name for the external-genomes file. For example, you could use "external_genomes": "external-genomes.txt", \ but feel free to be creative.') if not internal_genomes_file and not external_genomes_file: raise ConfigError('You must provide either an external genomes file or internal genomes file') # here we do a little trick to make sure the rule can expect either one or both d = {"internal_genomes_file": external_genomes_file, "external_genomes_file": internal_genomes_file} if internal_genomes_file: filesnpaths.is_file_exists(internal_genomes_file) d['internal_genomes_file'] = internal_genomes_file if external_genomes_file: if filesnpaths.is_file_exists(external_genomes_file, dont_raise=True): run.warning('There is no file %s. No worries, one will be created for you.' % external_genomes_file) d['external_genomes_file'] = external_genomes_file return d
def get_newick_tree_data(observation_matrix_path, output_file_name=None, linkage=constants.linkage_method_default, distance=constants.distance_metric_default, norm='l1', progress=progress, transpose=False): is_distance_and_linkage_compatible(distance, linkage) filesnpaths.is_file_exists(observation_matrix_path) filesnpaths.is_file_tab_delimited(observation_matrix_path) if output_file_name: output_file_name = os.path.abspath(output_file_name) output_directory = os.path.dirname(output_file_name) if not os.access(output_directory, os.W_OK): raise ConfigError, "You do not have write permission for the output directory: '%s'" % output_directory id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(observation_matrix_path, transpose=transpose) vectors = np.array(vectors) # normalize vectors: vectors = get_normalized_vectors(vectors, norm=norm, progress=progress) tree = get_clustering_as_tree(vectors, linkage, distance, progress) newick = get_tree_object_in_newick(tree, id_to_sample_dict) if output_file_name: open(output_file_name, 'w').write(newick.strip() + '\n') return newick
def populate_collections_dict(self, db_path): filesnpaths.is_file_exists(db_path) self.db_path = db_path database = db.DB(db_path, utils.get_required_version_for_db(db_path)) self.db_type = database.get_meta_value('db_type') collections_info_table = database.get_table_as_dict(t.collections_info_table_name) database.disconnect() # collections info must be read only if its coming from the contigs database. if self.db_type == 'contigs': read_only = True elif self.db_type == 'profile': read_only = False elif self.db_type: read_only = False elif self.db_type == 'pan': read_only = False else: raise ConfigError('Collections class does not know about this "%s" database type :/' % self.db_type) for collection_name in collections_info_table: self.collections_dict[collection_name] = collections_info_table[collection_name] self.collections_dict[collection_name]['read_only'] = read_only self.collections_dict[collection_name]['source_db_path'] = db_path self.collections_dict[collection_name]['source_db_version'] = utils.get_required_version_for_db(db_path)
def __init__(self, input_bam_path, run=run, progress=progress): self.run = run self.progress = progress self.input_bam_path = input_bam_path filesnpaths.is_file_exists(input_bam_path)
def load_collections(self): ''' Load the collections_txt file, run some sanity checks, and figure out params for anvi_import_collection''' collections = u.get_TAB_delimited_file_as_dictionary(self.collections_txt) bad_groups = [g for g in collections if g not in self.group_names] if bad_groups: raise ConfigError('Some of the names in your collection_txt \ file ("%s") don\'t match the names of the \ groups in your samples_txt/fasta_txt. \ Here are the names that don\'t match: %s. \ And here are the group names we expect to find: \ %s' % (self.collections_txt, ', '.join(bad_groups), ', '.join(self.group_names))) for group in collections: filesnpaths.is_file_exists(collections[group]['collection_file']) if not collections[group]['collection_name']: raise ConfigError('You must specify a name for each collection in your collections_txt') u.check_collection_name(collections[group]['collection_name']) if collections[group].get('bins_info'): filesnpaths.is_file_exists(collections[group]['bins_info']) collections[group]['bins_info'] = '--bins-info %s' % collections[group]['bins_info'] else: collections[group]['bins_info'] = '' if collections[group].get('contigs_mode'): collections[group]['contigs_mode'] = '--contigs-mode' else: collections[group]['contigs_mode'] = '' self.collections = collections
def fix_input_file(self, input_file_path): """This is sadly necessary because Kaiju output contains either three, or eight TAB-delimited columns ... not very parser friendly""" filesnpaths.is_file_exists(input_file_path) self.progress.new('Fixing the broken kaiju output') self.progress.update('...') corrected_temp_file_path = filesnpaths.get_temp_file_path() corrected_temp_file = open(corrected_temp_file_path, 'w') input_file = open(input_file_path, 'rU') num_correct_lines = 0 for line in input_file.readlines(): if len(line.split('\t')) == 8: corrected_temp_file.write(line) num_correct_lines += 1 corrected_temp_file.close() self.progress.end() if not num_correct_lines: os.remove(corrected_temp_file_path) raise ConfigError("Something must have been wrong with you input file. Not a single line in it\ matched to what the kaiju parsers expects (a proper file should have eight\ TAB-delimited columns).") return corrected_temp_file_path
def __init__(self, file_path, unique_hash, create_new = False, ignore_hash = False): self.file_path = file_path if create_new: if ignore_hash: raise HDF5Error, "When creating a new database, you can't use the 'ignore_hash'\ parameter." if not unique_hash: raise HDF5Error, "When creating a new database, the 'unique_hash' cannot be None" self.fp = h5py.File(self.file_path, 'w') self.fp.attrs['hash'] = unique_hash self.fp.attrs['version'] = anvio.__hdf5__version__ else: filesnpaths.is_file_exists(self.file_path) self.fp = h5py.File(self.file_path, 'r') if [h not in self.fp.attrs for h in ['hash', 'version']].count(True): raise HDF5Error, "The database at '%s' is missing one or more essential headers that\ should appear in every anvi'o generated HDF5 file. Sorry!" % self.file_path if self.fp.attrs['version'] != anvio.__hdf5__version__: raise HDF5Error, "The database at '%s' is at version '%s', however your cliend is at\ version '%s'. Bad news." % (self.file_path, self.fp.attrs['version'], anvio.__hdf5__version__) if not ignore_hash and self.fp.attrs['hash'] != unique_hash: raise HDF5Error, "The database at '%s' does not have the hash the client requested."
def __init__(self, args = None): self.args = args self.input_file_path = None self.contigs_and_positions = {} self.progress = terminal.Progress() self.run = terminal.Run(width=35) if args: filesnpaths.is_file_exists(args.input_file) self.input_file_path = args.input_file if args.list_contigs: self.list_contigs() sys.exit() filesnpaths.is_file_exists(args.contigs_and_positions) filesnpaths.is_file_tab_delimited(args.contigs_and_positions, expected_number_of_fields = 2) f = open(args.contigs_and_positions) for line in f.readlines(): contig_name, positions = line.split('\t') try: positions = [int(pos) for pos in positions.split(',')] except ValueError: raise ConfigError, 'Positions for contig "%s" does not seem to be comma-separated integers...' % contig_name self.contigs_and_positions[contig_name] = set(positions) self.bam = None self.linkmers = None
def unique_FASTA_file(input_file_path, output_fasta_path = None, names_file_path = None, store_frequencies_in_deflines = True): filesnpaths.is_file_exists(input_file_path) if not output_fasta_path: output_fasta_path = input_file_path + '.unique' if not names_file_path: names_file_path = output_fasta_path + '.names' if output_fasta_path == names_file_path: raise ConfigError, "I can't unique this. Output FASTA file path can't be identical to\ the names file path..." if output_fasta_path == input_file_path or names_file_path == input_file_path: raise ConfigError, "Anvi'o will not unique this. Output FASTA path and names file path should\ be different from the the input file path..." filesnpaths.is_output_file_writable(output_fasta_path) filesnpaths.is_output_file_writable(names_file_path) input_fasta = u.SequenceSource(input_file_path, unique = True) output_fasta = u.FastaOutput(output_fasta_path) names_file = open(names_file_path, 'w') names_dict = {} while input_fasta.next(): output_fasta.store(input_fasta, split = False, store_frequencies = store_frequencies_in_deflines) names_file.write('%s\t%s\n' % (input_fasta.id, ','.join(input_fasta.ids))) names_dict[input_fasta.id] = input_fasta.ids return output_fasta_path, names_file_path, names_dict
def check_params(self): # deal with the output directory: try: filesnpaths.is_file_exists(self.output_dir) except FilesNPathsError: filesnpaths.gen_output_directory(self.output_dir, delete_if_exists = self.overwrite_output_destinations) filesnpaths.is_output_dir_writable(self.output_dir) self.output_dir = os.path.abspath(self.output_dir) if type(self.min_percent_identity) != float: raise ConfigError, "Minimum percent identity value must be of type float :(" if self.min_percent_identity < 20 or self.min_percent_identity > 100: raise ConfigError, "Minimum percent identity must be between 20%% and 100%%. Although your %.2f%% is\ pretty cute, too." % self.min_percent_identity if len([c for c in self.genomes.values() if 'contigs_db_path' not in c]): raise ConfigError, "self.genomes does not seem to be a properly formatted dictionary for\ the anvi'o class Pangenome." for genome_name in self.genomes: if not os.path.exists(self.genomes[genome_name]['contigs_db_path']): raise ConfigError, "The contigs database for genome %s is not where the input data suggested where\ it would be.." % genome_name if genome_name in self.internal_genome_names and not os.path.exists(self.genomes[genome_name]['profile_db_path']): raise ConfigError, "The profile database for genome %s is not where the input data suggested where\ it would be.." % genome_name
def __init__(self, db_path, client_version, new_database=False, ignore_version=False): self.db_path = db_path self.version = None if new_database: filesnpaths.is_output_file_writable(db_path) else: filesnpaths.is_file_exists(db_path) if new_database and os.path.exists(self.db_path): os.remove(self.db_path) self.check_if_db_writable() self.conn = sqlite3.connect(self.db_path) self.conn.text_factory = str self.cursor = self.conn.cursor() if new_database: self.create_self() self.set_version(client_version) else: self.version = self.get_version() if str(self.version) != str(client_version) and not ignore_version: if int(self.version) > int(client_version): raise ConfigError("Bad news of the day: the database at %s was generated with an anvi'o version that is 'newer' than\ the one you are actively using right now. We know, you hate to hear this, but you need to upgrade\ your anvi'o :(" % self.db_path) else: raise ConfigError("The database at '%s' is outdated (its version is v%s, but your anvi'o installation only knows how to\ deal with v%s). You can migrate your database without losing any data using the program `anvi-migrate-db`."\ % (self.db_path, self.version, client_version))
def __init__(self, db_path, client_version, new_database=False, ignore_version=False): self.db_path = db_path self.version = None if new_database: filesnpaths.is_output_file_writable(db_path) else: filesnpaths.is_file_exists(db_path) if new_database and os.path.exists(self.db_path): os.remove(self.db_path) self.conn = sqlite3.connect(self.db_path) self.conn.text_factory = str self.cursor = self.conn.cursor() if new_database: self.create_self() self.set_version(client_version) else: self.version = self.get_version() if str(self.version) != str(client_version) and not ignore_version: raise ConfigError, "It seems the database '%s' was generated when your client was at version %s,\ however, your client now is at version %s. Which means this database file\ cannot be used with this client anymore and needs to be upgraded to the\ version %s :/"\ % (self.db_path, self.version, client_version, client_version)
def get_vectors_from_TAB_delim_matrix(file_path, cols_to_return=None, rows_to_return=[], transpose=False): filesnpaths.is_file_exists(file_path) filesnpaths.is_file_tab_delimited(file_path) if transpose: transposed_file_path = filesnpaths.get_temp_file_path() transpose_tab_delimited_file(file_path, transposed_file_path) file_path = transposed_file_path rows_to_return = set(rows_to_return) vectors = [] id_to_sample_dict = {} sample_to_id_dict = {} input_matrix = open(file_path) columns = input_matrix.readline().strip().split("\t")[1:] fields_of_interest = [] if cols_to_return: fields_of_interest = [columns.index(col) for col in cols_to_return] else: fields_of_interest = [f for f in range(0, len(columns)) if IS_ESSENTIAL_FIELD(columns[f])] # update columns: columns = [columns[i] for i in fields_of_interest] if not len(columns): raise ConfigError, "Only a subset (%d) of fields were requested by the caller, but none of them was found\ in the matrix (%s) :/" % ( len(cols_to_return), file_path, ) id_counter = 0 for line in input_matrix.readlines(): row_name = line.strip().split("\t")[0] if rows_to_return and row_name not in rows_to_return: continue id_to_sample_dict[id_counter] = row_name fields = line.strip().split("\t")[1:] if fields_of_interest: vector = [float(fields[i]) for i in fields_of_interest] else: vector = [float(f) for f in fields] vectors.append(vector) id_counter += 1 input_matrix.close() if transpose: # remove clutter os.remove(file_path) sample_to_id_dict = dict([(v, k) for k, v in id_to_sample_dict.iteritems()]) return id_to_sample_dict, sample_to_id_dict, columns, vectors
def __init__(self, args): # we will fill this in and return it self.split_names_of_interest = set([]) self.bins = None A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.bin_ids_file_path = A('bin_ids_file') self.bin_ids_list = A('bin_ids_list') self.bin_id = A('bin_id') self.collection_name = A('collection_name') self.contigs_db_path = A('contigs_db') self.profile_db_path = A('profile_db') self.debug = anvio.DEBUG if not self.profile_db_path: raise ConfigError("You didn't provide a profile database path. When you clearly should have :/\ This is GetSplitNamesInBins speaking. Has her eyes on you.") if self.bin_ids_file_path and self.bin_id: raise ConfigError('Either use a file to list all the bin ids (-B), or declare a single bin (-b)\ you would like to focus. Not both :/') if not self.collection_name: raise ConfigError('This will not work without a collection ID for your bins :/') if self.bin_ids_file_path: filesnpaths.is_file_exists(self.bin_ids_file_path) self.bins = set([b.strip() for b in open(self.bin_ids_file_path).readlines()]) elif self.bin_id: self.bins = set([self.bin_id]) self.collections = Collections() self.collections.populate_collections_dict(self.profile_db_path) if self.collection_name not in self.collections.collections_dict: raise ConfigError('The collection id "%s" does not seem to be in the profile database. These are the\ collections that are available through this profile database: "%s".'\ % (self.collection_name, ', '.join(self.collections.collections_dict))) self.collection_dict = self.collections.get_collection_dict(self.collection_name) bins_in_collection = list(self.collection_dict.keys()) if not self.bins: self.bins = bins_in_collection else: bins_that_do_not_exist_in_collection = [b for b in self.bins if b not in bins_in_collection] if len(bins_that_do_not_exist_in_collection): some_bins_that_exist_in_collection = bins_in_collection if len(bins_in_collection) < 30 else bins_in_collection[:30] raise ConfigError('Some of the bins you requested do not appear to have been described in the collection\ "%s". Here is a list of bins that are missing: "%s". Here is a list of some bins in\ your collection: "%s"' % (self.collection_name, ', '.join(bins_that_do_not_exist_in_collection), ', '.join(some_bins_that_exist_in_collection))) if not len(self.bins): raise ConfigError('There is no bin to work with :/')
def process(self, aa_sequences_file_path=None): if self.search_with not in self.available_search_methods: raise ConfigError("Let us start by making it clear that we probably like '%s' as much as you do, but it doesn't\ seem to be available on your system OR recognized by the COGs class since anvi'o couldn't\ find it among the available search methods. You probably need to try something else :/" \ % self.search_with) if self.search_with not in self.available_db_search_program_targets: raise ConfigError("Anvi'o understands that you want to use '%s' to search for COGs, however, there is no\ database formatted under the COGs data directory for that program :/ You may need to\ re-run the COGs setup, UNLESS, you set up your COG data directory somewhere else than what\ anvi'o attempts to use at the moment ('%s'). If that is the case, this may be the best\ time to point the right directory using the --cog-data-dir parameter, or the environmental\ variable 'ANVIO_COG_DATA_DIR'." % (self.search_with, self.COG_data_dir)) if not aa_sequences_file_path and not self.contigs_db_path: raise ConfigError("You either need to provide an anvi'o contigs database path, or a FASTA file for AA\ sequences") if aa_sequences_file_path and self.contigs_db_path: raise ConfigError("You can't provide both an AA sequences file and a contigs database. Choose one!") if self.contigs_db_path: utils.is_contigs_db(self.contigs_db_path) if not self.temp_dir_path: self.temp_dir_path = filesnpaths.get_temp_directory_path() self.remove_temp_dir_path = True else: filesnpaths.is_file_exists(self.temp_dir_path) filesnpaths.is_output_dir_writable(self.temp_dir_path) self.run.warning("Because you set the temporary directory path by hand, anvi'o will not remove its content\ when it is done. But she certainly hopes that you will clean those files later.") self.remove_temp_dir_path = False self.run.info('COG data directory', self.COG_data_dir) self.run.info('Searching with', self.search_with) self.run.info('Directory to store temporary files', self.temp_dir_path) self.run.info('Directory will be removed after the run', self.remove_temp_dir_path) if not aa_sequences_file_path: aa_sequences_file_path = dbops.export_aa_sequences_from_contigs_db(self.contigs_db_path, J(self.temp_dir_path, 'aa_sequences.fa')) # do the search search_results_tabular = self.search_methods_factory[self.search_with](aa_sequences_file_path) # convert the output to a hits dict self.hits = utils.get_BLAST_tabular_output_as_dict(search_results_tabular, target_id_parser_func=lambda x: x.split('|')[1]) # store hits into the contigs database self.store_hits_into_contigs_db() if self.remove_temp_dir_path: shutil.rmtree(self.temp_dir_path)
def get_GC_content_for_FASTA_entries(file_path): filesnpaths.is_file_exists(file_path) filesnpaths.is_file_fasta_formatted(file_path) GC_content_dict = {} fasta = u.SequenceSource(file_path) while fasta.next(): GC_content_dict[fasta.id] = get_GC_content_for_sequence(fasta.seq) return GC_content_dict
def get_FASTA_file_as_dictionary(file_path): filesnpaths.is_file_exists(file_path) filesnpaths.is_file_fasta_formatted(file_path) d = {} fasta = u.SequenceSource(file_path) while fasta.next(): d[fasta.id] = fasta.seq return d
def __init__(self, db_path, contigs_fasta=None, run=terminal.Run(), progress=terminal.Progress(), debug=False): self.run = run self.progress = progress self.db_path = db_path self.contigs_fasta = contigs_fasta self.debug = debug utils.is_contigs_db(self.db_path) if self.contigs_fasta: filesnpaths.is_file_exists(self.contigs_fasta) filesnpaths.is_file_fasta_formatted(self.contigs_fasta)
def transpose_tab_delimited_file(input_file_path, output_file_path): filesnpaths.is_file_exists(input_file_path) filesnpaths.is_file_tab_delimited(input_file_path) filesnpaths.is_output_file_writable(output_file_path) file_content = [line.strip('\n').split('\t') for line in open(input_file_path).readlines()] output_file = open(output_file_path, 'w') for entry in zip(*file_content): output_file.write('\t'.join(entry) + '\n') output_file.close() return output_file_path
def __init__(self, args): # we will fill this in and return it self.split_names_of_interest = set([]) A = lambda x: args.__dict__[x] if args.__dict__.has_key(x) else None self.bin_ids_file_path = A('bin_ids_file') self.bin_id = A('bin_id') self.collection_id = A('collection_id') self.annotation_db_path = A('annotation_db') self.profile_db_path = A('profile_db') self.debug = A('debug') if self.bin_ids_file_path and self.bin_id: raise ConfigError, 'Either use a file to list all the bin ids (-B), or declare a single bin (-b)\ you would like to focus. Not both :/' if (not self.bin_ids_file_path) and (not self.bin_id): raise ConfigError, "You must either use a file to list all the bin ids (-B) you would like to\ focus on, or declare a single bin id (-b) from your collection. You have\ not really given anvi'o anything to work with." if not self.collection_id: raise ConfigError, 'This will not work without a collection ID for your bins :/' if self.bin_ids_file_path: filesnpaths.is_file_exists(self.bin_ids_file_path) self.bins = set([b.strip() for b in open(self.bin_ids_file_path).readlines()]) if self.bin_id: self.bins = set([self.bin_id]) if not len(self.bins): raise ConfigError, 'There is no bin to work with :/' self.collections = Collections() self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__) if self.collection_id not in self.collections.sources_dict: raise ConfigError, 'The collection id "%s" does not seem to be in the profile database. These are the\ collections that are available through this profile database: %s.'\ % (self.collection_id, ', '.join(self.collections.sources_dict)) self.collection_dict = self.collections.get_collection_dict(self.collection_id) bins_in_collection = self.collection_dict.keys() bins_that_does_not_exist_in_collection = [b for b in self.bins if b not in bins_in_collection] if len(bins_that_does_not_exist_in_collection): raise ConfigError, 'Some of the bins you requested does not appear to have been described in the collection\ "%s". Here is a list of bins that are missing: %s'\ % (self.collection_id, ', '.join(bins_that_does_not_exist_in_collection))
def create_newick_file_from_matrix_file(observation_matrix_path, output_file_name, linkage=constants.linkage_method_default, distance=constants.distance_metric_default, norm='l1', progress=progress, transpose=False): is_distance_and_linkage_compatible(distance, linkage) filesnpaths.is_file_exists(observation_matrix_path) filesnpaths.is_file_tab_delimited(observation_matrix_path) filesnpaths.is_output_file_writable(output_file_name) id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(observation_matrix_path, transpose=transpose) vectors = np.array(vectors) newick = get_newick_from_matrix(vectors, distance, linkage, norm, id_to_sample_dict) if output_file_name: open(output_file_name, 'w').write(newick.strip() + '\n')
def initialize_classifier(self): filesnpaths.is_file_exists(self.classifier_object_path) classifier_obj = cPickle.load(open(self.classifier_object_path)) try: self.features = classifier_obj['features'] self.classes = classifier_obj['classes'] self.classifier = classifier_obj['classifier'] except: raise ConfigError, "RF class does not like the classifier object it was sent for processing :/ Are you sure you\ generated it the way you were supposed to?" self.classifier_initialized = True self.run.info('Classifier', "Initialized with %d features grouped into %d classes." % (len(self.features), len(self.classes)))
def concatenate_files(dest_file, file_list): if not dest_file: raise ConfigError, "Destination cannot be empty." if not len(file_list): raise ConfigError, "File list cannot be empty." for f in file_list: filesnpaths.is_file_exists(f) filesnpaths.is_output_file_writable(dest_file) dest_file_obj = open(dest_file, 'w') for chunk_path in file_list: for line in open(chunk_path): dest_file_obj.write(line) dest_file_obj.close() return dest_file
def populate_annotations_dict(self, annotations_file_path): filesnpaths.is_file_exists(annotations_file_path) num_entries_processed = 0 self.progress.new('Parsing the annotations file') for line in open(annotations_file_path, 'rU').readlines(): if line.startswith('#') or line == '\n': continue self.parser(line) num_entries_processed += 1 if num_entries_processed % 100 == 0: self.progress.update('%d ...' % num_entries_processed) self.progress.end()
def merge_split_coverage_data(self): output_file_path = os.path.join(self.output_directory, 'AUXILIARY-DATA.db') merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(output_file_path, self.contigs_db_hash, create_new=True) AUX = lambda x: os.path.join(os.path.dirname(x), 'AUXILIARY-DATA.db') if False in [filesnpaths.is_file_exists(AUX(p), dont_raise=True) for p in self.profile_dbs_info_dict]: self.run.warning("Some of your single profile databases to be merged are missing auxiliary data files associated with them. Did you\ download them from somewhere and forgot to download the AUXILIARY-DATA.db files? Well. That's fine. Anvi'o will\ continue merging your profiles without split coverages (which means you will not be able to inspect nucleotide\ level coverage values and some other bells and whistles). If you want, you can kill this process now with CTRL+C\ and redo it with all database files in proper places.") return None self.progress.new('Merging split coverage data') # fill coverages in from all samples for input_profile_db_path in self.profile_dbs_info_dict: self.progress.update(input_profile_db_path) sample_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(AUX(input_profile_db_path), self.contigs_db_hash) for split_name in self.split_names: coverages_dict = sample_split_coverage_values.get(split_name) for sample_name in coverages_dict: merged_split_coverage_values.append(split_name, sample_name, coverages_dict[sample_name]) sample_split_coverage_values.close() merged_split_coverage_values.store() merged_split_coverage_values.close() self.progress.end()
def check_params(self): # if the user did not set a specific output directory name, use the project name # for it: self.output_dir = self.output_dir if self.output_dir else self.project_name # deal with the output directory: try: filesnpaths.is_file_exists(self.output_dir) except FilesNPathsError: filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations) filesnpaths.is_output_dir_writable(self.output_dir) self.output_dir = os.path.abspath(self.output_dir) if not self.log_file_path: self.log_file_path = self.get_output_file_path('log.txt') filesnpaths.is_output_file_writable(self.log_file_path) os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None if not isinstance(self.minbit, float): raise ConfigError("minbit value must be of type float :(") if self.minbit < 0 or self.minbit > 1: raise ConfigError("Well. minbit must be between 0 and 1. Yes. Very boring.") if not isinstance(self.min_percent_identity, float): raise ConfigError("Minimum percent identity value must be of type float :(") if self.min_percent_identity < 0 or self.min_percent_identity > 100: raise ConfigError("Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\ pretty cute, too." % self.min_percent_identity) if len([c for c in list(self.genomes.values()) if 'genome_hash' not in c]): raise ConfigError("self.genomes does not seem to be a properly formatted dictionary for\ the anvi'o class Pangenome.") if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering: raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\ while also asking it to enforce it.") if self.description_file_path: filesnpaths.is_file_plain_text(self.description_file_path) self.description = open(os.path.abspath(self.description_file_path), 'rU').read() self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
def is_executable_a_MODELLER_program(self): # temp_dir created because log file outputs to wherever fasta_to_pir.py is temp_dir = filesnpaths.get_temp_directory_path() self.copy_script_to_directory('fasta_to_pir.py', add_to_scripts_dict=False, directory=temp_dir) test_script = J(temp_dir, 'fasta_to_pir.py') test_input = os.path.abspath(J(os.path.dirname(anvio.__file__), '../tests/sandbox/mock_data_for_structure/proteins.fa')) test_output = J(temp_dir, 'test_out') command = [self.executable, test_script, test_input, test_output] # try and execute the command process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() if process.returncode: # modeller has failed error = error.decode('utf-8').strip() is_licence_key_error = True if error.find('Invalid license key') > -1 else False if is_licence_key_error: # its a valid modeller program with no license key license_target_file = error.split('\n')[-1] raise ConfigError("You're making progress and anvi'o is proud of you! You just need to validate your MODELLER\ with a license key (it's free). Please go to https://salilab.org/modeller/registration.html\ to register for a new license. After you receive an e-mail with your key, please open '%s'\ and replace the characters XXXXX with your own key. Save the file and try again. " % license_target_file) else: error = "\n" + "\n".join(error.split('\n')) print(terminal.c(error, color='red')) raise ConfigError("The executable you requested is called `%s`, but anvi'o doesn't agree with you that\ it is a working MODELLER program. That was determined by running the command `%s`, which raised the\ error seen above. If you want to specify a specific MODELLER program, you can specify it with\ `--modeller-executable`." % (self.executable, " ".join(command))) # no error was raised. now check if output file exists try: filesnpaths.is_file_exists(test_output) except FilesNPathsError: raise ConfigError("The executable you requested is called `%s`, but anvi'o doesn't agree with you that\ it is a working MODELLER program. That was determined by running the command `%s`, which did not\ output the file expected. If you want to specify a specific MODELLER program, you can specify it with\ `--modeller-executable`." % (self.executable, " ".join(command)))
def load_collections(self): ''' Load the collections_txt file, run some sanity checks, and figure out params for anvi_import_collection''' collections = u.get_TAB_delimited_file_as_dictionary(self.collections_txt) bad_groups = [g for g in collections if g not in self.group_names] if bad_groups: raise ConfigError('Some of the names in your collection_txt \ file ("%s") don\'t match the names of the \ groups in your samples_txt/fasta_txt. \ Here are the names that don\'t match: %s. \ And here are the group names we expect to find: \ %s' % (self.collections_txt, ', '.join(bad_groups), ', '.join(self.group_names))) for group in collections: default_collection = collections[group].get('default_collection') if default_collection: # User can specify either a default collection OR collection from file not_allowed_params = {'collection_name', 'collection_file', 'bins_info', 'contigs_mode'} if any([collections[group][key] for key in not_allowed_params if key in collections[group].keys()]): raise ConfigError('We encountered the following problem with your \ collections_txt file ("%s"): you can choose \ either using a default collection OR importing \ a collection from a file. Yet, for "%s", you specificy \ a default collection AND also specify some of the following \ parameters: %s.' % (self.collections_txt, group, ", ".join(not_allowed_params))) collections[group]['collection_name'] = 'DEFAULT' collections[group]['contigs_mode'] = '' else: if not filesnpaths.is_file_exists(collections[group]['collection_file'], dont_raise=True): raise ConfigError('We encountered the following problem with your \ collections_txt file ("%s"): you did not specify \ a valid collection file for "%s".' % (self.collections_txt, group)) if not collections[group]['collection_name']: raise ConfigError('You must specify a name for each collection in your collections_txt') u.check_collection_name(collections[group]['collection_name']) if collections[group].get('bins_info'): filesnpaths.is_file_exists(collections[group]['bins_info']) collections[group]['bins_info'] = '--bins-info %s' % collections[group]['bins_info'] else: collections[group]['bins_info'] = '' if collections[group].get('contigs_mode'): collections[group]['contigs_mode'] = '--contigs-mode' else: collections[group]['contigs_mode'] = '' self.collections = collections
def sanity_check(self): bad_bam_files = [] for bam_file_path in self.input_bam_files: try: filesnpaths.is_file_exists(bam_file_path) pysam.Samfile(bam_file_path, 'rb') except ValueError as e: bad_bam_files.append(bam_file_path) if len(bad_bam_files): raise ConfigError, 'Samtools is not happy with some of your bam files. The following\ file(s) do not look like proper BAM files [ here is the actual\ error: "%s"]: %s.' % (e, ','.join(bad_bam_files)) if not self.output_file_path: self.output_file_path = 'short_reads.fa' filesnpaths.is_output_file_writable(self.output_file_path)
def init_refereces_txt(self): if self.references_mode and not filesnpaths.is_file_exists(self.fasta_txt_file, dont_raise=True): raise ConfigError('In references mode you must supply a fasta_txt file.') if not self.references_mode: # if it is reference mode then the group names have been assigned in the contigs Snakefile # if it is not reference mode and no groups are supplied in the samples_txt then group names are sample names self.group_names = self.sample_names if self.fasta_txt_file and not self.references_mode: raise ConfigError("In order to use reference fasta files you must set " "\"'references_mode': true\" in your config file, yet " "you didn't, but at the same time you supplied the following " "fasta_txt: %s. So we don't know what to do with this " "fasta_txt" % self.fasta_txt_file) # Collecting information regarding groups. if "group" in self.samples_information.columns: # if groups were specified then members of a groups will be co-assembled. self.group_names = list(self.samples_information['group'].unique()) # creating a dictionary with groups as keys and number of samples in # the groups as values self.group_sizes = self.samples_information['group'].value_counts().to_dict() if self.references_mode: # sanity check to see that groups specified in samples.txt match # the names of fasta. mismatch = set(self.group_names) - set(self.contigs_information.keys()) if mismatch: raise ConfigError("Group names specified in the samples.txt " "file must match the names of fasta " "in the fasta.txt file. These are the " "mismatches: %s" % mismatch) groups_in_contigs_information_but_not_in_samples_txt = set(self.contigs_information.keys()) - set(self.group_names) if groups_in_contigs_information_but_not_in_samples_txt: run.warning('The following group names appear in your fasta_txt ' 'but do not appear in your samples_txt. Maybe this is ' 'ok with you, but we thought you should know. This means ' 'that the metagenomics workflow will simply ignore these ' 'groups.') else: if self.references_mode: # if the user didn't provide a group column in the samples.txt, # in references mode the default is 'all_against_all'. run.warning("No groups were provided in your samples_txt,\ hence 'all_against_all' mode has been automatically\ set to True.") self.set_config_param('all_against_all', True) else: # if no groups were specified then each sample would be assembled # separately run.warning("No groups were specified in your samples_txt. This is fine. " "But we thought you should know. Any assembly will be performed " "on individual samples (i.e. NO co-assembly).") self.samples_information['group'] = self.samples_information['sample'] self.group_names = list(self.sample_names) self.group_sizes = dict.fromkeys(self.group_names,1) if self.get_param_value_from_config('all_against_all'): # in all_against_all, the size of each group is as big as the number # of samples. self.group_sizes = dict.fromkeys(self.group_names,len(self.sample_names)) if not self.references_mode and not (self.get_param_value_from_config(['anvi_script_reformat_fasta','run']) == True): # in assembly mode (i.e. not in references mode) we always have # to run reformat_fasta. The only reason for this is that # the megahit output is temporary, and if we dont run # reformat_fasta we will delete the output of meghit at the # end of the workflow without saving a copy. raise ConfigError("You can't skip reformat_fasta in assembly mode " "please change your config.json file")
def __init__(self, args): self.args = args A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.input_file_path = A('input_file') self.contigs_db_path = A('contigs_db') self.serialized_profile_path = A('serialized_profile') self.output_directory = A('output_dir') self.list_contigs_and_exit = A('list_contigs') self.min_contig_length = A('min_contig_length') self.min_mean_coverage = A('min_mean_coverage') self.min_coverage_for_variability = A('min_coverage_for_variability') self.contigs_shall_be_clustered = A('cluster_contigs') self.sample_id = A('sample_name') self.report_variability_full = A('report_variability_full') self.overwrite_output_destinations = A('overwrite_output_destinations') self.skip_SNV_profiling = A('skip_SNV_profiling') self.profile_AA_frequencies = A('profile_AA_frequencies') self.gen_serialized_profile = A('gen_serialized_profile') self.distance = A('distance') or constants.distance_metric_default self.linkage = A('linkage') or constants.linkage_method_default self.num_threads = int(A('num_threads')) self.queue_size = int(A('queue_size')) self.write_buffer_size = int(A('write_buffer_size')) self.total_length_of_all_contigs = 0 self.total_coverage_values_for_all_contigs = 0 self.description_file_path = A('description') # make sure early on that both the distance and linkage is OK. clustering.is_distance_and_linkage_compatible(self.distance, self.linkage) # whehther the profile database is a blank (without any BAM files or reads): self.blank = A('blank_profile') if self.blank: self.contigs_shall_be_clustered = True if args.contigs_of_interest: filesnpaths.is_file_exists(args.contigs_of_interest) self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\ if c.strip() and not c.startswith('#')]) else: self.contig_names_of_interest = None self.progress = terminal.Progress() self.run = terminal.Run(width=35) if self.list_contigs_and_exit: self.list_contigs() sys.exit() if not self.contigs_db_path: raise ConfigError("No contigs database, no profilin'. Bye.") # Initialize contigs db dbops.ContigsSuperclass.__init__(self, self.args, r=self.run, p=self.progress) self.init_contig_sequences() self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys()) self.bam = None self.contigs = [] self.database_paths = { 'CONTIGS.db': os.path.abspath(self.contigs_db_path) } self.profile_db_path = None self.clustering_configs = constants.clustering_configs[ 'blank' if self.blank else 'single'] # following variable will be populated during the profiling, and its content will eventually # be stored in t.variable_nts_table_name self.variable_nts_table_entries = [] # following variable will be populated while the variable positions table is computed self.codons_in_genes_to_profile_AA_frequencies = set([]) # we don't know what we are about self.description = None
def init(self): super().init() # loading the samples.txt file self.samples_txt_file = self.get_param_value_from_config( ['samples_txt']) filesnpaths.is_file_exists(self.samples_txt_file) try: # getting the samples information (names, [group], path to r1, path to r2) from samples.txt self.samples_information = pd.read_csv(self.samples_txt_file, sep='\t', index_col=False) except IndexError as e: raise ConfigError( "Looks like your samples_txt file, '%s', is not properly formatted. \ This is what we know: '%s'" % (self.samples_txt_file, e)) if 'sample' not in list(self.samples_information.columns): raise ConfigError( "Looks like your samples_txt file, '%s', is not properly formatted. \ We are not sure what's wrong, but we can't find a column with title 'sample'." % self.samples_txt_file) # get a list of the sample names self.sample_names = list(self.samples_information['sample']) self.run_metaspades = self.get_param_value_from_config( ['metaspades', 'run']) self.use_scaffold_from_metaspades = self.get_param_value_from_config( ['metaspades', 'use_scaffolds']) self.use_scaffold_from_idba_ud = self.get_param_value_from_config( ['idba_ud', 'use_scaffolds']) self.run_qc = self.get_param_value_from_config( ['iu_filter_quality_minoche', 'run']) == True self.run_summary = self.get_param_value_from_config( ['anvi_summarize', 'run']) == True self.run_split = self.get_param_value_from_config( ['anvi_split', 'run']) == True self.references_mode = self.get_param_value_from_config( 'references_mode') self.fasta_txt_file = self.get_param_value_from_config('fasta_txt') self.profile_databases = {} self.references_for_removal_txt = self.get_param_value_from_config(['remove_short_reads_based_on_references',\ 'references_for_removal_txt']) if self.references_for_removal_txt: self.load_references_for_removal() self.collections_txt = self.get_param_value_from_config( 'collections_txt') if self.collections_txt: self.load_collections() elif self.run_summary: raise ConfigError( 'If you want to run anvi-summarize you must provide a collections_txt file' ) elif self.run_split: raise ConfigError( 'If you want to run anvi-split you must provide a collections_txt file' ) self.init_samples_txt() self.init_kraken() self.init_refereces_txt() # Set the PROFILE databases paths variable: for group in self.group_names: # we need to use the single profile if the group is of size 1. self.profile_databases[group] = os.path.join(self.dirs_dict["MERGE_DIR"], group, "PROFILE.db") if self.group_sizes[group] > 1 else \ os.path.join(self.dirs_dict["PROFILE_DIR"], group, self.samples_information.loc[self.samples_information['group']==group,'sample'].values[0], "PROFILE.db")
def init(self): """This function is called from within the snakefile to initialize parameters.""" super().init() self.run_iu_merge_pairs = self.get_param_value_from_config( ['iu_merge_pairs', 'run']) self.gzip_iu_merge_pairs_output = self.get_param_value_from_config( ['iu_merge_pairs', '--gzip-output']) self.run_anvi_reformat_fasta = self.get_param_value_from_config( ['anvi_reformat_fasta', 'run']) self.gzip_anvi_reformat_fasta_output = self.get_param_value_from_config( ['anvi_reformat_fasta', '--gzip-output']) self.run_anvi_trnaseq = self.get_param_value_from_config( ['anvi_trnaseq', 'run']) self.run_anvi_convert_trnaseq_database = self.get_param_value_from_config( ['anvi_convert_trnaseq_database', 'run']) self.run_anvi_run_trna_taxonomy = self.get_param_value_from_config( ['anvi_run_trna_taxonomy', 'run']) # Load table of sample info from samples_txt (sample names, treatments, paths to r1 and r2, # r1 and r2 prefixes). self.samples_txt_file = self.get_param_value_from_config( ['samples_txt']) filesnpaths.is_file_exists(self.samples_txt_file) try: # An error will subsequently be raised in `check_samples_txt` if there is no header. self.sample_info = pd.read_csv(self.samples_txt_file, sep='\t', index_col=False) except IndexError as e: raise ConfigError( "The samples_txt file, '%s', does not appear to be properly formatted. " "This is the error from trying to load it: '%s'" % (self.samples_txt_file, e)) self.check_samples_txt() self.sample_names = self.sample_info['sample'].tolist() if 'treatment' in self.sample_info['treatment']: # The treatment is specified for each sample in samples_txt. self.treatments = self.sample_info['treatment'].tolist() else: # The treatment is the same for each sample and is set in the config file. self.treatments = [ self.get_param_value_from_config(['anvi_trnaseq', 'treatment']) ] * len(self.sample_names) if self.run_iu_merge_pairs: self.treatments = self.sample_info['treatment'] self.r1_paths = self.sample_info['r1'].tolist() self.r2_paths = self.sample_info['r2'].tolist() self.r1_prefixes = self.get_r1_prefixes() self.r2_prefixes = self.get_r2_prefixes() self.fasta_paths = None else: self.treatments = self.sample_info['treatment'] self.r1_paths = None self.r2_paths = None self.r1_prefixes = None self.r2_prefixes = None self.fasta_paths = self.sample_info['fasta'].tolist() self.target_files = self.get_target_files()
def init(self): super().init() # loading the samples.txt file self.samples_txt_file = self.get_param_value_from_config( ['samples_txt']) filesnpaths.is_file_exists(self.samples_txt_file) try: # getting the samples information (names, [group], path to r1, path to r2) from samples.txt self.samples_information = pd.read_csv(self.samples_txt_file, sep='\t', index_col=False) except IndexError as e: raise ConfigError( "Looks like your samples_txt file, '%s', is not properly formatted. \ This is what we know: '%s'" % (self.samples_txt_file, e)) if 'sample' not in list(self.samples_information.columns): raise ConfigError( "Looks like your samples_txt file, '%s', is not properly formatted. \ We are not sure what's wrong, but we can't find a column with title 'sample'." % self.samples_txt_file) # get a list of the sample names self.sample_names = list(self.samples_information['sample']) self.run_metaspades = self.get_param_value_from_config( ['metaspades', 'run']) self.use_scaffold_from_metaspades = self.get_param_value_from_config( ['metaspades', 'use_scaffolds']) self.run_qc = self.get_param_value_from_config( ['iu_filter_quality_minoche', 'run']) == True self.run_summary = self.get_param_value_from_config( ['anvi_summarize', 'run']) == True self.run_split = self.get_param_value_from_config( ['anvi_split', 'run']) == True self.references_mode = self.get_param_value_from_config( 'references_mode', repress_default=True) self.fasta_txt_file = self.get_param_value_from_config( 'fasta_txt', repress_default=True) self.references_for_removal_txt = self.get_param_value_from_config(['remove_short_reads_based_on_references',\ 'references_for_removal_txt'],\ repress_default=True) if self.references_for_removal_txt: self.load_references_for_removal() self.collections_txt = self.get_param_value_from_config( 'collections_txt') if self.collections_txt: self.load_collections() elif self.run_summary: raise ConfigError( 'If you want to run anvi-summarize you must provide a collections_txt file' ) elif self.run_split: raise ConfigError( 'If you want to run anvi-split you must provide a collections_txt file' ) self.init_samples_txt() self.init_kraken() self.init_refereces_txt() self.init_target_files()
def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.gene_coverages_data_file_path = A('data_file') self.gene_detections_data_file_path = A('gene_detection_data_file') self.profile_db_path = A('profile_db') self.output_file_prefix = A('output_file_prefix') self.alpha = A('alpha') self.beta = A('beta') # self.gamma = A('gamma') # FIXME: beta, gamma, eta, and zeta are not self.eta = A('eta') # used anywhere in the code anyore :) self.zeta = A('zeta') # self.additional_layers_to_append = A('additional_layers_to_append') self.samples_information_to_append = A('samples_information_to_append') self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file_path = A('bin_ids_file') self.exclude_samples = A('exclude_samples') self.include_samples = A('include_samples') self.profile_db = {} self.coverage_values_per_nt = {} self.gene_coverages = pd.DataFrame.empty self.gene_detections = pd.DataFrame.empty self.samples = {} self.positive_samples = [] self.number_of_positive_samples = None self.negative_samples = pd.DataFrame.empty self.number_of_negative_samples = None self.gene_class_information = pd.DataFrame.empty self.samples_information = pd.DataFrame.empty self.gene_presence_absence_in_samples = pd.DataFrame.empty self.gene_coverages_filtered = pd.DataFrame.empty self.additional_description = '' self.total_length = None if self.exclude_samples: # check that there is a file like this filesnpaths.is_file_exists(self.exclude_samples) self.samples_to_exclude = set([l.split('\t')[0].strip() for l in open(self.exclude_samples, 'rU').readlines()]) if not self.samples_to_exclude: raise ConfigError("You asked to exclude samples, but provided an empty list.") run.info('Excluding Samples', 'The following samples will be excluded: %s' % self.samples_to_exclude,) else: self.samples_to_exclude = set([]) if self.include_samples: # check that there is a file like this filesnpaths.is_file_exists(self.include_samples) self.samples_to_include = set([l.split('\t')[0].strip() for l in open(self.include_samples, 'rU').readlines()]) if not self.samples_to_include: raise ConfigError("You provided an empty list of samples to include.") run.info('Including Samples', 'The following samples will be included: %s' % self.samples_to_include,) else: self.samples_to_include = set([]) # run sanity check on all input arguments self.sanity_check() if self.profile_db_path is None: # TODO: this will probably be removed because we don't save the coverage information in nucleotide level. pass else: # load sample list and gene_coverage_dict from the merged profile db args.init_gene_coverages = True if self.collection_name: self.summary = summarizer.ProfileSummarizer(args) self.summary.init() self.init_samples(self.summary.p_meta['samples']) else: self.profile_db = ProfileSuperclass(args) self.init_samples(self.profile_db.p_meta['samples']) self.profile_db.init_split_coverage_values_per_nt_dict() self.profile_db.init_gene_level_coverage_stats_dicts() self.coverage_values_per_nt = get_coverage_values_per_nucleotide(self.profile_db.split_coverage_values_per_nt_dict, self.samples) # comply with the new design and get gene_coverages and gene_detection dicsts from # gene_level_coverage_stats_dict. gene_coverages, gene_detection = self.get_gene_coverages_and_gene_detection_dicts() self.init_coverage_and_detection_dataframes(gene_coverages, gene_detection) # getting the total length of all contigs self.total_length = self.profile_db.p_meta['total_length']
def check_MODELLER(executable=None): """Test if MODELLER is going to work. Exists outside of the class MODELLER so it does not have to be checked everytime. Checks the executable exists, that a license exists, and can produce the expected output of a modeller executable. Returns ======= output : executable, str Returns the executable that you _should_ use, which is not necessarily what is input """ executable = executable if executable else up_to_date_modeller_exec scripts_folder = J(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/scripts') if utils.filesnpaths.is_dir_empty(scripts_folder): raise ConfigError( "Anvi'o houses all its MODELLER scripts in %s, but your directory " "contains no scripts. Why you did dat?" % scripts_folder) try: utils.is_program_exists(executable) except ConfigError as e: *prefix, sub_version = up_to_date_modeller_exec.split('.') prefix, sub_version = ''.join(prefix), int(sub_version) for alternate_version in reversed( range(sub_version - 10, sub_version + 10)): alternate_program = prefix + '.' + str(alternate_version) if utils.is_program_exists(alternate_program, dont_raise=True): executable = alternate_program break else: raise ConfigError( "Anvi'o needs a MODELLER program to be installed on your system. You didn't specify one " "(which can be done with `--modeller-executable`), so anvi'o tried the most recent version " "it knows about: '%s'. If you are certain you have it on your system (for instance you can run it " "by typing '%s' in your terminal window), you may want to send a detailed bug report. If you " "don't have it on your system, check out these installation instructions on our website: " "http://merenlab.org/2016/06/18/installing-third-party-software/#modeller" % (executable, executable)) temp_dir = filesnpaths.get_temp_directory_path() shutil.copy2(J(scripts_folder, 'fasta_to_pir.py'), temp_dir) test_script = J(temp_dir, 'fasta_to_pir.py') test_input = J(os.path.dirname(anvio.__file__), 'tests/sandbox/mock_data_for_structure/proteins.fa') test_output = J(temp_dir, 'test_out') command = [executable, test_script, test_input, test_output] # try and execute the command process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() if process.returncode: # modeller has failed error = error.decode('utf-8').strip() is_licence_key_error = True if error.find( 'Invalid license key') > -1 else False if is_licence_key_error: # its a valid modeller program with no license key license_target_file = error.split('\n')[-1] raise ConfigError( "You're making progress and anvi'o is proud of you! You just need to validate your MODELLER " "with a license key (it's free). Please go to https://salilab.org/modeller/registration.html " "to register for a new license. After you receive an e-mail with your key, please open '%s' " "and replace the characters XXXXX with your own key. Save the file and try again. " % license_target_file) else: error = "\n" + "\n".join(error.split('\n')) print(terminal.c(error, color='red')) raise ConfigError( "The executable you requested is called `%s`, but anvi'o doesn't agree with you that " "it is a working MODELLER program. That was determined by running the command `%s`, which raised the " "error seen above. If you want to specify a specific MODELLER program, you can specify it with " "`--modeller-executable`." % (executable, " ".join(command))) # no error was raised. now check if output file exists try: filesnpaths.is_file_exists(test_output) except FilesNPathsError: raise ConfigError( "The executable you requested is called `%s`, but anvi'o doesn't agree with you that " "it is a working MODELLER program. That was determined by running the command `%s`, which did not " "output the file expected. If you want to specify a specific MODELLER program, you can specify it with " "`--modeller-executable`." % (executable, " ".join(command))) return executable
def __init__(self, args): # we will fill this in and return it self.split_names_of_interest = set([]) self.bins = None A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.bin_ids_file_path = A('bin_ids_file') self.bin_ids_list = A('bin_ids_list') self.bin_id = A('bin_id') self.collection_name = A('collection_name') self.contigs_db_path = A('contigs_db') self.profile_db_path = A('profile_db') self.debug = A('debug') if not self.profile_db_path: raise ConfigError( "You didn't provide a profile database path. When you clearly should have :/\ This is GetSplitNamesInBins speaking. Has her eyes on you." ) if self.bin_ids_file_path and self.bin_id: raise ConfigError( 'Either use a file to list all the bin ids (-B), or declare a single bin (-b)\ you would like to focus. Not both :/') if not self.collection_name: raise ConfigError( 'This will not work without a collection ID for your bins :/') if self.bin_ids_file_path: filesnpaths.is_file_exists(self.bin_ids_file_path) self.bins = set( [b.strip() for b in open(self.bin_ids_file_path).readlines()]) elif self.bin_id: self.bins = set([self.bin_id]) self.collections = Collections() self.collections.populate_collections_dict(self.profile_db_path) if self.collection_name not in self.collections.collections_dict: raise ConfigError('The collection id "%s" does not seem to be in the profile database. These are the\ collections that are available through this profile database: "%s".'\ % (self.collection_name, ', '.join(self.collections.collections_dict))) self.collection_dict = self.collections.get_collection_dict( self.collection_name) bins_in_collection = list(self.collection_dict.keys()) if not self.bins: self.bins = bins_in_collection else: bins_that_does_not_exist_in_collection = [ b for b in self.bins if b not in bins_in_collection ] if len(bins_that_does_not_exist_in_collection): raise ConfigError('Some of the bins you requested does not appear to have been described in the collection\ "%s". Here is a list of bins that are missing: "%s"'\ % (self.collection_name, ', '.join(bins_that_does_not_exist_in_collection))) if not len(self.bins): raise ConfigError('There is no bin to work with :/')
def check_params(self): # check the project name: if not self.project_name: raise ConfigError( "Please set a project name, and be prepared to see it around as (1) anvi'o will use\ that name to set the output directory and to name various output files such as the\ databases that will be generated at the end of the process. If you set your own output\ directory name, you can have multiple projects in it and all of those projects can use\ the same intermediate files whenever possible." ) utils.is_this_name_OK_for_database('pan project name', self.project_name, stringent=False) # if the user did not set a specific output directory name, use the project name # for it: self.output_dir = self.output_dir if self.output_dir else self.project_name # deal with the output directory: try: filesnpaths.is_file_exists(self.output_dir) except FilesNPathsError: filesnpaths.gen_output_directory( self.output_dir, delete_if_exists=self.overwrite_output_destinations) filesnpaths.is_output_dir_writable(self.output_dir) self.output_dir = os.path.abspath(self.output_dir) if not self.log_file_path: self.log_file_path = self.get_output_file_path('log.txt') filesnpaths.is_output_file_writable(self.log_file_path) os.remove(self.log_file_path) if os.path.exists( self.log_file_path) else None if not isinstance(self.minbit, float): raise ConfigError("minbit value must be of type float :(") if self.minbit < 0 or self.minbit > 1: raise ConfigError( "Well. minbit must be between 0 and 1. Yes. Very boring.") if not isinstance(self.min_percent_identity, float): raise ConfigError( "Minimum percent identity value must be of type float :(") if self.min_percent_identity < 0 or self.min_percent_identity > 100: raise ConfigError( "Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\ pretty cute, too." % self.min_percent_identity) if len( [c for c in list(self.genomes.values()) if 'genome_hash' not in c]): raise ConfigError( "self.genomes does not seem to be a properly formatted dictionary for\ the anvi'o class Pangenome.") if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering: raise ConfigError( "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\ while also asking it to enforce it.") if self.description_file_path: filesnpaths.is_file_plain_text(self.description_file_path) self.description = open( os.path.abspath(self.description_file_path), 'rU').read() if not self.skip_alignments: self.aligner = aligners.select(self.align_with) self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
def check_samples_txt(self): if self.run_iu_merge_pairs: proper_header = ['sample', 'split', 'r1', 'r2'] else: proper_header = ['sample', 'split', 'fasta'] missing_columns = [] for column_title in proper_header: if column_title not in self.sample_info.columns: missing_columns.append(column_title) if missing_columns: raise ConfigError( "The samples_txt file, '%s', is not properly formatted, " "as the following columns are missing: '%s'." % (self.sample_info, ', '.join(missing_columns))) for sample_name in self.sample_info['sample']: try: u.check_sample_id(sample_name) except ConfigError as e: raise ConfigError( "While processing the samples_txt file, '%s', " "Anvi'o ran into the following error: %s" % (self.samples_txt_file, e)) unknown_split_types = [] for split_type in self.sample_info['split']: if split_type not in TRNASeqWorkflow.known_split_types: unknown_split_types.append(split_type) if unknown_split_types: run.warning( "Some of the names of split types in the samples_txt file, '%s', " "are not what we were expecting (%s). " "That's okay, but Anvi'o decided it should warn you. " "Here are the names of split types that are not in our little list: %s. " % (self.samples_txt_file, ', '.join( TRNASeqWorkflow.known_split_types), ', '.join( sorted(set(unknown_split_types))))) if self.run_iu_merge_pairs: fastq_paths = self.sample_info['r1'].tolist( ) + self.sample_info['r2'].tolist() bad_fastq_paths = [ s for s in fastq_paths if not filesnpaths.is_file_exists(s, dont_raise=True) ] if bad_fastq_paths: raise ConfigError( "The following FASTQ files in the samples_txt file, '%s', cannot be found: %s." % (self.samples_txt_file, ', '.join(bad_fastq_paths))) bad_fastq_names = [ s for s in fastq_paths if (not s.endswith('.fq') and not s.endswith('.fq.gz') and not s.endswith('.fastq') and not s.endswith('.fastq.gz')) ] if bad_fastq_names: run.warning( "Some of the sequence files in the samples_txt file, '%s', " "do not end with '.fq', '.fq.gz', 'fastq' or '.fastq.gz'. " "That's okay, but Anvi'o decided it should warn you. " "Here are the first 5 such files that have unconventional file extensions: %s." % (self.samples_txt_file, ', '.join(bad_fastq_names[:5]))) else: fasta_paths = self.sample_info['fasta'].tolist() bad_fasta_paths = [ s for s in fasta_paths if not filesnpaths.is_file_exists(s, dont_raise=True) ] if bad_fasta_paths: raise ConfigError( "The following FASTA files in the samples_txt file, '%s', cannot be found: %s." % (self.samples_txt_file, ', '.join(bad_fasta_paths))) bad_fasta_names = [ s for s in fasta_paths if (not s.endswith('.fa') and not s.endswith('.fa.gz') and not s.endswith('.fasta') and not s.endswith('.fasta.gz')) ] if bad_fasta_names: run.warning( "Some of the FASTA files in the samples_txt file, '%s', " "do not end with '.fa', '.fa.gz', 'fasta' or '.fasta.gz'. " "That's okay, but Anvi'o decided it should warn you. " "Here are the first 5 such files that have unconventional file extensions: %s." % (self.samples_txt_file, ', '.join(bad_fasta_names[:5])))
def __init__(self, file_path, unique_hash, create_new=False, open_in_append_mode=False, ignore_hash=False, run=run, progress=progress, quiet=False): self.run = run self.progress = progress self.file_path = file_path if open_in_append_mode and not create_new: raise HDF5Error( "The 'open_in_append_mode' flag can only be used along with the flag 'create_new'." ) if create_new: if ignore_hash: raise HDF5Error( "When creating (or appending to) a database, you can't use the 'ignore_hash'\ flag.") if not unique_hash: raise HDF5Error( "When creating (or appending to) a database, the 'unique_hash' cannot be None." ) self.fp = h5py.File(self.file_path, 'a' if open_in_append_mode else 'w') self.fp.attrs['hash'] = unique_hash self.fp.attrs['version'] = self.version else: filesnpaths.is_file_exists(self.file_path) self.fp = h5py.File(self.file_path, 'r') G = lambda x: self.fp.attrs[x].decode('utf-8') if isinstance( self.fp.attrs[x], bytes) else self.fp.attrs[x] fp_version = G('version') fp_hash = G('hash') if fp_version != self.version: raise HDF5Error( "The data file for %s ('%s') is at version '%s', however, your client is at\ version '%s'. This is bad news, because your version of anvi'o can't work with\ this file. You can regenerate the data file using the current version of anvi'o,\ or look around to see whether there is an upgrade script is available (a good start\ would be to type 'anvi-script-upgrade-' and then click TAB key twice). Otherwise you\ may want to consider sending an e-mail to the anvi'o developers to find out what's up.\ We heard that they love them some e-mails." % (self.db_type, self.file_path, self.fp.attrs['version'], self.version)) if not ignore_hash and fp_hash != unique_hash: raise HDF5Error( "The database at '%s' does not seem to be compatible with the client :/\ (i.e., the hash values do not match)." % self.file_path) self.unique_hash = fp_hash
run = terminal.Run() progress = terminal.Progress() parser = argparse.ArgumentParser( description='A simple script to generate info from search tables') parser.add_argument(*anvio.A('profile-db'), **anvio.K('profile-db', {'required': False})) parser.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db', {'required': False})) parser.add_argument(*anvio.A('output-file'), **anvio.K('output-file', {'default': "COLLECTIONS.txt"})) args = parser.parse_args() filesnpaths.is_file_exists(args.output_file) contigs = set([]) contig_lengths = {} db = dbops.ContigsDatabase(args.contigs_db, quiet=False) contigs_info_table = db.db.get_table_as_dict(t.contigs_info_table_name) contig_lengths = dict([(c, contigs_info_table[c]['length']) for c in contigs_info_table]) db.disconnect() db = dbops.ProfileDatabase(args.profile_db, quiet=False) collections_splits_table = db.db.get_table_as_dict( t.collections_splits_table_name) collections_info_table = db.db.get_table_as_dict(t.collections_info_table_name) db.disconnect()
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress(), skip_sanity_check=False): """Parses arguments and run sanity_check""" self.args = args self.run = run self.progress = progress # Parse arguments A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.annotation_source = A('annotation_source') self.window_range = A('ngram_window_range') or "2:3" self.is_in_unknowns_mode = A('analyze_unknown_functions') self.output_file = A('output_file') self.skip_init_functions = A('skip_init_functions') self.genome_names_to_focus = A('genome_names') self.ngram_source = A("ngram_source") self.annotation_source_dict = {} self.pan_db_path = A('pan_db') if self.annotation_source and self.pan_db_path: self.annotation_sources = [self.annotation_source, 'gene_clusters'] if self.pan_db_path: self.pan_db = PanDatabase(self.pan_db_path) self.p_meta = self.pan_db.meta self.p_meta['creation_date'] = utils.get_time_to_date( self.p_meta['creation_date'] ) if 'creation_date' in self.p_meta else 'unknown' self.p_meta['genome_names'] = sorted([ s.strip() for s in self.p_meta['external_genome_names'].split(',') + self.p_meta['internal_genome_names'].split(',') if s ]) self.p_meta['num_genomes'] = len(self.p_meta['genome_names']) self.genome_names = self.p_meta['genome_names'] self.gene_clusters_gene_alignments_available = self.p_meta[ 'gene_alignments_computed'] else: self.pan_db = None self.genomes_storage_path = A('genomes_storage') # confirm genome-storage and pangenome hashes match of pangenome is provided if self.pan_db: self.genomes_storage = genomestorage.GenomeStorage( self.genomes_storage_path, self.p_meta['genomes_storage_hash'], genome_names_to_focus=self.p_meta['genome_names'], skip_init_functions=self.skip_init_functions, run=self.run, progress=self.progress) else: self.genomes_storage = genomestorage.GenomeStorage( self.genomes_storage_path, skip_init_functions=self.skip_init_functions, run=self.run, progress=self.progress) # list-annotation-resources self.list_annotation_sources = A('list_annotation_sources') self.gene_function_source_set = self.genomes_storage.db.get_table_as_dataframe( 'gene_function_calls').source.unique() if self.list_annotation_sources: self.run.info('Available functional annotation sources', ', '.join(self.gene_function_source_set)) sys.exit() # This houses the ngrams' data self.ngram_attributes_list = [] # Focus on specfic set of genomes if self.genome_names_to_focus: if filesnpaths.is_file_exists(self.genome_names_to_focus, dont_raise=True): self.genome_names_to_focus = utils.get_column_data_from_TAB_delim_file( self.genome_names_to_focus, column_indices=[0], expected_number_of_fields=1)[0] else: self.genome_names_to_focus = [ g.strip() for g in self.genome_names_to_focus.split(',') ] self.run.warning( "A subset of genome names is found, and anvi'o will focus only on to those." ) self.genomes_storage = genomestorage.GenomeStorage( self.genomes_storage_path, storage_hash=None, genome_names_to_focus=self.genome_names_to_focus) self.genomes = self.genomes_storage.get_genomes_dict() self.external_genome_names = [ g for g in self.genomes if self.genomes[g]['external_genome'] ] self.internal_genome_names = [ g for g in self.genomes if not self.genomes[g]['external_genome'] ] self.hash_to_genome_name = {} for genome_name in self.genomes: self.hash_to_genome_name[self.genomes[genome_name] ['genome_hash']] = genome_name # number of genomes in genome-storage self.num_contigs_in_external_genomes_with_genes = len(self.genomes) # number of genomes in genome-storage if self.genome_names_to_focus: self.num_contigs_in_external_genomes_with_genes = len( self.genome_names_to_focus) else: self.num_contigs_in_external_genomes_with_genes = len( self.genomes_storage.get_all_genome_names()) if not skip_sanity_check: self.sanity_check() # unless we are in debug mode, let's keep things quiet. if anvio.DEBUG: self.run_object = terminal.Run() else: self.run_object = terminal.Run(verbose=False)
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress A = lambda x: (args.__dict__[x] if x in args.__dict__ else None) if args else None if self.mode == 'train': self.genomes_dir = os.path.abspath(A('genomes_dir')) self.classifier_output_path = os.path.abspath(A('output')) if A('classifier'): raise ConfigError("You should not initialize the domain training class with a input classifier path (`args.classifier`).") if not self.genomes_dir: raise ConfigError("You must provide a genomes directory. Please read the help menu if you are not sure\ how the contents of this directory should look like.") filesnpaths.is_output_file_writable(self.classifier_output_path) filesnpaths.is_file_exists(self.genomes_dir) elif self.mode == 'predict': if A('output'): raise ConfigError("You should not initialize the domain prediction class with an output classifier path (`args.output`).") default_classifier_path = 'misc/SCGDOMAINCLASSIFIER.rf' self.input_classifier_path = A('classifier') or os.path.join(os.path.dirname(anvio.data.__file__), default_classifier_path) if A('classifier'): filesnpaths.is_file_exists(self.input_classifier_path) else: if not filesnpaths.is_file_exists(self.input_classifier_path, dont_raise=True): raise ConfigError("Somehow, this anvi'o installation dose not seem to have a SCG domain classifier. This is one of\ those anvi'o things that should never happen. If you are an anvi'o user, please feel free to panic :(\ If you are an anvi'o developer, what you need to do is to follow the instructions in \ `anvi-script-gen-scg-domain-classifier` with a reasonable set of genomes and store the resulting\ classifier at the default anvi'o path of /blah/blah/anvio/data/%s." % (default_classifier_path)) self.rf = RF(self.input_classifier_path, r=self.run, p=self.progress) self.rf.initialize_classifier() else: raise ConfigError("Someone initialized the SCG domain classifier class without an explicit mode :(") self.SCG_sources = [d for d in hmm_data.sources if hmm_data.sources[d]['kind'] == 'singlecopy'] self.SCG_domains = sorted([hmm_data.sources[source]['domain'] for source in self.SCG_sources]) self.SCG_domain_to_source = dict([(hmm_data.sources[source]['domain'], source) for source in self.SCG_sources]) if not len(self.SCG_sources): raise ConfigError("There is something wrong :( There is not even a single SCG source found. Usually\ anvi'o comes with multiple of them :/") if len(self.SCG_sources) == 1: raise ConfigError("There is only a single SCG source in your anvi'o installation. It is OK if you are\ being a hacker and playing with things, but there is no logic behind creating a\ classifier with a single class.") if len(self.SCG_domains) != len(set(self.SCG_domains)): raise ConfigError("Something is wrong. For each domain, there must be a single sinlge-copy core gene\ source.") self.data, self.labels, self.features = [], [], [] for domain in self.SCG_domains: self.features.extend(sorted(hmm_data.sources[self.SCG_domain_to_source[domain]]['genes'])) self.run.info('SCG domain classifier mode', self.mode) self.run.info("SCG domains found", ', '.join(self.SCG_domains)) self.run.info("Num features", len(self.features))
def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.output_file_prefix = A('output_file_prefix') self.alpha = A('alpha') self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file_path = A('bin_ids_file') self.exclude_samples = A('exclude_samples') self.include_samples = A('include_samples') self.outliers_threshold = A('outliers_threshold') self.zeros_are_outliers = A('zeros_are_outliers') self.gen_figures = A('gen_figures') self.overwrite_output_destinations = A('overwrite_output_destinations') self.split_coverage_values_per_nt_dict = None self.gene_level_coverage_stats_dict = None self.gene_level_coverage_stats_dict_of_dataframes = None self.profile_db = {} self.coverage_values_per_nt = None self.gene_coverages = {} self.gene_detections = None self.samples = None self.positive_samples = [] self.number_of_positive_samples = None self.negative_samples = {} self.number_of_negative_samples = None self.gene_class_df = {} self.samples_detection_information = {} self.gene_presence_absence_in_samples_initiated = False self.gene_presence_absence_in_samples = None self.additional_description = '' self.total_length = None self.samples_coverage_stats_dicts_was_initiated = False self.samples_coverage_stats_dicts = {} self.non_outlier_indices = {} self.gene_coverage_consistency_dict = {} self.gene_coverage_consistency_dict_initiated = False self.samples_to_exclude = set([]) self.samples_to_include = set([]) self.write_output_to_files = None if self.exclude_samples: # check that there is a file like this filesnpaths.is_file_exists(self.exclude_samples) self.samples_to_exclude = set([l.split('\t')[0].strip() for l in open(self.exclude_samples, 'rU').readlines()]) if not self.samples_to_exclude: raise ConfigError("You asked to exclude samples, but provided an empty list.") run.info('Excluding Samples', 'The following samples will be excluded: %s' % self.samples_to_exclude,) if self.include_samples: # check that there is a file like this filesnpaths.is_file_exists(self.include_samples) self.samples_to_include = set([l.split('\t')[0].strip() for l in open(self.include_samples, 'rU').readlines()]) if not self.samples_to_include: raise ConfigError("You provided an empty list of samples to include.") run.info('Including Samples', 'The following samples will be included: %s' % self.samples_to_include,) # run sanity check on all input arguments self.sanity_check()
def sanity_check(self): A = lambda x, t: t(args.__dict__[x] ) if x in self.args.__dict__ else None null = lambda x: x # the directory files will be dumped into (can exist but must be empty) if filesnpaths.is_file_exists(self.directory, dont_raise=True): filesnpaths.is_output_dir_writable(self.directory) if not filesnpaths.is_dir_empty(self.directory): raise ModellerError( "You cannot give MODELLER a non-empty directory to work in." ) else: filesnpaths.gen_output_directory(self.directory) # All MODELLER scripts are housed in self.script_folder self.scripts_folder = J(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/scripts') if utils.filesnpaths.is_dir_empty(self.scripts_folder): raise ConfigError( "Anvi'o houses all its MODELLER scripts in {}, but your directory \ contains no scripts. Why you do dat?") # check that MODELLER exists if self.args.__dict__[ 'modeller_executable'] if 'modeller_executable' in self.args.__dict__ else None: self.run.info_single( "As per your request, anvi'o will use `%s` to run MODELLER." % self.executable, nl_before=1) utils.is_program_exists(self.executable) else: try: utils.is_program_exists(self.executable) except ConfigError as e: raise ConfigError( "Anvi'o needs a MODELLER program to be installed on your system. You didn't specify one\ (which can be done with `--modeller-executable`), so anvi'o tried the most recent version\ it knows about: '%s'. If you are certain you have it on your system (for instance you can run it\ by typing '%s' in your terminal window), you may want to send a detailed bug report. If you\ don't have it on your system, check out these installation instructions on our website:\ http://merenlab.org/2016/06/18/installing-third-party-software/#modeller" % (self.executable, self.executable)) self.run.info_single( "Anvi'o found the default executable for MODELLER, `%s`, and will\ use it." % self.executable, nl_before=1) self.is_executable_a_MODELLER_program() # does target_fasta_path point to a fasta file? utils.filesnpaths.is_file_fasta_formatted(self.target_fasta_path) # make sure target_fasta is valid target_fasta = u.SequenceSource(self.target_fasta_path, lazy_init=False) if target_fasta.total_seq != 1: raise ConfigError( "MODELLER::The input FASTA file must have exactly one sequence.\ You provided one with {}.".format( target_fasta.total_seq)) # (not sanity check but we get self.corresponding_gene_call since target_fasta is opened) while next(target_fasta): self.corresponding_gene_call = target_fasta.id target_fasta.close() # parameter consistencies if self.deviation < 0.5 or self.deviation > 20: self.run.warning( "You realize that deviation is given in angstroms, right? You chose {}" .format(self.deviation)) if self.very_fast and self.num_models > 1: self.run.warning( "Since you chose --very-fast, there will be little difference, if at all, between models. You \ can potentially save a lot of time by setting --num-models to 1." ) if self.percent_identical_cutoff <= 20: self.run.warning( "Two completely unrelated sequences of same length can expect to have around 10% proper \ percent identicalness... Having this parameter below 20% is probably a bad idea." )
def load_collections(self): ''' Load the collections_txt file, run some sanity checks, and figure out params for anvi_import_collection''' collections = u.get_TAB_delimited_file_as_dictionary( self.collections_txt) bad_groups = [g for g in collections if g not in self.group_names] if bad_groups: raise ConfigError('Some of the names in your collection_txt \ file ("%s") don\'t match the names of the \ groups in your samples_txt/fasta_txt. \ Here are the names that don\'t match: %s. \ And here are the group names we expect to find: \ %s' % (self.collections_txt, ', '.join(bad_groups), ', '.join(self.group_names))) for group in collections: default_collection = collections[group].get('default_collection') if default_collection: # User can specify either a default collection OR collection from file not_allowed_params = { 'collection_name', 'collection_file', 'bins_info', 'contigs_mode' } if any([ collections[group][key] for key in not_allowed_params if key in collections[group].keys() ]): raise ConfigError( 'We encountered the following problem with your \ collections_txt file ("%s"): you can choose \ either using a default collection OR importing \ a collection from a file. Yet, for "%s", you specificy \ a default collection AND also specify some of the following \ parameters: %s.' % (self.collections_txt, group, ", ".join(not_allowed_params))) collections[group]['collection_name'] = 'DEFAULT' collections[group]['contigs_mode'] = '' else: if not filesnpaths.is_file_exists( collections[group]['collection_file'], dont_raise=True): raise ConfigError( 'We encountered the following problem with your \ collections_txt file ("%s"): you did not specify \ a valid collection file for "%s".' % (self.collections_txt, group)) if not collections[group]['collection_name']: raise ConfigError( 'You must specify a name for each collection in your collections_txt' ) u.check_collection_name(collections[group]['collection_name']) if collections[group].get('bins_info'): filesnpaths.is_file_exists(collections[group]['bins_info']) collections[group][ 'bins_info'] = '--bins-info %s' % collections[group][ 'bins_info'] else: collections[group]['bins_info'] = '' if collections[group].get('contigs_mode'): collections[group]['contigs_mode'] = '--contigs-mode' else: collections[group]['contigs_mode'] = '' self.collections = collections
def load_from_user_files(self, args): if self.contigs_db_path: raise ConfigError, "When you want to use the interactive interface in an ad hoc manner, you must\ not use a contigs database." if not self.profile_db_path: raise ConfigError, "Even when you want to use the interactive interface in an ad hoc manner by\ using the '--manual-mode' flag, you still need to declare a profile database.\ The profile database in this mode only used to read or store the 'state' of\ the display for visualization purposes. You DO NOT need to point to an already\ existing database, as anvi'o will generate an empty one for your if there is no\ profile database." if not self.tree: raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\ at least the tree file. Please see the documentation for help." if self.view: raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\ in manual mode" if self.show_views: raise ConfigError, "Sorry, there are no views to show in manual mode :/" if self.show_states: raise ConfigError, "Sorry, there are no states to show in manual mode :/" filesnpaths.is_file_exists(self.tree) tree = filesnpaths.is_proper_newick(self.tree) view_data_path = os.path.abspath( self.view_data_path) if self.view_data_path else None self.p_meta['splits_fasta'] = os.path.abspath( self.fasta_file) if self.fasta_file else None self.p_meta['output_dir'] = None self.p_meta['views'] = {} self.p_meta['merged'] = True self.p_meta['default_view'] = 'single' self.p_meta['default_clustering'] = 'default' self.p_meta['available_clusterings'] = ['default'] self.p_meta['clusterings'] = { 'default': { 'newick': open(os.path.abspath(self.tree)).read() } } self.default_view = self.p_meta['default_view'] if self.view_data_path: # sanity of the view data filesnpaths.is_file_tab_delimited(view_data_path) view_data_columns = utils.get_columns_of_TAB_delim_file( view_data_path, include_first_column=True) if not view_data_columns[0] == "contig": raise ConfigError, "The first row of the first column of the view data file must\ say 'contig', which is not the case for your view data file\ ('%s'). Please make sure this is a properly formatted view data\ file." % (view_data_path) # load view data as the default view: self.views[self.default_view] = { 'header': view_data_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path) } else: # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict' # here using what is in the tree. names_in_the_tree = [n.name for n in tree.get_leaves()] ad_hoc_dict = {} for item in names_in_the_tree: ad_hoc_dict[item] = {'names': item} self.views[self.default_view] = { 'header': ['names'], 'dict': ad_hoc_dict } self.split_names_ordered = self.views[self.default_view]['dict'].keys() # we assume that the sample names are the header of the view data, so we might as well set it up: self.p_meta['samples'] = self.views[self.default_view]['header'] # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts, # otherwise we will leave them empty self.splits_basic_info = {} self.split_sequences = None if self.p_meta['splits_fasta']: filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary( self.p_meta['splits_fasta']) names_missing_in_FASTA = set(self.split_names_ordered) - set( self.split_sequences.keys()) num_names_missing_in_FASTA = len(names_missing_in_FASTA) if num_names_missing_in_FASTA: raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\ FASTA file you provided. Here is an example to one of those %d names that occur\ in your data file, but not in the FASTA file: "%s"' % ( num_names_missing_in_FASTA, names_missing_in_FASTA.pop()) # setup a mock splits_basic_info dict for split_id in self.split_names_ordered: self.splits_basic_info[split_id] = { 'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence( self.split_sequences[split_id]) } # create a new, empty profile database for ad hoc operations if not os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) profile_db.create({ 'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples']) }) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__) # also populate collections, if there are any self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__) if self.title: self.title = self.title
def check_input_paths(self): """Check FASTQ file paths if running Illumina-utils for read merging, or FASTA file paths if considering merged or unpaired reads. Allow both absolute and relative paths in samples_txt.""" if self.run_iu_merge_pairs: fastq_paths = self.sample_info['r1'].tolist( ) + self.sample_info['r2'].tolist() bad_fastq_paths = [] for fastq_path in fastq_paths: if os.path.isabs(fastq_path): if not filesnpaths.is_file_exists(fastq_path, dont_raise=True): bad_fastq_paths.append(fastq_path) else: if not filesnpaths.is_file_exists(os.path.join( os.getcwd(), fastq_path), dont_raise=True): bad_fastq_paths.append(fastq_path) if bad_fastq_paths: raise ConfigError( "The following FASTQ files in the samples_txt file, '%s', cannot be found: %s." % (self.samples_txt_file, ', '.join(bad_fastq_paths))) bad_fastq_names = [ s for s in fastq_paths if (not s.endswith('.fq') and not s.endswith('.fq.gz') and not s.endswith('.fastq') and not s.endswith('.fastq.gz')) ] if bad_fastq_names: run.warning( "Some of the sequence files in the samples_txt file, '%s', " "do not end with '.fq', '.fq.gz', 'fastq' or '.fastq.gz'. " "That's okay, but anvi'o decided it should warn you. " "Here are the first 5 such files that have unconventional file extensions: %s." % (self.samples_txt_file, ', '.join(bad_fastq_names[:5]))) else: fasta_paths = self.sample_info['fasta'].tolist() bad_fasta_paths = [] for fasta_path in fasta_paths: if os.path.isabs(fasta_path): if not filesnpaths.is_file_exists(fasta_path, dont_raise=True): bad_fasta_paths.append(fasta_path) else: if not filesnpaths.is_file_exists(os.path.join( os.getcwd(), fasta_path), dont_raise=True): bad_fasta_paths.append(fasta_path) bad_fasta_paths = [ s for s in fasta_paths if not filesnpaths.is_file_exists(os.path.abspath(s), dont_raise=True) ] if bad_fasta_paths: raise ConfigError( "The following FASTA files in the samples_txt file, '%s', cannot be found: %s." % (self.samples_txt_file, ', '.join(bad_fasta_paths))) bad_fasta_names = [ s for s in fasta_paths if (not s.endswith('.fa') and not s.endswith('.fa.gz') and not s.endswith('.fasta') and not s.endswith('.fasta.gz')) ] if bad_fasta_names: run.warning( "Some of the FASTA files in the samples_txt file, '%s', " "do not end with '.fa', '.fa.gz', 'fasta' or '.fasta.gz'. " "That's okay, but anvi'o decided it should warn you. " "Here are the first 5 such files that have unconventional file extensions: %s." % (self.samples_txt_file, ', '.join(bad_fasta_names[:5])))
def run_hmmer(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms, desired_output='table', hmmer_output_dir=None): """Run the program Parameters ========== source : str A name for your HMM effort. alphabet : str Which alphabet are you using? Choose from {'AA', 'DNA', 'RNA'} context : str This will determine how your output is processed. FIXME Documentation is lacking. Choose from {'GENE', 'CONTIG', 'DOMAIN'}. kind : str Used for user stdout info. Don't by afraid to pass None domain : str Used for user stdout info. Don't by afraid to pass None num_genes_in_model : int Used for user stdout info. Don't by afraid to pass None hmm : str Path to the input .hmm file ref : int Used for user stdout info. Don't by afraid to pass None noise_cutoff_terms : str Filter out hits with built-in flags. e.g. '--cut_ga' desired_output : str OR list, 'table' HMMER programs have a couple of outputs. For the standard output (specified by the hmmer program flag `-o`), pass 'standard'. For the regular tabular output (specified by the hmmer program flag `--tblout`), pass 'table'. For the domain tabular output (specified by the hmmer program flag `--domtblout`), pass 'domtable'. If you want to use multiple, pass a tuple like ('standard', 'table') hmmer_output_dir : str The path at which to store the HMMER output files, if desired. After all HMMER workers are done and their partial output files have been combined into one (for each type), those combined output files will be moved to this location. """ target = ':'.join([alphabet, context]) if target not in self.target_files_dict: raise ConfigError( "You have an unknown target :/ Target, which defines an alphabet and context " "to clarify whether the HMM search is supposed to be done using alphabets DNA, " "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it " "doesn't work for anvi'o." % target) if not self.target_files_dict[target]: raise ConfigError( "HMMer class does not know about Sequences file for the target %s :/" % target) if isinstance(desired_output, str): desired_output = (desired_output, ) for output in desired_output: if output not in ['standard', 'table', 'domtable']: raise ConfigError( "HMMer.run_hmmer :: Unknown desired_output, '%s'" % output) if hmmer_output_dir: if not os.path.exists(hmmer_output_dir): filesnpaths.gen_output_directory(hmmer_output_dir) else: filesnpaths.is_output_dir_writable(hmmer_output_dir) for output in desired_output: file_path = os.path.join(hmmer_output_dir, f"hmm.{output}") if filesnpaths.is_file_exists(file_path, dont_raise=True): raise ConfigError( f"The file {file_path} already exists, and anvi'o does not like to " "to overwrite things. Please either remove the file or rename your " "desired output.") self.run.warning('', header='HMM Profiling for %s' % source, lc='green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Kind', kind if kind else 'unknown') self.run.info('Alphabet', alphabet) self.run.info('Context', context) self.run.info('Domain', domain if domain else 'N/A') self.run.info('HMM model path', hmm) self.run.info('Number of genes in HMM model', num_genes_in_model or 'unknown') self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) if alphabet in ['DNA', 'RNA']: self.run.info('HMMer program used for search', 'nhmmscan') if 'domtable' in desired_output: raise ConfigError( "Oh, dear. Someone (probably a programmer) has requested domain table output from " f"the run_hmmer() function when the alphabet is {alphabet}. Sadly, this will not " "work because that alphabet requires the use of `nhmmscan`, which does not have " "the --domtblout parameter.") else: self.run.info('HMMer program used for search', self.program_to_use) tmp_dir = os.path.dirname(self.target_files_dict[target][0]) self.run.info('Temporary work dir', tmp_dir) # check if all hmmpress files are in the HMM directory self.verify_hmmpress_output(hmm) workers = [] manager = multiprocessing.Manager( ) # this dude holds the shared objects that will be modified by workers ret_value_queue = manager.Queue(maxsize=self.num_threads_to_use) output_queue = manager.Queue() # Holds buffer and write lock for each output merged_files_dict = {} for output in desired_output: merged_files_dict[output] = { 'buffer': io.StringIO(), 'lock': manager.Lock() } num_parts = len(self.target_files_dict[target]) cores_per_process = 1 if num_parts < self.num_threads_to_use: cores_per_process = self.num_threads_to_use // num_parts self.run.warning( f"You requested {P('core', self.num_threads_to_use)} but there were only {P('sequence', num_parts)} " f"in the FASTA file for the target '{target}'. Anvi'o will use {P('process', num_parts, sfp='es')} " f"with {P('core', cores_per_process)} instead. And that's that." ) self.num_threads_to_use = num_parts if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch': self.run.warning( "You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. " "We hope that is alright." % (self.program_to_use, alphabet)) thread_num = 0 for partial_input_file in self.target_files_dict[target]: log_file = partial_input_file + '_log' output_file = partial_input_file + '_output' table_file = partial_input_file + '_table' if 'domtable' in desired_output: domtable_file = partial_input_file + '_domtable' else: domtable_file = None self.run.info('Log file for thread %s' % thread_num, log_file) thread_num += 1 if noise_cutoff_terms: if 'domtable' in desired_output: cmd_line = [ 'nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, *noise_cutoff_terms.split(), '--cpu', cores_per_process, '--tblout', table_file, '--domtblout', domtable_file, hmm, partial_input_file ] else: cmd_line = [ 'nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, *noise_cutoff_terms.split(), '--cpu', cores_per_process, '--tblout', table_file, hmm, partial_input_file ] else: # if we didn't pass any noise cutoff terms, here we don't include them in the command line if 'domtable' in desired_output: cmd_line = [ 'nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, '--cpu', cores_per_process, '--tblout', table_file, '--domtblout', domtable_file, hmm, partial_input_file ] else: cmd_line = [ 'nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use, '-o', output_file, '--cpu', cores_per_process, '--tblout', table_file, hmm, partial_input_file ] t = multiprocessing.Process( target=self.hmmer_worker, args=(partial_input_file, cmd_line, table_file, output_file, desired_output, log_file, output_queue, ret_value_queue, domtable_file)) t.start() workers.append(t) self.progress.new('Processing') self.progress.update( f'Running {self.program_to_use} in {P("thread", self.num_threads_to_use)}...' ) finished_workers = 0 while finished_workers < self.num_threads_to_use: try: ret_value = ret_value_queue.get() if isinstance(ret_value, Exception): # If thread returns an exception, we raise it and kill the main thread. raise ret_value finished_workers += 1 if ret_value == 0: if anvio.DEBUG: self.run.info_single( f"{finished_workers} out of {self.num_threads_to_use} have finished" ) else: raise ConfigError( "An HMMER worker thread came back with an unexpected return value of {ret_value}. " "Something is probably wrong, so you should contact a developer for help." ) # if worker finished successfully we can take its individual output file(s) and append them to the main file(s) output_dict = output_queue.get() for file_type, file in output_dict.items(): main_file_buffer = merged_files_dict[file_type]['buffer'] main_file_lock = merged_files_dict[file_type]['lock'] worker_file = file if file_type == 'table': append_function = self.append_to_main_table_file elif file_type == 'standard': append_function = self.append_to_main_standard_file elif file_type == 'domtable': append_function = self.append_to_main_table_file append_function(main_file_buffer, worker_file, main_file_lock) except KeyboardInterrupt: self.run.info_single( "HMMER driver received SIGINT, terminating all threads...", nl_before=2) break except Exception as worker_error: # An exception was thrown in one of the threads so we kill all of them self.progress.end() self.run.warning( "An exception was thrown in one of the worker threads (see output below for details)." ) for worker in workers: worker.terminate() raise worker_error for worker in workers: worker.terminate() output_file_paths = [] for output in desired_output: if hmmer_output_dir: output_file_path = os.path.join(hmmer_output_dir, f"hmm.{output}") else: output_file_path = os.path.join(tmp_dir, f"hmm.{output}") with open(output_file_path, 'w') as out: merged_files_dict[output]['buffer'].seek(0) out.write(merged_files_dict[output]['buffer'].read()) if output == 'table' or output == 'domtable': num_raw_hits = filesnpaths.get_num_lines_in_file( output_file_path) self.run.info(f'Number of raw hits in {output} file', num_raw_hits, progress=self.progress) output_file_path = output_file_path if num_raw_hits else None output_file_paths.append(output_file_path) self.progress.end() # Return output path as string if desired_output is len 1. Else return tuple of output paths output = output_file_paths[0] if len( output_file_paths) == 1 else tuple(output_file_paths) return output
def __init__(self, config_file_path, input_directory=None, db_paths={}, row_ids_of_interest=[], r=run, p=progress): self.run = r self.progress = p self.input_directory = input_directory or os.path.abspath(os.getcwd()) self.config_file_path = config_file_path # `row_ids_of_interest` gives opportunity to filter out irrelevant entries quickly # while vectors are being obtained from each matrix described in the config file. # to see why it is important in the context of anvi'o, see # https://github.com/meren/anvio/issues/100 self.row_ids_of_interest = set(row_ids_of_interest) # these are the database files that may be referenced from within the config files # with !DATABASE.db::table notation. If a database entry has an exclamation mark, # it will be searched for in the db_paths dict to associate it with the relative # path that is only known to the client self.db_paths = db_paths # read the config filesnpaths.is_file_exists(self.config_file_path) config = configparser.ConfigParser() config.read(self.config_file_path) # this will keep the actual paths for each matrix: self.matrix_paths = {} self.set_default_paths(config) self.check_for_db_requests(config) # and sanity check. self.sanity_check(config) if self.get_option(config, 'general', 'output_file', str): self.output_file_name = self.get_option(config, 'general', 'output_file', str) self.output_file_path = os.path.join(self.input_directory, self.output_file_name) else: self.output_file_name = None self.output_file_path = None self.name = self.get_option( config, 'general', 'name', str) or filesnpaths.get_name_from_file_path(self.config_file_path) self.distance = self.get_option(config, 'general', 'distance', str) self.linkage = self.get_option(config, 'general', 'linkage', str) self.num_components = self.get_option(config, 'general', 'num_components', int) self.seed = self.get_option(config, 'general', 'seed', int) self.master = None self.matrices_dict = {} self.matrices = [] for section in self.get_other_sections(config): alias, matrix = section.split() self.matrices.append(alias) m = {} columns_to_use = self.get_option(config, section, 'columns_to_use', str) table_form = self.get_option(config, section, 'table_form', str) m['alias'] = alias m['matrix'] = matrix m['table_form'] = table_form m['columns_to_use'] = [ c.strip() for c in columns_to_use.split(',') ] if columns_to_use else None m['ratio'] = self.get_option(config, section, 'ratio', int) m['path'] = self.matrix_paths[alias] m['normalize'] = False if self.get_option( config, section, 'normalize', str) == 'False' else True m['log'] = True if self.get_option(config, section, 'log', str) == 'True' else False # next two variables are necessary to follow the order of vectors m['id_to_sample'], m['sample_to_id'], m['cols'], m[ 'vectors'] = get_vectors(m['path'], m['columns_to_use'], self.row_ids_of_interest) self.matrices_dict[alias] = m # make sure all matrices have identical rows: if len( set([ list(m['id_to_sample'].values()).__str__() for m in list(self.matrices_dict.values()) ])) > 1: master_rows, master_matrix = sorted([(len(self.matrices_dict[m]['id_to_sample']), list(self.matrices_dict[m]['id_to_sample'].values()), m)\ for m in self.matrices_dict])[0][1:] self.master = master_matrix self.master_rows = master_rows # the smallest matrix is 'master_matrix', and the rows it has is master_rows. so every other matrix # must match that, or we will throw a tantrum. for matrix in [m for m in self.matrices if m != master_matrix]: m = self.matrices_dict[matrix] # get reduced set of vectors from rows that match `master_rows`: m['id_to_sample'], m['sample_to_id'], m['cols'], m[ 'vectors'] = get_vectors(m['path'], m['columns_to_use'], master_rows) if len(m['vectors']) != len(master_rows): raise ConfigError( 'The content of rows differed between input matrices. So I tried to ' 'match all other matrices to the matrix with the smallest number of ' 'rows (which was "%s"). However, not all other matrices contained ' 'the small set of rows.' % (master_matrix)) else: self.master_rows = sorted( self.matrices_dict[self.matrices[0]]['sample_to_id'].keys()) self.num_matrices = len(self.matrices) self.multiple_matrices = self.num_matrices > 1
def __init__(self, args, r=terminal.Run(width=35), p=terminal.Progress()): self.args = args self.progress = p self.run = r A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.input_file_path = A('input_file') self.contigs_db_path = A('contigs_db') self.serialized_profile_path = A('serialized_profile') self.output_directory = A('output_dir') self.list_contigs_and_exit = A('list_contigs') self.min_contig_length = A('min_contig_length') or 0 self.max_contig_length = A('max_contig_length') or sys.maxsize self.min_mean_coverage = A('min_mean_coverage') self.min_coverage_for_variability = A('min_coverage_for_variability') self.contigs_shall_be_clustered = A('cluster_contigs') self.skip_hierarchical_clustering = A('skip_hierarchical_clustering') self.sample_id = A('sample_name') self.report_variability_full = A('report_variability_full') self.overwrite_output_destinations = A('overwrite_output_destinations') self.skip_SNV_profiling = A('skip_SNV_profiling') self.profile_SCVs = A('profile_SCVs') self.ignore_orphans = A('ignore_orphans') self.max_coverage_depth = A('max_coverage_depth') or 8000 self.gen_serialized_profile = A('gen_serialized_profile') self.distance = A('distance') or constants.distance_metric_default self.linkage = A('linkage') or constants.linkage_method_default self.num_threads = int(A('num_threads') or 1) self.queue_size = int( A('queue_size') if A('queue_size') is not None else 0) self.write_buffer_size = int( A('write_buffer_size') if A('write_buffer_size' ) is not None else 500) self.total_length_of_all_contigs = 0 self.total_coverage_values_for_all_contigs = 0 self.description_file_path = A('description') # make sure early on that both the distance and linkage is OK. clustering.is_distance_and_linkage_compatible(self.distance, self.linkage) # whehther the profile database is a blank (without any BAM files or reads): self.blank = A('blank_profile') if not self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering: raise ConfigError( "You are confused, and confusing anvi'o, too. You can't as hierarchical clustering\ to be performed with one flag, and try to skip it with another one :(" ) if self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering: raise ConfigError( "So you want to generate a blank profile, and you both want hierarchical clustering\ of your contigs to be performed, and skipped. No." ) if self.blank and self.contigs_shall_be_clustered: raise ConfigError( "When the blank profile is asked to be generated, there is no need to ask for the\ hierarchical clustering of contigs. It is going to be done by default. If it is\ not changing anything, why is anvi'o upset with you? Because. Let's don't use flags\ we don't need.") if self.max_coverage_depth >= auxiliarydataops.COVERAGE_MAX_VALUE: raise ConfigError("The value %s for the maximum coverage depth is not going to work :/ While the maximum\ depth of coverage for anvi'o to care about is a soft cut-off (hence you have some level\ of freedom through the parameter `--max-coverage-depth`), there are database limitations\ anvi'o must consider and can not change. The maximum value allowed in the database for\ coverage information is 65536. Hence, you should set your depth of coverage to something \ that is less than this value. In addition, it is also recommended to leave a little gap\ and don't go beyond 90%% of this hard limit (that's why anvi'o will keep telling you,\ \"%s is nice, but %s is the best I can do\" when you try to exceed that)." \ % (pp(self.max_coverage_depth), pp(self.max_coverage_depth), pp(auxiliarydataops.COVERAGE_MAX_VALUE))) if self.blank and not self.skip_hierarchical_clustering: self.contigs_shall_be_clustered = True if A('contigs_of_interest'): filesnpaths.is_file_exists(args.contigs_of_interest) self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\ if c.strip() and not c.startswith('#')]) else: self.contig_names_of_interest = None if self.list_contigs_and_exit: self.list_contigs() sys.exit() if not self.contigs_db_path: raise ConfigError("No contigs database, no profilin'. Bye.") # Initialize contigs db dbops.ContigsSuperclass.__init__(self, self.args, r=self.run, p=self.progress) self.init_contig_sequences() self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys()) self.bam = None self.contigs = [] self.database_paths = { 'CONTIGS.db': os.path.abspath(self.contigs_db_path) } self.profile_db_path = None self.clustering_configs = constants.clustering_configs[ 'blank' if self.blank else 'single'] # following variable will be populated during the profiling, and its content will eventually # be stored in t.variable_nts_table_name self.variable_nts_table_entries = [] # if genes are not called, yet the user is asking for codon frequencies to be profiled, we give # a warning and force-turn that flag off. if (not self.a_meta['genes_are_called']) and self.profile_SCVs: self.run.warning( "You asked the codon frequencies to be profiled, but genes were not called\ for your contigs database. Anvi'o is assigning `False` to the profile-codon-frequncies\ flag, overruling your request like a boss.") self.profile_SCVs = False # following variable will be populated while the variable positions table is computed self.codons_in_genes_to_profile_SCVs = set([]) # we don't know what we are about self.description = None # additional layer data will be filled later self.layer_additional_keys = [] self.layer_additional_data = {}
def __init__(self, db_path, client_version, new_database=False, ignore_version=False, skip_rowid_prepend=False, run=terminal.Run(), progress=terminal.Progress()): self.db_path = db_path self.version = None self.run = run self.progress = progress # these anonymous functions report whether the ROWID will be added # to its rows read from the database or not. if the first column of a given # table does not contain unique variables, anvi'o prepends the ROWID of each # column to index 0, unless `skip_rowid_prepend` is True self.ROWID_PREPENDS_ROW_DATA = lambda table_name: False if skip_rowid_prepend else tables.requires_unique_entry_id[ table_name] self.PROPER_SELECT_STATEMENT = lambda table_name: 'ROWID as "entry_id", *' if self.ROWID_PREPENDS_ROW_DATA( table_name) else '*' if new_database: filesnpaths.is_output_file_writable(db_path) else: filesnpaths.is_file_exists(db_path) if new_database and os.path.exists(self.db_path): os.remove(self.db_path) self.check_if_db_writable() try: self.conn = sqlite3.connect(self.db_path) except Exception as e: raise ConfigError( f"This one time someone was not happy with '{self.db_path}' and '{e}', they said." ) self.conn.text_factory = str self.cursor = self.conn.cursor() self.table_names_in_db = self.get_table_names() if new_database: self.create_self() self.set_version(client_version) else: self.version = self.get_version() if str(self.version) != str(client_version) and not ignore_version: if int(self.version) > int(client_version): progress.reset() raise ConfigError( "Bad news of the day: the database at %s was generated with an anvi'o version that is 'newer' than " "the one you are actively using right now. We know, you hate to hear this, but you need to upgrade " "your anvi'o :(" % self.db_path) else: progress.reset() raise ConfigError( f"The database at '{self.db_path}' is outdated (this database is v{self.version} and your anvi'o installation " f"wants to work with v{client_version}). You can migrate your database without losing any data using the " f"program `anvi-migrate` with either of the flags `--migrate-dbs-safely` or `--migrate-dbs-quickly`." ) bad_tables = [ table_name for table_name in self.table_names_in_db if table_name not in tables.requires_unique_entry_id ] if len(bad_tables): raise ConfigError( "You better be a programmer tinkering with anvi'o databases adding new tables or something. Otherwise we " "have quite a serious problem :/ Each table in a given anvi'o database must have an entry in the " "anvio/tables/__init__.py dictionary `requires_unique_entry_id` to explicitly define whether anvi'o " "should add a unique entry id for its contents upon retrieval as a dictionary. The following tables " "in this database do not satisfy that: '%s'. You can solve this problem by adding an entry into that " "dictionary." % (', '.join(bad_tables)))
def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.gene_coverages_data_file_path = A('data_file') self.gene_detections_data_file_path = A('gene_detection_data_file') self.profile_db_path = A('profile_db') self.output_file_prefix = A('output_file_prefix') self.alpha = A('alpha') self.beta = A('beta') self.gamma = A('gamma') self.eta = A('eta') self.zeta = A('zeta') self.additional_layers_to_append = A('additional_layers_to_append') self.samples_information_to_append = A('samples_information_to_append') self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file_path = A('bin_ids_file') self.store_gene_detections_and_gene_coverages_tables = A( 'store_gene_detections_and_gene_coverages_tables') self.exclude_samples = A('exclude_samples') self.gene_coverages = pd.DataFrame.empty self.gene_detections = pd.DataFrame.empty self.samples = {} self.positive_samples = pd.DataFrame.empty self.number_of_positive_samples = None self.negative_samples = pd.DataFrame.empty self.number_of_negative_samples = None self.gene_class_information = pd.DataFrame.empty self.samples_information = pd.DataFrame.empty self.profile_db = {} self.gene_presence_absence_in_samples = pd.DataFrame.empty self.gene_coverages_filtered = pd.DataFrame.empty # check that there is a file like this if self.exclude_samples: filesnpaths.is_file_exists(self.exclude_samples) self.samples_to_exclude = set([ l.split('\t')[0].strip() for l in open(args.exclude_samples, 'rU').readlines() ]) run.info( 'Excluding Samples', 'The following samples will be excluded: %s' % self.samples_to_exclude, ) else: self.samples_to_exclude = set([]) self.sanity_check() if self.profile_db_path is None: self.get_data_from_txt_file() else: # load sample list and gene_coverage_dict from the merged profile db args.init_gene_coverages = True if self.collection_name: self.summary = summarizer.ProfileSummarizer(args) self.summary.init() else: self.profile_db = ProfileSuperclass(args) self.profile_db.init_gene_coverages_and_detection_dicts() self.gene_coverages = pd.DataFrame.from_dict( self.profile_db.gene_coverages_dict, orient='index', dtype=float) self.gene_coverages.drop(self.samples_to_exclude, axis=1, inplace=True) self.Ng = len(self.gene_coverages.index) self.gene_detections = pd.DataFrame.from_dict( self.profile_db.gene_detection_dict, orient='index', dtype=float) self.gene_detections.drop(self.samples_to_exclude, axis=1, inplace=True) self.samples = set(self.gene_coverages.columns)