def load_from_files(self, args): if (not self.fasta_file) or (not self.metadata) or (not self.tree) or (not self.output_dir): raise ConfigError, "If you do not have a RUNINFO dict, you must declare each of\ '-f', '-m', '-t' and '-o' parameters. Please see '--help' for\ more detailed information on them." if self.view: raise ConfigError, "You can't use '-v' parameter when this program is not called with a RUNINFO.cp" if self.show_views: raise ConfigError, "Sorry, there are no views to show when there is no RUNINFO.cp :/" metadata_path = os.path.abspath(self.metadata) self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) self.p_meta['output_dir'] = os.path.abspath(self.output_dir) self.p_meta['views'] = {} self.p_meta['default_view'] = 'single' self.p_meta['default_clustering'] = 'default' self.p_meta['available_clusterings'] = ['default'] self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}} self.default_view = self.p_meta['default_view'] if self.summary_index: self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index) self.splits_summary_index = dictio.read_serialized_object(self.p_meta['profile_summary_index']) # sanity of the metadata filesnpaths.is_file_tab_delimited(metadata_path) metadata_columns = utils.get_columns_of_TAB_delim_file(metadata_path, include_first_column=True) if not metadata_columns[0] == "contig": raise ConfigError, "The first row of the first column of the metadata file must\ say 'contig', which is not the case for your metadata file\ ('%s'). Please make sure this is a properly formatted metadata\ file." % (metadata_path) # store metadata as view: self.views[self.default_view] = {'header': metadata_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(metadata_path)} self.split_names_ordered = self.views[self.default_view]['dict'].keys() filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta']) # setup a mock splits_basic_info dict self.splits_basic_info = {} for split_id in self.split_names_ordered: self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])} # reminder: this is being stored in the output dir provided as a commandline parameter: self.p_meta['self_path'] = os.path.join(self.p_meta['output_dir'], 'RUNINFO.cp') if self.title: self.title = self.title filesnpaths.gen_output_directory(self.p_meta['output_dir'])
def populate_from_file(self, additional_data_file_path, skip_check_names=None): data_keys = utils.get_columns_of_TAB_delim_file(additional_data_file_path) data_dict = utils.get_TAB_delimited_file_as_dictionary(additional_data_file_path) if not len(data_keys): raise ConfigError("There is something wrong with the additional data file for %s at %s.\ It does not seem to have any additional keys for data :/" \ % (self.target, additional_data_file_path)) if self.target == 'layer_orders': OrderDataBaseClass.add(self, data_dict, skip_check_names) else: AdditionalDataBaseClass.add(self, data_dict, data_keys, skip_check_names)
def get(engine, run=run): data = {} if engine not in engines: raise ConfigError("Anvi'o was about to populate the SSMs, but it does not know about the engine '%s'." % engine) dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), engine) substitution_matrix_paths = [s for s in glob.glob(os.path.join(dir_path, '*')) if s.endswith('.txt')] for matrix_path in substitution_matrix_paths: matrix_id = os.path.basename(matrix_path).split('.txt')[0] matrix_rows = u.get_column_data_from_TAB_delim_file(matrix_path, column_indices=[0])[0][1:] matrix_columns = u.get_columns_of_TAB_delim_file(matrix_path, include_first_column=False) if sorted(matrix_columns) != sorted(matrix_rows): raise ConfigError("Anvi'o found a substitution scoring matrix named '%s'. However, it doesn't look like\ a nicely done matrix. Substitution scoring matrices must contain the same row and column\ names (i.e., a square matrix that is equal to its transpose). Well. This one does not :/" \ % (os.path.basename(matrix_path))) if engine == 'AA': expected_items = set(list(constants.amino_acids)) elif engine == 'NT': expected_items = set(list(constants.nucleotides)) elif engine == 'CDN': expected_items = set(list(constants.codons)) unexpected_items_in_matrix = [item for item in matrix_columns if item not in expected_items] if len(unexpected_items_in_matrix): raise ConfigError("It seems you have a poorly done substitution scoring matrix named '%s' in the data directory.\ Anvi'o expects an %s substitution matrix to describe one or more of these %d guys: '%s'. But\ the matrix %s had stuff anvi'o is not familiar with: '%s'." % \ (matrix_id, engine, len(expected_items), ', '.join(expected_items), matrix_id, ', '.join(unexpected_items_in_matrix))) matrix_data = u.get_TAB_delimited_file_as_dictionary(matrix_path, column_mapping = [str] + [float] * len(expected_items)) data[matrix_id] = matrix_data if len(data): run.warning('%d matri%s been loaded: "%s".' % \ (len(data), 'ces have' if len(data) > 1 else 'x has', ', '.join(list(data.keys()))), header='%s substitution scoring matrices' % engine, lc="green") return data
def convert_view_data_into_json(self): '''This function's name must change to something more meaningful.''' additional_layers_dict, additional_layer_headers = None, [] if self.additional_layers_path: additional_layers_dict = utils.get_TAB_delimited_file_as_dictionary(self.additional_layers_path) additional_layer_headers = utils.get_columns_of_TAB_delim_file(self.additional_layers_path) for view in self.views: # here we will populate runinfo['views'] with json objects. view_dict = self.views[view]['dict'] view_headers = self.views[view]['header'] json_object = [] # (1) set the header line with the first entry: json_header = ['contigs'] # (2) add taxonomy, if exitsts: if len(self.splits_taxonomy_dict): json_header.extend(['taxonomy']) # (3) then add split summaries from contigs db, if exists if len(self.genes_in_splits_summary_dict): json_header.extend(self.genes_in_splits_summary_headers[1:]) # (4) then add length and GC content basic_info_headers = ['length', 'gc_content'] json_header.extend(basic_info_headers) # (5) then add the view! json_header.extend(view_headers) # (6) then add 'additional' headers as the outer ring: if additional_layer_headers: json_header.extend(additional_layer_headers) # (7) finally add hmm search results if self.hmm_searches_header: json_header.extend([tpl[0] for tpl in self.hmm_searches_header]) # (8) and finalize it (yay): json_object.append(json_header) for split_name in view_dict: # (1) json_entry = [split_name] # (2) if self.splits_taxonomy_dict: json_entry.extend([self.splits_taxonomy_dict[split_name]['t_species']]) # (3) if self.genes_in_splits_summary_dict: json_entry.extend([self.genes_in_splits_summary_dict[split_name][header] for header in self.genes_in_splits_summary_headers[1:]]) # (4) json_entry.extend([self.splits_basic_info[split_name][header] for header in basic_info_headers]) # (5) adding essential data for the view json_entry.extend([view_dict[split_name][header] for header in view_headers]) # (6) adding additional layers json_entry.extend([additional_layers_dict[split_name][header] if additional_layers_dict.has_key(split_name) else None for header in additional_layer_headers]) # (7) adding hmm stuff if self.hmm_searches_dict: if self.split_hmm_layers: json_entry.extend([self.hmm_searches_dict[split_name][header] if self.hmm_searches_dict.has_key(split_name) else None for header in [tpl[0] for tpl in self.hmm_searches_header]]) else: json_entry.extend([len(self.hmm_searches_dict[split_name][header]) if self.hmm_searches_dict.has_key(split_name) else 0 for header in [tpl[1] for tpl in self.hmm_searches_header]]) # (8) send it along! json_object.append(json_entry) self.views[view] = json_object
def load_from_anvio_files(self, args): if not self.contigs_db_path: raise ConfigError, "Anvi'o needs the contigs database to make sense of this run." ProfileSuperclass.__init__(self, args) # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load # all the split sequences since only now we know the mun_contig_length that was used to profile # this stuff self.init_split_sequences(self.p_meta['min_contig_length']) self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__) self.p_meta['self_path'] = self.profile_db_path self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path)) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__) # load views from the profile database self.load_views() self.default_view = self.p_meta['default_view'] # if the user wants to see available views, show them and exit. if self.show_views: run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green') for view in self.views: run.info(view, 'Via "%s" table' % self.views[view]['table_name'], lc='crimson', mc='green' if view == self.default_view else 'crimson') print sys.exit() if self.show_states: run.warning('', header = 'Available states (%d)' % len(self.states_table.states), lc = 'green') for state in self.states_table.states: run.info(state, 'Last modified %s' % self.states_table.states[state]['last_modified'], lc='crimson', mc='crimson') print sys.exit() # if the user has an additional view data, load it up into the self.views dict. if self.additional_view_path: filesnpaths.is_file_tab_delimited(self.additional_view_path) additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path) if not additional_view_columns[-1] == '__parent__': raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\ parent information for each split." column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str] self.views['user_view'] = {'table_name': 'NA', 'header': additional_view_columns, 'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)} # if the user specifies a view, set it as default: if self.view: if not self.view in self.views: raise ConfigError, "The requested view ('%s') is not available for this run. Please see\ available views by running this program with --show-views flag." % self.view self.default_view = self.view self.p_meta['clusterings'] = self.clusterings if self.tree: entry_id = os.path.basename(self.tree).split('.')[0] if not self.p_meta['clusterings']: self.p_meta['default_clustering'] = entry_id self.p_meta['available_clusterings'] = [entry_id] self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}} run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id) else: self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()} run.info('Additional Tree', "'%s' has been added to available trees." % entry_id) # set title if self.title: self.title = self.title else: self.title = self.p_meta['sample_id'].replace('-', ' ').replace('_', ' ') # do we have auxiliary data available? if not self.auxiliary_data_available: summary_cp_available = os.path.exists(os.path.join(os.path.dirname(self.profile_db_path), 'SUMMARY.cp')) self.run.warning("Auxiliary data is not available; which means you will not be able to perform\ certain operations (i.e., the inspect menu in the interactive interface will\ not work, etc). %s" % ('' if not summary_cp_available else "Although, you have\ a SUMMARY.cp file in your work directory, which means you are working with an\ outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\ by using `anvi-script-generate-auxiliary-data-from-summary-cp` script.")) if self.state: if not self.state in self.states_table.states: raise ConfigError, "The requested state ('%s') is not available for this run. Please see\ available states by running this program with --show-states flag." % self.state
def load_from_user_files(self, args): if self.contigs_db_path: raise ConfigError, "When you want to use the interactive interface in an ad hoc manner, you must\ not use a contigs database." if not self.profile_db_path: raise ConfigError, "Even when you want to use the interactive interface in an ad hoc manner by\ using the '--manual-mode' flag, you still need to declare a profile database.\ The profile database in this mode only used to read or store the 'state' of\ the display for visualization purposes. You DO NOT need to point to an already\ existing database, as anvi'o will generate an empty one for your if there is no\ profile database." if (not self.fasta_file) or (not self.view_data_path) or (not self.tree): raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\ each of '-f', '-d', and '-t' parameters. Please see the help menu for more info." if self.view: raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\ in manual mode" if self.show_views: raise ConfigError, "Sorry, there are no views to show in manual mode :/" if self.show_states: raise ConfigError, "Sorry, there are no states to show in manual mode :/" view_data_path = os.path.abspath(self.view_data_path) self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) self.p_meta['output_dir'] = None self.p_meta['views'] = {} self.p_meta['merged'] = True self.p_meta['default_view'] = 'single' self.p_meta['default_clustering'] = 'default' self.p_meta['available_clusterings'] = ['default'] self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}} self.default_view = self.p_meta['default_view'] # sanity of the view data filesnpaths.is_file_tab_delimited(view_data_path) view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True) if not view_data_columns[0] == "contig": raise ConfigError, "The first row of the first column of the view data file must\ say 'contig', which is not the case for your view data file\ ('%s'). Please make sure this is a properly formatted view data\ file." % (view_data_path) # load view data as the default view: self.views[self.default_view] = {'header': view_data_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)} self.split_names_ordered = self.views[self.default_view]['dict'].keys() # we assume that the sample names are the header of the view data, so we might as well set it up: self.p_meta['samples'] = self.views[self.default_view]['header'] filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta']) # setup a mock splits_basic_info dict self.splits_basic_info = {} for split_id in self.split_names_ordered: self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])} # create a new, empty profile database for ad hoc operations if not os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])}) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__) # also populate collections, if there are any self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__) if self.title: self.title = self.title
def load_from_profile_database(self, args): if self.p_meta['version'] != anvio.__profile__version__: raise ConfigError, "The profile database has a version number that differs from the version that is valid\ for this codebase (the profile database is at '%s', and the codebase is at '%s'). Very\ unfortunately, you need to re-profile and re-merge this project using the current anvi'o :(" self.p_meta['self_path'] = self.profile_db_path self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path)) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__) # load views from the profile database self.load_views() self.default_view = self.p_meta['default_view'] # if the user wants to see available views, show them and exit. if self.show_views: run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green') for view in self.views: run.info(view, 'Via "%s" table' % self.views[view]['table_name'], lc='crimson', mc='green' if view == self.default_view else 'crimson') print sys.exit() # if the user has an additional view data, load it up into the self.views dict. if self.additional_view_path: filesnpaths.is_file_tab_delimited(self.additional_view_path) additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path) if not additional_view_columns[-1] == '__parent__': raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\ parent information for each split." column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str] self.views['user_view'] = {'table_name': 'NA', 'header': additional_view_columns, 'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)} # if the user specifies a view, set it as default: if self.view: if not self.view in self.views: raise ConfigError, "The requested view ('%s') is not available for this run. Please see\ available views by running this program with --show-views flag." % self.view self.default_view = self.view self.p_meta['clusterings'] = self.clusterings if self.tree: entry_id = os.path.basename(self.tree).split('.')[0] if not self.p_meta['clusterings']: self.p_meta['default_clustering'] = entry_id self.p_meta['available_clusterings'] = [entry_id] self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}} run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id) else: self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()} run.info('Additional Tree', "'%s' has been added to available trees." % entry_id) # is summary being overwritten? if self.summary_index: run.info('Warning', "The default summary index in RUNINFO is being overriden by '%s'." % self.summary_index) self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index) if os.path.exists(self.P('SUMMARY.cp')): self.splits_summary_index = dictio.read_serialized_object(self.P('SUMMARY.cp')) else: self.splits_summary_index = None run.warning("SUMMARY.cp is missing for your run. Anvi'o will continue working (well, at least\ it will attempt to do it), but things may behave badly with the absence of\ SUMMARY.cp (first and foremost, you will not be able to inspect individual\ contigs through any of the interactive interfaces). Please investigate it\ if you were not expecting this.") # set title if self.title: self.title = self.title + ' (%s)' % self.default_view else: self.title = self.p_meta['sample_id'] + ' (%s)' % self.default_view
def load_manual_mode(self, args): if self.contigs_db_path: raise ConfigError, "When you want to use the interactive interface in manual mode, you must\ not use a contigs database." if not self.profile_db_path: raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\ to declare a profile database. The profile database in this mode only used to\ read or store the 'state' of the display for visualization purposes. You DO\ NOT need to point to an already existing database, as anvi'o will generate\ an empty one for your if there is no profile database." if not self.tree: raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\ at least the tree file. Please see the documentation for help." if self.view: raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\ in manual mode" if self.show_views: raise ConfigError, "Sorry, there are no views to show in manual mode :/" if self.show_states: raise ConfigError, "Sorry, there are no states to show in manual mode :/" filesnpaths.is_file_exists(self.tree) tree = filesnpaths.is_proper_newick(self.tree) view_data_path = os.path.abspath(self.view_data_path) if self.view_data_path else None self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) if self.fasta_file else None self.p_meta['output_dir'] = None self.p_meta['views'] = {} self.p_meta['merged'] = True self.p_meta['default_view'] = 'single' clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree) self.p_meta['default_clustering'] = clustering_id self.p_meta['available_clusterings'] = [clustering_id] self.p_meta['clusterings'] = {clustering_id: {'newick': ''.join([l.strip() for l in open(os.path.abspath(self.tree)).readlines()])}} self.default_view = self.p_meta['default_view'] if self.view_data_path: # sanity of the view data filesnpaths.is_file_tab_delimited(view_data_path) view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True) if not view_data_columns[0] == "contig": raise ConfigError, "The first row of the first column of the view data file must\ say 'contig', which is not the case for your view data file\ ('%s'). Please make sure this is a properly formatted view data\ file." % (view_data_path) # load view data as the default view: self.views[self.default_view] = {'header': view_data_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)} else: # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict' # here using what is in the tree. names_in_the_tree = [n.name for n in tree.get_leaves()] ad_hoc_dict = {} for item in names_in_the_tree: ad_hoc_dict[item] = {'names': item} self.views[self.default_view] = {'header': ['names'], 'dict': ad_hoc_dict} self.split_names_ordered = self.views[self.default_view]['dict'].keys() # we assume that the sample names are the header of the view data, so we might as well set it up: self.p_meta['samples'] = self.views[self.default_view]['header'] # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts, # otherwise we will leave them empty self.splits_basic_info = {} self.split_sequences = None if self.p_meta['splits_fasta']: filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta']) names_missing_in_FASTA = set(self.split_names_ordered) - set(self.split_sequences.keys()) num_names_missing_in_FASTA = len(names_missing_in_FASTA) if num_names_missing_in_FASTA: raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\ FASTA file you provided. Here is an example to one of those %d names that occur\ in your data file, but not in the FASTA file: "%s"' % (num_names_missing_in_FASTA, names_missing_in_FASTA.pop()) # setup a mock splits_basic_info dict for split_id in self.split_names_ordered: self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])} # create a new, empty profile database for manual operations if not os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])}) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__) # also populate collections, if there are any self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__) if self.title: self.title = self.title
def use_external_gene_calls_to_populate_genes_in_contigs_table( self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False, skip_predict_frame=False, skip_amino_acid_sequences=False): """Add genes to the contigs database. Primary input is either an `input_file_path` for external gene calls, or an external `gene_calls_dict` dictionary object. Parameters ========== input_file_path : str Path to file with one of the following structures. Option 1: gene_callers_id contig start stop direction partial call_type source version 0 CACHJY01_00016 0 693 r 1 1 prodigal v2.6.3 1 CACHJY01_00016 711 1140 r 0 1 prodigal v2.6.3 Option 2: gene_callers_id contig start stop direction partial call_type source version aa_sequence 0 CACHJY01_00016 0 693 r 1 1 prodigal v2.6.3 MSKKIYFTEYSKVNRLQTISNFTGSA 1 CACHJY01_00016 711 1140 r 0 1 prodigal v2.6.3 MVNVDYHGLIAGAGSGKTKVLTSRIAHIIK gene_calls_dict : dict, None Alternative to `input_file_path`. If provided, entries will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. Should look like: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "call_type": 1, "source": "source_name", "version": "unknown", "aa_sequence": "MSKKIYFTEYSKVNRLQTISNFTGSA" }, "2": { (...) }, (...) } All entries are required except "aa_sequence", which is optional. If provided, it should be present for ALL entries, even if it is an empty string. It's presence will be used to populate `gene_amino_acid_sequences`. ignore_internal_stop_codons : bool, False If False, ConfigError will be raised if a stop codon is found inside any gene. If True, this is suppressed and the stop codon is replaced with the character `X`. skip_predict_frame : bool, False If True, ConfigError will be raised if a gene is not divisible by 3. If False, anvi'o predicts the most likley open reading frame and trims the start/stop of the gene call to reflect this change so that the gene *is* divisible by 3. This flag allows the retention of amino acid sequences even if genes are not divisible by 3, or when it is flagged as partial. skip_amino_acid_sequences : bool, False Should the gene_amino_acid_sequences table be populated? This may be useful if genes that are not translated are being added, such as ribosomal RNA genes, etc. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError( "'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True has_aa_seq = lambda x: True if 'aa_sequence' in x else False num_with_aa_seqs = sum([ has_aa_seq(gene_call) for gene_call in gene_calls_dict.values() ]) num_gene_calls = len(gene_calls_dict) if num_with_aa_seqs != 0 and num_with_aa_seqs != num_gene_calls: raise ConfigError( "The gene_calls_dict passed to use_external_gene_calls_to_populate_genes_in_contigs_table " "has %d entries with 'aa_sequence' and %d without. Either 0 or all (%d) should have " "'aa_sequence'" % (num_with_aa_seqs, num_gene_calls - num_with_aa_seqs, num_gene_calls)) if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single( "'Use external gene calls' function found an empty gene calls dict, returning " "prematurely and assuming you know what's up. If you don't, stop here and try to " "identify what decisions you've made might have led you to this weird point your " "workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've " "done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError( "You must provide either an input file, or an gene calls dict to process external " "gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table` " "with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: expected_fields = t.genes_in_contigs_table_structure column_mapping = [int, str, int, int, str, int, int, str, str] if 'aa_sequence' in utils.get_columns_of_TAB_delim_file( input_file_path): expected_fields = t.genes_in_contigs_table_structure + [ 'aa_sequence' ] column_mapping.append( lambda x: '' if x is None else str(x)) # str(None) is 'None', amazingly gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary( input_file_path, expected_fields=expected_fields, only_expected_fields=True, column_mapping=column_mapping) if not len(gene_calls_dict): raise ConfigError( "You provided an external gene calls file, but it returned zero gene calls. Assuming that " "this is an error, anvi'o will stop here and complain. If this is not an error and you " "in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag, " "instead of providing an emtpy external gene calls file. You don't agree? You need this " "for some weird step for you weird pipeline? Let us know, and we will consider changing " "this.") self.run.info( "External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid sequences or create a blank dictionary if skip_amino_acid_sequences: amino_acid_sequences = dict([(g, '') for g in gene_calls_dict]) else: gene_calls_dict, amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict( gene_calls_dict, ignore_internal_stop_codons=ignore_internal_stop_codons, skip_predict_frame=skip_predict_frame, ) # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)
def predict_from_TAB_delimited_file(self, file_path): cols = utils.get_columns_of_TAB_delim_file(file_path) return self.predict( utils.get_TAB_delimited_file_as_dictionary(file_path, column_mapping=[str] + [float] * len(cols)))
def parse_pubs_txt(self): if os.path.exists(self.pubs_info_file_path): self.info = u.get_TAB_delimited_file_as_dictionary( self.pubs_info_file_path) pubs_header = u.get_columns_of_TAB_delim_file( self.pubs_file_path, include_first_column=True) headers_expected = [ 'Authors', 'Title', 'Publication', 'Volume', 'Number', 'Pages', 'Year', 'doi' ] missing_headers = [h for h in pubs_header if h not in headers_expected] if len(missing_headers): raise ConfigError( "Sorry, the pubs.txt seems to be missing some of the headers that are mandatory. Each of \ the columns in the following list must be present in this file: %s (hint: yours do not have\ the following: %s)." % (', '.join(headers_expected), ', '.join(missing_headers))) self.pubs_txt = u.get_TAB_delimited_file_as_dictionary( self.pubs_file_path, indexing_field=pubs_header.index('doi')) for doi in self.pubs_txt: authors = [] co_first_authors = [] co_senior_authors = [] p = self.pubs_txt[doi] for author in [_.strip() for _ in p['Authors'].split(';')]: if not len(author): continue author_last_name, author_first_name_raw = [ _.strip() for _ in author.split(',') ] author_first_name = ''.join( [n[0] for n in author_first_name_raw.split()]) author_final_name = '%s %s' % (author_last_name, author_first_name) if author_first_name_raw.endswith('*'): co_first_authors.append(author_final_name) elif author_first_name_raw.endswith('+'): co_senior_authors.append(author_final_name) authors.append(author_final_name) if p['Number']: issue = '%s(%s):%s' % (p['Volume'], p['Number'], p['Pages']) else: issue = '%s:%s' % (p['Volume'], p['Pages']) year = p['Year'].strip() pub_entry = { 'authors': authors, 'title': p['Title'], 'journal': p['Publication'], 'issue': issue, 'doi': doi, 'year': year, 'co_first_authors': co_first_authors, 'co_senior_authors': co_senior_authors } if year not in self.pubs_dict: self.pubs_dict[year] = [pub_entry] else: self.pubs_dict[year].append(pub_entry)
def convert_view_data_into_json(self): '''This function's name must change to something more meaningful.''' additional_layers_dict, additional_layers_headers = self.additional_layers_dict, self.additional_layers_headers if self.additional_layers_path: additional_layers_dict = utils.get_TAB_delimited_file_as_dictionary(self.additional_layers_path, dict_to_append=additional_layers_dict, assign_none_for_missing=True) additional_layers_headers = additional_layers_headers + utils.get_columns_of_TAB_delim_file(self.additional_layers_path) for view in self.views: # here we will populate runinfo['views'] with json objects. view_dict = self.views[view]['dict'] view_headers = self.views[view]['header'] json_object = [] # (1) set the header line with the first entry: json_header = ['contigs'] # (2) add taxonomy, if exitsts: if len(self.splits_taxonomy_dict): json_header.extend(['taxonomy']) # (3) then add split summaries from contigs db, if exists if len(self.genes_in_splits_summary_dict): json_header.extend(self.genes_in_splits_summary_headers[1:]) # (4) then add length and GC content IF we have sequences available if self.splits_basic_info: basic_info_headers = ['length', 'gc_content'] json_header.extend(basic_info_headers) # (5) then add the view! json_header.extend(view_headers) # (6) then add 'additional' headers as the outer ring: if additional_layers_headers: json_header.extend(additional_layers_headers) # (7) finally add hmm search results if self.hmm_searches_header: json_header.extend([tpl[0] for tpl in self.hmm_searches_header]) # (8) and finalize it (yay): json_object.append(json_header) for split_name in view_dict: # (1) json_entry = [split_name] # (2) if self.splits_taxonomy_dict: if split_name in self.splits_taxonomy_dict: json_entry.extend([self.splits_taxonomy_dict[split_name]]) else: json_entry.extend([None]) # (3) if self.genes_in_splits_summary_dict: json_entry.extend([self.genes_in_splits_summary_dict[split_name][header] for header in self.genes_in_splits_summary_headers[1:]]) # (4) if self.splits_basic_info: json_entry.extend([self.splits_basic_info[split_name][header] for header in basic_info_headers]) # (5) adding essential data for the view json_entry.extend([view_dict[split_name][header] for header in view_headers]) # (6) adding additional layers json_entry.extend([additional_layers_dict[split_name][header] if split_name in additional_layers_dict else None for header in additional_layers_headers]) # (7) adding hmm stuff if self.hmm_searches_dict: if self.split_hmm_layers: json_entry.extend([self.hmm_searches_dict[split_name][header] if split_name in self.hmm_searches_dict else None for header in [tpl[0] for tpl in self.hmm_searches_header]]) else: json_entry.extend([len(self.hmm_searches_dict[split_name][header]) if split_name in self.hmm_searches_dict else 0 for header in [tpl[1] for tpl in self.hmm_searches_header]]) # (8) send it along! json_object.append(json_entry) self.views[view] = json_object
def load_full_mode(self, args): if not self.contigs_db_path: raise ConfigError, "Anvi'o needs the contigs database to make sense of this run (or maybe you\ should use the `--manual` flag if that's what your intention)." if not self.profile_db_path: raise ConfigError, "So you want to run anvi'o in full mode, but without a profile database?\ Well. This does not make any sense." if not args.skip_init_functions: self.init_functions() ProfileSuperclass.__init__(self, args) # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load # all the split sequences since only now we know the mun_contig_length that was used to profile # this stuff self.init_split_sequences(self.p_meta['min_contig_length']) self.collections.populate_collections_dict(self.profile_db_path) self.p_meta['self_path'] = self.profile_db_path self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path)) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path) # load views from the profile database if self.p_meta['blank']: blank_dict = {} for split_name in self.splits_basic_info: blank_dict[split_name] = {'blank_view': 0} self.views['blank_view'] = {'header': ['blank_view'], 'dict': blank_dict} self.default_view = 'blank_view' else: self.load_views() self.default_view = self.p_meta['default_view'] # if the user wants to see available views, show them and exit. if self.show_views: run.warning('', header='Available views (%d)' % len(self.views), lc='green') for view in self.views: run.info(view, 'Via "%s" table' % self.views[view]['table_name'], lc='crimson', mc='green' if view == self.default_view else 'crimson') print sys.exit() if self.show_states: run.warning('', header='Available states (%d)' % len(self.states_table.states), lc='green') for state in self.states_table.states: run.info(state, 'Last modified %s' % self.states_table.states[state]['last_modified'], lc='crimson', mc='crimson') print sys.exit() # if the user has an additional view data, load it up into the self.views dict. if self.additional_view_path: filesnpaths.is_file_tab_delimited(self.additional_view_path) additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path) if not additional_view_columns[-1] == '__parent__': raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\ parent information for each split." column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str] self.views['user_view'] = {'table_name': 'NA', 'header': additional_view_columns, 'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping=column_mapping)} # if the user specifies a view, set it as default: if self.view: if not self.view in self.views: raise ConfigError, "The requested view ('%s') is not available for this run. Please see\ available views by running this program with --show-views flag." % self.view self.default_view = self.view self.p_meta['clusterings'] = self.clusterings if self.tree: clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree) if not self.p_meta['clusterings']: self.p_meta['default_clustering'] = clustering_id self.p_meta['available_clusterings'] = [clustering_id] self.p_meta['clusterings'] = {clustering_id: {'newick': open(os.path.abspath(self.tree)).read()}} run.info('Additional Tree', "Splits will be organized based on '%s'." % clustering_id) else: self.p_meta['clusterings'][clustering_id] = {'newick': open(os.path.abspath(self.tree)).read()} run.info('Additional Tree', "'%s' has been added to available trees." % clustering_id) # set title if self.title: self.title = self.title else: self.title = self.p_meta['sample_id'].replace('-', ' ').replace('_', ' ') # do we have auxiliary data available? if not self.auxiliary_profile_data_available: summary_cp_available = os.path.exists(os.path.join(os.path.dirname(self.profile_db_path), 'SUMMARY.cp')) self.run.warning("Auxiliary data is not available; which means you will not be able to perform\ certain operations (i.e., the inspect menu in the interactive interface will\ not work, etc). %s" % ('' if not summary_cp_available else "Although, you have\ a SUMMARY.cp file in your work directory, which means you are working with an\ outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\ by using `anvi-script-generate-auxiliary-data-from-summary-cp` script.")) if self.state_autoload: if not self.state_autoload in self.states_table.states: raise ConfigError, "The requested state ('%s') is not available for this run. Please see\ available states by running this program with --show-states flag." % self.state_autoload
def load_manual_mode(self, args): if self.contigs_db_path: raise ConfigError, "When you want to use the interactive interface in manual mode, you must\ not use a contigs database." # if the user is using an existing profile database, we need to make sure that it is not associated # with a contigs database, since it would mean that it is a full anvi'o profile database and should # not be included in manual operations. if os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) if profile_db.meta['contigs_db_hash']: raise ConfigError, "Well. It seems the profile database is associated with a contigs database,\ which means using it in manual mode is not the best way to use it. Probably\ what you wanted to do is to let the manual mode create a new profile database\ for you. Simply type in a new profile database path (it can be a file name\ that doesn't exist)." if not self.profile_db_path: raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\ to declare a profile database. The profile database in this mode only used to\ read or store the 'state' of the display for visualization purposes. You DO\ NOT need to point to an already existing database, as anvi'o will generate\ an empty one for your if there is no profile database." if not self.tree: raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\ at least the tree file. Please see the documentation for help." if self.view: raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\ in manual mode" if self.show_views: raise ConfigError, "Sorry, there are no views to show in manual mode :/" if self.show_states: raise ConfigError, "Sorry, there are no states to show in manual mode :/" filesnpaths.is_file_exists(self.tree) tree = filesnpaths.is_proper_newick(self.tree) view_data_path = os.path.abspath(self.view_data_path) if self.view_data_path else None self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) if self.fasta_file else None self.p_meta['output_dir'] = None self.p_meta['views'] = {} self.p_meta['merged'] = True self.p_meta['default_view'] = 'single' clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree) self.p_meta['default_clustering'] = clustering_id self.p_meta['available_clusterings'] = [clustering_id] self.p_meta['clusterings'] = {clustering_id: {'newick': ''.join([l.strip() for l in open(os.path.abspath(self.tree)).readlines()])}} self.default_view = self.p_meta['default_view'] if self.view_data_path: # sanity of the view data filesnpaths.is_file_tab_delimited(view_data_path) view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True) if not view_data_columns[0] == "contig": raise ConfigError, "The first row of the first column of the view data file must\ say 'contig', which is not the case for your view data file\ ('%s'). Please make sure this is a properly formatted view data\ file." % (view_data_path) # load view data as the default view: self.views[self.default_view] = {'header': view_data_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)} else: # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict' # here using what is in the tree. names_in_the_tree = [n.name for n in tree.get_leaves()] ad_hoc_dict = {} for item in names_in_the_tree: ad_hoc_dict[item] = {'names': item} self.views[self.default_view] = {'header': ['names'], 'dict': ad_hoc_dict} self.displayed_item_names_ordered = self.views[self.default_view]['dict'].keys() # we assume that the sample names are the header of the view data, so we might as well set it up: self.p_meta['samples'] = self.views[self.default_view]['header'] # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts, # otherwise we will leave them empty self.splits_basic_info = {} self.split_sequences = None if self.p_meta['splits_fasta']: filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta']) names_missing_in_FASTA = set(self.displayed_item_names_ordered) - set(self.split_sequences.keys()) num_names_missing_in_FASTA = len(names_missing_in_FASTA) if num_names_missing_in_FASTA: raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\ FASTA file you provided. Here is an example to one of those %d names that occur\ in your data file, but not in the FASTA file: "%s"' % (num_names_missing_in_FASTA, names_missing_in_FASTA.pop()) # setup a mock splits_basic_info dict for split_id in self.displayed_item_names_ordered: self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])} # create a new, empty profile database for manual operations if not os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])}) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path) # also populate collections, if there are any self.collections.populate_collections_dict(self.profile_db_path) if self.title: self.title = self.title
def predict_from_TAB_delimited_file(self, file_path): cols = utils.get_columns_of_TAB_delim_file(file_path) return self.predict(utils.get_TAB_delimited_file_as_dictionary(file_path, column_mapping=[str] + [float] * len(cols)))
def append_dict_to_file(self, dict_to_append, file_handle): """This function adds a TAB-delimited dictionary to the end of the file. If the file is empty, it writes the header as well as adding the dictionary contents. Otherwise, it checks that the dictionary contains the same keys as the header and appends the dictionary contents to the end of the file. Parameters ========== dict_to_append : dictionary Holds the data you want to add to the end of the file. Keys should be headers of the file. file_handle : a file object Pointer to the file, opened in append mode. The calling function should take care of the open() and pass the handle here """ import anvio.utils as utils if is_file_empty(self.path): utils.store_dict_as_TAB_delimited_file(dict_to_append, None, headers=self.headers, file_obj=file_handle, \ key_header=self.key_header, keys_order=self.keys_order, \ header_item_conversion_dict=self.header_item_conversion_dict, \ do_not_close_file_obj=True) else: # if dictionary is empty, just return if not dict_to_append: return file_headers = utils.get_columns_of_TAB_delim_file(self.path, include_first_column=True) inner_dict_keys = list(dict_to_append.values())[0].keys() # figure out if the first column holds the keys of the outer dictionary or one of the inner dictionary keys if file_headers[0] in inner_dict_keys: self.key_header = None self.headers = file_headers else: self.key_header = file_headers[0] self.headers = file_headers[1:] # check that the inner dictionary has the file headers we need missing_headers = [h for h in self.headers if h not in inner_dict_keys] if len(missing_headers): if anvio.DEBUG: if len(missing_headers) > 10: raise FilesNPathsError(f"Some headers from the file (n={len(missing_headers)}) are not in your dictionary :/ " f"Here are the first ten of them: {missing_headers[:10].__str__()}") else: raise FilesNPathsError(f"Some headers from the file are not in your dictionary :/ Here they are: {missing_headers.__str__()}") else: raise FilesNPathsError("Some headers from the file are not in your dictionary :/ Use `--debug` to see where this " "error is coming from the codebase with a list of example keys that are missing.") # check that any requested outer dictionary keys are present if not self.keys_order: self.keys_order = sorted(dict_to_append.keys()) else: missing_keys = [k for k in self.keys_order if k not in dict_to_append] if len(missing_keys): if anvio.DEBUG: if len(missing_keys) > 10: raise FilesNPathsError(f"Some keys (n={len(missing_keys)}) are not in your dictionary :/ Here are " f"the first ten of them: {missing_keys[:10].__str__()}") else: raise FilesNPathsError(f"Some keys are not in your dictionary :/ Here they are: {missing_keys.__str__()}") else: raise FilesNPathsError("Some keys are not in your dictionary :/ Use `--debug` to see where this " "error is coming from the codebase with a list of example keys that are " "missing.") # dict looks okay, append it to file for k in self.keys_order: if self.key_header: # first column is key of outer dict line = [str(k)] else: # do not put the key of outer dict in the first column line = [] for header in self.headers: try: val = dict_to_append[k][header] except KeyError: raise FilesNPathsError(f"Header '{header}' is not found in the dict for key '{k}':/") except TypeError: raise FilesNPathsError("Your dictionary is not properly formatted to be exported " f"as a TAB-delimited file :/ You ask for '{header}', but it is not " "even a key in the dictionary") line.append(str(val) if not isinstance(val, type(None)) else '') if anvio.AS_MARKDOWN: file_handle.write(f"|{'|'.join(map(str, line))}|\n") else: file_handle.write('%s\n' % '\t'.join(line))