def __init__(self, fasta_file_path, gene_caller='prodigal', args=None, progress=progress, run=run, debug=False): filesnpaths.is_file_exists(fasta_file_path) filesnpaths.is_file_fasta_formatted(fasta_file_path) self.fasta_file_path = fasta_file_path self.run = run self.progress = progress self.args = args self.debug = debug self.tmp_dirs = [] self.gene_callers = {'prodigal': Prodigal} self.gene_caller = gene_caller if self.gene_caller not in self.gene_callers: raise ConfigError( "The gene caller you requested ('%s') is not available at this point.\ here is a list of what we have: %s." % (', '.join(self.gene_callers)))
def load_from_files(self, args): if (not self.fasta_file) or (not self.metadata) or (not self.tree) or (not self.output_dir): raise ConfigError, "If you do not have a RUNINFO dict, you must declare each of\ '-f', '-m', '-t' and '-o' parameters. Please see '--help' for\ more detailed information on them." if self.view: raise ConfigError, "You can't use '-v' parameter when this program is not called with a RUNINFO.cp" if self.show_views: raise ConfigError, "Sorry, there are no views to show when there is no RUNINFO.cp :/" metadata_path = os.path.abspath(self.metadata) self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) self.p_meta['output_dir'] = os.path.abspath(self.output_dir) self.p_meta['views'] = {} self.p_meta['default_view'] = 'single' self.p_meta['default_clustering'] = 'default' self.p_meta['available_clusterings'] = ['default'] self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}} self.default_view = self.p_meta['default_view'] if self.summary_index: self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index) self.splits_summary_index = dictio.read_serialized_object(self.p_meta['profile_summary_index']) # sanity of the metadata filesnpaths.is_file_tab_delimited(metadata_path) metadata_columns = utils.get_columns_of_TAB_delim_file(metadata_path, include_first_column=True) if not metadata_columns[0] == "contig": raise ConfigError, "The first row of the first column of the metadata file must\ say 'contig', which is not the case for your metadata file\ ('%s'). Please make sure this is a properly formatted metadata\ file." % (metadata_path) # store metadata as view: self.views[self.default_view] = {'header': metadata_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(metadata_path)} self.split_names_ordered = self.views[self.default_view]['dict'].keys() filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta']) # setup a mock splits_basic_info dict self.splits_basic_info = {} for split_id in self.split_names_ordered: self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])} # reminder: this is being stored in the output dir provided as a commandline parameter: self.p_meta['self_path'] = os.path.join(self.p_meta['output_dir'], 'RUNINFO.cp') if self.title: self.title = self.title filesnpaths.gen_output_directory(self.p_meta['output_dir'])
def get_GC_content_for_FASTA_entries(file_path): filesnpaths.is_file_exists(file_path) filesnpaths.is_file_fasta_formatted(file_path) GC_content_dict = {} fasta = u.SequenceSource(file_path) while fasta.next(): GC_content_dict[fasta.id] = get_GC_content_for_sequence(fasta.seq) return GC_content_dict
def get_FASTA_file_as_dictionary(file_path): filesnpaths.is_file_exists(file_path) filesnpaths.is_file_fasta_formatted(file_path) d = {} fasta = u.SequenceSource(file_path) while fasta.next(): d[fasta.id] = fasta.seq return d
def __init__(self, db_path, contigs_fasta=None, run=terminal.Run(), progress=terminal.Progress(), debug=False): self.run = run self.progress = progress self.db_path = db_path self.contigs_fasta = contigs_fasta self.debug = debug utils.is_contigs_db(self.db_path) if self.contigs_fasta: filesnpaths.is_file_exists(self.contigs_fasta) filesnpaths.is_file_fasta_formatted(self.contigs_fasta)
def __init__(self, db_path, contigs_fasta=None, run=run, progress=progress, debug=False): self.run = run self.progress = progress self.db_path = db_path self.contigs_fasta = contigs_fasta self.debug = debug utils.is_contigs_db(self.db_path) if self.contigs_fasta: filesnpaths.is_file_exists(self.contigs_fasta) filesnpaths.is_file_fasta_formatted(self.contigs_fasta)
def load_references_for_removal(self): """Load and perform some sanity checks on the references for removal""" self.references_for_removal = u.get_TAB_delimited_file_as_dictionary( self.references_for_removal_txt) # adding the references_for_removal to the fasta_information dict self.fasta_information.update(self.references_for_removal) for sample in self.references_for_removal.keys(): try: u.check_sample_id(sample) except ConfigError as e: raise ConfigError( "While processing the references for removal txt file ('%s'), anvi'o ran into the following error: " "%s" % (self.samples_txt_file, e)) files_that_end_with_gz = [] for ref_dict in self.references_for_removal.values(): if 'path' not in ref_dict: raise ConfigError( 'Yor references for removal txt file is not formatted properly. It must have only two columns ' 'with the headers "reference" and "path".') if ref_dict['path'].endswith('.gz'): filesnpaths.is_file_exists(ref_dict['path']) files_that_end_with_gz.append(ref_dict['path']) else: # if the file is not compressed then we can verify that it is a fasta file filesnpaths.is_file_fasta_formatted(ref_dict['path']) if files_that_end_with_gz: run.warning( 'The following reference for removal files are compressed: %s. ' 'That\'s fine, but it means that we will skip the ' 'sanity check to verify that this is actually ' 'a properly formatted fasta file. Things are ' 'probably Ok, this is just one of these occasions ' 'in which anvi\'o is oversharing.' % ', '.join(files_that_end_with_gz)) if self.references_mode: # Make sure that the user didn't give the same name to references and references_for_removal ref_name_in_both = [ r for r in self.references_for_removal if r in self.contigs_information ] if ref_name_in_both: raise ConfigError( 'You must have unique names for your fasta files in your fasta txt file ' 'and your references for removal txt file. These are the names that appear ' 'in both: %s' % ', '.join(ref_name_in_both)) dont_remove = self.get_param_value_from_config( ['remove_short_reads_based_on_references', 'dont_remove_just_map']) if not dont_remove: self.remove_short_reads_based_on_references = True
def __init__(self, protein_sequences_fasta, progress = progress, run = run): self.progress = progress self.run = run filesnpaths.is_file_fasta_formatted(protein_sequences_fasta) self.protein_sequences_fasta = protein_sequences_fasta # hmm_scan_hits is the file to access later on for parsing: self.hmm_scan_output = None self.hmm_scan_hits = None self.genes_in_contigs = None self.tmp_dirs = []
def load_references_for_removal(self): """Load and perform some sanity checks on the references for removal""" self.references_for_removal = u.get_TAB_delimited_file_as_dictionary(self.references_for_removal_txt) # adding the references_for_removal to the fasta_information dict self.fasta_information.update(self.references_for_removal) for sample in self.references_for_removal.keys(): try: u.check_sample_id(sample) except ConfigError as e: raise ConfigError("While processing the references for removal txt file ('%s'), anvi'o ran into the following error: \ %s" % (self.samples_txt_file, e)) files_that_end_with_gz = [] for ref_dict in self.references_for_removal.values(): if 'path' not in ref_dict: raise ConfigError('Yor references for removal txt file is not formatted properly. It must have only two columns \ with the headers "reference" and "path".') if ref_dict['path'].endswith('.gz'): filesnpaths.is_file_exists(ref_dict['path']) files_that_end_with_gz.append(ref_dict['path']) else: # if the file is not compressed then we can verify that it is a fasta file filesnpaths.is_file_fasta_formatted(ref_dict['path']) if files_that_end_with_gz: run.warning('The following reference for removal files are compressed: %s. \ That\'s fine, but it means that we will skip the \ sanity check to verify that this is actually \ a properly formatted fasta file. Things are \ probably Ok, this is just one of these occasions \ in which anvi\'o is oversharing.' % ', '.join(files_that_end_with_gz)) if self.references_mode: # Make sure that the user didn't give the same name to references and references_for_removal ref_name_in_both = [r for r in self.references_for_removal if r in self.contigs_information] if ref_name_in_both: raise ConfigError('You must have unique names for your fasta files in your fasta txt file \ and your references for removal txt file. These are the names that appear \ in both: %s' % ', '.join(ref_name_in_both)) dont_remove = self.get_param_value_from_config(['remove_short_reads_based_on_references', 'dont_remove_just_map']) if not dont_remove: self.remove_short_reads_based_on_references = True
def __init__(self, fasta_file_path, gene_caller = 'prodigal', progress = progress, run = run, debug = False): filesnpaths.is_file_exists(fasta_file_path) filesnpaths.is_file_fasta_formatted(fasta_file_path) self.fasta_file_path = fasta_file_path self.run = run self.progress = progress self.debug = debug self.tmp_dirs = [] self.gene_callers = {'prodigal': Prodigal} self.gene_caller = gene_caller if self.gene_caller not in self.gene_callers: raise ConfigError, "The gene caller you requested ('%s') is not available at this point.\ here is a list of what we have: %s." % (', '.join(self.gene_callers))
def sanity_check(self): if self.contigs_db_path and self.fasta_file_path: raise ConfigError( "You should either choose a FASTA file or a contigs db to send to this " "class, not both :/") if self.output_file_path: filesnpaths.is_output_file_writable(self.output_file_path, ok_if_exists=False) else: self.verbose = True if self.contigs_db_path: utils.is_contigs_db(self.contigs_db_path) if self.fasta_file_path: filesnpaths.is_file_fasta_formatted(self.fasta_file_path) try: self.min_palindrome_length = int(self.min_palindrome_length) except: raise ConfigError("Minimum palindrome length must be an integer.") try: self.max_num_mismatches = int(self.max_num_mismatches) except: raise ConfigError( "Maximum number of mismatches must be an integer.") if self.blast_word_size < 4: raise ConfigError( "For everyone's sake, we set the minimum value for the minimum word size for BLAST to " "5. If you need this to change, please let us know (or run the same command with `--debug` " "flag, find the location of this control, and hack anvi'o by replacing that 4 with something " "smaller -- anvi'o doesn't mind being hacked).") if self.min_palindrome_length < 5: raise ConfigError( "For everyone's sake, we set the minimum value for the minimum palindrome length to " "4. You have a problem with that? WELL, WELCOME TO THE CLUB, YOU'LL FIT RIGHT IN -- " "WE HAVE A PROBLEM WITH LOGIC TOO.")
def load_manual_mode(self, args): if self.contigs_db_path: raise ConfigError, "When you want to use the interactive interface in manual mode, you must\ not use a contigs database." if not self.profile_db_path: raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\ to declare a profile database. The profile database in this mode only used to\ read or store the 'state' of the display for visualization purposes. You DO\ NOT need to point to an already existing database, as anvi'o will generate\ an empty one for your if there is no profile database." if not self.tree: raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\ at least the tree file. Please see the documentation for help." if self.view: raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\ in manual mode" if self.show_views: raise ConfigError, "Sorry, there are no views to show in manual mode :/" if self.show_states: raise ConfigError, "Sorry, there are no states to show in manual mode :/" filesnpaths.is_file_exists(self.tree) tree = filesnpaths.is_proper_newick(self.tree) view_data_path = os.path.abspath(self.view_data_path) if self.view_data_path else None self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) if self.fasta_file else None self.p_meta['output_dir'] = None self.p_meta['views'] = {} self.p_meta['merged'] = True self.p_meta['default_view'] = 'single' clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree) self.p_meta['default_clustering'] = clustering_id self.p_meta['available_clusterings'] = [clustering_id] self.p_meta['clusterings'] = {clustering_id: {'newick': ''.join([l.strip() for l in open(os.path.abspath(self.tree)).readlines()])}} self.default_view = self.p_meta['default_view'] if self.view_data_path: # sanity of the view data filesnpaths.is_file_tab_delimited(view_data_path) view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True) if not view_data_columns[0] == "contig": raise ConfigError, "The first row of the first column of the view data file must\ say 'contig', which is not the case for your view data file\ ('%s'). Please make sure this is a properly formatted view data\ file." % (view_data_path) # load view data as the default view: self.views[self.default_view] = {'header': view_data_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)} else: # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict' # here using what is in the tree. names_in_the_tree = [n.name for n in tree.get_leaves()] ad_hoc_dict = {} for item in names_in_the_tree: ad_hoc_dict[item] = {'names': item} self.views[self.default_view] = {'header': ['names'], 'dict': ad_hoc_dict} self.split_names_ordered = self.views[self.default_view]['dict'].keys() # we assume that the sample names are the header of the view data, so we might as well set it up: self.p_meta['samples'] = self.views[self.default_view]['header'] # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts, # otherwise we will leave them empty self.splits_basic_info = {} self.split_sequences = None if self.p_meta['splits_fasta']: filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta']) names_missing_in_FASTA = set(self.split_names_ordered) - set(self.split_sequences.keys()) num_names_missing_in_FASTA = len(names_missing_in_FASTA) if num_names_missing_in_FASTA: raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\ FASTA file you provided. Here is an example to one of those %d names that occur\ in your data file, but not in the FASTA file: "%s"' % (num_names_missing_in_FASTA, names_missing_in_FASTA.pop()) # setup a mock splits_basic_info dict for split_id in self.split_names_ordered: self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])} # create a new, empty profile database for manual operations if not os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])}) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__) # also populate collections, if there are any self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__) if self.title: self.title = self.title
def load_from_user_files(self, args): if self.contigs_db_path: raise ConfigError, "When you want to use the interactive interface in an ad hoc manner, you must\ not use a contigs database." if not self.profile_db_path: raise ConfigError, "Even when you want to use the interactive interface in an ad hoc manner by\ using the '--manual-mode' flag, you still need to declare a profile database.\ The profile database in this mode only used to read or store the 'state' of\ the display for visualization purposes. You DO NOT need to point to an already\ existing database, as anvi'o will generate an empty one for your if there is no\ profile database." if (not self.fasta_file) or (not self.view_data_path) or (not self.tree): raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\ each of '-f', '-d', and '-t' parameters. Please see the help menu for more info." if self.view: raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\ in manual mode" if self.show_views: raise ConfigError, "Sorry, there are no views to show in manual mode :/" if self.show_states: raise ConfigError, "Sorry, there are no states to show in manual mode :/" view_data_path = os.path.abspath(self.view_data_path) self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) self.p_meta['output_dir'] = None self.p_meta['views'] = {} self.p_meta['merged'] = True self.p_meta['default_view'] = 'single' self.p_meta['default_clustering'] = 'default' self.p_meta['available_clusterings'] = ['default'] self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}} self.default_view = self.p_meta['default_view'] # sanity of the view data filesnpaths.is_file_tab_delimited(view_data_path) view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True) if not view_data_columns[0] == "contig": raise ConfigError, "The first row of the first column of the view data file must\ say 'contig', which is not the case for your view data file\ ('%s'). Please make sure this is a properly formatted view data\ file." % (view_data_path) # load view data as the default view: self.views[self.default_view] = {'header': view_data_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)} self.split_names_ordered = self.views[self.default_view]['dict'].keys() # we assume that the sample names are the header of the view data, so we might as well set it up: self.p_meta['samples'] = self.views[self.default_view]['header'] filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta']) # setup a mock splits_basic_info dict self.splits_basic_info = {} for split_id in self.split_names_ordered: self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])} # create a new, empty profile database for ad hoc operations if not os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])}) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__) # also populate collections, if there are any self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__) if self.title: self.title = self.title
def load_from_user_files(self, args): if self.contigs_db_path: raise ConfigError, "When you want to use the interactive interface in an ad hoc manner, you must\ not use a contigs database." if not self.profile_db_path: raise ConfigError, "Even when you want to use the interactive interface in an ad hoc manner by\ using the '--manual-mode' flag, you still need to declare a profile database.\ The profile database in this mode only used to read or store the 'state' of\ the display for visualization purposes. You DO NOT need to point to an already\ existing database, as anvi'o will generate an empty one for your if there is no\ profile database." if (not self.view_data_path) or (not self.tree): raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\ each of the '-d', and '-t' parameters. Please see the documentation for help." if self.view: raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\ in manual mode" if self.show_views: raise ConfigError, "Sorry, there are no views to show in manual mode :/" if self.show_states: raise ConfigError, "Sorry, there are no states to show in manual mode :/" filesnpaths.is_file_exists(self.tree) filesnpaths.is_proper_newick(self.tree) view_data_path = os.path.abspath(self.view_data_path) self.p_meta['splits_fasta'] = os.path.abspath( self.fasta_file) if self.fasta_file else None self.p_meta['output_dir'] = None self.p_meta['views'] = {} self.p_meta['merged'] = True self.p_meta['default_view'] = 'single' self.p_meta['default_clustering'] = 'default' self.p_meta['available_clusterings'] = ['default'] self.p_meta['clusterings'] = { 'default': { 'newick': open(os.path.abspath(self.tree)).read() } } self.default_view = self.p_meta['default_view'] # sanity of the view data filesnpaths.is_file_tab_delimited(view_data_path) view_data_columns = utils.get_columns_of_TAB_delim_file( view_data_path, include_first_column=True) if not view_data_columns[0] == "contig": raise ConfigError, "The first row of the first column of the view data file must\ say 'contig', which is not the case for your view data file\ ('%s'). Please make sure this is a properly formatted view data\ file." % (view_data_path) # load view data as the default view: self.views[self.default_view] = { 'header': view_data_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path) } self.split_names_ordered = self.views[self.default_view]['dict'].keys() # we assume that the sample names are the header of the view data, so we might as well set it up: self.p_meta['samples'] = self.views[self.default_view]['header'] # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts, # otherwise we will leave them empty self.splits_basic_info = {} self.split_sequences = None if self.p_meta['splits_fasta']: filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary( self.p_meta['splits_fasta']) names_missing_in_FASTA = set(self.split_names_ordered) - set( self.split_sequences.keys()) num_names_missing_in_FASTA = len(names_missing_in_FASTA) if num_names_missing_in_FASTA: raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\ FASTA file you provided. Here is an example to one of those %d names that occur\ in your data file, but not in the FASTA file: "%s"' % ( num_names_missing_in_FASTA, names_missing_in_FASTA.pop()) # setup a mock splits_basic_info dict for split_id in self.split_names_ordered: self.splits_basic_info[split_id] = { 'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence( self.split_sequences[split_id]) } # create a new, empty profile database for ad hoc operations if not os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) profile_db.create({ 'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples']) }) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__) # also populate collections, if there are any self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__) if self.title: self.title = self.title
def load_manual_mode(self, args): if self.contigs_db_path: raise ConfigError, "When you want to use the interactive interface in manual mode, you must\ not use a contigs database." # if the user is using an existing profile database, we need to make sure that it is not associated # with a contigs database, since it would mean that it is a full anvi'o profile database and should # not be included in manual operations. if os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) if profile_db.meta['contigs_db_hash']: raise ConfigError, "Well. It seems the profile database is associated with a contigs database,\ which means using it in manual mode is not the best way to use it. Probably\ what you wanted to do is to let the manual mode create a new profile database\ for you. Simply type in a new profile database path (it can be a file name\ that doesn't exist)." if not self.profile_db_path: raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\ to declare a profile database. The profile database in this mode only used to\ read or store the 'state' of the display for visualization purposes. You DO\ NOT need to point to an already existing database, as anvi'o will generate\ an empty one for your if there is no profile database." if not self.tree: raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\ at least the tree file. Please see the documentation for help." if self.view: raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\ in manual mode" if self.show_views: raise ConfigError, "Sorry, there are no views to show in manual mode :/" if self.show_states: raise ConfigError, "Sorry, there are no states to show in manual mode :/" filesnpaths.is_file_exists(self.tree) tree = filesnpaths.is_proper_newick(self.tree) view_data_path = os.path.abspath( self.view_data_path) if self.view_data_path else None self.p_meta['splits_fasta'] = os.path.abspath( self.fasta_file) if self.fasta_file else None self.p_meta['output_dir'] = None self.p_meta['views'] = {} self.p_meta['merged'] = True self.p_meta['default_view'] = 'single' clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path( self.tree) self.p_meta['default_clustering'] = clustering_id self.p_meta['available_clusterings'] = [clustering_id] self.p_meta['clusterings'] = { clustering_id: { 'newick': ''.join([ l.strip() for l in open(os.path.abspath(self.tree)).readlines() ]) } } self.default_view = self.p_meta['default_view'] if self.view_data_path: # sanity of the view data filesnpaths.is_file_tab_delimited(view_data_path) view_data_columns = utils.get_columns_of_TAB_delim_file( view_data_path, include_first_column=True) if not view_data_columns[0] == "contig": raise ConfigError, "The first row of the first column of the view data file must\ say 'contig', which is not the case for your view data file\ ('%s'). Please make sure this is a properly formatted view data\ file." % (view_data_path) # load view data as the default view: self.views[self.default_view] = { 'header': view_data_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path) } else: # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict' # here using what is in the tree. names_in_the_tree = [n.name for n in tree.get_leaves()] ad_hoc_dict = {} for item in names_in_the_tree: ad_hoc_dict[item] = {'names': item} self.views[self.default_view] = { 'header': ['names'], 'dict': ad_hoc_dict } self.split_names_ordered = self.views[self.default_view]['dict'].keys() # we assume that the sample names are the header of the view data, so we might as well set it up: self.p_meta['samples'] = self.views[self.default_view]['header'] # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts, # otherwise we will leave them empty self.splits_basic_info = {} self.split_sequences = None if self.p_meta['splits_fasta']: filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary( self.p_meta['splits_fasta']) names_missing_in_FASTA = set(self.split_names_ordered) - set( self.split_sequences.keys()) num_names_missing_in_FASTA = len(names_missing_in_FASTA) if num_names_missing_in_FASTA: raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\ FASTA file you provided. Here is an example to one of those %d names that occur\ in your data file, but not in the FASTA file: "%s"' % ( num_names_missing_in_FASTA, names_missing_in_FASTA.pop()) # setup a mock splits_basic_info dict for split_id in self.split_names_ordered: self.splits_basic_info[split_id] = { 'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence( self.split_sequences[split_id]) } # create a new, empty profile database for manual operations if not os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) profile_db.create({ 'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples']) }) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__) # also populate collections, if there are any self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__) if self.title: self.title = self.title
def load_manual_mode(self, args): if self.contigs_db_path: raise ConfigError( "When you want to use the interactive interface in manual mode, you must\ not use a contigs database.") if not self.profile_db_path: raise ConfigError( "Even when you want to use the interactive interface in manual mode, you need\ to provide a profile database path. But you DO NOT need an already existing\ profile database, since anvi'o will generate an empty one for you. The profile\ database in this mode only used to read or store the 'state' of the display\ for visualization purposes, or to allow you to create and store collections." ) # if the user is using an existing profile database, we need to make sure that it is not associated # with a contigs database, since it would mean that it is a full anvi'o profile database and should # not be included in manual operations. if filesnpaths.is_file_exists(self.profile_db_path, dont_raise=True): profile_db = ProfileDatabase(self.profile_db_path) if profile_db.meta['contigs_db_hash']: raise ConfigError( "Well. It seems the profile database is associated with a contigs database,\ which means using it in manual mode is not the best way to use it. Probably\ what you wanted to do is to let the manual mode create a new profile database\ for you. Simply type in a new profile database path (it can be a file name\ that doesn't exist).") if not self.tree and not self.view_data_path: raise ConfigError( "You must be joking Mr. Feynman. No tree file, and no data file? What is it that\ anvi'o supposed to visualize? :(") if not self.tree: self.run.warning( "You haven't declared a tree file. Anvi'o will do its best to come up with an\ organization of your items.") if self.view: raise ConfigError( "You can't use '--view' parameter when you are running the interactive interface\ in manual mode") if self.show_views: raise ConfigError( "Sorry, there are no views to show in manual mode :/") if self.show_states: raise ConfigError( "Sorry, there are no states to show in manual mode :/") if self.tree: filesnpaths.is_file_exists(self.tree) newick_tree_text = ''.join([ l.strip() for l in open(os.path.abspath(self.tree)).readlines() ]) item_names = utils.get_names_order_from_newick_tree( newick_tree_text) else: item_names = utils.get_column_data_from_TAB_delim_file( self.view_data_path, column_indices=[0])[0][1:] # try to convert item names into integer values for proper sorting later. it's OK if it does # not work. try: item_names = [int(n) for n in item_names] except: pass view_data_path = os.path.abspath( self.view_data_path) if self.view_data_path else None self.p_meta['splits_fasta'] = os.path.abspath( self.fasta_file) if self.fasta_file else None self.p_meta['output_dir'] = None self.p_meta['views'] = {} self.p_meta['merged'] = True self.p_meta['default_view'] = 'single' self.default_view = self.p_meta['default_view'] # set some default organizations of data: self.p_meta['clusterings'] = { 'Alphabetical_(reverse):none:none': { 'basic': sorted(item_names) }, 'Alphabetical:none:none': { 'basic': sorted(item_names, reverse=True) } } self.p_meta['available_clusterings'] = [ 'Alphabetical_(reverse):none:none', 'Alphabetical:none:none' ] self.p_meta['default_clustering'] = self.p_meta[ 'available_clusterings'][0] # if we have a tree, let's make arrangements for it: if self.tree: clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path( self.tree) self.p_meta['default_clustering'] = clustering_id self.p_meta['available_clusterings'].append(clustering_id) self.p_meta['clusterings'][clustering_id] = { 'newick': newick_tree_text } if self.view_data_path: # sanity of the view data filesnpaths.is_file_tab_delimited(view_data_path) view_data_columns = utils.get_columns_of_TAB_delim_file( view_data_path, include_first_column=True) # load view data as the default view: self.views[self.default_view] = { 'header': view_data_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path) } else: # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict' # here using what is in the tree. ad_hoc_dict = {} for item in item_names: ad_hoc_dict[item] = {'names': item} self.views[self.default_view] = { 'header': ['names'], 'dict': ad_hoc_dict } self.displayed_item_names_ordered = list( self.views[self.default_view]['dict'].keys()) # we assume that the sample names are the header of the view data, so we might as well set it up: self.p_meta['samples'] = self.views[self.default_view]['header'] # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts, # otherwise we will leave them empty self.splits_basic_info = {} self.split_sequences = None if self.p_meta['splits_fasta']: filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary( self.p_meta['splits_fasta']) names_missing_in_FASTA = set( self.displayed_item_names_ordered) - set( self.split_sequences.keys()) num_names_missing_in_FASTA = len(names_missing_in_FASTA) if num_names_missing_in_FASTA: raise ConfigError( 'Some of the names in your view data does not have corresponding entries in the\ FASTA file you provided. Here is an example to one of those %d names that occur\ in your data file, but not in the FASTA file: "%s"' % (num_names_missing_in_FASTA, names_missing_in_FASTA.pop())) # setup a mock splits_basic_info dict for split_id in self.displayed_item_names_ordered: self.splits_basic_info[split_id] = { 'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence( self.split_sequences[split_id]) } # create a new, empty profile database for manual operations if not os.path.exists(self.profile_db_path): profile_db = ProfileDatabase(self.profile_db_path) profile_db.create({ 'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples']) }) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path) # also populate collections, if there are any self.collections.populate_collections_dict(self.profile_db_path) # read description from self table, if it is not available get_description function will return placeholder text self.p_meta['description'] = get_description_in_db( self.profile_db_path) if self.title: self.title = self.title