def merge_split_summaries(self): merged_summary_index = {} merged_summary_index_path = os.path.join(self.output_directory, 'SUMMARY.cp') summary_dir = filesnpaths.gen_output_directory(os.path.join(self.output_directory, 'SUMMARY'), delete_if_exists = True) # read all index files per run into a dict here, so the access is easier from within # the for loop below run_sum_indices = {} for runinfo in self.input_runinfo_dicts.values(): run_sum_indices[runinfo['sample_id']] = dictio.read_serialized_object(runinfo['profile_summary_index']) for i in range(0, len(self.split_names)): self.progress.update('merging summaries for splits %s of %s' % (i + 1, len(self.split_names))) split_name = self.split_names[i] merged_summary = {} for runinfo in self.input_runinfo_dicts.values(): run_split_summary = dictio.read_serialized_object(os.path.join(runinfo['input_dir'], run_sum_indices[runinfo['sample_id']][split_name])) merged_summary[runinfo['sample_id']] = run_split_summary[runinfo['sample_id']] merged_split_summary_path = os.path.join(summary_dir, os.path.basename(run_sum_indices[runinfo['sample_id']][split_name])) dictio.write_serialized_object(merged_summary, merged_split_summary_path) merged_summary_index[split_name] = merged_split_summary_path self.progress.update('Serializing merged split summary index ...') dictio.write_serialized_object(dictio.strip_prefix_from_dict_values(merged_summary_index, self.output_directory),\ merged_summary_index_path) return summary_dir, merged_summary_index_path
def list_contigs(self): import signal signal.signal(signal.SIGPIPE, signal.SIG_DFL) if self.input_file_path: self.progress.new('Init') self.progress.update('Reading BAM File') self.bam = pysam.Samfile(self.input_file_path, 'rb') self.progress.end() self.contig_names = self.bam.references self.contig_lengths = self.bam.lengths utils.check_contig_names(self.contig_names) for tpl in sorted(zip(self.contig_lengths, self.contig_names), reverse=True): print '%-40s %s' % (tpl[1], pp(int(tpl[0]))) else: self.progress.new('Init') self.progress.update('Reading serialized profile') self.contigs = dictio.read_serialized_object( self.serialized_profile_path) self.progress.end() self.run.info('profile_loaded_from', self.serialized_profile_path) self.run.info('num_contigs', pp(len(self.contigs))) for tpl in sorted([(int(self.contigs[contig].length), contig) for contig in self.contigs]): print '%-40s %s' % (tpl[1], pp(int(tpl[0])))
def init_p_id_to_cog_id_dict(self): self.progress.new('Initializing COGs Data') self.progress.update('Reading NCBI Protein ID to COG id converter ...') self.p_id_to_cog_id = dictio.read_serialized_object(self.essential_files['PID-TO-CID.cPickle']) self.progress.end()
def init(self): self.progress.new('Initializing COGs Data') self.progress.update('Reading COG functions ...') self.cogs = utils.get_TAB_delimited_file_as_dictionary( self.essential_files['COG.txt'], no_header=True, column_names=['COG', 'categories', 'annotation']) self.progress.update('Reading COG categories ...') self.categories = utils.get_TAB_delimited_file_as_dictionary( self.essential_files['CATEGORIES.txt'], no_header=True, column_names=['category', 'description']) self.progress.update('Reading missing COG IDs ...') self.missing_cogs = dictio.read_serialized_object( self.essential_files['MISSING_COG_IDs.cPickle']) self.progress.end() for cog in self.cogs: self.cogs[cog]['categories'] = [ c.strip() for c in self.cogs[cog]['categories'].split(',') ] for cat in self.categories: self.categories[cat] = self.categories[cat]['description'] self.initialized = True
def list_contigs(self): import signal signal.signal(signal.SIGPIPE, signal.SIG_DFL) if self.input_file_path: self.progress.new('Init') self.progress.update('Reading BAM File') self.bam = pysam.Samfile(self.input_file_path, 'rb') self.progress.end() self.contig_names = self.bam.references self.contig_lenghts = self.bam.lengths utils.check_contig_names(self.contig_names) for tpl in sorted(zip(self.contig_lenghts, self.contig_names), reverse = True): print '%-40s %s' % (tpl[1], pp(int(tpl[0]))) else: self.progress.new('Init') self.progress.update('Reading serialized profile') self.contigs = dictio.read_serialized_object(self.serialized_profile_path) self.progress.end() self.run.info('profile_loaded_from', self.serialized_profile_path) self.run.info('num_contigs', pp(len(self.contigs))) for tpl in sorted([(int(self.contigs[contig].length), contig) for contig in self.contigs]): print '%-40s %s' % (tpl[1], pp(int(tpl[0])))
def init_serialized_profile(self): self.progress.new('Init') self.progress.update('Reading serialized profile') self.contigs = dictio.read_serialized_object(self.serialized_profile_path) self.progress.end() self.run.info('profile_loaded_from', self.serialized_profile_path) self.run.info('num_contigs', pp(len(self.contigs))) if self.contig_names_of_interest: contigs_to_discard = set() for contig in self.contigs: if contig not in self.contig_names_of_interest: contigs_to_discard.add(contig) if len(contigs_to_discard): for contig in contigs_to_discard: self.contigs.pop(contig) self.run.info('num_contigs_selected_for_analysis', pp(len(self.contigs))) self.check_contigs() # it brings good karma to let the user know what the hell is wrong with their data: self.check_contigs_without_any_gene_calls(self.contigs.keys()) contigs_to_discard = set() for contig in self.contigs.values(): if contig.length < self.min_contig_length: contigs_to_discard.add(contig.name) if len(contigs_to_discard): for contig in contigs_to_discard: self.contigs.pop(contig) self.run.info('contigs_raw_longer_than_M', len(self.contigs)) self.check_contigs()
def init(self): self.progress.new('Initializing COGs Data') self.progress.update('Reading COG functions ...') if self.COG_version == 'COG14': self.cogs = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['COG.txt'], no_header=True, column_names=['COG', 'categories', 'annotation']) elif self.COG_version == 'COG20': self.cogs = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['COG.txt'], no_header=True, column_names=['COG', 'categories', 'annotation', 'pathway']) else: raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper " "parsing of a new generation of COG files.") self.progress.update('Reading COG categories ...') self.categories = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['CATEGORIES.txt'], no_header=True, column_names=['category', 'description']) self.progress.update('Reading missing COG IDs ...') self.missing_cogs = dictio.read_serialized_object(self.essential_files['MISSING_COG_IDs.cPickle']) self.progress.end() for cog in self.cogs: self.cogs[cog]['categories'] = [c.strip() for c in self.cogs[cog]['categories'].split(',')] for cat in self.categories: self.categories[cat] = self.categories[cat]['description'] self.initialized = True
def load_from_files(self, args): if (not self.fasta_file) or (not self.metadata) or (not self.tree) or (not self.output_dir): raise ConfigError, "If you do not have a RUNINFO dict, you must declare each of\ '-f', '-m', '-t' and '-o' parameters. Please see '--help' for\ more detailed information on them." if self.view: raise ConfigError, "You can't use '-v' parameter when this program is not called with a RUNINFO.cp" if self.show_views: raise ConfigError, "Sorry, there are no views to show when there is no RUNINFO.cp :/" metadata_path = os.path.abspath(self.metadata) self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) self.p_meta['output_dir'] = os.path.abspath(self.output_dir) self.p_meta['views'] = {} self.p_meta['default_view'] = 'single' self.p_meta['default_clustering'] = 'default' self.p_meta['available_clusterings'] = ['default'] self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}} self.default_view = self.p_meta['default_view'] if self.summary_index: self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index) self.splits_summary_index = dictio.read_serialized_object(self.p_meta['profile_summary_index']) # sanity of the metadata filesnpaths.is_file_tab_delimited(metadata_path) metadata_columns = utils.get_columns_of_TAB_delim_file(metadata_path, include_first_column=True) if not metadata_columns[0] == "contig": raise ConfigError, "The first row of the first column of the metadata file must\ say 'contig', which is not the case for your metadata file\ ('%s'). Please make sure this is a properly formatted metadata\ file." % (metadata_path) # store metadata as view: self.views[self.default_view] = {'header': metadata_columns[1:], 'dict': utils.get_TAB_delimited_file_as_dictionary(metadata_path)} self.split_names_ordered = self.views[self.default_view]['dict'].keys() filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta']) self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta']) # setup a mock splits_basic_info dict self.splits_basic_info = {} for split_id in self.split_names_ordered: self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]), 'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])} # reminder: this is being stored in the output dir provided as a commandline parameter: self.p_meta['self_path'] = os.path.join(self.p_meta['output_dir'], 'RUNINFO.cp') if self.title: self.title = self.title filesnpaths.gen_output_directory(self.p_meta['output_dir'])
def read_runinfo_dict(self, path): runinfo = dictio.read_serialized_object(path) sample_id = runinfo['sample_id'] if not sample_id in self.merged_sample_ids: self.merged_sample_ids.append(sample_id) self.input_runinfo_dicts[sample_id] = runinfo input_dir = os.path.dirname(os.path.abspath(path)) runinfo['input_dir'] = input_dir runinfo['profile_db'] = os.path.join(input_dir, 'PROFILE.db') return sample_id, runinfo
def migrate(db_path): if db_path is None: raise ConfigError("No profile database is given.") # make sure someone is not being funny utils.is_profile_db(db_path) # make sure the version is 5 profile_db = db.DB(db_path, None, ignore_version=True) if str(profile_db.get_version()) != '14': raise ConfigError( "Version of this profile database is not 14 (hence, this script cannot really do anything)." ) is_merged = profile_db.get_meta_value('merged') progress.new("Trying to upgrade the %s profile database" % 'merged' if is_merged else 'single') # update the runinfo.cp input_dir = os.path.dirname(os.path.abspath(db_path)) P = lambda x: os.path.join(input_dir, x) E = lambda x: os.path.exists(x) runinfo_path = P('RUNINFO.cp') if E(P('RUNINFO.cp')) else None runinfo_path = P('RUNINFO.mcp') if E(P('RUNINFO.mcp')) else None if runinfo_path: runinfo = dictio.read_serialized_object(runinfo_path) if 'blank' not in runinfo: runinfo['blank'] = False dictio.write_serialized_object(runinfo, runinfo_path) # add the new value profile_db.set_meta_value('blank', False) # set the version profile_db.remove_meta_key_value_pair('version') profile_db.set_version('15') # bye profile_db.disconnect() progress.end() run.info_single("Database successfully upgraded to version 15!", nl_after=1, nl_before=1, mc='green')
def migrate(db_path): if db_path is None: raise ConfigError("No profile database is given.") # make sure someone is not being funny utils.is_profile_db(db_path) # make sure the version is 5 profile_db = db.DB(db_path, None, ignore_version = True) if str(profile_db.get_version()) != '14': raise ConfigError("Version of this profile database is not 14 (hence, this script cannot really do anything).") is_merged = profile_db.get_meta_value('merged') progress.new("Trying to upgrade the %s profile database" % 'merged' if is_merged else 'single') # update the runinfo.cp input_dir = os.path.dirname(os.path.abspath(db_path)) P = lambda x: os.path.join(input_dir, x) E = lambda x: os.path.exists(x) runinfo_path = P('RUNINFO.cp') if E(P('RUNINFO.cp')) else None runinfo_path = P('RUNINFO.mcp') if E(P('RUNINFO.mcp')) else None if runinfo_path: runinfo = dictio.read_serialized_object(runinfo_path) if 'blank' not in runinfo: runinfo['blank'] = False dictio.write_serialized_object(runinfo, runinfo_path) # add the new value profile_db.set_meta_value('blank', False) # set the version profile_db.remove_meta_key_value_pair('version') profile_db.set_version('15') # bye profile_db.disconnect() progress.end() run.info_single("Database successfully upgraded to version 15!", nl_after=1, nl_before=1, mc='green')
def init(self): self.progress.new('Initializing COGs Data') self.progress.update('Reading COG functions ...') self.cogs = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['COG.txt'], no_header=True, column_names=['COG', 'categories', 'annotation']) self.progress.update('Reading COG categories ...') self.categories = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['CATEGORIES.txt'], no_header=True, column_names=['category', 'description']) self.progress.update('Reading missing COG IDs ...') self.missing_cogs = dictio.read_serialized_object(self.essential_files['MISSING_COG_IDs.cPickle']) self.progress.end() for cog in self.cogs: self.cogs[cog]['categories'] = [c.strip() for c in self.cogs[cog]['categories'].split(',')] for cat in self.categories: self.categories[cat] = self.categories[cat]['description'] self.initialized = True
def init_serialized_profile(self): self.progress.new('Init') self.progress.update('Reading serialized profile') self.contigs = dictio.read_serialized_object( self.serialized_profile_path) self.progress.end() self.run.info('profile_loaded_from', self.serialized_profile_path) self.run.info('num_contigs', pp(len(self.contigs))) if self.contig_names_of_interest: contigs_to_discard = set() for contig in self.contigs: if contig not in self.contig_names_of_interest: contigs_to_discard.add(contig) if len(contigs_to_discard): for contig in contigs_to_discard: self.contigs.pop(contig) self.run.info('num_contigs_selected_for_analysis', pp(len(self.contigs))) self.check_contigs() # it brings good karma to let the user know what the hell is wrong with their data: self.check_contigs_without_any_gene_calls(self.contigs.keys()) contigs_to_discard = set() for contig in self.contigs.values(): if contig.length < self.min_contig_length: contigs_to_discard.add(contig.name) if len(contigs_to_discard): for contig in contigs_to_discard: self.contigs.pop(contig) self.run.info('contigs_raw_longer_than_M', len(self.contigs)) self.check_contigs()
def load_from_profile_database(self, args): if self.p_meta['version'] != anvio.__profile__version__: raise ConfigError, "The profile database has a version number that differs from the version that is valid\ for this codebase (the profile database is at '%s', and the codebase is at '%s'). Very\ unfortunately, you need to re-profile and re-merge this project using the current anvi'o :(" self.p_meta['self_path'] = self.profile_db_path self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path)) # create an instance of states table self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__) # load views from the profile database self.load_views() self.default_view = self.p_meta['default_view'] # if the user wants to see available views, show them and exit. if self.show_views: run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green') for view in self.views: run.info(view, 'Via "%s" table' % self.views[view]['table_name'], lc='crimson', mc='green' if view == self.default_view else 'crimson') print sys.exit() # if the user has an additional view data, load it up into the self.views dict. if self.additional_view_path: filesnpaths.is_file_tab_delimited(self.additional_view_path) additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path) if not additional_view_columns[-1] == '__parent__': raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\ parent information for each split." column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str] self.views['user_view'] = {'table_name': 'NA', 'header': additional_view_columns, 'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)} # if the user specifies a view, set it as default: if self.view: if not self.view in self.views: raise ConfigError, "The requested view ('%s') is not available for this run. Please see\ available views by running this program with --show-views flag." % self.view self.default_view = self.view self.p_meta['clusterings'] = self.clusterings if self.tree: entry_id = os.path.basename(self.tree).split('.')[0] if not self.p_meta['clusterings']: self.p_meta['default_clustering'] = entry_id self.p_meta['available_clusterings'] = [entry_id] self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}} run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id) else: self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()} run.info('Additional Tree', "'%s' has been added to available trees." % entry_id) # is summary being overwritten? if self.summary_index: run.info('Warning', "The default summary index in RUNINFO is being overriden by '%s'." % self.summary_index) self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index) if os.path.exists(self.P('SUMMARY.cp')): self.splits_summary_index = dictio.read_serialized_object(self.P('SUMMARY.cp')) else: self.splits_summary_index = None run.warning("SUMMARY.cp is missing for your run. Anvi'o will continue working (well, at least\ it will attempt to do it), but things may behave badly with the absence of\ SUMMARY.cp (first and foremost, you will not be able to inspect individual\ contigs through any of the interactive interfaces). Please investigate it\ if you were not expecting this.") # set title if self.title: self.title = self.title + ' (%s)' % self.default_view else: self.title = self.p_meta['sample_id'] + ' (%s)' % self.default_view