def init(self): self.progress.new('Initializing') self.progress.update('Getting split names') d = ccollections.GetSplitNamesInBins(self.args).get_dict() self.bins = d.keys() for split_names in d.values(): self.split_names_of_interest.update(split_names) self.progress.end() # if the user updates the refinement of a single bin or bins, there shouldn't be multiple copies # of that stored in the database. so everytime 'store_refined_bins' function is called, # it will check this varlable and, (1) if empty, continue updating stuff in db store updates # in it, (2) if not empty, remove items stored in this variable from collections dict, and continue # with step (1). the starting point is of course self.bins. when the store_refined_bins function is # called the first time, it will read collection data for collection_name, and remove the bin(s) in # analysis from it before it stores the data: self.ids_for_already_refined_bins = self.bins self.input_directory = os.path.dirname(os.path.abspath(self.profile_db_path)) self.run.info('Input directory', self.input_directory) self.run.info('Collection ID', self.collection_name) self.run.info('Number of bins', len(self.bins)) self.run.info('Number of splits', len(self.split_names_of_interest)) self.collections = ccollections.Collections() self.collections.populate_collections_dict(self.profile_db_path)
def init(self): self.sanity_check() self.run.info('Input BAM file(s)', ', '.join([os.path.basename(f) for f in self.input_bam_files])) d = ccollections.GetSplitNamesInBins(self.args).get_dict() self.bins = d.keys() for split_names in d.values(): self.split_names_of_interest.update(split_names) self.run.info('Collection ID', self.collection_id) self.run.info('Bin(s)', ', '.join(self.bins)) self.run.info('Number of splits', pp(len(self.split_names_of_interest)))
def get_split_names_of_interest_for_internal_genome(self, entry): utils.is_profile_db(entry['profile_db_path']) # get splits of interest: class Args: pass args = Args() args.profile_db = entry['profile_db_path'] args.collection_name = entry['collection_id'] args.bin_id = entry['bin_id'] split_names_of_interest = list(ccollections.GetSplitNamesInBins(args).get_split_names_only()) if not len(split_names_of_interest): raise ConfigError("There are 0 splits defined for bin id %s in collection %s..." % (entry['bin_id'], entry['collection_id'])) return split_names_of_interest
def init(self): utils.is_contigs_db(self.contigs_db_path) self.run.info('Input BAM file(s)', ', '.join([os.path.basename(f) for f in self.input_bam_files])) d = ccollections.GetSplitNamesInBins(self.args).get_dict() self.bins = list(d.keys()) for split_names in list(d.values()): self.split_names_of_interest.update(split_names) self.run.info('Collection ID', self.collection_name) self.run.info('Bin(s)', ', '.join(self.bins)) self.run.info('Number of splits', pp(len(self.split_names_of_interest))) self.initialized = True
def init_commons(self): self.progress.new('Init') self.progress.update('Checking the output file path ..') if self.output_file_path: filesnpaths.is_output_file_writable(self.output_file_path) self.progress.update('Checking the samples of interest ..') if self.samples_of_interest_path: filesnpaths.is_file_exists(self.samples_of_interest_path) self.samples_of_interest = set([ s.strip() for s in open(self.samples_of_interest_path).readlines() ]) else: self.samples_of_interest = set([]) self.progress.update('Making sure our databases are here ..') if not self.profile_db_path: raise ConfigError, 'You need to provide a profile database.' if not self.contigs_db_path: raise ConfigError, 'You need to provide a contigs database.' self.progress.update('Making sure our databases are compatible ..') dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) if self.min_coverage_in_each_sample and not self.quince_mode: self.progress.end() raise ConfigError, "When you sepecify a coverage value through --min-coverage-in-each-sample, you must also\ use --quince-mode flag, since the former parameter needs to know the coverage values in all\ samples even if variation is reported for only one sample among otheres. This is the only way\ to figure out whether variation is not reported for other samples due to low or zero coverage,\ or there was no variation to report despite the high coverage. Anvi'o could turn --quince-mode\ flat automatically for you, but then it is much better if you have full control and understaning\ of what is going on." if self.quince_mode: self.progress.update('Accessing auxiliary data file ...') auxiliary_data_file_path = os.path.join( os.path.dirname(self.profile_db_path), 'AUXILIARY-DATA.h5') if not os.path.exists(auxiliary_data_file_path): raise ConfigError, "Anvi'o needs the auxiliary data file to run this program with '--quince-mode' flag.\ However it wasn't found at '%s' :/" % auxiliary_data_file_path self.merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages( auxiliary_data_file_path, None, ignore_hash=True) self.progress.update( 'Attempting to get our splits of interest sorted ..') if self.collection_name: # the user wants to go with the collection id path. fine. we will get our split names from # the profile database. if not self.bin_id: self.progress.end() raise ConfigError, 'When you declare a collection id, you must also declare a bin name\ (from which the split names of interest will be acquired)' if self.splits_of_interest or self.splits_of_interest_path: self.progress.end() raise ConfigError, "You declared a collection id and one or more bin names so anvi'o can find out\ splits of interest, but you also have specified informaiton for split names?\ This is confusing. You should choose one way or another :/" self.splits_of_interest = ccollections.GetSplitNamesInBins( self.args).get_split_names_only() else: # OK. no collection id. we will go oldschool. we whope to find what we are looking for in # self.splits_of_interst_path at this point (which may have been filled through the command # line client), or in self.splits_of_interest (which may have been filled in by another program) if not self.splits_of_interest: if not self.splits_of_interest_path: self.progress.end() raise ConfigError, 'You did not declare a source for split names. You either should give me\ a file with split names you are interested in, or a collection id and\ bin name so I can learn split names from the profile database.' filesnpaths.is_file_exists(self.splits_of_interest_path) self.splits_of_interest = set([ c.strip().replace('\r', '') for c in open(self.splits_of_interest_path).readlines() ]) self.input_file_path = '/' + '/'.join( os.path.abspath(self.profile_db_path).split('/')[:-1]) self.progress.update('Reading the data ...') profile_db = dbops.ProfileDatabase(self.profile_db_path) self.sample_ids = profile_db.samples # we set this now, but we will overwrite it with args.samples_of_interest if necessary if not profile_db.meta['SNVs_profiled']: self.progress.end() raise ConfigError, "Well well well. It seems SNVs were not characterized for this profile database.\ Sorry, there is nothing to report here!" if self.engine == 'NT': self.data = profile_db.db.get_table_as_dict( t.variable_nts_table_name) elif self.engine == 'AA': # AA specific stuff. first check whether things were profiled if not profile_db.meta['AA_frequencies_profiled']: raise ConfigError, "It seems AA frequencies were not characterized for this profile database.\ There is nothing to report here for AAs!" # get the data. self.data = profile_db.db.get_table_as_dict( t.variable_aas_table_name) # append split_name information for e in self.data.values(): e['split_name'] = self.gene_callers_id_to_split_name_dict[ e['corresponding_gene_call']] else: raise ConfigError, "VariabilitySuper :: Anvi'o doesn't know what to do with a engine on '%s' yet :/" % self.engine profile_db.disconnect() self.progress.end()
def __init__(self, args, hmm_sources, run=terminal.Run(), progress=terminal.Progress()): self.args = args self.run = run self.progress = progress self.hmm_sources = hmm_sources self.splits_dict = {} # process genome descriptions GenomeDescriptions.__init__(self, args, run=self.run, progress=self.progress) self.load_genomes_descriptions(skip_functions=True, init=False) hmm_sources_in_all_genomes = self.get_HMM_sources_common_to_all_genomes( sources_that_must_be_common=hmm_sources) if not len(hmm_sources_in_all_genomes): raise ConfigError( "There are no HMM sources among your external genomes that occur in every genome :/" ) # initialize the super SequencesForHMMHits.__init__(self, None, sources=hmm_sources, run=self.run, progress=self.progress) num_internal_genomes = len( set([ g['genome_hash'] for g in self.genomes.values() if 'profile_db_path' in g ])) collection_names = set([ g['collection_id'] for g in self.genomes.values() if 'collection_id' in g ]) if num_internal_genomes: self.run.warning( "SequencesForHMMHitsWrapperForMultipleContigs class is speaking (yes, the class is " "quite aware of its very long name thankyouverymuch). Of the total %d genome descriptions " "it was given, %d seem to represent internal genomes with bins in collection(s) '%s'. Anvi'o " "will make sure HMM hits to be used for downstream analyses are only those that match to contigs " "that were included in those selections." % (len(self.genomes), num_internal_genomes, ', '.join(collection_names)), lc="green") # very hacky code follows. here we generate a self SequencesForHMMHits object, # and we will fill everything in it with slightly modified information so multiple # contigs databases could be processed by this talented class seamlessly. hmm_hits_splits_counter = 0 for genome_name in self.genomes: g = self.genomes[genome_name] contigs_db_path = g['contigs_db_path'] contigs_db_hash = g['contigs_db_hash'] # this is an important variable and allows us to track origins of HMM hits for bins # and individual contigs databases seamlessly. if you want to understand truly what # the hell does this mean, look at `get_genome_hash_for_external_genome` and # `get_genome_hash_for_internal_genome` functions in `genomedescriptions.py`. genome_hash = None # here we check if the genome descriptions contain reference to a collection name, # because if it is the case, we need to focus only on hmm hits that are relevant # to splits in this collection: if 'collection_id' in g: if ('bin_id' not in g) or ('profile_db_path' not in g): raise ConfigError( "There is something VERY weird going on. Your genome descriptions object contains " "a collection name, yet it doesn't know anything about a bin name or profile database " "path. While this is very interesting because it should never happen, anvi'o will say " "goodbye and abruptly quit in confusion :(") # setup an args object, and recover the split names of interest args = argparse.Namespace(profile_db=g['profile_db_path'], contigs_db=g['contigs_db_path'], bin_id=g['bin_id'], collection_name=g['collection_id']) split_names_of_interest = ccollections.GetSplitNamesInBins( args).get_split_names_only() genome_hash = hashlib.sha224('_'.join( [''.join(split_names_of_interest), contigs_db_hash]).encode('utf-8')).hexdigest()[0:12] # current hmm hits now will match to the collection current = SequencesForHMMHits( contigs_db_path, sources=hmm_sources, split_names_of_interest=split_names_of_interest) else: current = SequencesForHMMHits(contigs_db_path, sources=hmm_sources) genome_hash = contigs_db_hash for hmm_hit_id in current.hmm_hits: hit = current.hmm_hits[hmm_hit_id] hit['gene_callers_id'] = '%s_%d' % (contigs_db_hash, hit['gene_callers_id']) hit['genome_hash'] = genome_hash self.hmm_hits['%s_%d' % (contigs_db_hash, hmm_hit_id)] = hit if not self.hmm_hits_info: for hmm_source in hmm_sources_in_all_genomes: self.hmm_hits_info[hmm_source] = current.hmm_hits_info[ hmm_source] for hit in current.hmm_hits_splits.values(): hit['split'] = '%s_%s' % (contigs_db_hash, hit['split']) hit['hmm_hit_entry_id'] = '%s_%d' % (contigs_db_hash, hit['hmm_hit_entry_id']) self.hmm_hits_splits[hmm_hits_splits_counter] = hit hmm_hits_splits_counter += 1 for seq in current.contig_sequences: self.contig_sequences['%s_%s' % (contigs_db_hash, seq)] = current.contig_sequences[seq] for seq in current.aa_sequences: self.aa_sequences['%s_%s' % (contigs_db_hash, seq)] = current.aa_sequences[seq] for gene_callers_id in current.genes_in_contigs: entry = current.genes_in_contigs[gene_callers_id] entry['contig'] = '%s_%s' % (contigs_db_hash, entry['contig']) self.genes_in_contigs['%s_%d' % (contigs_db_hash, gene_callers_id)] = entry self.splits_dict[genome_name] = [ '%s_%s' % (contigs_db_hash, s) for s in current.splits_in_contigs ]