def generate_variabile_nts_table(self): if self.skip_SNV_profiling: return variable_nts_table = TableForVariability(self.profile_db_path, progress=self.progress) for contig in self.contigs: for split in contig.splits: for column_profile in list(split.column_profiles.values()): # let's figure out more about this particular variable position pos_in_contig = column_profile['pos_in_contig'] column_profile['in_partial_gene_call'], \ column_profile['in_complete_gene_call'],\ column_profile['base_pos_in_codon'] = self.get_nt_position_info(contig.name, pos_in_contig) column_profile['sample_id'] = self.sample_id column_profile[ 'corresponding_gene_call'] = -1 # this means there is no gene call that corresponds to this # nt position, which will be updated in the following lines. # yeah, we use '-1', because genecaller ids start from 0 :/ column_profile['codon_order_in_gene'] = -1 # if this particular position (`pos_in_contig`) falls within a COMPLETE gene call, # we would like to find out which unique gene caller id(s) match to this position. if column_profile['in_complete_gene_call']: corresponding_gene_caller_ids = self.get_corresponding_gene_caller_ids_for_base_position( contig.name, pos_in_contig) # if there are more than one corresponding gene call, this usually indicates an assembly error # just to be on the safe side, we will not report a corresopnding unique gene callers id for this # position if len(corresponding_gene_caller_ids) == 1: # if we are here, it means this nucleotide position is in a complete gene call. we will do two things here. # first, we will store the gene_callers_id that corresponds to this nt position, and then we will store the # order of the corresponding codon in the gene for this nt position. gene_callers_id = corresponding_gene_caller_ids[0] column_profile[ 'corresponding_gene_call'] = gene_callers_id column_profile[ 'codon_order_in_gene'] = self.get_corresponding_codon_order_in_gene( gene_callers_id, contig.name, pos_in_contig) # save this information for later use self.codons_in_genes_to_profile_SCVs.add( (gene_callers_id, column_profile['codon_order_in_gene']), ) variable_nts_table.append(column_profile) variable_nts_table.store() self.layer_additional_data[ 'num_SNVs_reported'] = variable_nts_table.num_entries self.layer_additional_keys.append('num_SNVs_reported')
def generate_variabile_nts_table(self): if self.skip_SNV_profiling: return variable_nts_table = TableForVariability(self.profile_db_path, progress=null_progress) for contig in self.contigs: for split in contig.splits: for column_profile in list(split.column_profiles.values()): variable_nts_table.append(column_profile) variable_nts_table.store()
def merge_variable_nts_tables(self): variable_nts_table = TableForVariability(self.merged_profile_db_path, progress=self.progress) for input_profile_db_path in self.profile_dbs_info_dict: sample_profile_db = dbops.ProfileDatabase(input_profile_db_path, quiet=True) sample_variable_nts_table = sample_profile_db.db.get_table_as_list_of_tuples(tables.variable_nts_table_name, tables.variable_nts_table_structure) sample_profile_db.disconnect() for tpl in sample_variable_nts_table: entry = tuple([variable_nts_table.next_id(tables.variable_nts_table_name)] + list(tpl[1:])) variable_nts_table.db_entries.append(entry) variable_nts_table.store()
def profile(self): manager = multiprocessing.Manager() available_index_queue = manager.Queue() output_queue = manager.Queue(self.queue_size) # put contig indices into the queue to be read from within # the worker for i in range(0, self.num_contigs): available_index_queue.put(i) processes = [] for i in range(0, self.num_threads): processes.append( multiprocessing.Process( target=BAMProfiler.profile_contig_worker, args=(self, available_index_queue, output_queue))) for proc in processes: proc.start() recieved_contigs = 0 discarded_contigs = 0 memory_usage = None self.progress.new('Profiling w/' + str(self.num_threads) + ' thread%s' % ('s' if self.num_threads > 1 else ''), progress_total_items=self.num_contigs) self.progress.update('initializing threads ...') # FIXME: memory usage should be generalized. last_memory_update = int(time.time()) self.progress.update('contigs are being processed ...') self.progress.increment(recieved_contigs) while recieved_contigs < self.num_contigs: try: contig = output_queue.get() # if we have a contig back, it means we are good to go with it, # otherwise it is garbage. if contig: self.contigs.append(contig) else: discarded_contigs += 1 recieved_contigs += 1 if (int(time.time()) - last_memory_update) > 5: memory_usage = utils.get_total_memory_usage() last_memory_update = int(time.time()) self.progress.update('%d of %d contigs ⚙ / MEM ☠️ %s' % \ (recieved_contigs, self.num_contigs, memory_usage or '??')) # here you're about to witness the poor side of Python (or our use of it). # the problem we run into here was the lack of action from the garbage # collector on the processed objects. although we couldn't find any refs to # these objects, garbage collecter kept them in the memory, and `del` statement # on the `split` object did not yield any improvement either. so here we are # accessing to the atomic data structures in our split objects to try to relieve # the memory by encouraging the garbage collector to realize what's up # explicitly. if self.write_buffer_size > 0 and len( self.contigs) % self.write_buffer_size == 0: self.store_contigs_buffer() for c in self.contigs: for split in c.splits: del split.coverage del split.auxiliary del split del c.splits[:] del c.coverage del c del self.contigs[:] except KeyboardInterrupt: self.run.info_single( "Anvi'o profiler recieved SIGINT, terminating all processes...", nl_before=2) break for proc in processes: proc.terminate() self.store_contigs_buffer() self.auxiliary_db.close() self.progress.end() # FIXME: this needs to be checked: if discarded_contigs > 0: self.run.info('contigs_after_C', pp(recieved_contigs - discarded_contigs)) overall_mean_coverage = 1 if self.total_length_of_all_contigs != 0: overall_mean_coverage = self.total_coverage_values_for_all_contigs / self.total_length_of_all_contigs # FIXME: We know this is ugly. You can keep your opinion to yourself. if overall_mean_coverage > 0.0: # avoid dividing by zero dbops.ProfileDatabase(self.profile_db_path).db._exec( "UPDATE atomic_data_splits SET abundance = abundance / " + str(overall_mean_coverage) + " * 1.0;") dbops.ProfileDatabase(self.profile_db_path).db._exec( "UPDATE atomic_data_contigs SET abundance = abundance / " + str(overall_mean_coverage) + " * 1.0;") if not self.skip_SNV_profiling: self.layer_additional_data[ 'num_SNVs_reported'] = TableForVariability( self.profile_db_path, progress=null_progress).num_entries self.layer_additional_keys.append('num_SNVs_reported') self.check_contigs(num_contigs=recieved_contigs - discarded_contigs)