def profile(self): manager = multiprocessing.Manager() info_dict = manager.dict() info_dict = { 'input_file_path': self.input_file_path, 'contig_names': self.contig_names, 'contig_lengths': self.contig_lengths, 'splits_basic_info': self.splits_basic_info, 'split_length': self.a_meta['split_length'], 'min_coverage_for_variability': self.min_coverage_for_variability, 'skip_SNV_profiling': self.skip_SNV_profiling, 'report_variability_full': self.report_variability_full, 'contig_name_to_splits': self.contig_name_to_splits, 'contig_sequences': self.contig_sequences, 'min_mean_coverage': self.min_mean_coverage } available_index_queue = manager.Queue() output_queue = manager.Queue(self.queue_size) # put contig indices into the queue to be read from within # the worker for i in range(0, self.num_contigs): available_index_queue.put(i) processes = [] for i in range(0, self.num_threads): processes.append( multiprocessing.Process( target=BAMProfiler.profile_contig_worker, args=(available_index_queue, output_queue, info_dict))) for proc in processes: proc.start() recieved_contigs = 0 discarded_contigs = 0 memory_usage = None self.progress.new('Profiling using ' + str(self.num_threads) + ' thread%s' % ('s' if self.num_threads > 1 else '')) self.progress.update('initializing threads ...') # FIXME: memory usage should be generalized. last_memory_update = int(time.time()) self.progress.update('contigs are being processed ...') while recieved_contigs < self.num_contigs: try: contig = output_queue.get() # if we have a contig back, it means we are good to go with it, # otherwise it is garbage. if contig: self.contigs.append(contig) else: discarded_contigs += 1 recieved_contigs += 1 if (int(time.time()) - last_memory_update) > 5: memory_usage = utils.get_total_memory_usage() last_memory_update = int(time.time()) self.progress.update('Processed %d of %d contigs. Current memory usage: %s' % \ (recieved_contigs, self.num_contigs, memory_usage or '...')) # here you're about to witness the poor side of Python (or our use of it). # the problem we run into here was the lack of action from the garbage # collector on the processed objects. although we couldn't find any refs to # these objects, garbage collecter kept them in the memory, and `del` statement # on the `split` object did not yield any improvement either. so here we are # accessing to the atomic data structures in our split objects to try to relieve # the memory by encouraging the garbage collector to realize what's up # explicitly. if self.write_buffer_size > 0 and len( self.contigs) % self.write_buffer_size == 0: self.store_contigs_buffer() for c in self.contigs: for split in c.splits: del split.coverage del split.auxiliary del split del c.splits[:] del c.coverage del c del self.contigs[:] except KeyboardInterrupt: print( "Anvi'o profiler recieved SIGINT, terminating all processes..." ) break for proc in processes: proc.terminate() self.store_contigs_buffer() self.auxiliary_db.close() self.progress.end() # FIXME: this needs to be checked: if discarded_contigs > 0: self.run.info('contigs_after_C', pp(recieved_contigs - discarded_contigs)) overall_mean_coverage = 1 if self.total_length_of_all_contigs != 0: overall_mean_coverage = self.total_coverage_values_for_all_contigs / self.total_length_of_all_contigs # FIXME: We know this is ugly. You can keep your opinion to yourself. if overall_mean_coverage > 0.0: # avoid dividing by zero dbops.ProfileDatabase(self.profile_db_path).db._exec( "UPDATE atomic_data_splits SET abundance = abundance / " + str(overall_mean_coverage) + " * 1.0;") dbops.ProfileDatabase(self.profile_db_path).db._exec( "UPDATE atomic_data_contigs SET abundance = abundance / " + str(overall_mean_coverage) + " * 1.0;") self.check_contigs(num_contigs=recieved_contigs - discarded_contigs)
def profile(self): manager = multiprocessing.Manager() available_index_queue = manager.Queue() output_queue = manager.Queue(self.queue_size) # put contig indices into the queue to be read from within # the worker for i in range(0, self.num_contigs): available_index_queue.put(i) processes = [] for i in range(0, self.num_threads): processes.append(multiprocessing.Process(target=BAMProfiler.profile_contig_worker, args=(self, available_index_queue, output_queue))) for proc in processes: proc.start() recieved_contigs = 0 discarded_contigs = 0 memory_usage = None self.progress.new('Profiling w/' + str(self.num_threads) + ' thread%s' % ('s' if self.num_threads > 1 else ''), progress_total_items=self.num_contigs) self.progress.update('initializing threads ...') # FIXME: memory usage should be generalized. last_memory_update = int(time.time()) self.progress.update('contigs are being processed ...') self.progress.increment(recieved_contigs) while recieved_contigs < self.num_contigs: try: contig = output_queue.get() # if we have a contig back, it means we are good to go with it, # otherwise it is garbage. if contig: self.contigs.append(contig) else: discarded_contigs += 1 recieved_contigs += 1 if (int(time.time()) - last_memory_update) > 5: memory_usage = utils.get_total_memory_usage() last_memory_update = int(time.time()) self.progress.update('%d of %d contigs ⚙ / MEM ☠️ %s' % \ (recieved_contigs, self.num_contigs, memory_usage or '??')) # here you're about to witness the poor side of Python (or our use of it). # the problem we run into here was the lack of action from the garbage # collector on the processed objects. although we couldn't find any refs to # these objects, garbage collecter kept them in the memory, and `del` statement # on the `split` object did not yield any improvement either. so here we are # accessing to the atomic data structures in our split objects to try to relieve # the memory by encouraging the garbage collector to realize what's up # explicitly. if self.write_buffer_size > 0 and len(self.contigs) % self.write_buffer_size == 0: self.store_contigs_buffer() for c in self.contigs: for split in c.splits: del split.coverage del split.auxiliary del split del c.splits[:] del c.coverage del c del self.contigs[:] except KeyboardInterrupt: self.run.info_single("Anvi'o profiler recieved SIGINT, terminating all processes...", nl_before=2) break for proc in processes: proc.terminate() self.store_contigs_buffer() self.auxiliary_db.close() self.progress.end() # FIXME: this needs to be checked: if discarded_contigs > 0: self.run.info('contigs_after_C', pp(recieved_contigs - discarded_contigs)) overall_mean_coverage = 1 if self.total_length_of_all_contigs != 0: overall_mean_coverage = self.total_coverage_values_for_all_contigs / self.total_length_of_all_contigs # FIXME: We know this is ugly. You can keep your opinion to yourself. if overall_mean_coverage > 0.0: # avoid dividing by zero dbops.ProfileDatabase(self.profile_db_path).db._exec("UPDATE atomic_data_splits SET abundance = abundance / " + str(overall_mean_coverage) + " * 1.0;") dbops.ProfileDatabase(self.profile_db_path).db._exec("UPDATE atomic_data_contigs SET abundance = abundance / " + str(overall_mean_coverage) + " * 1.0;") if not self.skip_SNV_profiling: self.layer_additional_data['num_SNVs_reported'] = TableForVariability(self.profile_db_path, progress=null_progress).num_entries self.layer_additional_keys.append('num_SNVs_reported') self.check_contigs(num_contigs=recieved_contigs-discarded_contigs)