Example #1
0
    def profile(self):
        manager = multiprocessing.Manager()
        info_dict = manager.dict()
        info_dict = {
            'input_file_path': self.input_file_path,
            'contig_names': self.contig_names,
            'contig_lengths': self.contig_lengths,
            'splits_basic_info': self.splits_basic_info,
            'split_length': self.a_meta['split_length'],
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'skip_SNV_profiling': self.skip_SNV_profiling,
            'report_variability_full': self.report_variability_full,
            'contig_name_to_splits': self.contig_name_to_splits,
            'contig_sequences': self.contig_sequences,
            'min_mean_coverage': self.min_mean_coverage
        }

        available_index_queue = manager.Queue()
        output_queue = manager.Queue(self.queue_size)

        # put contig indices into the queue to be read from within
        # the worker
        for i in range(0, self.num_contigs):
            available_index_queue.put(i)

        processes = []
        for i in range(0, self.num_threads):
            processes.append(
                multiprocessing.Process(
                    target=BAMProfiler.profile_contig_worker,
                    args=(available_index_queue, output_queue, info_dict)))

        for proc in processes:
            proc.start()

        recieved_contigs = 0
        discarded_contigs = 0
        memory_usage = None

        self.progress.new('Profiling using ' + str(self.num_threads) +
                          ' thread%s' % ('s' if self.num_threads > 1 else ''))
        self.progress.update('initializing threads ...')
        # FIXME: memory usage should be generalized.
        last_memory_update = int(time.time())

        self.progress.update('contigs are being processed ...')
        while recieved_contigs < self.num_contigs:
            try:
                contig = output_queue.get()

                # if we have a contig back, it means we are good to go with it,
                # otherwise it is garbage.
                if contig:
                    self.contigs.append(contig)
                else:
                    discarded_contigs += 1

                recieved_contigs += 1

                if (int(time.time()) - last_memory_update) > 5:
                    memory_usage = utils.get_total_memory_usage()
                    last_memory_update = int(time.time())

                self.progress.update('Processed %d of %d contigs. Current memory usage: %s' % \
                            (recieved_contigs, self.num_contigs, memory_usage or '...'))

                # here you're about to witness the poor side of Python (or our use of it).
                # the problem we run into here was the lack of action from the garbage
                # collector on the processed objects. although we couldn't find any refs to
                # these objects, garbage collecter kept them in the memory, and `del` statement
                # on the `split` object did not yield any improvement either. so here we are
                # accessing to the atomic data structures in our split objects to try to relieve
                # the memory by encouraging the garbage collector to realize what's up
                # explicitly.
                if self.write_buffer_size > 0 and len(
                        self.contigs) % self.write_buffer_size == 0:
                    self.store_contigs_buffer()
                    for c in self.contigs:
                        for split in c.splits:
                            del split.coverage
                            del split.auxiliary
                            del split
                        del c.splits[:]
                        del c.coverage
                        del c
                    del self.contigs[:]
            except KeyboardInterrupt:
                print(
                    "Anvi'o profiler recieved SIGINT, terminating all processes..."
                )
                break

        for proc in processes:
            proc.terminate()

        self.store_contigs_buffer()
        self.auxiliary_db.close()
        self.progress.end()

        # FIXME: this needs to be checked:
        if discarded_contigs > 0:
            self.run.info('contigs_after_C',
                          pp(recieved_contigs - discarded_contigs))

        overall_mean_coverage = 1
        if self.total_length_of_all_contigs != 0:
            overall_mean_coverage = self.total_coverage_values_for_all_contigs / self.total_length_of_all_contigs

        # FIXME: We know this is ugly. You can keep your opinion to yourself.
        if overall_mean_coverage > 0.0:
            # avoid dividing by zero
            dbops.ProfileDatabase(self.profile_db_path).db._exec(
                "UPDATE atomic_data_splits SET abundance = abundance / " +
                str(overall_mean_coverage) + " * 1.0;")
            dbops.ProfileDatabase(self.profile_db_path).db._exec(
                "UPDATE atomic_data_contigs SET abundance = abundance / " +
                str(overall_mean_coverage) + " * 1.0;")

        self.check_contigs(num_contigs=recieved_contigs - discarded_contigs)
Example #2
0
    def profile(self):
        manager = multiprocessing.Manager()
        available_index_queue = manager.Queue()
        output_queue = manager.Queue(self.queue_size)

        # put contig indices into the queue to be read from within
        # the worker
        for i in range(0, self.num_contigs):
            available_index_queue.put(i)

        processes = []
        for i in range(0, self.num_threads):
            processes.append(multiprocessing.Process(target=BAMProfiler.profile_contig_worker, args=(self, available_index_queue, output_queue)))

        for proc in processes:
            proc.start()

        recieved_contigs = 0
        discarded_contigs = 0
        memory_usage = None

        self.progress.new('Profiling w/' + str(self.num_threads) + ' thread%s' % ('s' if self.num_threads > 1 else ''), progress_total_items=self.num_contigs)
        self.progress.update('initializing threads ...')
        # FIXME: memory usage should be generalized.
        last_memory_update = int(time.time())

        self.progress.update('contigs are being processed ...')
        self.progress.increment(recieved_contigs)
        while recieved_contigs < self.num_contigs:
            try:
                contig = output_queue.get()

                # if we have a contig back, it means we are good to go with it,
                # otherwise it is garbage.
                if contig:
                    self.contigs.append(contig)
                else:
                    discarded_contigs += 1

                recieved_contigs += 1

                if (int(time.time()) - last_memory_update) > 5:
                    memory_usage = utils.get_total_memory_usage()
                    last_memory_update = int(time.time())

                self.progress.update('%d of %d contigs ⚙  / MEM ☠️  %s' % \
                            (recieved_contigs, self.num_contigs, memory_usage or '??'))

                # here you're about to witness the poor side of Python (or our use of it).
                # the problem we run into here was the lack of action from the garbage
                # collector on the processed objects. although we couldn't find any refs to
                # these objects, garbage collecter kept them in the memory, and `del` statement
                # on the `split` object did not yield any improvement either. so here we are
                # accessing to the atomic data structures in our split objects to try to relieve
                # the memory by encouraging the garbage collector to realize what's up
                # explicitly.
                if self.write_buffer_size > 0 and len(self.contigs) % self.write_buffer_size == 0:
                    self.store_contigs_buffer()
                    for c in self.contigs:
                        for split in c.splits:
                            del split.coverage
                            del split.auxiliary
                            del split
                        del c.splits[:]
                        del c.coverage
                        del c
                    del self.contigs[:]
            except KeyboardInterrupt:
                self.run.info_single("Anvi'o profiler recieved SIGINT, terminating all processes...", nl_before=2)
                break

        for proc in processes:
            proc.terminate()

        self.store_contigs_buffer()
        self.auxiliary_db.close()

        self.progress.end()

        # FIXME: this needs to be checked:
        if discarded_contigs > 0:
            self.run.info('contigs_after_C', pp(recieved_contigs - discarded_contigs))

        overall_mean_coverage = 1
        if self.total_length_of_all_contigs != 0:
            overall_mean_coverage = self.total_coverage_values_for_all_contigs / self.total_length_of_all_contigs

        # FIXME: We know this is ugly. You can keep your opinion to yourself.
        if overall_mean_coverage > 0.0:
            # avoid dividing by zero
            dbops.ProfileDatabase(self.profile_db_path).db._exec("UPDATE atomic_data_splits SET abundance = abundance / " + str(overall_mean_coverage) + " * 1.0;")
            dbops.ProfileDatabase(self.profile_db_path).db._exec("UPDATE atomic_data_contigs SET abundance = abundance / " + str(overall_mean_coverage) + " * 1.0;")

        if not self.skip_SNV_profiling:
            self.layer_additional_data['num_SNVs_reported'] =  TableForVariability(self.profile_db_path, progress=null_progress).num_entries
            self.layer_additional_keys.append('num_SNVs_reported')

        self.check_contigs(num_contigs=recieved_contigs-discarded_contigs)