Beispiel #1
0
    def store_contigs_buffer(self):
        for contig in self.contigs:
            self.total_length_of_all_contigs += contig.length
            self.total_coverage_values_for_all_contigs += contig.coverage.mean * contig.length

            # we will divide every abundance after profiling is done.
            contig.abundance = contig.coverage.mean
            for split in contig.splits:
                split.abundance = split.coverage.mean

        self.progress.verbose = False
        self.generate_variabile_nts_table()
        self.generate_variabile_aas_table()
        self.store_split_coverages()
        self.progress.verbose = True

        # creating views in the database for atomic data we gathered during the profiling. Meren, please note
        # that the first entry has a view_id, and the second one does not have one. I know you will look at this
        # and be utterly confused 2 months from now. Please go read the description given in the dbops.py for the
        # function create_new_view defined in the class TablesForViews.
        view_data_splits, view_data_contigs = contigops.get_atomic_data_dicts(
            self.sample_id, self.contigs)

        dbops.TablesForViews(self.profile_db_path).create_new_view(
            data_dict=view_data_splits,
            table_name='atomic_data_splits',
            table_structure=t.atomic_data_table_structure,
            table_types=t.atomic_data_table_types,
            view_name='single',
            append_mode=True)

        dbops.TablesForViews(self.profile_db_path).create_new_view(
            data_dict=view_data_contigs,
            table_name='atomic_data_contigs',
            table_structure=t.atomic_data_table_structure,
            table_types=t.atomic_data_table_types,
            view_name=None,
            append_mode=True)
Beispiel #2
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.log_file_path = self.generate_output_destination('RUNLOG.txt')
        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('blank', self.blank)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('profile_AA_frequencies', self.profile_AA_frequencies)
        self.run.info('report_variability_full', self.report_variability_full)
        self.run.info('gene_coverages_computed',
                      self.a_meta['genes_are_called'])

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.blank:
            self.init_mock_profile()
        elif self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
            if self.gen_serialized_profile:
                self.store_profile()
        elif self.serialized_profile_path:
            self.init_serialized_profile()
        else:
            raise ConfigError, "What are you doing? :( Whatever it is, anvi'o will have none of it."

        self.generate_variabile_nts_table()
        self.generate_variabile_aas_table()
        self.generate_gene_coverages_table()
        self.store_split_coverages()

        # creating views in the database for atomic data we gathered during the profiling. Meren, please note
        # that the first entry has a view_id, and the second one does not have one. I know you will look at this
        # and be utterly confused 2 months from now. Please go read the description given in the dbops.py for the
        # function create_new_view defined in the class TablesForViews.
        view_data_splits, view_data_contigs = self.atomic_data.get_data(
            self.sample_id, self.contigs)
        dbops.TablesForViews(self.profile_db_path,
                             anvio.__profile__version__).create_new_view(
                                 data_dict=view_data_splits,
                                 table_name='atomic_data_splits',
                                 table_structure=t.atomic_data_table_structure,
                                 table_types=t.atomic_data_table_types,
                                 view_name='single')

        dbops.TablesForViews(self.profile_db_path,
                             anvio.__profile__version__).create_new_view(
                                 data_dict=view_data_splits,
                                 table_name='atomic_data_contigs',
                                 table_structure=t.atomic_data_table_structure,
                                 table_types=t.atomic_data_table_types,
                                 view_name=None)

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        runinfo_serialized = self.generate_output_destination('RUNINFO.cp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        if self.bam:
            self.bam.close()

        self.run.quit()
Beispiel #3
0
    def gen_view_data_tables_from_atomic_data(self):
        essential_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_ESSENTIAL_FIELD(f)
        ]
        auxiliary_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_AUXILIARY_FIELD(f)
        ]

        # setting standard view table structure and types
        view_table_structure = [
            'contig'
        ] + self.sample_ids_found_in_input_dbs + auxiliary_fields
        view_table_types = [
            'text'
        ] + ['numeric'] * len(self.sample_ids_found_in_input_dbs) + ['text']

        # generate a dictionary for normalized coverage of each contig across samples per target
        self.normalized_coverages = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.normalized_coverages[target][split_name] = {}
                for input_profile_db_path in self.profile_dbs_info_dict:
                    self.normalized_coverages[target][split_name][
                        input_profile_db_path] = self.get_normalized_coverage_of_split(
                            target, input_profile_db_path, split_name)

        # generate a dictionary for max normalized ratio of each contig across samples per target
        self.max_normalized_ratios = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.max_normalized_ratios[target][
                    split_name] = self.get_max_normalized_ratio_of_split(
                        target, split_name)

        self.progress.new('Generating view data tables')
        for target in ['contigs', 'splits']:
            for essential_field in essential_fields:
                self.progress.update('Processing %s for %s ...' %
                                     (essential_field, target))

                data_dict = {}
                for split_name in self.split_names:
                    data_dict[split_name] = {
                        '__parent__': self.split_parents[split_name]
                    }

                    for input_profile_db_path in self.profile_dbs_info_dict:
                        sample_id = self.profile_dbs_info_dict[
                            input_profile_db_path]['sample_id']
                        if essential_field == 'normalized_coverage':
                            data_dict[split_name][
                                sample_id] = self.normalized_coverages[target][
                                    split_name][input_profile_db_path]
                        elif essential_field == 'max_normalized_ratio':
                            data_dict[split_name][
                                sample_id] = self.max_normalized_ratios[
                                    target][split_name][input_profile_db_path]
                        elif essential_field == 'relative_abundance':
                            data_dict[split_name][
                                sample_id] = self.get_relative_abundance_of_split(
                                    target, input_profile_db_path, split_name)
                        else:
                            data_dict[split_name][
                                sample_id] = self.atomic_data_for_each_run[
                                    target][input_profile_db_path][split_name][
                                        essential_field]

                # time to store the data for this view in the profile database
                table_name = '_'.join([essential_field, target])
                dbops.TablesForViews(
                    self.merged_profile_db_path).create_new_view(
                        data_dict=data_dict,
                        table_name=table_name,
                        table_structure=view_table_structure,
                        table_types=view_table_types,
                        view_name=essential_field
                        if target == 'splits' else None)

        # if SNVs were not profiled, remove all entries from variability tables:
        if not self.SNVs_profiled:
            dbops.TablesForViews(self.merged_profile_db_path).remove(
                view_name='variability',
                table_names_to_blank=[
                    'variability_splits', 'variability_contigs'
                ])

        self.progress.end()
Beispiel #4
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.log_file_path = self.generate_output_destination('RUNLOG.txt')
        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info(
            'description', 'Found (%d characters)' %
            len(self.description) if self.description else None)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('blank', self.blank)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('profile_AA_frequencies', self.profile_AA_frequencies)
        self.run.info('report_variability_full', self.report_variability_full)

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.blank:
            self.init_mock_profile()

            # creating a null view_data_splits dict:
            view_data_splits = dict(
                list(
                    zip(self.split_names, [
                        dict(
                            list(
                                zip(t.atomic_data_table_structure[1:], [None] *
                                    len(t.atomic_data_table_structure[1:]))))
                    ] * len(self.split_names))))
            dbops.TablesForViews(self.profile_db_path).remove(
                'single', table_names_to_blank=['atomic_data_splits'])
            dbops.TablesForViews(self.profile_db_path).create_new_view(
                data_dict=view_data_splits,
                table_name='atomic_data_splits',
                table_structure=t.atomic_data_table_structure,
                table_types=t.atomic_data_table_types,
                view_name='single')
        elif self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
        else:
            raise ConfigError(
                "What are you doing? :( Whatever it is, anvi'o will have none of it."
            )

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        if self.bam:
            self.bam.close()

        self.run.quit()
Beispiel #5
0
    def process_PCs(self, PCs_dict):
        self.progress.new('Generating view data')
        self.progress.update('...')

        PCs = list(PCs_dict.keys())

        for PC in PCs:
            self.view_data[PC] = dict([(genome_name, 0)
                                       for genome_name in self.genomes])
            self.view_data_presence_absence[PC] = dict([
                (genome_name, 0) for genome_name in self.genomes
            ])
            self.additional_view_data[PC] = {
                'num_genes_in_pc': 0,
                'num_genomes_pc_has_hits': 0,
                'SCG': 0
            }

            for gene_entry in PCs_dict[PC]:
                genome_name = gene_entry['genome_name']

                self.view_data[PC][genome_name] += 1
                self.view_data_presence_absence[PC][genome_name] = 1
                self.additional_view_data[PC]['num_genes_in_pc'] += 1

            self.additional_view_data[PC]['SCG'] = 1 if set(
                self.view_data[PC].values()) == set([1]) else 0

            self.additional_view_data[PC]['num_genomes_pc_has_hits'] = len([
                True for genome in self.view_data[PC]
                if self.view_data[PC][genome] > 0
            ])

        self.progress.end()

        ########################################################################################
        #                           FILTERING BASED ON OCCURRENCE
        ########################################################################################
        PCs_of_interest = set([])
        for PC in PCs:
            if self.additional_view_data[PC][
                    'num_genomes_pc_has_hits'] >= self.PC_min_occurrence:
                PCs_of_interest.add(PC)

        removed_PCs = 0
        for PC in PCs:
            if PC not in PCs_of_interest:
                self.view_data.pop(PC)
                self.view_data_presence_absence.pop(PC)
                self.additional_view_data.pop(PC)
                PCs_dict.pop(PC)
                removed_PCs += 1

        if self.PC_min_occurrence > 1:
            self.run.info(
                'PCs min occurrence', '%d (the filter removed %d PCs)' %
                (self.PC_min_occurrence, removed_PCs))

        ########################################################################################
        #            CAN WE CLUSTER THIS STUFF? DOES THE USER WANT US TO TRY REGARDLESS?
        ########################################################################################
        if len(PCs_dict) > self.max_num_PCs_for_hierarchical_clustering:
            if self.enforce_hierarchical_clustering:
                self.run.warning(
                    "You have %s PCs, which exceeds the number of PCs anvi'o is comfortable to cluster. But\
                                  since you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                                  to create a hierarchical clustering of your PCs anyway. It may take a bit of \
                                  time. Pour yourself a coffee. Or go to a nice vacation. See you in 10 mins, or next year \
                                  or never." % pp(len(PCs_dict)))
            else:
                self.run.warning("It seems you have %s protein clusters in your pangenome. This exceeds the soft limit\
                                  of %s for anvi'o to attempt to create a hierarchical clustering of your protein clusters\
                                  (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                                  clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`." \
                                            % (pp(len(PCs_dict)), pp(self.max_num_PCs_for_hierarchical_clustering)))
                self.skip_hierarchical_clustering = True

        ########################################################################################
        #                           STORING FILTERED DATA IN THE DB
        ########################################################################################
        table_structure = ['PC'] + sorted(self.genomes.keys())
        table_types = ['text'] + ['numeric'] * len(self.genomes)
        dbops.TablesForViews(self.pan_db_path).create_new_view(
            data_dict=self.view_data,
            table_name='PC_frequencies',
            table_structure=table_structure,
            table_types=table_types,
            view_name='PC_frequencies')

        dbops.TablesForViews(self.pan_db_path).create_new_view(
            data_dict=self.view_data_presence_absence,
            table_name='PC_presence_absence',
            table_structure=table_structure,
            table_types=table_types,
            view_name='PC_presence_absence')

        additional_data_structure = [
            'PC', 'num_genomes_pc_has_hits', 'num_genes_in_pc', 'SCG'
        ]
        dbops.TablesForViews(self.pan_db_path).create_new_view(
            data_dict=self.additional_view_data,
            table_name='additional_data',
            table_structure=additional_data_structure,
            table_types=['text', 'numeric', 'numeric', 'numeric'],
            view_name=None)

        # add additional data structure to the self table, so we can have them initially ordered
        # in the interface the way additional_data_structure suggests:
        pan_db = dbops.PanDatabase(self.pan_db_path, quiet=True)
        pan_db.db.set_meta_value('additional_data_headers',
                                 ','.join(additional_data_structure[1:]))
        pan_db.disconnect()

        ########################################################################################
        #                   RETURN THE -LIKELY- UPDATED PROTEIN CLUSTERS DICT
        ########################################################################################
        return PCs_dict