Beispiel #1
0
    def store_contigs_buffer(self):
        for contig in self.contigs:
            self.total_length_of_all_contigs += contig.length
            self.total_coverage_values_for_all_contigs += contig.coverage.mean * contig.length

            # we will divide every abundance after profiling is done.
            contig.abundance = contig.coverage.mean
            for split in contig.splits:
                split.abundance = split.coverage.mean

        self.generate_variabile_nts_table()
        self.generate_variabile_codons_table()
        self.store_split_coverages()

        # creating views in the database for atomic data we gathered during the profiling. Meren, please note
        # that the first entry has a view_id, and the second one does not have one. I know you will look at this
        # and be utterly confused 2 months from now. Please go read the description given in the dbops.py for the
        # function create_new_view defined in the class TablesForViews.
        view_data_splits, view_data_contigs = contigops.get_atomic_data_dicts(self.sample_id, self.contigs)

        TablesForViews(self.profile_db_path).create_new_view(
                                        data_dict=view_data_splits,
                                        table_name='atomic_data_splits',
                                        table_structure=t.atomic_data_table_structure,
                                        table_types=t.atomic_data_table_types,
                                        view_name='single',
                                        append_mode=True)

        TablesForViews(self.profile_db_path).create_new_view(
                                        data_dict=view_data_contigs,
                                        table_name='atomic_data_contigs',
                                        table_structure=t.atomic_data_table_structure,
                                        table_types=t.atomic_data_table_types,
                                        view_name=None,
                                        append_mode=True)
Beispiel #2
0
    def gen_view_data_tables_from_atomic_data(self):
        essential_fields = [f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f)]
        auxiliary_fields = [f for f in self.atomic_data_fields if constants.IS_AUXILIARY_FIELD(f)]

        # setting standard view table structure and types
        view_table_structure = ['contig'] + self.sample_ids_found_in_input_dbs + auxiliary_fields
        view_table_types = ['text'] + ['numeric'] * len(self.sample_ids_found_in_input_dbs) + ['text']

        # generate a dictionary for normalized coverage of each contig across samples per target
        self.normalized_coverages = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.normalized_coverages[target][split_name] = {}
                for input_profile_db_path in self.profile_dbs_info_dict:
                    self.normalized_coverages[target][split_name][input_profile_db_path] = self.get_normalized_coverage_of_split(target, input_profile_db_path, split_name)

        # generate a dictionary for max normalized ratio of each contig across samples per target
        self.max_normalized_ratios = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.max_normalized_ratios[target][split_name] = self.get_max_normalized_ratio_of_split(target, split_name)

        self.progress.new('Generating view data tables')
        for target in ['contigs', 'splits']:
            for essential_field in essential_fields:
                self.progress.update('Processing %s for %s ...' % (essential_field, target))

                data_dict = {}
                for split_name in self.split_names:
                    data_dict[split_name] = {'__parent__': self.split_parents[split_name]}

                    for input_profile_db_path in self.profile_dbs_info_dict:
                        sample_id = self.profile_dbs_info_dict[input_profile_db_path]['sample_id']
                        if essential_field == 'normalized_coverage':
                            data_dict[split_name][sample_id] = self.normalized_coverages[target][split_name][input_profile_db_path]
                        elif essential_field == 'max_normalized_ratio':
                            data_dict[split_name][sample_id] = self.max_normalized_ratios[target][split_name][input_profile_db_path]
                        elif essential_field == 'relative_abundance':
                            data_dict[split_name][sample_id] = self.get_relative_abundance_of_split(target, input_profile_db_path, split_name)
                        else:
                            data_dict[split_name][sample_id] = self.atomic_data_for_each_run[target][input_profile_db_path][split_name][essential_field]

                # time to store the data for this view in the profile database
                table_name = '_'.join([essential_field, target])
                TablesForViews(self.merged_profile_db_path).create_new_view(
                                                data_dict=data_dict,
                                                table_name=table_name,
                                                table_structure=view_table_structure,
                                                table_types=view_table_types,
                                                view_name=essential_field if target == 'splits' else None)

        # if SNVs were not profiled, remove all entries from variability tables:
        if not self.SNVs_profiled:
            TablesForViews(self.merged_profile_db_path).remove(view_name='variability', table_names_to_blank=['variability_splits', 'variability_contigs'])

        self.progress.end()
Beispiel #3
0
    def gen_view_data_tables_from_atomic_data(self):
        essential_fields = constants.essential_data_fields_for_anvio_profiles

        self.progress.new('Views')
        for target in ['contigs', 'splits']:
            for essential_field in essential_fields:
                table_name_to_read_from = '_'.join([essential_field, target])
                for input_profile_db_path in self.profile_dbs_info_dict:
                    sample_id = self.profile_dbs_info_dict[
                        input_profile_db_path]['sample_id']
                    self.progress.update(
                        f"Reading '{essential_field}' of '{target}' in '{sample_id}'"
                    )

                    profile_db = db.DB(input_profile_db_path,
                                       utils.get_required_version_for_db(
                                           input_profile_db_path),
                                       skip_rowid_prepend=True)
                    view_data_for_sample = profile_db.get_all_rows_from_table(
                        table_name_to_read_from)
                    profile_db.disconnect()

                    self.progress.update(
                        f"Writing '{essential_field}' of '{target}' in '{sample_id}'"
                    )
                    table_name = '_'.join([essential_field, target])
                    TablesForViews(self.merged_profile_db_path,
                                   progress=self.progress) \
                                        .create_new_view(view_data=view_data_for_sample,
                                                         table_name=table_name,
                                                         append_mode=True,
                                                         view_name=essential_field if target == 'splits' else None,
                                                         skip_sanity_check=True)

        # if SNVs were not profiled, remove all entries from variability tables:
        if not self.SNVs_profiled:
            TablesForViews(self.merged_profile_db_path).remove(
                view_name='variability',
                table_names_to_blank=[
                    'variability_splits', 'variability_contigs'
                ])

        self.progress.end()
Beispiel #4
0
    def process_gene_clusters(self, gene_clusters_dict):
        self.progress.new('Generating view data')
        self.progress.update('...')

        gene_clusters = list(gene_clusters_dict.keys())

        for genome_name in self.genomes:
            self.genomes[genome_name]['singleton_gene_clusters'] = 0
            self.genomes[genome_name]['num_gene_clusters_raw'] = 0

        for gene_cluster in gene_clusters:
            self.view_data[gene_cluster] = dict([(genome_name, 0) for genome_name in self.genomes])
            self.view_data_presence_absence[gene_cluster] = dict([(genome_name, 0) for genome_name in self.genomes])
            self.additional_view_data[gene_cluster] = {'num_genes_in_gene_cluster': 0, 'num_genomes_gene_cluster_has_hits': 0, 'SCG': 0, 'max_num_paralogs': 0}

            for gene_entry in gene_clusters_dict[gene_cluster]:
                genome_name = gene_entry['genome_name']

                self.view_data[gene_cluster][genome_name] += 1
                self.view_data_presence_absence[gene_cluster][genome_name] = 1
                self.additional_view_data[gene_cluster]['num_genes_in_gene_cluster'] += 1
                self.genomes[genome_name]['num_gene_clusters_raw'] += 1

            genomes_contributing_to_gene_cluster = [t[0] for t in self.view_data_presence_absence[gene_cluster].items() if t[1]]

            if len(genomes_contributing_to_gene_cluster) == 1:
                self.genomes[genomes_contributing_to_gene_cluster[0]]['singleton_gene_clusters'] += 1

            self.additional_view_data[gene_cluster]['SCG'] = 1 if set(self.view_data[gene_cluster].values()) == set([1]) else 0
            self.additional_view_data[gene_cluster]['max_num_paralogs'] = max(self.view_data[gene_cluster].values())

            self.additional_view_data[gene_cluster]['num_genomes_gene_cluster_has_hits'] = len([True for genome in self.view_data[gene_cluster] if self.view_data[gene_cluster][genome] > 0])

        self.progress.end()
        ########################################################################################
        #                           FILTERING BASED ON OCCURRENCE
        ########################################################################################
        gene_clusters_of_interest = set([])
        for gene_cluster in gene_clusters:
            if self.additional_view_data[gene_cluster]['num_genomes_gene_cluster_has_hits'] >= self.gene_cluster_min_occurrence:
                gene_clusters_of_interest.add(gene_cluster)

        removed_gene_clusters = 0
        for gene_cluster in gene_clusters:
            if gene_cluster not in gene_clusters_of_interest:
                self.view_data.pop(gene_cluster)
                self.view_data_presence_absence.pop(gene_cluster)
                self.additional_view_data.pop(gene_cluster)
                gene_clusters_dict.pop(gene_cluster)
                removed_gene_clusters += 1

        if self.gene_cluster_min_occurrence > 1:
            self.run.info('gene_clusters min occurrence', '%d (the filter removed %d gene_clusters)' % (self.gene_cluster_min_occurrence, removed_gene_clusters))

        ########################################################################################
        #            CAN WE CLUSTER THIS STUFF? DOES THE USER WANT US TO TRY REGARDLESS?
        ########################################################################################
        if len(gene_clusters_dict) > self.max_num_gene_clusters_for_hierarchical_clustering:
            if self.enforce_hierarchical_clustering:
                self.run.warning("You have %s gene_clusters, which exceeds the number of gene_clusters anvi'o is comfortable to cluster. But\
                                  since you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                                  to create a hierarchical clustering of your gene_clusters anyway. It may take a bit of \
                                  time. Pour yourself a coffee. Or go to a nice vacation. See you in 10 mins, or next year \
                                  or never." % pp(len(gene_clusters_dict)))
            else:
                self.run.warning("It seems you have %s gene clusters in your pangenome. This exceeds the soft limit\
                                  of %s for anvi'o to attempt to create a hierarchical clustering of your gene clusters\
                                  (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                                  clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`." \
                                            % (pp(len(gene_clusters_dict)), pp(self.max_num_gene_clusters_for_hierarchical_clustering)))
                self.skip_hierarchical_clustering = True

        ########################################################################################
        #                           STORING FILTERED DATA IN THE DB
        ########################################################################################
        table_structure=['gene_cluster'] + sorted(self.genomes.keys())
        table_types=['text'] + ['numeric'] * len(self.genomes)
        TablesForViews(self.pan_db_path).create_new_view(
                                        data_dict=self.view_data,
                                        table_name='gene_cluster_frequencies',
                                        table_structure=table_structure,
                                        table_types=table_types,
                                        view_name = 'gene_cluster_frequencies')

        TablesForViews(self.pan_db_path).create_new_view(
                                        data_dict=self.view_data_presence_absence,
                                        table_name='gene_cluster_presence_absence',
                                        table_structure=table_structure,
                                        table_types=table_types,
                                        view_name = 'gene_cluster_presence_absence')

        item_additional_data_table = miscdata.TableForItemAdditionalData(self.args)
        item_additional_data_keys = ['num_genomes_gene_cluster_has_hits', 'num_genes_in_gene_cluster', 'max_num_paralogs', 'SCG']
        item_additional_data_table.add(self.additional_view_data, item_additional_data_keys, skip_check_names=True)
        #                                                                                    ^^^^^^^^^^^^^^^^^^^^^
        #                                                                                   /
        # here we say skip_check_names=True, simply because there is no gene_clusters table has not been
        # generated yet, but the check names functionality in dbops looks for the gene clsuters table to
        # be certain. it is not a big deal here, since we absoluely know what gene cluster names we are
        # working with.

        ########################################################################################
        #                   RETURN THE -LIKELY- UPDATED PROTEIN CLUSTERS DICT
        ########################################################################################
        return gene_clusters_dict
Beispiel #5
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.log_file_path = self.generate_output_destination('RUNLOG.txt')
        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info(
            'description', 'Found (%d characters)' %
            len(self.description) if self.description else None)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('blank', self.blank)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('profile_SCVs', self.profile_SCVs)
        self.run.info('report_variability_full', self.report_variability_full)

        self.run.warning("Your minimum contig length is set to %s base pairs. So anvi'o will not take into\
                          consideration anything below that. If you need to kill this an restart your\
                          analysis with another minimum contig length value, feel free to press CTRL+C." \
                                                % (pp(self.min_contig_length)))

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.blank:
            self.init_mock_profile()

            # creating a null view_data_splits dict:
            view_data_splits = dict(
                list(
                    zip(self.split_names, [
                        dict(
                            list(
                                zip(t.atomic_data_table_structure[1:], [None] *
                                    len(t.atomic_data_table_structure[1:]))))
                    ] * len(self.split_names))))
            TablesForViews(self.profile_db_path).remove(
                'single', table_names_to_blank=['atomic_data_splits'])
            TablesForViews(self.profile_db_path).create_new_view(
                data_dict=view_data_splits,
                table_name='atomic_data_splits',
                table_structure=t.atomic_data_table_structure,
                table_types=t.atomic_data_table_types,
                view_name='single')
        elif self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
        else:
            raise ConfigError(
                "What are you doing? :( Whatever it is, anvi'o will have none of it."
            )

        # update layer additional data table content
        if self.layer_additional_data:
            layer_additional_data_table = TableForLayerAdditionalData(
                argparse.Namespace(profile_db=self.profile_db_path))
            layer_additional_data_table.add(
                {self.sample_id: self.layer_additional_data},
                self.layer_additional_keys)

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        if self.bam:
            self.bam.close()

        self.run.quit()