Esempio n. 1
0
    def cluster_contigs_anvio(self):
        # clustering of contigs is done for each configuration file under static/clusterconfigs/merged directory;
        # at this point we don't care what those recipes really require because we already merged and generated
        # every data file that may be required.

        self.run.info_single("Anvi'o hierarchical clustering of contigs...", nl_before=1, nl_after=1, mc="blue")

        if not self.skip_hierarchical_clustering:
            dbops.do_hierarchical_clustering_of_items(self.merged_profile_db_path, self.clustering_configs, self.split_names, self.database_paths, \
                                                      input_directory=self.output_directory, default_clustering_config=constants.merged_default, \
                                                      distance=self.distance, linkage=self.linkage, run=self.run, progress=self.progress)
Esempio n. 2
0
    def cluster_contigs_anvio(self):
        # clustering of contigs is done for each configuration file under static/clusterconfigs/merged directory;
        # at this point we don't care what those recipes really require because we already merged and generated
        # every data file that may be required.

        self.run.info_single("Anvi'o hierarchical clustering of contigs...", nl_before=1, nl_after=1, mc="blue")

        if not self.skip_hierarchical_clustering:
            dbops.do_hierarchical_clustering_of_items(self.merged_profile_db_path, self.clustering_configs, self.split_names, self.database_paths, \
                                                      input_directory=self.output_directory, default_clustering_config=constants.merged_default, \
                                                      distance=self.distance, linkage=self.linkage, run=self.run, progress=self.progress)
Esempio n. 3
0
    def gen_hierarchical_clustering_of_gene_clusters(self):
        """Uses a clustering configuration to add hierarchical clustering of gene clusters into the pan db

        Note how this function cheats the system to create an enchanced clustering configuration:
        We want to use the clustering configurations for pan genomomic analyses to order
        gene clusters. however, we want to add something into the clustering configuraiton
        file, which depends on the number of genomes we have. this addition is 'num_genomes_gene_cluster_has_hits'
        data, which pulls together gene clusters that are distributed across genomes similarly based
        on this extra bit of inofrmation. becasue the clustering configurations framework in anvi'o
        does not allow us to have variable information in these recipes, we are going to generate one
        on the fly to have a more capable one."""

        if self.skip_hierarchical_clustering:
            return

        updated_clustering_configs = {}

        for config_name in constants.clustering_configs['pan']:
            config_path = constants.clustering_configs['pan'][config_name]

            # now we have the config path. we first get a temporary file path:
            enhanced_config_path = filesnpaths.get_temp_file_path()

            # setup the additional section based on the number of genomes we have:
            if config_name == 'presence-absence':
                additional_config_section="""\n[AdditionalData !PAN.db::item_additional_data]\ntable_form=dataframe\ncolumns_to_use = %s\nnormalize = False\n""" \
                                        % ','.join(['num_genomes_gene_cluster_has_hits'] * (int(round(len(self.genomes) / 2))))
            elif config_name == 'frequency':
                additional_config_section="""\n[AdditionalData !PAN.db::item_additional_data]\ntable_form=dataframe\ncolumns_to_use = %s\nnormalize = False\nlog=True\n""" \
                                        % ','.join(['num_genes_in_gene_cluster'] * (int(round(math.sqrt(len(self.genomes))))))

            # write the content down in to file at the new path:
            open(enhanced_config_path, 'w').write(
                open(config_path).read() + additional_config_section)

            # update the clustering configs:
            updated_clustering_configs[config_name] = enhanced_config_path

            dbops.do_hierarchical_clustering_of_items(self.pan_db_path, updated_clustering_configs, database_paths={'PAN.db': self.pan_db_path},\
                                                      input_directory=self.output_dir, default_clustering_config=constants.pan_default,\
                                                      distance=self.distance, linkage=self.linkage, run=self.run, progress=self.progress)
Esempio n. 4
0
    def gen_hierarchical_clustering_of_gene_clusters(self):
        """Uses a clustering configuration to add hierarchical clustering of gene clusters into the pan db

        Note how this function cheats the system to create an enchanced clustering configuration:
        We want to use the clustering configurations for pan genomomic analyses to order
        gene clusters. however, we want to add something into the clustering configuraiton
        file, which depends on the number of genomes we have. this addition is 'num_genomes_gene_cluster_has_hits'
        data, which pulls together gene clusters that are distributed across genomes similarly based
        on this extra bit of inofrmation. becasue the clustering configurations framework in anvi'o
        does not allow us to have variable information in these recipes, we are going to generate one
        on the fly to have a more capable one."""

        if self.skip_hierarchical_clustering:
            return

        updated_clustering_configs = {}

        for config_name in constants.clustering_configs['pan']:
            config_path = constants.clustering_configs['pan'][config_name]

            # now we have the config path. we first get a temporary file path:
            enhanced_config_path = filesnpaths.get_temp_file_path()

            # setup the additional section based on the number of genomes we have:
            if config_name == 'presence-absence':
                additional_config_section="""\n[AdditionalData !PAN.db::item_additional_data]\ntable_form=dataframe\ncolumns_to_use = %s\nnormalize = False\n""" \
                                        % ','.join(['num_genomes_gene_cluster_has_hits'] * (int(round(len(self.genomes) / 2))))
            elif config_name == 'frequency':
                additional_config_section="""\n[AdditionalData !PAN.db::item_additional_data]\ntable_form=dataframe\ncolumns_to_use = %s\nnormalize = False\nlog=True\n""" \
                                        % ','.join(['num_genes_in_gene_cluster'] * (int(round(math.sqrt(len(self.genomes))))))

            # write the content down in to file at the new path:
            open(enhanced_config_path, 'w').write(open(config_path).read() + additional_config_section)

            # update the clustering configs:
            updated_clustering_configs[config_name] = enhanced_config_path

            dbops.do_hierarchical_clustering_of_items(self.pan_db_path, updated_clustering_configs, database_paths={'PAN.db': self.pan_db_path},\
                                                      input_directory=self.output_dir, default_clustering_config=constants.pan_default,\
                                                      distance=self.distance, linkage=self.linkage, run=self.run, progress=self.progress)
Esempio n. 5
0
    def do_profile_db(self):
        # are we working with a merged profile database?
        merged = self.summary.p_meta['merged']
        self.run.info('Merged database', 'True' if merged else 'False')

        self.progress.new('Splitting "%s"' % self.bin_id)
        self.progress.update('Subsetting the %s profile database' %
                             'merged' if merged else 'single')

        bin_profile_db = dbops.ProfileDatabase(self.bin_profile_db_path)
        bin_profile_db.touch()

        # copy-paste tables that will largely stay the same from the parent
        bin_profile_db.db.copy_paste(table_name='self',
                                     source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='views',
                                     source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='states',
                                     source_db_path=self.profile_db_path)

        # update some values
        bin_profile_db.db.update_meta_value('contigs_db_hash',
                                            self.contigs_db_hash)
        bin_profile_db.db.update_meta_value('available_clusterings', None)
        bin_profile_db.db.update_meta_value('sample_id', self.bin_id)

        # setup the filtering rules for migrating data:
        tables = {}

        # this is to deal with merge atomic data tables that are stored in merged profiles.
        # they are being created on the fly during merge, so bin_profile_db.touch() did not
        # create them, and we have to do it here ourselves. while creating them in the target
        # db, we will also populate the tables dictionary for data migration::
        sample_names = self.summary.p_meta['samples']
        if merged:
            for table_name in t.atomic_data_table_structure[1:-1]:
                for target in ['splits', 'contigs']:
                    new_table_name = '_'.join([table_name, target])
                    new_table_structure = ['contig'
                                           ] + sample_names + ['__parent__']
                    new_table_types = [
                        'text'
                    ] + ['numeric'] * len(sample_names) + ['text']
                    bin_profile_db.db.create_table(new_table_name,
                                                   new_table_structure,
                                                   new_table_types)

                    tables[new_table_name] = ('contig', self.split_names)
        else:
            profile_db = dbops.ProfileDatabase(self.profile_db_path)
            table_structure = profile_db.db.get_table_structure(
                'atomic_data_contigs')
            table_types = profile_db.db.get_table_column_types(
                'atomic_data_contigs')
            for table_name in ['atomic_data_splits', 'atomic_data_contigs']:
                new_table_structure = profile_db.db.get_table_structure(
                    table_name)
                bin_profile_db.db.create_table(table_name, table_structure,
                                               table_types)

                tables[table_name] = ('contig', self.split_names)

        # we need to migrate these guys, too. unless we don't need to... if we are migrating,
        # the values in the self table are already accurate. if we are skipping, regardless
        # of what the values were, we will set the absolut correct ones.
        if self.skip_variability_tables:
            bin_profile_db.db.update_meta_value('SNVs_profiled', False)
            bin_profile_db.db.update_meta_value('SCVs_profiled', False)
        else:
            tables[t.variable_nts_table_name] = ('split_name',
                                                 self.split_names)
            tables[t.variable_codons_table_name] = ('corresponding_gene_call',
                                                    self.gene_caller_ids)

        bin_profile_db.disconnect()

        self.migrate_data(tables, self.profile_db_path,
                          self.bin_profile_db_path)

        self.progress.end()

        if not self.skip_hierarchical_clustering:
            dbops.do_hierarchical_clustering_of_items(self.bin_profile_db_path, constants.clustering_configs['merged' if merged else 'single'], self.split_names, \
                                                      self.database_paths, input_directory=self.bin_output_directory, \
                                                      default_clustering_config=constants.merged_default, distance=self.distance, \
                                                      linkage=self.linkage, run=terminal.Run(verbose=False), progress=self.progress)

        # add a collection
        collection_dict = {'ALL_SPLITS': self.split_names}
        bins_info_dict = {
            'ALL_SPLITS': {
                'html_color': '#FF0000',
                'source': 'anvi-split'
            }
        }
        collections = TablesForCollections(self.bin_profile_db_path)
        collections.append('DEFAULT',
                           collection_dict,
                           bins_info_dict=bins_info_dict)
Esempio n. 6
0
    def cluster_contigs(self):
        default_clustering_config = constants.blank_default if self.blank else constants.single_default

        dbops.do_hierarchical_clustering_of_items(self.profile_db_path, self.clustering_configs, self.split_names, self.database_paths, \
                                                  input_directory=self.output_directory, default_clustering_config=default_clustering_config, \
                                                  distance=self.distance, linkage=self.linkage, run=self.run, progress=self.progress)
Esempio n. 7
0
    def cluster_contigs(self):
        default_clustering_config = constants.blank_default if self.blank else constants.single_default

        dbops.do_hierarchical_clustering_of_items(self.profile_db_path, self.clustering_configs, self.split_names, self.database_paths, \
                                                  input_directory=self.output_directory, default_clustering_config=default_clustering_config, \
                                                  distance=self.distance, linkage=self.linkage, run=self.run, progress=self.progress)
Esempio n. 8
0
    def do_profile_db(self):
        # are we working with a merged profile database?
        merged = self.summary.p_meta['merged']
        self.run.info('Merged database', 'True' if merged else 'False')

        self.progress.new('Splitting "%s"' % self.bin_id)
        self.progress.update('Subsetting the %s profile database' % 'merged' if merged else 'single')

        bin_profile_db = dbops.ProfileDatabase(self.bin_profile_db_path)
        bin_profile_db.touch()

        # copy-paste tables that will largely stay the same from the parent
        bin_profile_db.db.copy_paste(table_name='self', source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='views', source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='states', source_db_path=self.profile_db_path)

        # update some values
        bin_profile_db.db.update_meta_value('contigs_db_hash', self.contigs_db_hash)
        bin_profile_db.db.update_meta_value('available_clusterings', None)
        bin_profile_db.db.update_meta_value('sample_id', self.bin_id)

        # setup the filtering rules for migrating data:
        tables = {}

        # this is to deal with merge atomic data tables that are stored in merged profiles.
        # they are being created on the fly during merge, so bin_profile_db.touch() did not
        # create them, and we have to do it here ourselves. while creating them in the target
        # db, we will also populate the tables dictionary for data migration::
        sample_names = self.summary.p_meta['samples']
        if merged:
            for table_name in t.atomic_data_table_structure[1:-1]:
                for target in ['splits', 'contigs']:
                    new_table_name = '_'.join([table_name, target])
                    new_table_structure = ['contig'] + sample_names + ['__parent__']
                    new_table_types = ['text'] + ['numeric'] * len(sample_names) + ['text']
                    bin_profile_db.db.create_table(new_table_name, new_table_structure, new_table_types)

                    tables[new_table_name] = ('contig', self.split_names)
        else:
            profile_db = dbops.ProfileDatabase(self.profile_db_path)
            table_structure = profile_db.db.get_table_structure('atomic_data_contigs')
            table_types = profile_db.db.get_table_column_types('atomic_data_contigs')
            for table_name in ['atomic_data_splits', 'atomic_data_contigs']:
                new_table_structure = profile_db.db.get_table_structure(table_name)
                bin_profile_db.db.create_table(table_name, table_structure, table_types)

                tables[table_name] = ('contig', self.split_names)


        # we need to migrate these guys, too. unless we don't need to... if we are migrating,
        # the values in the self table are already accurate. if we are skipping, regardless
        # of what the values were, we will set the absolut correct ones.
        if self.skip_variability_tables:
            bin_profile_db.db.update_meta_value('SNVs_profiled', False)
            bin_profile_db.db.update_meta_value('SCVs_profiled', False)
        else:
            tables[t.variable_nts_table_name] = ('split_name', self.split_names)
            tables[t.variable_codons_table_name] = ('corresponding_gene_call', self.gene_caller_ids)

        bin_profile_db.disconnect()

        self.migrate_data(tables, self.profile_db_path, self.bin_profile_db_path)

        self.progress.end()

        if not self.skip_hierarchical_clustering:
            dbops.do_hierarchical_clustering_of_items(self.bin_profile_db_path, constants.clustering_configs['merged' if merged else 'single'], self.split_names, \
                                                      self.database_paths, input_directory=self.bin_output_directory, \
                                                      default_clustering_config=constants.merged_default, distance=self.distance, \
                                                      linkage=self.linkage, run=terminal.Run(verbose=False), progress=self.progress)

        # add a collection
        collection_dict = {'ALL_SPLITS': self.split_names}
        bins_info_dict = {'ALL_SPLITS': {'html_color': '#FF0000', 'source': 'anvi-split'}}
        collections = TablesForCollections(self.bin_profile_db_path)
        collections.append('DEFAULT', collection_dict, bins_info_dict=bins_info_dict)