Beispiel #1
0
    def add_ECG_EAG_ratio_per_gene_cluster_into_pan_database(self):
        if not self.pan_summary:
            self.init_pan_summary()

        gene_presence_in_the_environment_dict = self.get_gene_presence_in_the_environment_dict()

        self.progress.new('Working on ECG/EAG ratio per gene cluster')
        self.progress.update('...')

        gene_status_frequencies_in_gene_cluster = {}

        gene_cluster_names = list(self.pan_summary.gene_clusters.keys())
        num_gene_clusters = len(gene_cluster_names)
        for i in range(0, num_gene_clusters):
            self.progress.update('%.2f' % ((i + 1) * 100 / num_gene_clusters))
            gene_cluster_name = gene_cluster_names[i]

            status = {'EAG': 0, 'ECG': 0, 'NA': 0}
            for internal_genome_name in self.pan_summary.gene_clusters[gene_cluster_name]:
                genome_name = self.descriptions.genomes[internal_genome_name]['bin_id']

                for gene_caller_id in self.pan_summary.gene_clusters[gene_cluster_name][internal_genome_name]:
                    if genome_name not in gene_presence_in_the_environment_dict:
                        self.progress.end()
                        raise ConfigError("Something is wrong... It seems you generated a pangenome with an internal genomes file\
                                           that is not identical to the internal genomes file you are using to run this program.")

                    status[gene_presence_in_the_environment_dict[genome_name][gene_caller_id]] += 1
            gene_status_frequencies_in_gene_cluster[gene_cluster_name] = status

        # setup some boring variable names.
        items_additional_data_dict = {}
        key_ECG_EAG_ratio = 'EAG_ECG_ratio'
        key_ECGs_and_EAGs = 'ECGs_and_EAGs'
        list_ECG_EAG_keys = ['EAG', 'ECG', 'NA']

        self.progress.update('Setting up the items data dictionary ..')
        for gene_cluster_name in gene_status_frequencies_in_gene_cluster:
            r = gene_status_frequencies_in_gene_cluster[gene_cluster_name]

            # add ECG and EAG frequencies for the gene cluster
            items_additional_data_dict[gene_cluster_name] = dict([('%s!%s' % (key_ECGs_and_EAGs, status), r[status]) for status in list_ECG_EAG_keys])

            # add ECG / EAG ratio
            items_additional_data_dict[gene_cluster_name][key_ECG_EAG_ratio] = (r['EAG'] / (r['EAG'] + r['ECG']) if (r['EAG'] + r['ECG']) else 0)

        self.progress.end()

        # add that bad boy to the database
        self.args.just_do_it = True
        items_additional_data_keys = [('%s!%s' % (key_ECGs_and_EAGs, status)) for status in list_ECG_EAG_keys] + [key_ECG_EAG_ratio]
        TableForItemAdditionalData(self.args).add(items_additional_data_dict, items_additional_data_keys)
Beispiel #2
0
    def check_for_db_requests(self, config):
        sections = self.get_other_sections(config)
        # look for requests from the database, create temporary tab delimited files:
        for section in sections:
            alias, matrix = section.split()
            if matrix.find('::') > -1:
                if matrix.startswith('!'):
                    database, table = matrix.split('::')
                    database = database[1:]

                    if database not in self.db_paths:
                        raise ConfigError('anvio could not recover the actual path of the database\
                                            (!%s) referenced in the config file, because the database\
                                            paths variable sent from the client does not have an entry\
                                            for it :( There are two options. One is to get a db_paths\
                                            dictionary sent to this class that contains a key for %s\
                                            with the full path to the dataase as a value. Or the table\
                                            "%s" can be exported to a TAB-delimited matrix and declared in\
                                            the config file. If you are experimenting and stuck here, please\
                                            see the documentation or send an e-mail to the developers.'\
                                                                                % (database, database, table))
                    database_path = self.db_paths[database]
                else:
                    database, table = matrix.split('::')
                    database_path = os.path.abspath(self.db_paths[database]) if database in self.db_paths else os.path.abspath(database)

                    # if its not there, let's try one more thing
                    if not os.path.exists(database_path):
                        database_path = os.path.abspath(os.path.join(self.input_directory, database))

                if not os.path.exists(database_path):
                    raise ConfigError("The database you requested (%s) is not where it was supposed to be ('%s') :/" % (database, database_path))

                dbc = db.DB(database_path, None, ignore_version=True)

                if not table in dbc.get_table_names():
                    raise ConfigError('The table you requested (%s) does not seem to be in %s :/' % (table, database))

                # here we know we are working with a database table that we have access to. however, in anvi'o database
                # tables in two forms: dataframe form, and matrix form. in dataframe form, we have key/value pairs rather
                # than MxN matrices where each N is a column for an attribute. while the latter is easier to export as a
                # matrix the clustering module can work with, the former requires extra attention. so here we need to first
                # figure out whether which form the table is in. why this even became necessary? taking a look at this issue
                # may help: https://github.com/merenlab/anvio/issues/662
                table_form = None
                if config.has_option(section, 'table_form'):
                    table_form = config.get(section, 'table_form')

                table_rows = dbc.get_all_rows_from_table(table)

                if self.row_ids_of_interest:
                    if table_form == 'dataframe':
                        raise ConfigError("Oops .. anvi'o does not know how to deal with specific row ids of interest when a table\
                                           refernced from a clustering recipe is in dataframe form :(")
                    table_rows = [r for r in table_rows if r[0] in self.row_ids_of_interest]

                if not len(table_rows):
                    raise ConfigError("It seems the table '%s' in the database it was requested from is empty. This\
                                        is not good. Here is the section that is not working for you: '%s' :/" \
                                                                % (table, section))

                tmp_file_path = filesnpaths.get_temp_file_path()

                # time to differentially store table contents.
                if table_form == 'dataframe':
                    args = argparse.Namespace(pan_or_profile_db=database_path, table_name=table)
                    table = TableForItemAdditionalData(args)
                    table_keys_list, table_data_dict = table.get()
                    store_dict_as_TAB_delimited_file(table_data_dict, tmp_file_path)
                else:
                    table_structure = dbc.get_table_structure(table)
                    columns_to_exclude = [c for c in ['entry_id', 'sample_id'] if c in table_structure]
                    store_array(table_rows, tmp_file_path, table_structure, exclude_columns=columns_to_exclude)

                self.matrix_paths[alias] = tmp_file_path