Beispiel #1
0
    def append(self, collection_name, collection_dict, bins_info_dict={}, drop_collection=True):
        utils.is_this_name_OK_for_database('collection name', collection_name, stringent=False)

        for bin_name in collection_dict:
            utils.is_this_name_OK_for_database('bin name', bin_name, stringent=False)

        if bins_info_dict:
            if set(collection_dict.keys()) - set(bins_info_dict.keys()):
                raise ConfigError('Bins in the collection dict do not match to the ones in the bins info dict. '
                                   'They do not have to be identical, but for each bin id, there must be a unique '
                                   'entry in the bins informaiton dict. There is something wrong with your input :/')

        if drop_collection:
            # remove any pre-existing information for 'collection_name'
            self.delete(collection_name)

        num_splits_in_collection_dict = sum([len(splits) for splits in list(collection_dict.values())])
        splits_in_collection_dict = set(list(chain.from_iterable(list(collection_dict.values()))))
        if len(splits_in_collection_dict) != num_splits_in_collection_dict:
            raise ConfigError("TablesForCollections::append: %d of the split or contig IDs appear more than once in "
                               "your collections input. It is unclear to anvi'o how did you manage to do this, but we "
                               "cannot go anywhere with this :/" % (num_splits_in_collection_dict - len(splits_in_collection_dict)))

        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        # how many clusters are defined in 'collection_dict'?
        bin_names = list(collection_dict.keys())

        if drop_collection:
            db_entries = tuple([collection_name, num_splits_in_collection_dict, len(bin_names), ','.join(bin_names)])
            database._exec('''INSERT INTO %s VALUES (?,?,?,?)''' % t.collections_info_table_name, db_entries)

        if not bins_info_dict:
            colors = utils.get_random_colors_dict(bin_names)
            for bin_name in bin_names:
                bins_info_dict[bin_name] = {'html_color': colors[bin_name], 'source': 'UNKNOWN'}

        # populate bins info table.
        db_entries = [(collection_name, b, bins_info_dict[b]['source'], bins_info_dict[b]['html_color']) for b in bin_names]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?)''' % t.collections_bins_info_table_name, db_entries)

        # populate splits table
        db_entries = []
        for bin_name in collection_dict:
            for split_name in collection_dict[bin_name]:
                db_entries.append(tuple([collection_name, split_name, bin_name]))
        database._exec_many('''INSERT INTO %s VALUES (?,?,?)''' % t.collections_splits_table_name, db_entries)
        num_splits = len(db_entries)

        # FIXME: This function can be called to populate the contigs database (via anvi-populate-collections), or
        # the profile database. when it is contigs database, the superclass Table has the self.splits_info variable
        # set when it is initialized. however, the Table instance is missing self.splis when it is initialized with
        # the profile database. hence some special controls for contigs db (note that collections_contigs_table is
        # only populated in the contigs database):
        if self.db_type == 'contigs':
            splits_only_in_collection_dict = [c for c in splits_in_collection_dict if c not in self.splits_info]
            splits_only_in_db = [c for c in self.splits_info if c not in splits_in_collection_dict]

            if len(splits_only_in_collection_dict):
                self.run.warning('%d of %d splits found in "%s" results are not in the database. This may be OK,\
                                          but you must be the judge of it. If this is somewhat surprising, please use caution\
                                          and make sure all is fine before going forward with you analysis.'\
                                                % (len(splits_only_in_collection_dict), len(splits_in_collection_dict), collection_name))

            if len(splits_only_in_db):
                self.run.warning('%d of %d splits found in the database were missing from the "%s" results. If this '
                                         'does not make any sense, please make sure you know why before going any further.'\
                                                % (len(splits_only_in_db), len(self.splits_info), collection_name))

            # then populate contigs table.
            db_entries = self.process_contigs(collection_name, collection_dict)
            database._exec_many('''INSERT INTO %s VALUES (?,?,?,?)''' % t.collections_contigs_table_name, db_entries)

        database.disconnect()

        num_bins = len(bin_names)
        num_bins_to_report = 50
        if not drop_collection:
            bins_to_report = bin_names
            bin_report_msg = "Here is a full list of the bin names added to this collection: {}.".format(",".join(bins_to_report))
        elif num_bins <= num_bins_to_report:
            bins_to_report = bin_names
            bin_report_msg = "Here is a full list of the bin names in this collection: {}.".format(",".join(bins_to_report))
        else:
            bins_to_report = bin_names[:num_bins_to_report]
            bin_report_msg = "Here is a list of the first {} bin names in this collection: {}.".format(num_bins_to_report, ",".join(bins_to_report))

        if drop_collection:
            self.run.info('Collections', 'The collection "%s" that describes %s splits and %s bins has been successfully added to the\
                                          database at "%s". %s' % (collection_name, pp(num_splits), pp(num_bins), self.db_path, bin_report_msg), mc='green')
        else:
            self.run.info('Collections', 'The existing collection "%s" updated, %s splits and %s bins has been successfully added to the\
                                          database at "%s". %s' % (collection_name, pp(num_splits), pp(num_bins), self.db_path, bin_report_msg), mc='green')
Beispiel #2
0
    def init_refereces_txt(self):
        if self.references_mode:
            try:
                filesnpaths.is_file_exists(self.fasta_txt_file)
            except FilesNPathsError as e:
                raise ConfigError(
                    'In references mode you must supply a fasta_txt file.')

        if not self.references_mode:
            # if it is reference mode then the group names have been assigned in the contigs Snakefile
            # if it is not reference mode and no groups are supplied in the samples_txt then group names are sample names
            self.group_names = self.sample_names

        if self.fasta_txt_file and not self.references_mode:
            raise ConfigError(
                "In order to use reference fasta files you must set\
                           \"'references_mode': true\" in your config file, yet\
                           you didn't, but at the same time you supplied the following\
                           fasta_txt: %s. So we don't know what to do with this\
                           fasta_txt" % self.fasta_txt_file)

        # Collecting information regarding groups.
        if "group" in self.samples_information.columns:
            # if groups were specified then members of a groups will be co-assembled.
            self.group_names = list(self.samples_information['group'].unique())
            # creating a dictionary with groups as keys and number of samples in
            # the groups as values
            self.group_sizes = self.samples_information['group'].value_counts(
            ).to_dict()

            if self.references_mode:
                # sanity check to see that groups specified in samples.txt match
                # the names of fasta.
                mismatch = set(self.group_names) - set(
                    self.contigs_information.keys())
                if mismatch:
                    raise ConfigError(
                        "Group names specified in the samples.txt \
                                       file must match the names of fasta \
                                       in the fasta.txt file. These are the \
                                       mismatches: %s" % mismatch)
                groups_in_contigs_information_but_not_in_samples_txt = set(
                    self.contigs_information.keys()) - set(self.group_names)
                if groups_in_contigs_information_but_not_in_samples_txt:
                    run.warning(
                        'The following group names appear in your fasta_txt\
                                 but do not appear in your samples_txt. Maybe this is\
                                 ok with you, but we thought you should know. This means\
                                 that the metagenomics workflow will simply ignore these\
                                 groups.')

        else:
            if self.references_mode:
                # if the user didn't provide a group column in the samples.txt,
                # in references mode the default is 'all_against_all'.
                run.warning("No groups were provided in your samples_txt,\
                             hence 'all_against_all' mode has been automatically\
                             set to True.")
                self.set_config_param('all_against_all', True)
            else:
                # if no groups were specified then each sample would be assembled
                # separately
                run.warning(
                    "No groups were specified in your samples_txt. This is fine.\
                             But we thought you should know. Any assembly will be performed\
                             on individual samples (i.e. NO co-assembly).")
                self.samples_information['group'] = self.samples_information[
                    'sample']
                self.group_names = list(self.sample_names)
                self.group_sizes = dict.fromkeys(self.group_names, 1)

        if self.get_param_value_from_config('all_against_all',
                                            repress_default=True):
            # in all_against_all, the size of each group is as big as the number
            # of samples.
            self.group_sizes = dict.fromkeys(self.group_names,
                                             len(self.sample_names))

        if not self.references_mode and not (self.get_param_value_from_config(
            ['anvi_script_reformat_fasta', 'run']) == True):
            # in assembly mode (i.e. not in references mode) we always have
            # to run reformat_fasta. The only reason for this is that
            # the megahit output is temporary, and if we dont run
            # reformat_fasta we will delete the output of meghit at the
            # end of the workflow without saving a copy.
            raise ConfigError("You can't skip reformat_fasta in assembly mode \
                                please change your config.json file")
Beispiel #3
0
    def get_gene_presence_in_the_environment_dict(self):
        if not isinstance(self.fraction_of_median_coverage, float):
            raise ConfigError(
                "Fraction of median coverage must of type `float`.")

        if not isinstance(self.min_detection, float):
            raise ConfigError("Minimum detection must be of type `float`")

        self.run.info('Fraction of median coverage for core genes',
                      self.fraction_of_median_coverage)
        self.run.info('Min detection of a genome in at last one metagenome',
                      self.min_detection)

        self.progress.new('Working on gene presence/absence')
        self.progress.update('...')

        gene_presence_in_the_environment_dict = {}
        for profile_db_path in self.unique_profile_db_path_to_internal_genome_name:
            self.progress.update('Collection info from profile db at %s ...' %
                                 (profile_db_path))
            summary = self.get_summary_object_for_profile_db(profile_db_path)

            for internal_genome_name in self.unique_profile_db_path_to_internal_genome_name[
                    profile_db_path]:
                genome_name = self.descriptions.genomes[internal_genome_name][
                    'bin_id']

                self.progress.update(
                    'Working on genome %s in profile db %s ...' %
                    (internal_genome_name, profile_db_path))

                # for each genome, first we will see whether it is detected in at least one metagenome
                detection_across_metagenomes = summary.collection_profile[
                    genome_name]['detection']
                num_metagenomes_above_min_detection = [
                    m for m in detection_across_metagenomes
                    if detection_across_metagenomes[m] > self.min_detection
                ]
                not_enough_detection = False if len(
                    num_metagenomes_above_min_detection) else True

                gene_presence_in_the_environment_dict[genome_name] = {}
                split_names_of_interest = self.descriptions.get_split_names_of_interest_for_internal_genome(
                    self.descriptions.genomes[internal_genome_name])

                genome_bin_summary = summarizer.Bin(summary, genome_name,
                                                    split_names_of_interest)
                gene_coverages_across_samples = utils.get_values_of_gene_level_coverage_stats_as_dict(
                    genome_bin_summary.gene_level_coverage_stats_dict,
                    "mean_coverage")

                # at this point we have all the genes in the genome bin. what we need is to characterize their detection. first,
                # summarize the coverage of each gene in all samples:
                sum_gene_coverages_across_samples = dict([
                    (gene_callers_id,
                     sum(gene_coverages_across_samples[gene_callers_id].values(
                     ))) for gene_callers_id in gene_coverages_across_samples
                ])

                # now we will identify the median coverage
                median_coverage_across_samples = numpy.median(
                    list(sum_gene_coverages_across_samples.values()))

                # now we will store decide whether a gene found in this genome is also found in the environment, and store that
                # information into `gene_presence_in_the_environment_dict`, and move on to the next stage.
                for gene_caller_id in sum_gene_coverages_across_samples:
                    if not_enough_detection:
                        _class = 'NA'
                    elif sum_gene_coverages_across_samples[
                            gene_caller_id] < median_coverage_across_samples * self.fraction_of_median_coverage:
                        _class = 'EAG'
                    else:
                        _class = 'ECG'

                    gene_presence_in_the_environment_dict[genome_name][
                        gene_caller_id] = _class

        self.progress.end()

        return gene_presence_in_the_environment_dict
Beispiel #4
0
    def check_params(self):
        # check the project name:
        if not self.project_name:
            raise ConfigError(
                "Please set a project name, and be prepared to see it around as (1) anvi'o will use\
                                that name to set the output directory and to name various output files such as the\
                                databases that will be generated at the end of the process. If you set your own output\
                                directory name, you can have multiple projects in it and all of those projects can use\
                                the same intermediate files whenever possible."
            )

        utils.is_this_name_OK_for_database('pan project name',
                                           self.project_name,
                                           stringent=False)

        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(
                self.output_dir,
                delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(
            self.log_file_path) else None

        if not isinstance(self.minbit, float):
            raise ConfigError("minbit value must be of type float :(")

        if self.minbit < 0 or self.minbit > 1:
            raise ConfigError(
                "Well. minbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError(
                "Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError(
                "Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\
                               pretty cute, too." % self.min_percent_identity)

        if len(
            [c for c in list(self.genomes.values())
             if 'genome_hash' not in c]):
            raise ConfigError(
                "self.genomes does not seem to be a properly formatted dictionary for\
                               the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name +
                                                     '-PAN.db')
Beispiel #5
0
 def check(self, aligner):
     if aligner not in self.aligners:
         raise ConfigError(
             "Sorry, anvi'o knows nothing of the aligner '%s'. Nice try though :/"
             % aligner)
Beispiel #6
0
    def check_for_db_requests(self, config):
        sections = self.get_other_sections(config)
        # look for requests from the database, create temporary tab delimited files:
        for section in sections:
            alias, matrix = section.split()
            if matrix.find('::') > -1:
                if matrix.startswith('!'):
                    database, table = matrix.split('::')
                    database = database[1:]

                    if database not in self.db_paths:
                        raise ConfigError(
                            f"Ehem. Anvi'o could not recover the actual path of the database (!{database}) referenced in the config file, "
                            f"because the database paths variable sent from the client does not have an entry for it :( There are two "
                            f"options. One is to get a `db_paths` dictionary sent to this class that contains a key for {database} with "
                            f"the full path to the dataase as a value. Or the table '{table}' can be exported to a TAB-delimited matrix "
                            f"and declared in the config file. If you are experimenting and got stuck here, we like you. Please send an "
                            f"e-mail to the developers.")

                    database_path = self.db_paths[database]
                else:
                    database, table = matrix.split('::')
                    database_path = os.path.abspath(
                        self.db_paths[database]
                    ) if database in self.db_paths else os.path.abspath(
                        database)

                    # if its not there, let's try one more thing
                    if not os.path.exists(database_path):
                        database_path = os.path.abspath(
                            os.path.join(self.input_directory, database))

                if not os.path.exists(database_path):
                    raise ConfigError(
                        "The database you requested (%s) is not where it was supposed to be ('%s') :/"
                        % (database, database_path))

                dbc = db.DB(database_path, None, ignore_version=True)

                if not table in dbc.get_table_names():
                    raise ConfigError(
                        'The table you requested (%s) does not seem to be in %s :/'
                        % (table, database))

                # here we know we are working with a database table that we have access to. however, in anvi'o database
                # tables in two forms: dataframe form, and matrix form. in dataframe form, we have key/value pairs rather
                # than MxN matrices where each N is a column for an attribute. while the latter is easier to export as a
                # matrix the clustering module can work with, the former requires extra attention. so here we need to first
                # figure out whether which form the table is in. why this even became necessary? taking a look at this issue
                # may help: https://github.com/merenlab/anvio/issues/662
                table_form = None
                if config.has_option(section, 'table_form'):
                    table_form = config.get(section, 'table_form')

                if self.row_ids_of_interest:
                    column_name = dbc.get_table_structure(table)[0]
                    where_clause = """%s IN (%s)""" % (column_name, ','.join(
                        ['"%s"' % _ for _ in self.row_ids_of_interest]))
                    table_rows = dbc.get_some_rows_from_table(
                        table, where_clause=where_clause)
                else:
                    table_rows = dbc.get_all_rows_from_table(table)

                if self.row_ids_of_interest:
                    if table_form == 'dataframe':
                        raise ConfigError(
                            "Oops .. anvi'o does not know how to deal with specific row ids of interest when a table "
                            "refernced from a clustering recipe is in dataframe form :("
                        )
                    table_rows = [
                        r for r in table_rows
                        if r[0] in self.row_ids_of_interest
                    ]

                if not len(table_rows):
                    raise ConfigError("It seems the table '%s' in the database it was requested from is empty. This "
                                       "is not good. Here is the section that is not working for you: '%s' :/" \
                                                                % (table, section))

                tmp_file_path = filesnpaths.get_temp_file_path()

                # time to differentially store table contents.
                if table_form == 'dataframe':
                    args = argparse.Namespace(pan_or_profile_db=database_path,
                                              table_name=table)
                    table = TableForItemAdditionalData(args)
                    table_keys_list, table_data_dict = table.get()
                    store_dict_as_TAB_delimited_file(table_data_dict,
                                                     tmp_file_path)
                else:
                    table_structure = dbc.get_table_structure(table)
                    columns_to_exclude = [
                        c for c in ['entry_id', 'sample_id']
                        if c in table_structure
                    ]
                    store_array(table_rows,
                                tmp_file_path,
                                table_structure,
                                exclude_columns=columns_to_exclude)

                self.matrix_paths[alias] = tmp_file_path

                dbc.disconnect()
Beispiel #7
0
    def init(self):
        """This function is called from within the snakefile to initialize parameters."""
        super().init()

        self.run_iu_merge_pairs = self.get_param_value_from_config(
            ['iu_merge_pairs', 'run'])
        self.gzip_iu_merge_pairs_output = self.get_param_value_from_config(
            ['iu_merge_pairs', '--gzip-output'])
        self.run_anvi_reformat_fasta = self.get_param_value_from_config(
            ['anvi_reformat_fasta', 'run'])
        self.gzip_anvi_reformat_fasta_output = self.get_param_value_from_config(
            ['anvi_reformat_fasta', '--gzip-output'])
        self.run_anvi_trnaseq = self.get_param_value_from_config(
            ['anvi_trnaseq', 'run'])
        self.run_anvi_convert_trnaseq_database = self.get_param_value_from_config(
            ['anvi_convert_trnaseq_database', 'run'])
        self.run_anvi_run_trna_taxonomy = self.get_param_value_from_config(
            ['anvi_run_trna_taxonomy', 'run'])

        # Load table of sample info from samples_txt (sample names, treatments, paths to r1 and r2,
        # r1 and r2 prefixes).
        self.samples_txt_file = self.get_param_value_from_config(
            ['samples_txt'])
        filesnpaths.is_file_exists(self.samples_txt_file)
        try:
            # An error will subsequently be raised in `check_samples_txt` if there is no header.
            self.sample_info = pd.read_csv(self.samples_txt_file,
                                           sep='\t',
                                           index_col=False)
        except IndexError as e:
            raise ConfigError(
                "The samples_txt file, '%s', does not appear to be properly formatted. "
                "This is the error from trying to load it: '%s'" %
                (self.samples_txt_file, e))
        self.check_samples_txt()

        self.sample_names = self.sample_info['sample'].tolist()
        if 'treatment' in self.sample_info['treatment']:
            # The treatment is specified for each sample in samples_txt.
            self.treatments = self.sample_info['treatment'].tolist()
        else:
            # The treatment is the same for each sample and is set in the config file.
            self.treatments = [
                self.get_param_value_from_config(['anvi_trnaseq', 'treatment'])
            ] * len(self.sample_names)
        if self.run_iu_merge_pairs:
            self.treatments = self.sample_info['treatment']
            self.r1_paths = self.sample_info['r1'].tolist()
            self.r2_paths = self.sample_info['r2'].tolist()
            self.r1_prefixes = self.get_r1_prefixes()
            self.r2_prefixes = self.get_r2_prefixes()
            self.fasta_paths = None
        else:
            self.treatments = self.sample_info['treatment']
            self.r1_paths = None
            self.r2_paths = None
            self.r1_prefixes = None
            self.r2_prefixes = None
            self.fasta_paths = self.sample_info['fasta'].tolist()

        self.target_files = self.get_target_files()
Beispiel #8
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError(
                "You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/"
            )

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(
            self.output_directory,
            self.progress,
            delete_if_exists=self.overwrite_output_destinations)

        self.progress.update(
            'Creating a new single profile database with contigs hash "%s" ...'
            % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_SCVs = False

        meta_values = {
            'db_type':
            'profile',
            'anvio':
            __version__,
            'sample_id':
            self.sample_id,
            'samples':
            self.sample_id,
            'merged':
            False,
            'blank':
            self.blank,
            'contigs_ordered':
            self.contigs_shall_be_clustered,
            'default_view':
            'single',
            'min_contig_length':
            self.min_contig_length,
            'max_contig_length':
            self.max_contig_length,
            'SNVs_profiled':
            not self.skip_SNV_profiling,
            'SCVs_profiled':
            self.profile_SCVs,
            'min_coverage_for_variability':
            self.min_coverage_for_variability,
            'report_variability_full':
            self.report_variability_full,
            'contigs_db_hash':
            self.a_meta['contigs_db_hash'],
            'description':
            self.description
            if self.description else '_No description is provided_'
        }
        profile_db.create(meta_values)

        self.progress.update(
            'Creating a new auxiliary database with contigs hash "%s" ...' %
            self.a_meta['contigs_db_hash'])
        self.auxiliary_db_path = self.generate_output_destination(
            'AUXILIARY-DATA.db')
        self.auxiliary_db = auxiliarydataops.AuxiliaryDataForSplitCoverages(
            self.auxiliary_db_path,
            self.a_meta['contigs_db_hash'],
            create_new=True)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning(
                'Single-nucleotide variation will not be characterized for this profile.'
            )

        if not self.profile_SCVs:
            self.run.warning(
                'Amino acid linkmer frequencies will not be characterized for this profile.'
            )
Beispiel #9
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.log_file_path = self.generate_output_destination('RUNLOG.txt')
        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info(
            'description', 'Found (%d characters)' %
            len(self.description) if self.description else None)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('blank', self.blank)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('max_contig_length', self.max_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('profile_SCVs', self.profile_SCVs)
        self.run.info('report_variability_full', self.report_variability_full)

        self.run.warning("Your minimum contig length is set to %s base pairs. So anvi'o will not take into\
                          consideration anything below that. If you need to kill this an restart your\
                          analysis with another minimum contig length value, feel free to press CTRL+C." \
                                                % (pp(self.min_contig_length)))

        if self.max_contig_length < sys.maxsize:
            self.run.warning(
                "Your maximum contig length is set to %s base pairs. Which means anvi'o will remove\
            any contigs that are longer than this value." %
                pp(self.max_contig_length))

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.blank:
            self.init_mock_profile()

            # creating a null view_data_splits dict:
            view_data_splits = dict(
                list(
                    zip(self.split_names, [
                        dict(
                            list(
                                zip(t.atomic_data_table_structure[1:], [None] *
                                    len(t.atomic_data_table_structure[1:]))))
                    ] * len(self.split_names))))
            TablesForViews(self.profile_db_path).remove(
                'single', table_names_to_blank=['atomic_data_splits'])
            TablesForViews(self.profile_db_path).create_new_view(
                data_dict=view_data_splits,
                table_name='atomic_data_splits',
                table_structure=t.atomic_data_table_structure,
                table_types=t.atomic_data_table_types,
                view_name='single')
        elif self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
        else:
            raise ConfigError(
                "What are you doing? :( Whatever it is, anvi'o will have none of it."
            )

        # update layer additional data table content
        if self.layer_additional_data:
            layer_additional_data_table = TableForLayerAdditionalData(
                argparse.Namespace(profile_db=self.profile_db_path),
                r=self.run,
                p=self.progress)
            layer_additional_data_table.add(
                {self.sample_id: self.layer_additional_data},
                self.layer_additional_keys)

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        if self.bam:
            self.bam.close()

        self.run.quit()
Beispiel #10
0
    def add(self, data_dict, data_keys_list, skip_check_names=False):
        """Function to add data into the item additional data table.

           * `data_dict`: a dictionary for items or layers additional should follow this format:

                d = {
                        'item_or_layer_name_01': {'data_key_01': value,
                                                  'data_key_02': value,
                                                  'data_key_03': value
                                                  },
                        'item_or_layer_name_02': {'data_key_01': value,
                                                  'data_key_03': value,
                                                  },
                        (...)
                    }

           * `data_keys_list`: is a list of keys one or more of which should appear for each item
                               in `data_dict`.
        """

        self.data_dict_sanity_check(data_dict, data_keys_list=data_keys_list)

        if self.target not in ['items', 'layers']:
            raise ConfigError(
                "You are using an AdditionalDataBaseClass instance to add %s data into your %s database. But\
                               you know what? You can't do that :/ Someone made a mistake somewhere. If you are a user,\
                               check your flags to make sure you are targeting the right data table. If you are a programmer,\
                               you are fired." % (self.target, self.db_type))

        self.run.warning(None,
                         'New %s additional data...' % self.target,
                         lc="yellow")
        key_types = {}
        for key in data_keys_list:
            if '!' in key:
                predicted_key_type = "stackedbar"
            else:
                type_class = utils.get_predicted_type_of_items_in_a_dict(
                    data_dict, key)
                predicted_key_type = type_class.__name__ if type_class else None

            key_types[key] = predicted_key_type
            self.run.info('Data key "%s"' % key, 'Predicted type: %s' % (key_types[key]), \
                                            nl_after = 1 if key == data_keys_list[-1] else 0)

        # we be responsible here.
        keys_already_in_db = [
            c for c in data_keys_list if c in self.additional_data_keys
        ]
        if len(keys_already_in_db):
            if self.just_do_it:
                self.run.warning(
                    'The following keys in your data dict will replace the ones that are already\
                                  in your %s database: %s.' %
                    (self.db_type, ', '.join(keys_already_in_db)))

                self.remove(keys_already_in_db)
            else:
                run.info('Data keys already in the db',
                         ', '.join(keys_already_in_db),
                         nl_before=2,
                         mc='red')

                raise ConfigError(
                    "Some of the data keys in your new data appear to be in the database already. If you\
                                   want to replace those in the database with the ones in your new data use the\
                                   `--just-do-it` flag, and watch anvi'o make an exception just for you and complain\
                                   about nothin' for this once.")

        if skip_check_names:
            self.run.warning(
                "You (or the programmer) asked anvi'o to NOT check the consistency of the names of your %s\
                              between your additional data and the %s database you are attempting to update. So be it.\
                              Anvi'o will not check anything, but if things don't look the way you expected them to look,\
                              you will not blame anvi'o for your poorly prepared data, but choose between yourself or\
                              Obama." % (self.target, self.db_type))
        else:
            if self.target == 'layers':
                TableForLayerAdditionalData.check_names(self, data_dict)
            elif self.target == 'items':
                TableForItemAdditionalData.check_names(self, data_dict)
            else:
                raise ConfigError(
                    "Congratulations, you managed to hit an uncharted are in anvi'o. It is cerrtainly very\
                                   curious how you got here unless you are trying to implement a new functionality."
                )

        db_entries = []
        self.set_next_available_id(self.table_name)
        for item_name in data_dict:
            for key in data_keys_list:
                db_entries.append(
                    tuple([
                        self.next_id(self.table_name), item_name, key,
                        data_dict[item_name][key], key_types[key]
                    ]))

        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?,?)''' % self.table_name,
            db_entries)
        database.disconnect()

        self.run.info('New data added to the db for your %s' % self.target,
                      '%s.' % (', '.join(data_keys_list)),
                      nl_after=1)
Beispiel #11
0
    def remove(self, data_keys_list):
        '''Give this guy a list of key for additional data, and watch their demise.'''

        if not isinstance(data_keys_list, list):
            raise ConfigError(
                "The remove function in AdditionalDataBaseClass wants you to watch\
                               yourself before you wreck yourself. In other words, can you please\
                               make sure the keys you send is of type `list` thankyouverymuch?"
            )

        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))

        additional_data_keys = sorted(
            database.get_single_column_from_table(self.table_name,
                                                  'data_key',
                                                  unique=True))

        if not len(additional_data_keys):
            self.run.info_single(
                'There is nothing to remove --the %s additional data table is already empty :('
                % self.target)
            database.disconnect()

            return

        missing_keys = [
            k for k in data_keys_list if k not in additional_data_keys
        ]
        if len(missing_keys) and not self.just_do_it:
            database.disconnect()
            raise ConfigError(
                "The following keys you wanted to remove from the items additional data table are\
                               not really in the table: '%s'. Anvi'o is confused :/"
                % (', '.join(missing_keys)))

        if data_keys_list:
            for key in data_keys_list:
                if key not in additional_data_keys:
                    # what the hell, user?
                    return

                database._exec('''DELETE from %s WHERE data_key="%s"''' %
                               (self.table_name, key))

            self.run.warning(
                "%s data for the following keys removed from the database: '%s'. #SAD."
                % (self.target, ', '.join(data_keys_list)))
        else:
            if not self.just_do_it:
                raise ConfigError(
                    "You did not provide a list of data keys to remove, which means you are about to delete everything in the\
                                   %s additional data table. Just to be on the safe side, anvi'o is looking for a confirmation. If you\
                                   try again with the --just-do-it flag flag, anvi'o will put on its business socks, and burn this table\
                                   and everything in it to the ground." %
                    self.target)

            database._exec('''DELETE from %s''' % (self.table_name))

            self.run.warning(
                "All data from the %s additional data table is removed (ouch)."
                % self.target)

        database.disconnect()
Beispiel #12
0
    def add(self, data_dict, skip_check_names=False):
        """
            The default function to add data into the orders table.

             * `data_dict`: this variable for layer orders is expected to follow this format:

                  d = {
                          'data_key_01': {'data_type': 'newick',
                                          'data_value': '(item_or_layer_name_01:0.0370199,(item_or_layer_name_02:0.0227268,item_or_layer_name_01:0.0227268)Int3:0.0370199);'
                                          },
                          'data_key_02': {'data_type': 'basic',
                                          'data_value': 'item_or_layer_name_02,item_or_layer_name_01,item_or_layer_name_03'
                                          },
                          (...)
                  }
        """

        self.data_dict_sanity_check(data_dict)

        if self.target not in ['layer_orders']:
            raise ConfigError(
                "You are using an OrderDataBaseClass instance to add %s data into your %s database. This is\
                               illegal and if you are here, it means someone made a mistake somewhere. If you are a user,\
                               check your flags to make sure you are targeting the right data table. If you are a programmer,\
                               you are fired." % (self.target, self.db_type))

        self.run.warning(None, 'New %s data...' % self.target, lc="yellow")
        data_keys_list = list(data_dict.keys())
        data_key_types = {}
        for key in data_keys_list:
            predicted_key_type = data_dict[key]['data_type']

            data_key_types[key] = predicted_key_type
            self.run.info('Data key "%s"' % key, 'Type: %s' % (data_key_types[key]), \
                                            nl_after = 1 if key == data_keys_list[-1] else 0)

        # we be responsible here.
        keys_already_in_db = [
            c for c in data_keys_list if c in self.additional_data_keys
        ]
        if len(keys_already_in_db):
            if self.just_do_it:
                self.run.warning(
                    'The following keys in your data dict will replace the ones that are already\
                                  in your %s database: %s.' %
                    (self.db_type, ', '.join(keys_already_in_db)))

                self.remove(keys_already_in_db)
            else:
                run.info('Data keys already in the db',
                         ', '.join(keys_already_in_db),
                         nl_before=2,
                         mc='red')

                raise ConfigError(
                    "Some of the keys in your new order data appear to be in the database already. If you\
                                   want to replace those in the database with the ones in your new data use the\
                                   `--just-do-it` flag.")

        if skip_check_names:
            self.run.warning(
                "You (or the programmer) asked anvi'o to NOT check the consistency of the names of your %s\
                              between your additional data and the %s database you are attempting to update. So be it.\
                              Anvi'o will not check anything, but if things don't look the way you expected them to look,\
                              you will not blame anvi'o for your poorly prepared data, but choose between yourself or\
                              Obama." % (self.target, self.db_type))
        else:
            TableForLayerOrders.check_names(self, data_dict)

        db_entries = []
        for item_name in data_dict:
            db_entries.append(
                tuple([
                    item_name, data_dict[item_name]['data_type'],
                    data_dict[item_name]['data_value']
                ]))

        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?)''' % self.table_name, db_entries)
        database.disconnect()

        self.run.info('New order data added to the db for %s' % self.target,
                      '%s.' % (', '.join(data_keys_list)))
Beispiel #13
0
                    'APP_DIRS': False,
                }
            ]
        })
    except ImportError:
        local_settings.update({'TEMPLATE_DIRS': (template_dir,)})

    try:
        import django
    except:
        pass

    from django.template.loader import render_to_string
    from django.template.defaultfilters import register
except ImportError:
    raise ConfigError('You need to have Django module (http://djangoproject.com) installed on your system to generate HTML output.')

# It seems this really wants to be here in the global context :/ 
settings.configure(**local_settings)
django.setup()


__author__ = "Developers of anvi'o (see AUTHORS.txt)"
__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "*****@*****.**"
__status__ = "Development"
Beispiel #14
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_profile_db(db_path)

    profile_db = db.DB(db_path, None, ignore_version=True)
    is_merged = profile_db.get_meta_value('merged')
    is_blank = profile_db.get_meta_value('blank')
    sample_name = profile_db.get_meta_value('sample_id')
    tables_in_db = profile_db.get_table_names()
    profile_db.disconnect()

    is_full_profile = 'mean_coverage_splits' in tables_in_db or 'atomic_data_splits' in tables_in_db

    run.info('Profile db type', 'Merged' if is_merged else 'Single')
    run.info('Full profile', is_full_profile)
    run.info('Is blank', is_blank)

    progress.new("Durr Durr")
    progress.update('...')

    if is_blank:
        ########################
        #     BLANK PROFILE    #
        ########################

        pass

    elif is_full_profile and not is_merged:
        #########################
        #     SINGLE PROFILE    #
        #########################
        profile_db = db.DB(db_path, None, ignore_version=True)

        # remove the default view variable in self, and add it back with 'mean_coverage'
        profile_db.remove_meta_key_value_pair('default_view')
        profile_db.set_meta_value('default_view', 'mean_coverage')

        for target in ['splits', 'contigs']:
            # get rid of the hideous view called 'single'.
            profile_db._exec('''DELETE FROM views WHERE view_id = "single"''')

            atomic_data = profile_db.get_table_as_dict(f'atomic_data_{target}')

            for view in essential_data_fields_for_anvio_profiles:
                table_name = f'{view}_{target}'

                # le creationeaux au de neuvo tabl
                profile_db._exec(
                    f'''CREATE TABLE {table_name} (item text, layer text, value numeric)'''
                )
                view_data = []
                for split_name in atomic_data:
                    view_data.append((split_name, sample_name,
                                      atomic_data[split_name][view]), )

                # populate the new view table
                profile_db._exec_many(
                    '''INSERT INTO %s VALUES (?,?,?)''' % (table_name),
                    view_data)

                # update the views table
                if target == 'splits':
                    profile_db._exec('''INSERT INTO views VALUES (?,?)''',
                                     (view, table_name))

        # баяртай
        profile_db.disconnect()

    elif is_full_profile and is_merged:
        #########################
        #     MERGED PROFILE    #
        #########################

        # open the profile database without rowid prepend.
        profile_db = db.DB(db_path,
                           None,
                           ignore_version=True,
                           skip_rowid_prepend=True)

        # learn your samples
        sample_names = [
            s.strip() for s in profile_db.get_meta_value('samples').split(',')
        ]

        # drop the contents of the view table.
        profile_db._exec("DELETE FROM views")

        for target in ['splits', 'contigs']:
            for view in essential_data_fields_for_anvio_profiles:
                table_name = f'{view}_{target}'

                progress.update(f"Working on table '{table_name} ...'")

                table_data = profile_db.get_table_as_dict(table_name)

                # drop the old view table
                profile_db._exec(f'DROP TABLE {table_name}')

                # create a new view table!
                profile_db._exec(
                    f'''CREATE TABLE {table_name} (item text, layer text, value numeric)'''
                )

                # fill in the new view data from the old format
                view_data = []
                for split_name in table_data:
                    for sample_name in sample_names:
                        view_data.append(
                            (split_name, sample_name,
                             table_data[split_name][sample_name]), )

                # populate new view table
                profile_db._exec_many(
                    '''INSERT INTO %s VALUES (?,?,?)''' % (table_name),
                    view_data)

                # if splits, I sits
                if target == 'splits':
                    profile_db._exec('''INSERT INTO views VALUES (?,?)''',
                                     (view, table_name))

        # さようなら
        profile_db.disconnect()

    else:
        ###########################
        #     SURPRISE PROFILE    #
        ###########################

        raise ConfigError(
            "Anvi'o is confuse. Your profile database does not fit into anything we have "
            "anticipated to run into here. For full disclosure, [the rest of the sentence "
            "was left blank intentionally just to drive you mad as you drive anvi'o mad -- "
            "eye for an eye].")

    # set the version
    profile_db = db.DB(db_path, None, ignore_version=True)
    profile_db.remove_meta_key_value_pair('version')
    profile_db.set_version(next_version)
    profile_db.disconnect()

    progress.end()
    run.info_single(
        f"The profile database is now {next_version}. This upgrade fixed one of the most annoying "
        f"early design decisions we have made (and when we say 'we', we actually mean 'Meren', and "
        f"the rest of us accept no blame for it). This design shortcoming prevented anvi'o to merge "
        f"more than 2,000 samples. The current update reflects a significant change in the structure "
        f"of the 'view' tables of anvi'o and not only removes this limitation, but also results in "
        f"significant speed and memory gains during `anvi-merge`. But this operation is similar to "
        f"changing the entire flooring of an apartment while having to make sure each piece of the "
        f"furniture put back to their place properly once the flooring is redone. The aim of this "
        f"migration script was to put the furniture back. If you are reading this message, you are "
        f"most likely ⭐",
        nl_after=1,
        nl_before=1,
        mc='green')
Beispiel #15
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.check_h5py_module()
    import h5py

    fp = h5py.File(db_path, 'r')

    if int(fp.attrs['version']) != int(current_version):
        fp.close()
        raise ConfigError("Genome storage version is not %s." % current_version)

    old_storage_hash = str(fp.attrs['hash'])
    functions_are_available = fp.attrs['functions_are_available']

    run.info("Outdated genomes storage found (%s)" % old_storage_hash, db_path)

    genome_storage_db_path = db_path[:-3] + '.db'
    filesnpaths.is_output_file_writable(genome_storage_db_path, ok_if_exists=False)

    genomes_db = db.DB(genome_storage_db_path, next_version, new_database=True)
    genomes_db.create_table(genome_info_table_name, genome_info_table_structure, genome_info_table_types)
    genomes_db.create_table(gene_info_table_name, gene_info_table_structure, gene_info_table_types)
    genomes_db.create_table(genome_gene_function_calls_table_name, genome_gene_function_calls_table_structure, genome_gene_function_calls_table_types)
    genomes_db._exec("CREATE INDEX covering_index ON %s (gene_callers_id, genome_name);" % genome_gene_function_calls_table_name)

    genomes_db.set_meta_value('db_type', 'genomestorage')
    genomes_db.set_meta_value('creation_date', time.time())
    genomes_db.set_meta_value('hash', old_storage_hash)
    genomes_db.set_meta_value('functions_are_available', functions_are_available)

    I = lambda genome_name, key: fp['/info/genomes/%s/%s' % (genome_name, key)]

    genome_names = [d for d in fp['/info/genomes']]

    progress.new('Bleep bloop')
    progress.update('Adding genomes')
    genome_info_entries = []
    for genome_name in genome_names:
        values = (genome_name, )

        for column_name in genome_info_table_structure[1:]:
            # dirty workaround for backwards compatibility,
            # "percent_completion" may be "percent_complete" in some old genome storages, 
            # because ozcan forgot to add that into upgrade script :(
            if column_name == 'percent_completion' and '/info/genomes/%s/percent_completion' % genome_name not in fp:
                column_name = 'percent_complete'

            attr = I(genome_name, column_name)

            if attr.dtype == 'int64':
                values += (int(attr.value), )
            elif attr.dtype == 'float64':
                values += (float(attr.value), )
            else:
                values += ((attr.value), )

        genome_info_entries.append(values)
    genomes_db.insert_many(genome_info_table_name, entries=genome_info_entries)
    del genome_info_entries

    progress.update('Adding genes')
    gene_entries = []
    for genome_name in genome_names:
        for gene_callers_id in fp['/data/genomes/%s' % genome_name]:
            G = lambda key: fp['/data/genomes/%s/%s/%s' % (genome_name, gene_callers_id, key)].value
            gene_entries.append((genome_name, gene_callers_id, G('aa_sequence'), G('dna_sequence'), int(G('partial')), int(G('length')), ))
    genomes_db.insert_many(gene_info_table_name, entries=gene_entries)
    del gene_entries

    progress.update('Adding functions')
    functions_entries = []
    entry_id_counter = 0
    for genome_name in genome_names:
        for gene_callers_id in fp['/data/genomes/%s' % genome_name]:
            functions_path = '/data/genomes/%s/%s/functions' % (genome_name, gene_callers_id)
            if functions_path in fp:
                for source in fp[functions_path]:
                    annotation_list = str(fp['/data/genomes/%s/%s/functions/%s' % (genome_name, gene_callers_id, source)].value).split('|||')

                    functions_entries.append((genome_name, entry_id_counter, gene_callers_id, source, annotation_list[0], annotation_list[1], 0, ))
                    entry_id_counter += 1
    genomes_db.insert_many(genome_gene_function_calls_table_name, entries=functions_entries)

    genomes_db.disconnect()

    progress.end()

    os.remove(db_path)

    run.info_single("Your genomes storage is now at version %s. The new on is at %s, and anvi'o just removed\
                     the old one, which was at %s from your disk." % (next_version, genome_storage_db_path, db_path), nl_after=1, nl_before=1, mc='green')
Beispiel #16
0
    def __init__(self, args, r=terminal.Run(width=35), p=terminal.Progress()):
        self.args = args
        self.progress = p
        self.run = r

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.input_file_path = A('input_file')
        self.contigs_db_path = A('contigs_db')
        self.serialized_profile_path = A('serialized_profile')
        self.output_directory = A('output_dir')
        self.list_contigs_and_exit = A('list_contigs')
        self.min_contig_length = A('min_contig_length') or 0
        self.max_contig_length = A('max_contig_length') or sys.maxsize
        self.min_mean_coverage = A('min_mean_coverage')
        self.min_coverage_for_variability = A('min_coverage_for_variability')
        self.contigs_shall_be_clustered = A('cluster_contigs')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.sample_id = A('sample_name')
        self.report_variability_full = A('report_variability_full')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.skip_SNV_profiling = A('skip_SNV_profiling')
        self.profile_SCVs = A('profile_SCVs')
        self.gen_serialized_profile = A('gen_serialized_profile')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.num_threads = int(A('num_threads') or 1)
        self.queue_size = int(
            A('queue_size') if A('queue_size') is not None else 0)
        self.write_buffer_size = int(
            A('write_buffer_size') if A('write_buffer_size'
                                        ) is not None else 500)
        self.total_length_of_all_contigs = 0
        self.total_coverage_values_for_all_contigs = 0
        self.description_file_path = A('description')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        # whehther the profile database is a blank (without any BAM files or reads):
        self.blank = A('blank_profile')

        if not self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confused, and confusing anvi'o, too. You can't as hierarchical clustering\
                               to be performed with one flag, and try to skip it with another one :("
            )

        if self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError(
                "So you want to generate a blank profile, and you both want hierarchical clustering\
                               of your contigs to be performed, and skipped. No."
            )

        if self.blank and self.contigs_shall_be_clustered:
            raise ConfigError(
                "When the blank profile is asked to be generated, there is no need to ask for the\
                               hierarchical clustering of contigs. It is going to be done by default. If it is\
                               not changing anything, why is anvi'o upset with you? Because. Let's don't use flags\
                               we don't need.")

        if self.blank and not self.skip_hierarchical_clustering:
            self.contigs_shall_be_clustered = True

        if A('contigs_of_interest'):
            filesnpaths.is_file_exists(args.contigs_of_interest)
            self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\
                                                                           if c.strip() and not c.startswith('#')])
        else:
            self.contig_names_of_interest = None

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        if not self.contigs_db_path:
            raise ConfigError("No contigs database, no profilin'. Bye.")

        # Initialize contigs db
        dbops.ContigsSuperclass.__init__(self,
                                         self.args,
                                         r=self.run,
                                         p=self.progress)
        self.init_contig_sequences()
        self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys())

        self.bam = None
        self.contigs = []

        self.database_paths = {
            'CONTIGS.db': os.path.abspath(self.contigs_db_path)
        }

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs[
            'blank' if self.blank else 'single']

        # following variable will be populated during the profiling, and its content will eventually
        # be stored in t.variable_nts_table_name
        self.variable_nts_table_entries = []

        # if genes are not called, yet the user is asking for codon frequencies to be profiled, we give
        # a warning and force-turn that flag off.
        if (not self.a_meta['genes_are_called']) and self.profile_SCVs:
            self.run.warning(
                "You asked the codon frequencies to be profiled, but genes were not called\
                              for your contigs database. Anvi'o is assigning `False` to the profile-codon-frequncies\
                              flag, overruling your request like a boss.")
            self.profile_SCVs = False

        # following variable will be populated while the variable positions table is computed
        self.codons_in_genes_to_profile_SCVs = set([])

        # we don't know what we are about
        self.description = None

        # additional layer data will be filled later
        self.layer_additional_keys = []
        self.layer_additional_data = {}
Beispiel #17
0
    def __init__(self,
                 config_file_path,
                 input_directory=None,
                 db_paths={},
                 row_ids_of_interest=[],
                 r=run,
                 p=progress):
        self.run = r
        self.progress = p

        self.input_directory = input_directory or os.path.abspath(os.getcwd())
        self.config_file_path = config_file_path

        # `row_ids_of_interest` gives opportunity to filter out irrelevant entries quickly
        # while vectors are being obtained from each matrix described in the config file.
        # to see why it is important in the context of anvi'o, see
        # https://github.com/meren/anvio/issues/100
        self.row_ids_of_interest = set(row_ids_of_interest)

        # these are the database files that may be referenced from within the config files
        # with !DATABASE.db::table notation. If a database entry has an exclamation mark,
        # it will be searched for in the db_paths dict to associate it with the relative
        # path that is only known to the client
        self.db_paths = db_paths

        # read the config
        filesnpaths.is_file_exists(self.config_file_path)
        config = configparser.ConfigParser()
        config.read(self.config_file_path)

        # this will keep the actual paths for each matrix:
        self.matrix_paths = {}
        self.set_default_paths(config)

        self.check_for_db_requests(config)

        # and sanity check.
        self.sanity_check(config)

        if self.get_option(config, 'general', 'output_file', str):
            self.output_file_name = self.get_option(config, 'general',
                                                    'output_file', str)
            self.output_file_path = os.path.join(self.input_directory,
                                                 self.output_file_name)
        else:
            self.output_file_name = None
            self.output_file_path = None

        self.name = self.get_option(
            config, 'general', 'name',
            str) or filesnpaths.get_name_from_file_path(self.config_file_path)
        self.distance = self.get_option(config, 'general', 'distance', str)
        self.linkage = self.get_option(config, 'general', 'linkage', str)

        self.num_components = self.get_option(config, 'general',
                                              'num_components', int)
        self.seed = self.get_option(config, 'general', 'seed', int)
        self.master = None

        self.matrices_dict = {}
        self.matrices = []
        for section in self.get_other_sections(config):
            alias, matrix = section.split()

            self.matrices.append(alias)

            m = {}
            columns_to_use = self.get_option(config, section, 'columns_to_use',
                                             str)
            table_form = self.get_option(config, section, 'table_form', str)
            m['alias'] = alias
            m['matrix'] = matrix
            m['table_form'] = table_form
            m['columns_to_use'] = [
                c.strip() for c in columns_to_use.split(',')
            ] if columns_to_use else None
            m['ratio'] = self.get_option(config, section, 'ratio', int)
            m['path'] = self.matrix_paths[alias]
            m['normalize'] = False if self.get_option(
                config, section, 'normalize', str) == 'False' else True
            m['log'] = True if self.get_option(config, section, 'log',
                                               str) == 'True' else False
            # next two variables are necessary to follow the order of vectors
            m['id_to_sample'], m['sample_to_id'], m['cols'], m[
                'vectors'] = get_vectors(m['path'], m['columns_to_use'],
                                         self.row_ids_of_interest)
            self.matrices_dict[alias] = m

        # make sure all matrices have identical rows:
        if len(
                set([
                    list(m['id_to_sample'].values()).__str__()
                    for m in list(self.matrices_dict.values())
                ])) > 1:
            master_rows, master_matrix = sorted([(len(self.matrices_dict[m]['id_to_sample']), list(self.matrices_dict[m]['id_to_sample'].values()), m)\
                                                            for m in self.matrices_dict])[0][1:]
            self.master = master_matrix
            self.master_rows = master_rows
            # the smallest matrix is 'master_matrix', and the rows it has is master_rows. so every other matrix
            # must match that, or we will throw a tantrum.
            for matrix in [m for m in self.matrices if m != master_matrix]:
                m = self.matrices_dict[matrix]

                # get reduced set of vectors from rows that match `master_rows`:
                m['id_to_sample'], m['sample_to_id'], m['cols'], m[
                    'vectors'] = get_vectors(m['path'], m['columns_to_use'],
                                             master_rows)

                if len(m['vectors']) != len(master_rows):
                    raise ConfigError(
                        'The content of rows differed between input matrices. So I tried to '
                        'match all other matrices to the matrix with the smallest number of '
                        'rows (which was "%s"). However, not all other matrices contained '
                        'the small set of rows.' % (master_matrix))
        else:
            self.master_rows = sorted(
                self.matrices_dict[self.matrices[0]]['sample_to_id'].keys())

        self.num_matrices = len(self.matrices)
        self.multiple_matrices = self.num_matrices > 1
Beispiel #18
0
    def init_profile_from_BAM(self):
        self.progress.new('Init')
        self.progress.update('Reading BAM File')
        self.bam = bamops.BAMFileObject(self.input_file_path,
                                        run=self.run,
                                        progress=self.progress).get()
        self.num_reads_mapped = self.bam.mapped
        self.progress.end()

        self.contig_names = self.bam.references
        self.contig_lengths = self.bam.lengths

        utils.check_contig_names(self.contig_names)

        self.run.info('input_bam', self.input_file_path)
        self.run.info('output_dir', self.output_directory, display_only=True)
        self.run.info('total_reads_mapped', pp(int(self.num_reads_mapped)))
        self.run.info('num_contigs', pp(len(self.contig_names)))

        if self.contig_names_of_interest:
            indexes = [
                self.contig_names.index(r)
                for r in self.contig_names_of_interest
                if r in self.contig_names
            ]
            self.contig_names = [self.contig_names[i] for i in indexes]
            self.contig_lengths = [self.contig_lengths[i] for i in indexes]
            self.run.info('num_contigs_selected_for_analysis',
                          pp(len(self.contig_names)))

        # it brings good karma to let the user know what the hell is wrong with their data:
        self.check_contigs_without_any_gene_calls(self.contig_names)

        # check for the -M parameter.
        self.remove_contigs_based_on_min_max_contig_length()

        # let's see whether the user screwed up to follow the simple instructions
        # mentioned here: http://merenlab.org/2015/05/01/anvio-tutorial/#preparation
        for contig_name in self.contig_names:
            if contig_name not in self.contig_names_in_contigs_db:
                raise ConfigError("At least one contig name in your BAM file does not match contig names stored in the\
                                    contigs database. For instance, this is one contig name found in your BAM file: '%s',\
                                    and this is another one found in your contigs database: '%s'. You may be using an\
                                    contigs database for profiling that has nothing to do with the BAM file you are\
                                    trying to profile, or you may have failed to fix your contig names in your FASTA file\
                                    prior to mapping, which is described here: %s"\
                                        % (contig_name, self.contig_names_in_contigs_db.pop(), 'http://goo.gl/Q9ChpS'))

        self.run.info('num_contigs_after_M',
                      self.num_contigs,
                      display_only=True)
        self.run.info('num_contigs', self.num_contigs, quiet=True)
        self.run.info('num_splits', self.num_splits)
        self.run.info('total_length', self.total_length)

        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        profile_db.db.set_meta_value('num_splits', self.num_splits)
        profile_db.db.set_meta_value('num_contigs', self.num_contigs)
        profile_db.db.set_meta_value('total_length', self.total_length)
        profile_db.disconnect()

        self.layer_additional_data[
            'total_reads_mapped'] = self.num_reads_mapped
        self.layer_additional_keys.append('total_reads_mapped')
Beispiel #19
0
    def create_new_view(self,
                        data_dict,
                        table_name,
                        table_structure,
                        table_types,
                        view_name=None,
                        append_mode=False):
        """Creates a new view table, and adds an entry for it into the 'views' table.

        Entries in 'views' table appear in various places in the interface. However, we also generate
        view tables to store the type of data we do not wish to display on interfaces, but be able
        access from various other modules. A good example to this is the item_order recipes. When we
        profile a sample, we treat every stplit as their own entity with respect to their mean coverage.
        Although it is great for visualization purposes, it is not useful for item_order purposes since in
        most cases we wish splits to stay together in item_order output. Hence, we create a mean_coverage_splits
        table, where each split holds their own coverage, and we create a mean_coverage_contigs table where each
        split has the coverage of their parent. Clearly the second table is not useful to display. When a table
        is not added as an entry to the 'views' table, then it only exists in the database for other purposes
        than displaying it.

        If a new view does not have a 'view_id', it is not added the 'views' table to provide that flexibility.
        """

        anvio_db = DBClassFactory().get_db_object(self.db_path)

        views_in_db = anvio_db.db.get_table_as_dict(t.views_table_name)

        if not append_mode:
            if view_name and view_name in views_in_db:
                raise ConfigError("TablesForViews speaking: Yo yo yo. You already have a view in the db '%s' called '%s'.\
                                    You can't create another one before you get rid of the existing one, because rules."\
                                                                            % (self.db_path, view_name))

            # first create the data table:
            anvio_db.db.drop_table(table_name)

        try:
            anvio_db.db.create_table(table_name, table_structure, table_types)
        except Exception as e:
            # FIXME: the following if statement will omit errors and quietly continue despite the
            # table creation failed. I think we should remove it, and add `create_table` function
            # a new flag, such as `ok_if_exists` and call it in this context as
            # `ok_if_exists=append_mode`.
            if not append_mode:
                raise ConfigError(
                    "Something bad happened when anvi'o was trying to create table `%s` in database\
                                   '%s'. Here is how the part of the code that was about this described the\
                                   problem: '%s'." %
                    (table_name, self.db_path, str(e)))

        db_entries = [
            tuple([item] + [data_dict[item][h] for h in table_structure[1:]])
            for item in data_dict
        ]

        try:
            anvio_db.db._exec_many(
                '''INSERT INTO %s VALUES (%s)''' %
                (table_name, ','.join(['?'] * len(table_structure))),
                db_entries)
        except Exception as e:
            raise ConfigError("Something bad happened while anvi'o was trying to insert %d entries into the\
                               table '%s' which contained a table structure with %d columns in '%s' :( This\
                               is the error we got back from the database module: \"%s\"."                                                                                           % \
                                    (len(db_entries), table_name, len(table_structure), self.db_path, e))

        if view_name and view_name not in views_in_db:
            anvio_db.db._exec(
                '''INSERT INTO %s VALUES (?,?)''' % t.views_table_name,
                (view_name, table_name))

        anvio_db.disconnect()
Beispiel #20
0
    def check_contigs(self, num_contigs=None):
        if not num_contigs:
            num_contigs = len(self.contigs)

        if not num_contigs:
            raise ConfigError("0 contigs to work with. Bye.")
Beispiel #21
0
 def check_input_paths(self):
     """Check FASTQ file paths if running Illumina-utils for read merging, or FASTA file paths if
     considering merged or unpaired reads. Allow both absolute and relative paths in
     samples_txt."""
     if self.run_iu_merge_pairs:
         fastq_paths = self.sample_info['r1'].tolist(
         ) + self.sample_info['r2'].tolist()
         bad_fastq_paths = []
         for fastq_path in fastq_paths:
             if os.path.isabs(fastq_path):
                 if not filesnpaths.is_file_exists(fastq_path,
                                                   dont_raise=True):
                     bad_fastq_paths.append(fastq_path)
             else:
                 if not filesnpaths.is_file_exists(os.path.join(
                         os.getcwd(), fastq_path),
                                                   dont_raise=True):
                     bad_fastq_paths.append(fastq_path)
         if bad_fastq_paths:
             raise ConfigError(
                 "The following FASTQ files in the samples_txt file, '%s', cannot be found: %s."
                 % (self.samples_txt_file, ', '.join(bad_fastq_paths)))
         bad_fastq_names = [
             s for s in fastq_paths
             if (not s.endswith('.fq') and not s.endswith('.fq.gz') and
                 not s.endswith('.fastq') and not s.endswith('.fastq.gz'))
         ]
         if bad_fastq_names:
             run.warning(
                 "Some of the sequence files in the samples_txt file, '%s', "
                 "do not end with '.fq', '.fq.gz', 'fastq' or '.fastq.gz'. "
                 "That's okay, but anvi'o decided it should warn you. "
                 "Here are the first 5 such files that have unconventional file extensions: %s."
                 % (self.samples_txt_file, ', '.join(bad_fastq_names[:5])))
     else:
         fasta_paths = self.sample_info['fasta'].tolist()
         bad_fasta_paths = []
         for fasta_path in fasta_paths:
             if os.path.isabs(fasta_path):
                 if not filesnpaths.is_file_exists(fasta_path,
                                                   dont_raise=True):
                     bad_fasta_paths.append(fasta_path)
             else:
                 if not filesnpaths.is_file_exists(os.path.join(
                         os.getcwd(), fasta_path),
                                                   dont_raise=True):
                     bad_fasta_paths.append(fasta_path)
         bad_fasta_paths = [
             s for s in fasta_paths
             if not filesnpaths.is_file_exists(os.path.abspath(s),
                                               dont_raise=True)
         ]
         if bad_fasta_paths:
             raise ConfigError(
                 "The following FASTA files in the samples_txt file, '%s', cannot be found: %s."
                 % (self.samples_txt_file, ', '.join(bad_fasta_paths)))
         bad_fasta_names = [
             s for s in fasta_paths
             if (not s.endswith('.fa') and not s.endswith('.fa.gz') and
                 not s.endswith('.fasta') and not s.endswith('.fasta.gz'))
         ]
         if bad_fasta_names:
             run.warning(
                 "Some of the FASTA files in the samples_txt file, '%s', "
                 "do not end with '.fa', '.fa.gz', 'fasta' or '.fasta.gz'. "
                 "That's okay, but anvi'o decided it should warn you. "
                 "Here are the first 5 such files that have unconventional file extensions: %s."
                 % (self.samples_txt_file, ', '.join(bad_fasta_names[:5])))
Beispiel #22
0
    def check_args(self):
        if self.blank:
            self.run.warning(
                "You are about to generate a blank profile. This is what we do when we have nothing\
                              but a contigs database to play with. Because anvi'o is lazy, it will not check the\
                              rest of the parameters you may have declred. Most of them will not matter."
            )

            if not self.output_directory:
                raise ConfigError(
                    "If you want to generate a blank profile, you need to declare an output diretory path."
                )
            if not self.sample_id:
                raise ConfigError(
                    "Mock profiles require a sample name to be declared. Because :/"
                )
            return

        if (not self.input_file_path) and (not self.serialized_profile_path):
            raise ConfigError(
                "You didn't declare any input files :/ If you intend to create a blank profile without any,\
                                input file, you should be a bit more explicit about your intention (you know, in the help\
                                there is a flag for it and all). Otherwise you should either provide an input BAM file, or\
                                a serialized anvi'o profile. See '--help' maybe?"
            )
        if self.input_file_path and self.serialized_profile_path:
            raise ConfigError(
                "You can't declare both an input file and a serialized profile."
            )
        if self.serialized_profile_path and (not self.output_directory):
            raise ConfigError(
                "When loading serialized profiles, you need to declare an output directory."
            )
        if self.input_file_path and not os.path.exists(self.input_file_path):
            raise ConfigError("No such file: '%s'" % self.input_file_path)
        if self.serialized_profile_path and not os.path.exists(
                self.serialized_profile_path):
            raise ConfigError("No such file: '%s'" %
                              self.serialized_profile_path)
        if not self.min_coverage_for_variability >= 0:
            raise ConfigError(
                "Minimum coverage for variability must be 0 or larger.")
        if not self.min_mean_coverage >= 0:
            raise ConfigError("Minimum mean coverage must be 0 or larger.")
        if not self.min_contig_length >= 0:
            raise ConfigError("Minimum contig length must be 0 or larger.")
        if not self.max_contig_length >= 100:
            raise ConfigError(
                "Maximum contig length can't be less than 100 base pairs.")
        if self.min_contig_length >= self.max_contig_length:
            raise ConfigError(
                "Maximum contig length (%s) must be larger than the minimum\
                               contig length (%s). Seriously though." %
                (pp(self.max_contig_length), pp(self.min_contig_length)))

        if self.num_threads < 1:
            raise ConfigError(
                "Nice try. Obviously, number of threds can not be less than 1."
            )

        if not self.queue_size:
            self.queue_size = self.num_threads * 2

        if not self.write_buffer_size:
            self.run.warning(
                "You set the write buffer size to 0. Which means, the profiling data will be kept in memory until\
                              the very end of the processing.")

        if self.write_buffer_size < 0:
            raise ConfigError(
                'No. Write buffer size can not have a negative value.')
Beispiel #23
0
    def gen_mcl_input(self, blastall_results):
        self.progress.new('Processing search results')
        self.progress.update('...')

        all_ids = set([])

        # mapping for the fields in the blast output
        mapping = [
            str, str, float, int, int, int, int, int, int, int, float, float
        ]

        # here we perform an initial pass on the blast results to fill the dict that will hold
        # the bit score for each gene when it was blasted against itself. this dictionary
        # will then be used to calculate the 'minbit' value between two genes, which I learned
        # from ITEP (Benedict MN et al, doi:10.1186/1471-2164-15-8). ITEP defines minbit as
        # 'bit score between target and query / min(selfbit for query, selbit for target)'. This
        # heuristic approach provides a mean to set a cutoff to eliminate weak matches between
        # two genes. minbit value reaches to 1 for hits between two genes that are almost identical.
        self_bit_scores = {}
        line_no = 1
        self.progress.update(
            '(initial pass of the serach results to set the self bit scores ...)'
        )
        for line in open(blastall_results):
            fields = line.strip().split('\t')

            try:
                query_id, subject_id, perc_id, aln_length, mismatches, gaps, q_start, q_end, s_start, s_end, e_val, bit_score = \
                    [mapping[i](fields[i]) for i in range(0, len(mapping))]
            except Exception as e:
                self.progress.end()
                raise ConfigError(
                    "Something went wrong while processing the blastall output file in line %d.\
                                    Here is the error from the uppoer management: '''%s'''"
                    % (line_no, e))
            line_no += 1
            all_ids.add(query_id)
            all_ids.add(subject_id)

            if query_id == subject_id:
                self_bit_scores[query_id] = bit_score

        self.progress.end()

        ids_without_self_search = all_ids - set(self_bit_scores.keys())
        if len(ids_without_self_search):
            search_tool = 'BLAST' if self.use_ncbi_blast else 'DIAMOND'
            self.run.warning("%s did not retun search results for %d of %d the amino acid sequences in your input FASTA file.\
                              Anvi'o will do some heuristic magic to complete the missing data in the search output to recover\
                              from this. But since you are a scientist, here are the amino acid sequence IDs for which %s\
                              failed to report self search results: %s." \
                                                    % (search_tool, len(ids_without_self_search), len(all_ids), \
                                                       search_tool, ', '.join(ids_without_self_search)))

        # HEURISTICS TO ADD MISSING SELF SEARCH RESULTS
        # we are here, because amino acid sequences in ids_without_self_search did not have any hits in the search output
        # although they were in the FASTA file the target database were built from. so we will make sure they are not
        # missing from self_bit_scores dict, or mcl_input (additional mcl inputs will be stored in the following dict)
        additional_mcl_input_lines = {}

        for id_without_self_search in ids_without_self_search:
            entry_hash, gene_caller_id = id_without_self_search.split('_')

            try:
                genome_name = self.hash_to_genome_name[entry_hash]
            except KeyError:
                raise ConfigError(
                    "Something horrible happened. This can only happend if you started a new analysis with\
                                    additional genomes without cleaning the previous work directory. Sounds familiar?"
                )

            # divide the DNA length of the gene by three to get the AA length, and multiply that by two to get an approximate
            # bit score that would have recovered from a perfect match
            gene_amino_acid_sequence_length = len(
                self.genomes_storage.get_gene_sequence(
                    genome_name,
                    int(gene_caller_id),
                    report_DNA_sequences=False))
            self_bit_scores[
                id_without_self_search] = gene_amino_acid_sequence_length * 2

            # add this SOB into additional_mcl_input_lines dict.
            additional_mcl_input_lines[
                id_without_self_search] = '%s\t%s\t1.0\n' % (
                    id_without_self_search, id_without_self_search)

        # CONTINUE AS IF NOTHING HAPPENED
        self.run.info('Min percent identity', self.min_percent_identity)
        self.run.info('Minbit', self.minbit)
        self.progress.new('Processing search results')

        mcl_input_file_path = self.get_output_file_path('mcl-input.txt')
        mcl_input = open(mcl_input_file_path, 'w')

        line_no = 1
        num_edges_stored = 0
        for line in open(blastall_results):
            fields = line.strip().split('\t')

            query_id, subject_id, perc_id, aln_length, mismatches, gaps, q_start, q_end, s_start, s_end, e_val, bit_score = \
                [mapping[i](fields[i]) for i in range(0, len(mapping))]

            line_no += 1

            if line_no % 5000 == 0:
                self.progress.update('Lines processed %s ...' % pp(line_no))

            #
            # FILTERING BASED ON PERCENT IDENTITY
            #
            if perc_id < self.min_percent_identity:
                continue

            #
            # FILTERING BASED ON MINBIT
            #
            minbit = bit_score / min(self_bit_scores[query_id],
                                     self_bit_scores[subject_id])
            if minbit < self.minbit:
                continue

            mcl_input.write('%s\t%s\t%f\n' %
                            (query_id, subject_id, perc_id / 100.0))
            num_edges_stored += 1

        # add additional lines if there are any:
        for line in list(additional_mcl_input_lines.values()):
            mcl_input.write(line)
            num_edges_stored += 1

        mcl_input.close()

        self.progress.end()
        self.run.info('Filtered search results',
                      '%s edges stored' % pp(num_edges_stored))
        self.run.info('MCL input', '%s' % mcl_input_file_path)

        return mcl_input_file_path
Beispiel #24
0
    def process(self):
        """The file to be parsed looks like this:

                    C	1	340016	248	340016,	ASF00353.1,	KPDGLIIDIEGLENVQLGKGGELQPLDLHDIYEQTGVFYYRSKNPEGG,	NA; NA; NA; NA; NA; uncultured virus;
                    C	2	745014	427	745014,	WP_009471064.1,	GISENIRAISVIDRYLEHPRVYIAYSRGEPKYYMGSADLMTRNIDYRVEVLCPVHDPKAQKTLQDVLDQQWNDNVKARVIDASQ,	Proteobacteria; Cellvibrionales; Gammaproteobacteria; Halieaceae; NA; gamma proteobacterium HIMB55;
                    C	3	745014	564	745014,	WP_009471063.1,	MSLIASMARGMDESNLKMNAAGNIQSENTSGVAERRMRMPPIPDDFFSLITHEQRIALNQKQQFGWFVKFVRRPLFQPIEVVLSNPEGSEFLLLETDGITRPFFNVRTDDLR,	Proteobacteria; Cellvibrionales; Gammaproteobacteria; Halieaceae; NA; gamma proteobacterium HIMB55;
                    C	4	745014	991	745014,	WP_009471062.1,	SNPRWEQLLRYRYIELIALWEGRLTTRQLCETFGIGRQQANKDLTSYRRGLTRGDLVYDAVAKYYSPSEDFAPTLTQGLASEYLQMAAQQSDVQQILGDLPVASANVEVIAAPLREVPASLLRPIIRAMAESRRIDVDYVSLNNPDREGRIIVPHTLVWTGYRWHVRAWCEKNLDFRDFVLSRFRGDADLMD,FSNPRWEQLLRYRYIELIALWEGRLTTRQLCETFGIGRQQANKDLTSYRRGLTRGDLVYDAVAKYYSPSEDFAPTLTQGLASEYLQMAAQQSDVQQILGDLPVASANVEVIAAPLREVPASLLRPIIRAMAESRRIDVDYVSLNNPDREGRIIVPHTLVWTGYRWHVRAWCEKNLDFRDFVLSRFRGDADLMD,	Proteobacteria; Cellvibrionales; Gammaproteobacteria; Halieaceae; NA; gamma proteobacterium HIMB55;
                    C	5	745014	737	745014,	WP_009471061.1,	GLGGLEALRLFTDVLAPACISTDHPKFLAFVPAAPTEAATLFDLIVGASSICGTSWLESAGATYAENQALQWIADLAGFGPEAGGTFVSGGTAGNLSALVAARHKWRRGNESRDALRGLVISSKGAHASIKQATYVMDVDLLEVGGD,Proteobacteria; Cellvibrionales; Gammaproteobacteria; Halieaceae; NA; gamma proteobacterium HIMB55;
                    C	6	745014	462	745014,	WP_009471060.1,	SFRECLSNPLFVGYQLSGTFSFCGVFVYISTVAFFLRDVFDVSTEFFGVVFAMTAAGFIVGSLSSSRLVLKWGADRTLRRGAFICALSTTSAL,	Proteobacteria; Cellvibrionales; Gammaproteobacteria; Halieaceae; NA; gamma proteobacterium HIMB55;
                    C	7	745014	583	745014,	WP_009471059.1,	MSTNRSYVSATLTADENKAAIEAHLHEILERSLTPMEPGQAKVYMEHTAVRMAEEAGAGVTTFQMVEVKHANTAYMIRLAVLTNGSAIGLDLMDMENGQFFIPEVCPVIPLETPTVN,	Proteobacteria; Cellvibrionales; Gammaproteobacteria; Halieaceae; NA; gamma proteobacterium HIMB55;
                    U	8	0
                    C	11	1898104	296	1898104,	PTL97260.1,	YFEPWVKGGNSIIRAIHYPPITTDPGDSVRAGQHEDINLITLLMGASAEGLEVLNKQG,	Bacteroidetes; NA; NA; NA; NA; Bacteroidetes bacterium;
                    U	13	0

            Where, according to https://github.com/bioinformatics-centre/kaiju, each column corresponds to,

                    1. either C or U, indicating whether the read is classified or unclassified.
                    2. name of the read
                    3. NCBI taxon identifier of the assigned taxon
                    4. the length or score of the best match used for classification
                    5. the taxon identifiers of all database sequences with the best match
                    6. the accession numbers of all database sequences with the best match
                    7. matching fragment sequence(s)
        """

        if not self.just_do_it:
            raise ConfigError(
                "Anvi'o assumes you used this exact parameter during your kaiju run: \
                               '-r superkingdom,phylum,order,class,family,genus,species'. If you\
                               haven't, you will run into trouble later. If you are positive that\
                               you did include that parameter to your run, re-run this program with\
                               `--just-do-it` flag.")

        # THIS IS IMPORTANT.
        levels_of_taxonomy = constants.levels_of_taxonomy

        taxonomy_dict = {}

        kaiju_output = self.dicts['kaiju_output']

        self.run.info('Total num hits found', len(kaiju_output))

        self.progress.new('Bleep kaiju stuff bloop')
        self.progress.update('Processing the input data ...')

        for entry in kaiju_output.values():
            tax_string_list = [e.strip() for e in entry['taxonomy'].split(';')]
            gene_callers_id = entry['gene_callers_id']

            last_known = ''
            for i in range(0, len(tax_string_list)):
                if tax_string_list[i] == 'NA':
                    if last_known:
                        tax_string_list[i] = 'Unknown_' + last_known
                    else:
                        tax_string_list[i] = 'Unknown'
                else:
                    last_known = tax_string_list[i].replace(' ', '_')

            taxonomy_dict[gene_callers_id] = {}
            for i in range(0, len(levels_of_taxonomy)):
                level = levels_of_taxonomy[i]
                taxonomy_dict[gene_callers_id][level] = tax_string_list[i]

        self.progress.end()

        random_phylum_names = set([
            taxonomy_dict[e]['t_phylum']
            for e in random.sample(list(taxonomy_dict.keys()), 20)
        ])

        self.run.warning(
            "Good news: anvi'o finished parsing kaiju taxonomy output. Bad news: it has no idea whether\
                          it did well or not. Since the user can specify which taxonomic levels kaiju to report\
                          they can't ask anvi'o to utilize that information. So anvi'o always assumes you started from\
                          the domain-level, and followed the conventional levels of taxonomy. Here is your question. We\
                          randomly picked some phylum names from your input taoxnomy as anvi'o parsed them. Here they are:\
                          '%s'. Do they look like phylum names to you? If they don't, you are in very big trouble :( The\
                          best way to get yourself out of trouble is to immediately press CTRL-C, turn your computer off,\
                          and move permanently to Cuba since you are done here (if you are already in Cuba, please let\
                          us know for more instructions)." %
            ', '.join(random_phylum_names),
            header="Good news, bad news, and a question",
            lc="yellow")

        return TaxonomyHelper(
            taxonomy_dict).get_genes_taxonomy_and_taxon_names_dicts()
Beispiel #25
0
    def init(self):
        super().init()

        # loading the samples.txt file
        self.samples_txt_file = self.get_param_value_from_config(
            ['samples_txt'])
        filesnpaths.is_file_exists(self.samples_txt_file)
        try:
            # getting the samples information (names, [group], path to r1, path to r2) from samples.txt
            self.samples_information = pd.read_csv(self.samples_txt_file,
                                                   sep='\t',
                                                   index_col=False)
        except IndexError as e:
            raise ConfigError(
                "Looks like your samples_txt file, '%s', is not properly formatted. \
                               This is what we know: '%s'" %
                (self.samples_txt_file, e))
        if 'sample' not in list(self.samples_information.columns):
            raise ConfigError(
                "Looks like your samples_txt file, '%s', is not properly formatted. \
                               We are not sure what's wrong, but we can't find a column with title 'sample'."
                % self.samples_txt_file)

        # get a list of the sample names
        self.sample_names = list(self.samples_information['sample'])
        self.run_metaspades = self.get_param_value_from_config(
            ['metaspades', 'run'])
        self.use_scaffold_from_metaspades = self.get_param_value_from_config(
            ['metaspades', 'use_scaffolds'])
        self.run_qc = self.get_param_value_from_config(
            ['iu_filter_quality_minoche', 'run']) == True
        self.run_summary = self.get_param_value_from_config(
            ['anvi_summarize', 'run']) == True
        self.run_split = self.get_param_value_from_config(
            ['anvi_split', 'run']) == True
        self.references_mode = self.get_param_value_from_config(
            'references_mode', repress_default=True)
        self.fasta_txt_file = self.get_param_value_from_config(
            'fasta_txt', repress_default=True)

        self.references_for_removal_txt = self.get_param_value_from_config(['remove_short_reads_based_on_references',\
                                                                            'references_for_removal_txt'],\
                                                                           repress_default=True)
        if self.references_for_removal_txt:
            self.load_references_for_removal()

        self.collections_txt = self.get_param_value_from_config(
            'collections_txt')
        if self.collections_txt:
            self.load_collections()
        elif self.run_summary:
            raise ConfigError(
                'If you want to run anvi-summarize you must provide a collections_txt file'
            )
        elif self.run_split:
            raise ConfigError(
                'If you want to run anvi-split you must provide a collections_txt file'
            )

        self.init_samples_txt()
        self.init_kraken()
        self.init_refereces_txt()
        self.init_target_files()
Beispiel #26
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_pan_db(db_path)

    # make sure the version is accurate
    pan_db = db.DB(db_path, None, ignore_version=True)
    if str(pan_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this pan database is not %s (hence, this script cannot really do anything)."
            % current_version)

    # update keys
    for old_key, new_key in [('maxbit', 'minbit')]:
        try:
            pan_db.set_meta_value(new_key, pan_db.get_meta_value(old_key))
        except:
            pass

    # remove stuff that are not irrelevant
    try:
        pan_db.remove_meta_key_value_pair('maxbit')
    except:
        pass

    # learn additional_data_headers for later:
    additional_data_headers = pan_db.get_meta_value(
        'additional_data_headers').split(',')

    # take care of the self table
    self_table = pan_db.get_table_as_list_of_tuples('self')
    pan_db.cursor.execute('ALTER TABLE self RENAME TO self_TEMP;')
    pan_db.cursor.execute('CREATE TABLE self (key text, value text);')

    for key, val in self_table:
        new_key = key.replace('PC',
                              'gene_cluster').replace('pc', 'gene_cluster')
        new_val = val.replace('PC',
                              'gene_cluster').replace('pc', 'gene_cluster')

        pan_db.set_meta_value(new_key, new_val)

    pan_db.cursor.execute('DROP TABLE self_TEMP;')

    # take care of the views table
    views_table = pan_db.get_table_as_list_of_tuples('views')
    pan_db.cursor.execute('ALTER TABLE views RENAME TO views_TEMP;')
    pan_db.cursor.execute(
        'CREATE TABLE views (view_id str, target_table str);')

    values = []
    for view, target in views_table:
        new_view = view.replace('PC',
                                'gene_cluster').replace('pc', 'gene_cluster')
        new_target = target.replace('PC', 'gene_cluster').replace(
            'pc', 'gene_cluster')
        values.append((new_view, new_target), )

    pan_db.insert_many('views', values)

    pan_db.cursor.execute('DROP TABLE views_TEMP;')

    # rename tables
    pan_db._exec(
        'ALTER TABLE PC_frequencies RENAME TO gene_cluster_frequencies;')
    pan_db._exec(
        'ALTER TABLE PC_presence_absence RENAME TO gene_cluster_presence_absence;'
    )
    pan_db._exec('ALTER TABLE protein_clusters RENAME TO gene_clusters;')

    # protein_cluster_id -> gene_cluster_id in table gene_clusters.
    pan_db.cursor.execute(
        'ALTER TABLE gene_clusters RENAME TO gene_clusters_TEMP;')
    pan_db.cursor.execute(
        'CREATE TABLE gene_clusters (entry_id numeric, gene_caller_id numeric, gene_cluster_id str, genome_name str, alignment_summary str);'
    )
    pan_db.cursor.execute(
        'INSERT INTO gene_clusters(entry_id, gene_caller_id, gene_cluster_id, genome_name, alignment_summary) SELECT entry_id, gene_caller_id, protein_cluster_id, genome_name, alignment_summary FROM gene_clusters_TEMP;'
    )
    pan_db.cursor.execute('DROP TABLE gene_clusters_TEMP;')

    # commit
    try:
        pan_db._exec('COMMIT')
    except:
        pass

    # we also added a totally new table to this version:
    pan_db.create_table(item_additional_data_table_name,
                        item_additional_data_table_structure,
                        item_additional_data_table_types)

    # set the version
    pan_db.remove_meta_key_value_pair('version')
    pan_db.set_version(next_version)

    # we have one more thing to do: getting rid of the 'additional_data' table without losing data, by carrying
    # its content into our new item_additional_data_table
    additional_data_table_dict = pan_db.get_table_as_dict('additional_data')

    # close the db temporarily
    pan_db.disconnect()

    # update the contents of the item_additional_data_table
    args = argparse.Namespace(pan_db=db_path,
                              just_do_it=True,
                              ignore_db_version=True)
    item_additional_data_table = TableForItemAdditionalData(args)
    item_additional_data_table.add(additional_data_headers,
                                   additional_data_table_dict)

    # open the database again to remove stuff
    pan_db = db.DB(db_path, None, ignore_version=True)
    pan_db.remove_meta_key_value_pair('additional_data_headers')
    pan_db._exec("DROP TABLE additional_data")

    # now bye for real!
    pan_db.disconnect()

    progress.end()

    run.info_single('Your pan db is now %s.' % next_version,
                    nl_after=1,
                    nl_before=1,
                    mc='green')
Beispiel #27
0
    def load_collections(self):
        ''' Load the collections_txt file, run some sanity checks, and figure out params for anvi_import_collection'''
        collections = u.get_TAB_delimited_file_as_dictionary(
            self.collections_txt)
        bad_groups = [g for g in collections if g not in self.group_names]
        if bad_groups:
            raise ConfigError('Some of the names in your collection_txt \
                                   file ("%s") don\'t match the names of the \
                                   groups in your samples_txt/fasta_txt. \
                                   Here are the names that don\'t match: %s. \
                                   And here are the group names we expect to find: \
                                   %s' %
                              (self.collections_txt, ', '.join(bad_groups),
                               ', '.join(self.group_names)))
        for group in collections:
            default_collection = collections[group].get('default_collection')

            if default_collection:
                # User can specify either a default collection OR collection from file
                not_allowed_params = {
                    'collection_name', 'collection_file', 'bins_info',
                    'contigs_mode'
                }
                if any([
                        collections[group][key] for key in not_allowed_params
                        if key in collections[group].keys()
                ]):
                    raise ConfigError(
                        'We encountered the following problem with your \
                                       collections_txt file ("%s"): you can choose \
                                       either using a default collection OR importing \
                                       a collection from a file. Yet, for "%s", you specificy \
                                       a default collection AND also specify some of the following \
                                       parameters: %s.' %
                        (self.collections_txt, group,
                         ", ".join(not_allowed_params)))

                collections[group]['collection_name'] = 'DEFAULT'
                collections[group]['contigs_mode'] = ''

            else:
                if not filesnpaths.is_file_exists(
                        collections[group]['collection_file'],
                        dont_raise=True):
                    raise ConfigError(
                        'We encountered the following problem with your \
                                       collections_txt file ("%s"): you did not specify \
                                       a valid collection file for "%s".' %
                        (self.collections_txt, group))

                if not collections[group]['collection_name']:
                    raise ConfigError(
                        'You must specify a name for each collection in your collections_txt'
                    )
                u.check_collection_name(collections[group]['collection_name'])
                if collections[group].get('bins_info'):
                    filesnpaths.is_file_exists(collections[group]['bins_info'])
                    collections[group][
                        'bins_info'] = '--bins-info %s' % collections[group][
                            'bins_info']
                else:
                    collections[group]['bins_info'] = ''
                if collections[group].get('contigs_mode'):
                    collections[group]['contigs_mode'] = '--contigs-mode'
                else:
                    collections[group]['contigs_mode'] = ''
        self.collections = collections
Beispiel #28
0
    def __init__(self,
                 contigs_db_path,
                 scg_domain_classifier_path=None,
                 source_requested=None,
                 run=run,
                 progress=progress):
        self.run = run
        self.progress = progress
        self.initialized_properly = True

        self.SCG_domain_predictor = scgdomainclassifier.Predict(
            argparse.Namespace(),
            run=terminal.Run(verbose=False),
            progress=self.progress)

        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] != 'singlecopy'
        ])
        singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] == 'singlecopy'
        ])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # get the hmm hits table
        self.hmm_hits_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_table_name)

        # read search table (which holds hmmscan hits for splits).
        self.hmm_hits_splits_table = utils.get_filtered_dict(
            contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name),
            'source', singlecopy_sources)

        # an example entry in self.hmm_hits_splits_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 69.6763202725724,
        #    'source'                : u'Bacteria_74',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001',
        #    'hmm_hit_entry_id'      : 1
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [
                h for h in info_table[source_in_db]['ref'].split()
                if h.startswith('http')
            ][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', '))
                                 for s in info_table])

        # we're done with the db
        contigs_db.disconnect()

        self.sources = list(info_table.keys())
        self.domains = set(
            [info_table[source]['domain'] for source in self.sources])
        self.source_to_domain = dict([(source, info_table[source]['domain'])
                                      for source in self.sources])
        self.domain_to_sources = [(domain, [
            source for source in self.sources
            if info_table[source]['domain'] == domain
        ]) for domain in self.domains]

        self.domains_missing_in_SCG_domain_predictor = [
            d for d in self.domains
            if d not in self.SCG_domain_predictor.SCG_domains
        ]
        if len(self.domains_missing_in_SCG_domain_predictor):
            num_domains_missing = len(
                self.domains_missing_in_SCG_domain_predictor)
            self.run.warning("OK. We have a problem. You seem to have single-copy core gene collections for among your HMM hits %s that\
                              are not included when the anvi'o domain predictor was trained :/ Here is the list of domains that are making\
                              us upset here: \"%s\". This means either you put a new HMM single-copy core gene collection to the anvi'o HMMs\
                              directory, or gave it as a parameter, and run `anvi-run-hmms` without updating the classifier anvi'o uses to\
                              resolve domains for proper completion/redundancy estimates."                                                                                           % \
                                           ('a domain' if num_domains_missing == 1 else '%s domains' % num_domains_missing,
                                            ', '.join(self.domains_missing_in_SCG_domain_predictor)))
            self.initialized_properly = False

        if source_requested:
            if source_requested not in self.sources:
                raise ConfigError(
                    'Requested source "%s" is not one of the single-copy gene sources found in the database.'
                    % source_requested)

            # filter out sources that are not requested
            self.sources = [source_requested]
            self.genes_in_db = {
                source_requested: self.genes_in_db[source_requested]
            }
            self.hmm_hits_splits_table = utils.get_filtered_dict(
                self.hmm_hits_splits_table, 'source', set([source_requested]))

        # these will be very useful later. trust me.
        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        for entry in list(self.hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]
            gene_unique_identifier = hmm_hit['gene_unique_identifier']

            if gene_unique_identifier not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[
                    gene_unique_identifier] = hmm_hit['gene_name']

            if gene_unique_identifier not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[gene_unique_identifier] = [
                    entry['split']
                ]
            else:
                self.splits_unique_gene_id_occurs[
                    gene_unique_identifier].append(entry['split'])
Beispiel #29
0
    def add_ECG_EAG_ratio_per_gene_cluster_into_pan_database(self):
        if not self.pan_summary:
            self.init_pan_summary()

        gene_presence_in_the_environment_dict = self.get_gene_presence_in_the_environment_dict(
        )

        self.progress.new('Working on ECG/EAG ratio per gene cluster')
        self.progress.update('...')

        gene_status_frequencies_in_gene_cluster = {}

        gene_cluster_names = list(self.pan_summary.gene_clusters.keys())
        num_gene_clusters = len(gene_cluster_names)
        for i in range(0, num_gene_clusters):
            self.progress.update('%.2f' % ((i + 1) * 100 / num_gene_clusters))
            gene_cluster_name = gene_cluster_names[i]

            status = {'EAG': 0, 'ECG': 0, 'NA': 0}
            for internal_genome_name in self.pan_summary.gene_clusters[
                    gene_cluster_name]:
                genome_name = self.descriptions.genomes[internal_genome_name][
                    'bin_id']

                for gene_caller_id in self.pan_summary.gene_clusters[
                        gene_cluster_name][internal_genome_name]:
                    if genome_name not in gene_presence_in_the_environment_dict:
                        self.progress.end()
                        raise ConfigError(
                            "Something is wrong... It seems you generated a pangenome with an internal genomes file\
                                           that is not identical to the internal genomes file you are using to run this program."
                        )

                    status[gene_presence_in_the_environment_dict[genome_name]
                           [gene_caller_id]] += 1
            gene_status_frequencies_in_gene_cluster[gene_cluster_name] = status

        # setup some boring variable names.
        items_additional_data_dict = {}
        key_ECG_EAG_ratio = 'EAG_ECG_ratio'
        key_ECGs_and_EAGs = 'ECGs_and_EAGs'
        list_ECG_EAG_keys = ['EAG', 'ECG', 'NA']

        self.progress.update('Setting up the items data dictionary ..')
        for gene_cluster_name in gene_status_frequencies_in_gene_cluster:
            r = gene_status_frequencies_in_gene_cluster[gene_cluster_name]

            # add ECG and EAG frequencies for the gene cluster
            items_additional_data_dict[gene_cluster_name] = dict([
                ('%s!%s' % (key_ECGs_and_EAGs, status), r[status])
                for status in list_ECG_EAG_keys
            ])

            # add ECG / EAG ratio
            items_additional_data_dict[gene_cluster_name][
                key_ECG_EAG_ratio] = (r['EAG'] / (r['EAG'] + r['ECG']) if
                                      (r['EAG'] + r['ECG']) else 0)

        self.progress.end()

        # add that bad boy to the database
        self.args.just_do_it = True
        items_additional_data_keys = [('%s!%s' % (key_ECGs_and_EAGs, status))
                                      for status in list_ECG_EAG_keys
                                      ] + [key_ECG_EAG_ratio]
        TableForItemAdditionalData(self.args).add(items_additional_data_dict,
                                                  items_additional_data_keys)
Beispiel #30
0
    def sanity_check_for_kraken(self):
        '''Making sure the sample names and file paths the provided kraken.txt file are valid'''
        kraken_txt = self.get_param_value_from_config('kraken_txt')

        if kraken_txt:
            if self.get_param_value_from_config(['krakenhll', 'run']) == False:
                raise ConfigError(
                    "You supplied a kraken_txt file, %s, but you set krakenhll \
                                   not to run in the config file. anvi'o is confused and \
                                   is officially going on a strike." %
                    kraken_txt)

            if 'krakenhll' not in self.config:
                raise ConfigError(
                    'You provided a kraken_txt, but you didnt set any parameters \
                                   for krakenhll. As a minimum, you must provide the path to \
                                   the krakenhll database using the --db parameter in the config file.'
                )

            # if a kraken_txt was supplied then let's run kraken by default
            self.config['krakenhll']['run'] = True

            kraken_annotation_dict = u.get_TAB_delimited_file_as_dictionary(
                kraken_txt)
            if next(iter(next(iter(
                    kraken_annotation_dict.values())).keys())) != "path":
                raise ConfigError(
                    "Your kraken annotation file, '%s', is not formatted properly \
                                   anvi'o expected it to have two columns only and the second column \
                                   should have a header 'path'." % kraken_txt)
            samples_in_kraken_txt = set(kraken_annotation_dict.keys())
            # get a list of the sample names
            sample_names = set(self.samples_information['sample'])

            wrong_samples_in_kraken_txt = samples_in_kraken_txt - sample_names
            if wrong_samples_in_kraken_txt:
                raise ConfigError(
                    "Your kraken annotation file, '%s', contains samples that \
                                   are not in your samples_txt file, '%s'. Here is an example \
                                   of such a sample: %s." %
                    (kraken_txt,
                     self.get_param_value_from_config('samples_txt'),
                     next(iter(wrong_samples_in_kraken_txt))))

            missing_samples_in_kraken_txt = sample_names - samples_in_kraken_txt
            if missing_samples_in_kraken_txt:
                raise ConfigError(
                    "Your kraken annotation file, '%s', is missing samples that \
                                   are in your samples_txt file, '%s'. This is not allowed. \
                                   Here is an example of such a sample: %s." %
                    (kraken_txt,
                     self.get_param_value_from_config('samples_txt'),
                     wrong_samples_in_kraken_txt[0]))
            self.kraken_annotation_dict = kraken_annotation_dict

        if self.get_param_value_from_config(['krakenhll', 'run']):
            if not self.get_param_value_from_config(['krakenhll', '--db']):
                raise ConfigError(
                    'In order to run krakenhll, you must provide a path to \
                                   a database using the --db parameter in the config file.'
                )