Exemple #1
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError("You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations)

        self.progress.update('Creating a new single profile database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_SCVs = False

        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': self.sample_id,
                       'merged': False,
                       'blank': self.blank,
                       'items_ordered': False,
                       'default_view': 'single',
                       'min_contig_length': self.min_contig_length,
                       'max_contig_length': self.max_contig_length,
                       'SNVs_profiled': not self.skip_SNV_profiling,
                       'SCVs_profiled': self.profile_SCVs,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.a_meta['contigs_db_hash'],
                       'description': self.description if self.description else '_No description is provided_'}
        profile_db.create(meta_values)

        self.progress.update('Creating a new auxiliary database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.auxiliary_db_path = self.generate_output_destination('AUXILIARY-DATA.db')
        self.auxiliary_db = auxiliarydataops.AuxiliaryDataForSplitCoverages(self.auxiliary_db_path,
                                                                            self.a_meta['contigs_db_hash'],
                                                                            create_new=True,
                                                                            run=null_run,
                                                                            progress=null_progress)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning('Single-nucleotide variation will not be characterized for this profile.')

        if not self.profile_SCVs:
            self.run.warning('Amino acid linkmer frequencies will not be characterized for this profile.')
Exemple #2
0
    def check_params(self):
        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(
                self.output_dir,
                delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(
            self.log_file_path) else None

        if not isinstance(self.minbit, float):
            raise ConfigError("minbit value must be of type float :(")

        if self.minbit < 0 or self.minbit > 1:
            raise ConfigError(
                "Well. minbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError(
                "Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError(
                "Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is "
                "pretty cute, too." % self.min_percent_identity)

        if len(
            [c for c in list(self.genomes.values())
             if 'genome_hash' not in c]):
            raise ConfigError(
                "self.genomes does not seem to be a properly formatted dictionary for "
                "the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering "
                "while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name +
                                                     '-PAN.db')
Exemple #3
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError("You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations)

        self.progress.update('Creating a new single profile database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_SCVs = False

        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': self.sample_id,
                       'merged': False,
                       'blank': self.blank,
                       'items_ordered': False,
                       'default_view': 'single',
                       'min_contig_length': self.min_contig_length,
                       'max_contig_length': self.max_contig_length,
                       'SNVs_profiled': not self.skip_SNV_profiling,
                       'SCVs_profiled': self.profile_SCVs,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.a_meta['contigs_db_hash'],
                       'description': self.description if self.description else '_No description is provided_'}
        profile_db.create(meta_values)

        self.progress.update('Creating a new auxiliary database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.auxiliary_db_path = self.generate_output_destination('AUXILIARY-DATA.db')
        self.auxiliary_db = auxiliarydataops.AuxiliaryDataForSplitCoverages(self.auxiliary_db_path,
                                                                            self.a_meta['contigs_db_hash'],
                                                                            create_new=True,
                                                                            run=null_run,
                                                                            progress=null_progress)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning('Single-nucleotide variation will not be characterized for this profile.')

        if not self.profile_SCVs:
            self.run.warning('Amino acid linkmer frequencies will not be characterized for this profile.')
Exemple #4
0
    def check_params(self):
        # check the project name:
        if not self.project_name:
            raise ConfigError("Please set a project name, and be prepared to see it around as (1) anvi'o will use\
                                that name to set the output directory and to name various output files such as the\
                                databases that will be generated at the end of the process. If you set your own output\
                                directory name, you can have multiple projects in it and all of those projects can use\
                                the same intermediate files whenever possible.")

        utils.is_this_name_OK_for_database('pan project name', self.project_name, stringent=False)

        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None

        if not isinstance(self.maxbit, float):
            raise ConfigError("maxbit value must be of type float :(")

        if self.maxbit < 0 or self.maxbit > 1:
            raise ConfigError("Well. maxbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError("Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError("Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\
                               pretty cute, too." % self.min_percent_identity)


        if len([c for c in list(self.genomes.values()) if 'genome_hash' not in c]):
            raise ConfigError("self.genomes does not seem to be a properly formatted dictionary for\
                               the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
Exemple #5
0
    def process_single_order_data(self, single_order_path, single_order_name):
        """Just inject a single order into the `self.samples_order_dict`"""

        if not single_order_path:
            return

        if not single_order_name:
            raise SamplesError("You provided a file for a single order, but not a name for it. This is a no no :/")

        filesnpaths.is_file_plain_text(single_order_path)

        single_order_file_content = [l.strip('\n') for l in open(single_order_path, 'rU').readlines()]

        if len(single_order_file_content) != 1:
            raise SamplesError("The single order file should contain a single line of information. It can't have nothing,\
                                it can't have too much. Just a single newick tree, or a comma-separated list of sample\
                                names.")

        _order = single_order_file_content.pop()

        # if you are reading this line, please brace yourself to possibly one of the silliest
        # bunch of lines in the anvi'o codebase. the reason we are doing this this way is quite
        # a long story, and deserves a FIXME, but in order to utilize the excellent function
        # in the filesnpaths module to check the contents of the samples order dict rigirously,
        # we need to have this information in a file. a better way could have been implementing
        # a filesnpaths.is_proper_samples_order_content function next to the currently available
        # filesnpaths.is_proper_samples_order_file (the latter would call the former with a dict
        # and it would be much more flexible), but we can't import utils form within filesnpaths.
        # without utils we don't have a get_TAB_delimited_file_as_dictionary function, and we are
        # definitely not going to implement it in two places :( recovering from a poor design by
        # doing something even poorer? couldn't have we fixed this once and for all instead of
        # writing this paragraph? well. just remember that you are thinking about a rethorical
        # question in a comment section. so sometimes we do things that are not quite productive.
        temp_samples_order_file_path = filesnpaths.get_temp_file_path()
        temp_samples_order_file = open(temp_samples_order_file_path, 'w')
        temp_samples_order_file.write('\t'.join(['attributes', 'basic', 'newick']) + '\n')

        if filesnpaths.is_proper_newick(_order, dont_raise=True):
            temp_samples_order_file.write('\t'.join([single_order_name, '', _order]) + '\n')
            self.samples_order_dict[single_order_name] = {'newick': _order, 'basic': None}
        else:
            temp_samples_order_file.write('\t'.join([single_order_name, _order, '']) + '\n')
            self.samples_order_dict[single_order_name] = {'basic': _order, 'newick': None}

        temp_samples_order_file.close()

        sample_names_in_samples_order_file = filesnpaths.is_proper_samples_order_file(temp_samples_order_file_path)
        os.remove(temp_samples_order_file_path)

        if not self.sample_names_in_samples_information_file:
            self.sample_names_in_samples_order_file = sample_names_in_samples_order_file

        self.available_orders.add(single_order_name)

        self.run.info('Samples order', "A single order for '%s' is also loaded" % single_order_name, quiet=self.quiet)
Exemple #6
0
    def process_single_order_data(self, single_order_path, single_order_name):
        """Just inject a single order into the `self.samples_order_dict`"""

        if not single_order_path:
            return

        if not single_order_name:
            raise SamplesError("You provided a file for a single order, but not a name for it. This is a no no :/")

        filesnpaths.is_file_plain_text(single_order_path)

        single_order_file_content = [l.strip('\n') for l in open(single_order_path, 'rU').readlines()]

        if len(single_order_file_content) != 1:
            raise SamplesError("The single order file should contain a single line of information. It can't have nothing,\
                                it can't have too much. Just a single newick tree, or a comma-separated list of sample\
                                names.")

        _order = single_order_file_content.pop()

        # if you are reading this line, please brace yourself to possibly one of the silliest
        # bunch of lines in the anvi'o codebase. the reason we are doing this this way is quite
        # a long story, and deserves a FIXME, but in order to utilize the excellent function
        # in the filesnpaths module to check the contents of the samples order dict rigirously,
        # we need to have this information in a file. a better way could have been implementing
        # a filesnpaths.is_proper_samples_order_content function next to the currently available
        # filesnpaths.is_proper_samples_order_file (the latter would call the former with a dict
        # and it would be much more flexible), but we can't import utils form within filesnpaths.
        # without utils we don't have a get_TAB_delimited_file_as_dictionary function, and we are
        # definitely not going to implement it in two places :( recovering from a poor design by
        # doing something even poorer? couldn't have we fixed this once and for all instead of
        # writing this paragraph? well. just remember that you are thinking about a rethorical
        # question in a comment section. so sometimes we do things that are not quite productive.
        temp_samples_order_file_path = filesnpaths.get_temp_file_path()
        temp_samples_order_file = open(temp_samples_order_file_path, 'w')
        temp_samples_order_file.write('\t'.join(['attributes', 'basic', 'newick']) + '\n')

        if filesnpaths.is_proper_newick(_order, dont_raise=True):
            temp_samples_order_file.write('\t'.join([single_order_name, '', _order]) + '\n')
            self.samples_order_dict[single_order_name] = {'newick': _order, 'basic': None}
        else:
            temp_samples_order_file.write('\t'.join([single_order_name, _order, '']) + '\n')
            self.samples_order_dict[single_order_name] = {'basic': _order, 'newick': None}

        temp_samples_order_file.close()

        sample_names_in_samples_order_file = filesnpaths.is_proper_samples_order_file(temp_samples_order_file_path)
        os.remove(temp_samples_order_file_path)

        if not self.sample_names_in_samples_information_file:
            self.sample_names_in_samples_order_file = sample_names_in_samples_order_file

        self.available_orders.add(single_order_name)

        self.run.info('Samples order', "A single order for '%s' is also loaded" % single_order_name, quiet=self.quiet)
Exemple #7
0
    def check_params(self):
        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None

        if not isinstance(self.minbit, float):
            raise ConfigError("minbit value must be of type float :(")

        if self.minbit < 0 or self.minbit > 1:
            raise ConfigError("Well. minbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError("Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError("Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\
                               pretty cute, too." % self.min_percent_identity)


        if len([c for c in list(self.genomes.values()) if 'genome_hash' not in c]):
            raise ConfigError("self.genomes does not seem to be a properly formatted dictionary for\
                               the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
Exemple #8
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory,
            ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError(
                "Anvi'o couldn't find the contigs database where you said it would be :/"
            )

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.check_dbs_to_be_merged()

        self.populate_profile_dbs_info_dict()

        self.populate_layer_additional_data_dict()

        self.sample_ids_found_in_input_dbs = sorted([
            v['sample_id'] for v in list(self.profile_dbs_info_dict.values())
        ])
        if len(self.profile_dbs_info_dict) != len(
                set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError(
                "Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" %
                (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [
            ('total_length', 'The number of nucleotides described'),
            ('num_contigs', 'The number of contigs'),
            ('version', 'The version number'),
            ('num_splits', 'The number of splits'),
            ('min_contig_length', 'The minimum contig length (-M) values'),
            ('max_contig_length',
             'The maximum contig length (--max-contig-length) values'),
            ('min_coverage_for_variability',
             'The minimum coverage values to report variability (-V)'),
            ('report_variability_full',
             'Whether to report full variability (--report-variability-full) flags'
             ), ('SCVs_profiled', 'Profile SCVs flags (--profile-SCVs)'),
            ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')
        ]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                if anvio.FORCE:
                    self.run.warning(
                        "Anvio'o found out that %s is not identical across all your profiles, but since you\
                                      have used the `--force` flag, it will continue with the merge. This is very\
                                      dangerous, and even if merging finishes succesfully, it does not mean you can trust\
                                      your results to be error free. We believe you are prepared to deal with potential\
                                      implications of forcing things because you are awesome."
                        % p,
                        lc="cyan")
                else:
                    raise ConfigError(
                        "Ouch. %s are not identical for all profiles to be merged, which is a \
                                       deal breaker. All profiles that are going to be merged must be\
                                       run with identical flags and parameters :/ You really shouldn't but if you want to\
                                       try to force things because you believe this is due to a misunderstanding, you can\
                                       use the flag --force. While you are considering this as an option, please also\
                                       remember that this we advice against it.."
                        % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(
            list(
                utils.get_all_item_names_from_the_database(
                    list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([
            r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()
        ])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError(
                    "It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/"
                )
            else:
                raise ConfigError(
                    "It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")

        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError(
                "The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?"
                % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()
Exemple #9
0
    def read_anvio_markdown(self, file_path):
        """Reads markdown descriptions filling in anvi'o variables.

        Basically a lot of l_l83Я 1337 Я0XX0ЯZ stuff's going on down there, so you better run while you can.
        """

        filesnpaths.is_file_plain_text(file_path)

        if not len(self.anvio_markdown_variables_conversion_dict):
            self.init_anvio_markdown_variables_conversion_dict()

        markdown_content = open(file_path).read()

        # this is quite a big deal thing to do here:
        try:
            markdown_content = markdown_content % self.anvio_markdown_variables_conversion_dict
        except KeyError as e:
            self.progress.end()
            raise ConfigError(
                "One of the variables, %s, in '%s' is not yet described anywhere :/ If it is not a typo but "
                "a new artifact, you can add it to the file `anvio/docs/__init__.py`. After which everything "
                "should work. But please also remember to update provides / requires statements of programs "
                "for everything to be linked together." % (e, file_path))
        except Exception as e:
            self.progress.end()
            raise ConfigError(
                "Something went wrong while working with '%s' :/ This is what we know: '%s'."
                % (file_path, e))

        # now we have replaced anvi'o variables with markdown links, it is time to replace
        # hyphens in anvi'o codeblocks with HTML hyphens so markdown does not freakout when it is
        # time to visualize these and replace -- characters with en dash.
        markdwon_lines = markdown_content.split('\n')
        line_nums_for_codestart_tags = [
            i for i in range(0, len(markdwon_lines))
            if markdwon_lines[i].strip() == "{{ codestart }}"
        ]
        line_nums_for_codestop_tags = [
            i for i in range(0, len(markdwon_lines))
            if markdwon_lines[i].strip() == "{{ codestop }}"
        ]

        if len(line_nums_for_codestart_tags) != len(
                line_nums_for_codestop_tags):
            raise ConfigError(
                "In %s, the number of {{ codestart }} tags do not match to the number of {{ codestop }} tags :/"
                % file_path)

        for line_start, line_end in list(
                zip(line_nums_for_codestart_tags,
                    line_nums_for_codestop_tags)):
            for line_num in range(line_start + 1, line_end):
                markdwon_lines[line_num] = markdwon_lines[line_num].replace(
                    "-", "&#45;").replace("*",
                                          "&#42;").replace("==", "&#61;&#61;")

        # all lines are processed: merge them back into a single text:
        markdown_content = '\n'.join(markdwon_lines)

        # now we have a proper markdown, it is time to remove anvi'o {{ codestart }} and {{ codestop }} blocks.
        markdown_content = markdown_content.replace(
            """{{ codestart }}""", """<div class="codeblock" markdown="1">""")
        markdown_content = markdown_content.replace("""{{ codestop }}""",
                                                    """</div>""")

        # return it like a pro.
        return markdown_content
Exemple #10
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory,
            ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError(
                "Anvi'o couldn't find the contigs database where you said it would be :/"
            )

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.populate_profile_dbs_info_dict()

        self.sample_ids_found_in_input_dbs = sorted([
            v['sample_id'] for v in list(self.profile_dbs_info_dict.values())
        ])
        if len(self.profile_dbs_info_dict) != len(
                set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError(
                "Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" %
                (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [
            ('total_length', 'The number of nucleotides described'),
            ('num_contigs', 'The number of contigs'),
            ('version', 'The version number'),
            ('num_splits', 'The number of splits'),
            ('min_contig_length', 'The minimum contig length (-M) values'),
            ('min_coverage_for_variability',
             'The minimum coverage values to report variability (-V)'),
            ('report_variability_full',
             'Whether to report full variability (--report-variability-full) flags'
             ),
            ('AA_frequencies_profiled',
             'Profile AA frequencies flags (--profile-AA-frequencies)'),
            ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')
        ]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                raise ConfigError(
                    "%s are not identical for all profiles to be merged, which is a \
                                    deal breaker. All profiles that are going to be merged must be\
                                    run with identical flags and parameters :/"
                    % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(
            list(
                dbops.get_split_names_in_profile_db(
                    list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([
            r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()
        ])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError(
                    "It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/"
                )
            else:
                raise ConfigError(
                    "It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")

        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError(
                "The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?"
                % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()
Exemple #11
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError("Anvi'o couldn't find the contigs database where you said it would be :/")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.check_dbs_to_be_merged()

        self.populate_profile_dbs_info_dict()

        self.populate_layer_additional_data_dict()

        self.sample_ids_found_in_input_dbs = sorted([v['sample_id'] for v in list(self.profile_dbs_info_dict.values())])
        if len(self.profile_dbs_info_dict) != len(set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError("Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" % (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [('total_length', 'The number of nucleotides described'),
                     ('num_contigs', 'The number of contigs'),
                     ('version', 'The version number'),
                     ('num_splits', 'The number of splits'),
                     ('min_contig_length', 'The minimum contig length (-M) values'),
                     ('max_contig_length', 'The maximum contig length (--max-contig-length) values'),
                     ('min_coverage_for_variability', 'The minimum coverage values to report variability (-V)'),
                     ('report_variability_full', 'Whether to report full variability (--report-variability-full) flags'),
                     ('SCVs_profiled', 'Profile SCVs flags (--profile-SCVs)'),
                     ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                raise ConfigError("%s are not identical for all profiles to be merged, which is a \
                                    deal breaker. All profiles that are going to be merged must be\
                                    run with identical flags and parameters :/" % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(list(utils.get_all_item_names_from_the_database(list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError("It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/")
            else:
                raise ConfigError("It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")


        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError("The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?" % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()