Example #1
0
    def init_internal_genomes(self):
        self.progress.new('Initializing internal genomes')

        # to not initialize things over and over again:
        unique_profile_db_path_to_internal_genome_name = self.get_unique_profile_db_path_to_internal_genome_name_dict()

        for profile_db_path in unique_profile_db_path_to_internal_genome_name:
            self.collections = ccollections.Collections()
            self.collections.populate_collections_dict(profile_db_path)

            for genome_name in unique_profile_db_path_to_internal_genome_name[profile_db_path]:
                self.progress.update('working on %s' % (genome_name))
                c = self.genomes[genome_name]
                c['external_genome'] = False

                dbops.is_profile_db_and_contigs_db_compatible(c['profile_db_path'], c['contigs_db_path'])

                split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(c)

                # here we are using the get_contigs_db_info_dict function WITH split names we found in the collection
                # which returns a partial summary from the contigs database focusing only those splits. a small workaround
                # to be able to use the same funciton for bins in collections:
                contigs_summary = summarizer.ContigSummarizer(c['contigs_db_path'])
                summary_from_contigs_db_summary = contigs_summary.get_contigs_db_info_dict(split_names=split_names_of_interest,
                                                                                           gene_caller_to_use=self.gene_caller)

                for key in summary_from_contigs_db_summary:
                    c[key] = summary_from_contigs_db_summary[key]

        self.progress.end()

        self.run.info('Internal genomes', '%d have been initialized.' % len(self.internal_genome_names))
Example #2
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory, ok_if_exists=False)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['db_type'] != 'profile' or profile_db.meta[
                'blank'] or not profile_db.meta['merged']:
            raise ConfigError(
                "You an only split merged profiles :/ We hope this is not a moment of a terrible disappointment.\
                               If it is, you should consider writing to us.")

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError(
                    "The bin name you wish to split from this profile databse is not in the collection. Busted!"
                )
            else:
                self.bin_names_of_interest = [self.bin_name]
Example #3
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=False)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['blank']:
            raise ConfigError("The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!")

        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError("Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError("The bin name you wish to split from this profile databse is not in the collection. Busted!")
            else:
                self.bin_names_of_interest = [self.bin_name]
Example #4
0
    def init_internal_genomes(self):
        self.progress.new('Initializing internal genomes')

        # to not initialize things over and over again:
        unique_profile_db_path_to_internal_genome_name = {}
        for profile_path in set([self.genomes[g]['profile_db_path'] for g in self.internal_genome_names]):
            unique_profile_db_path_to_internal_genome_name[profile_path] = [g for g in self.internal_genome_names if self.genomes[g]['profile_db_path'] == profile_path]

        for profile_db_path in unique_profile_db_path_to_internal_genome_name:
            self.collections = ccollections.Collections()
            self.collections.populate_collections_dict(profile_db_path, anvio.__profile__version__)

            for genome_name in unique_profile_db_path_to_internal_genome_name[profile_db_path]:
                self.progress.update('working on %s' % (genome_name))
                c = self.genomes[genome_name]

                dbops.is_profile_db_and_contigs_db_compatible(c['profile_db_path'], c['contigs_db_path'])

                # set name
                c['name'] = genome_name

                collection_dict = self.collections.get_collection_dict(c['collection_id'])
                bins_info_dict = self.collections.get_bins_info_dict(c['collection_id'])

                if c['bin_id'] not in bins_info_dict:
                    self.progress.end()
                    raise ConfigError, "You betrayed us :( Genome %s does not appear to be a valid bin in collection %s in %s"\
                                % (c['bin_id'], c['collection_id'], c['profile_db_path'])


                split_names_of_interest = collection_dict[c['bin_id']]
                if not len(split_names_of_interest):
                    raise ConfigError, "There are 0 splits defined for bin id %s in collection %s..." % (c['bin_id'], c['collection_id'])


                contigs_db_summary = summarizer.get_contigs_db_info_dict(c['contigs_db_path'], split_names = split_names_of_interest, include_AA_counts = True)
                for key in contigs_db_summary:
                    c[key] = contigs_db_summary[key]

                # set hash
                c['genome_entry_hash'] = hashlib.sha224('_'.join([split_names_of_interest[0], split_names_of_interest[-1], c['contigs_db_hash']])).hexdigest()
                self.hash_to_genome_name[c['genome_entry_hash']] = genome_name

        self.progress.end()

        if len(set([self.genomes[genome_name]['genome_entry_hash'] for genome_name in self.internal_genome_names])) != len(self.internal_genome_names):
            raise ConfigError, "Not all hash values are unique across internal genomes. This is almost impossible to happen unless something very\
                                wrong with your workflow :/ Please let the developers know if you can't figure this one out"

        # make sure genes are called in every contigs db:
        genomes_missing_gene_calls = [g for g in self.internal_genome_names if not self.genomes[genome_name]['genes_are_called']]
        if len(genomes_missing_gene_calls):
            raise ConfigError, 'Genes must have been called during the generation of contigs database for this workflow to work. However,\
                                these external genomes do not have gene calls: %s' % (', '.join(genomes_missing_gene_calls))

        self.run.info('Internal genomes', '%d have been initialized.' % len(self.internal_genome_names))
Example #5
0
    def __init__(self, args, r=run, p=progress):
        self.run = r
        self.progress = p

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.num_clusters_requested = A('num_clusters_requested') or 80

        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        self.clusters = {}

        self.lengths = {}
        self.kmers = {}
        self.coverages = {}

        self.debug = args.debug

        self.progress.new('Init')

        self.progress.update('accessing the profile database ...')
        profile_db = dbops.ProfileDatabase(args.profile_db)

        if not int(profile_db.meta['merged']):
            self.progress.end()
            raise ConfigError(
                'CONCOCT can only be used to cluster merged runs...')

        self.coverages = profile_db.db.get_table_as_dict(
            'mean_coverage_contigs', columns_of_interest=profile_db.samples)
        profile_db.disconnect()

        self.progress.update('accessing the profile database ...')
        contigs_db = dbops.ContigsDatabase(args.contigs_db, quiet=True)
        self.kmers = contigs_db.db.get_table_as_dict(
            'kmer_contigs', keys_of_interest=list(self.coverages.keys()))
        splits_basic_info = contigs_db.db.get_table_as_dict(
            'splits_basic_info', keys_of_interest=list(self.coverages.keys()))
        contigs_db.disconnect()

        self.progress.update('computing split lengths ...')
        for split_name in splits_basic_info:
            self.lengths[split_name] = splits_basic_info[split_name]['length']

        self.progress.end()
Example #6
0
    def __init__(self, args, r=run, p=progress):
        self.run = r
        self.progress = p
        self.profile_db_path = args.profile_db
        self.contigs_db_path = args.contigs_db

        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        self.clusters = {}

        self.lengths = {}
        self.kmers = {}
        self.coverages = {}

        self.debug = args.debug

        self.progress.new("Init")

        self.progress.update("accessing the profile database ...")
        profile_db = dbops.ProfileDatabase(args.profile_db)

        if not int(profile_db.meta["merged"]):
            self.progress.end()
            raise ConfigError, "CONCOCT can only be used to cluster merged runs..."

        self.coverages = profile_db.db.get_table_as_dict(
            "mean_coverage_contigs", columns_of_interest=profile_db.samples
        )
        profile_db.disconnect()

        self.progress.update("accessing the profile database ...")
        contigs_db = dbops.ContigsDatabase(args.contigs_db, quiet=True)
        self.kmers = contigs_db.db.get_table_as_dict("kmer_contigs", keys_of_interest=self.coverages.keys())
        splits_basic_info = contigs_db.db.get_table_as_dict("splits_basic_info", keys_of_interest=self.coverages.keys())
        contigs_db.disconnect()

        self.progress.update("computing split lengths ...")
        for split_name in splits_basic_info:
            self.lengths[split_name] = splits_basic_info[split_name]["length"]

        self.progress.end()
Example #7
0
    def __init__(self, args, external_clustering = None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if args.__dict__.has_key(x) else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.manual_mode = A('manual_mode')
        self.split_hmm_layers = A('split_hmm_layers')
        self.additional_layers_path = A('additional_layers')
        self.additional_view_path = A('additional_view')
        self.samples_information_db_path = A('samples_information_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.view_data_path = A('view_data')
        self.tree = A('tree')
        self.title = A('title')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.state = A('state')
        self.show_states = A('show_states')
        self.skip_check_names = A('skip_check_names')

        self.split_names_ordered = None
        self.additional_layers = None

        self.samples_information_dict = {}
        self.samples_order_dict = {}
        self.samples_information_default_layer_order = {}


        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        ContigsSuperclass.__init__(self, self.args)

        if self.samples_information_db_path:
            samples_information_db = SamplesInformationDatabase(self.samples_information_db_path)
            self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts()
            self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order()
            samples_information_db.disconnect()

        if self.contigs_db_path:
            self.completeness = completeness.Completeness(self.contigs_db_path)
            self.collections.populate_sources_dict(self.contigs_db_path, anvio.__contigs__version__)
        else:
            self.completeness = None

        if 'skip_init_functions' in args and not args.skip_init_functions:
            self.init_functions()

        # make sure we are not dealing with apples and oranges here.
        if self.contigs_db_path and self.profile_db_path:
            is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if self.manual_mode:
            self.load_from_user_files(args)
        else:
            self.load_from_anvio_files(args)

        # make sure the samples information database, if there is one, is in fact compatible with the profile database
        # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is
        # being filled within the self.load_from_user_files function based on the headers of the view data.
        if self.profile_db_path and self.samples_information_db_path:
            is_profile_db_and_samples_db_compatible(self.profile_db_path, self.samples_information_db_path)

        if self.external_clustering:
            self.p_meta['clusterings'] = self.clusterings = self.external_clustering['clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering['default_clustering']

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped automatically by the platform. Please\
                                    read the help menu for anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"
            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        tree = Tree(self.p_meta['clusterings'][self.p_meta['default_clustering']]['newick'], format = 1)

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = [n.name for n in tree.get_leaves()]

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the contigs database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        self.init_non_singlecopy_gene_hmm_sources(self.split_names_ordered, return_each_gene_as_a_layer = self.split_hmm_layers)

        if self.additional_layers_path:
            filesnpaths.is_file_tab_delimited(self.additional_layers_path)
            self.additional_layers = self.additional_layers_path

        self.check_names_consistency()
        self.convert_view_data_into_json()
Example #8
0
    def init_internal_genomes(self):
        self.progress.new('Initializing internal genomes')

        # to not initialize things over and over again:
        unique_profile_db_path_to_internal_genome_name = {}
        for profile_path in set([
                self.genomes[g]['profile_db_path']
                for g in self.internal_genome_names
        ]):
            unique_profile_db_path_to_internal_genome_name[profile_path] = [
                g for g in self.internal_genome_names
                if self.genomes[g]['profile_db_path'] == profile_path
            ]

        for profile_db_path in unique_profile_db_path_to_internal_genome_name:
            self.collections = ccollections.Collections()
            self.collections.populate_collections_dict(
                profile_db_path, anvio.__profile__version__)

            for genome_name in unique_profile_db_path_to_internal_genome_name[
                    profile_db_path]:
                self.progress.update('working on %s' % (genome_name))
                c = self.genomes[genome_name]

                dbops.is_profile_db_and_contigs_db_compatible(
                    c['profile_db_path'], c['contigs_db_path'])

                # set name
                c['name'] = genome_name

                collection_dict = self.collections.get_collection_dict(
                    c['collection_id'])
                bins_info_dict = self.collections.get_bins_info_dict(
                    c['collection_id'])

                if c['bin_id'] not in bins_info_dict:
                    self.progress.end()
                    raise ConfigError, "You betrayed us :( Genome %s does not appear to be a valid bin in collection %s in %s"\
                                % (c['bin_id'], c['collection_id'], c['profile_db_path'])

                split_names_of_interest = collection_dict[c['bin_id']]
                if not len(split_names_of_interest):
                    raise ConfigError, "There are 0 splits defined for bin id %s in collection %s..." % (
                        c['bin_id'], c['collection_id'])

                contigs_db_summary = summarizer.get_contigs_db_info_dict(
                    c['contigs_db_path'],
                    split_names=split_names_of_interest,
                    exclude_partial_gene_calls=self.exclude_partial_gene_calls)
                for key in contigs_db_summary:
                    c[key] = contigs_db_summary[key]

                # set hash
                c['genome_entry_hash'] = hashlib.sha224('_'.join([
                    split_names_of_interest[0], split_names_of_interest[-1],
                    c['contigs_db_hash']
                ])).hexdigest()
                self.hash_to_genome_name[c['genome_entry_hash']] = genome_name

        self.progress.end()

        if len(
                set([
                    self.genomes[genome_name]['genome_entry_hash']
                    for genome_name in self.internal_genome_names
                ])) != len(self.internal_genome_names):
            raise ConfigError, "Not all hash values are unique across internal genomes. This is almost impossible to happen unless something very\
                                wrong with your workflow :/ Please let the developers know if you can't figure this one out"

        # make sure genes are called in every contigs db:
        genomes_missing_gene_calls = [
            g for g in self.internal_genome_names
            if not self.genomes[genome_name]['genes_are_called']
        ]
        if len(genomes_missing_gene_calls):
            raise ConfigError, 'Genes must have been called during the generation of contigs database for this workflow to work. However,\
                                these external genomes do not have gene calls: %s' % (
                ', '.join(genomes_missing_gene_calls))

        self.run.info(
            'Internal genomes',
            '%d have been initialized.' % len(self.internal_genome_names))
Example #9
0
    def init_commons(self):
        self.progress.new('Init')

        self.progress.update('Checking the output file path ..')
        if self.output_file_path:
            filesnpaths.is_output_file_writable(self.output_file_path)

        self.progress.update('Checking the samples of interest ..')
        if self.samples_of_interest_path:
            filesnpaths.is_file_exists(self.samples_of_interest_path)
            self.samples_of_interest = set([
                s.strip()
                for s in open(self.samples_of_interest_path).readlines()
            ])
        else:
            self.samples_of_interest = set([])

        self.progress.update('Making sure our databases are here ..')
        if not self.profile_db_path:
            raise ConfigError, 'You need to provide a profile database.'

        if not self.contigs_db_path:
            raise ConfigError, 'You need to provide a contigs database.'

        self.progress.update('Making sure our databases are compatible ..')
        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        if self.min_coverage_in_each_sample and not self.quince_mode:
            self.progress.end()
            raise ConfigError, "When you sepecify a coverage value through --min-coverage-in-each-sample, you must also\
                                use --quince-mode flag, since the former parameter needs to know the coverage values in all\
                                samples even if variation is reported for only one sample among otheres. This is the only way\
                                to figure out whether variation is not reported for other samples due to low or zero coverage,\
                                or there was no variation to report despite the high coverage. Anvi'o could turn --quince-mode\
                                flat automatically for you, but then it is much better if you have full control and understaning\
                                of what is going on."

        if self.quince_mode:
            self.progress.update('Accessing auxiliary data file ...')
            auxiliary_data_file_path = os.path.join(
                os.path.dirname(self.profile_db_path), 'AUXILIARY-DATA.h5')
            if not os.path.exists(auxiliary_data_file_path):
                raise ConfigError, "Anvi'o needs the auxiliary data file to run this program with '--quince-mode' flag.\
                                    However it wasn't found at '%s' :/" % auxiliary_data_file_path
            self.merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
                auxiliary_data_file_path, None, ignore_hash=True)

        self.progress.update(
            'Attempting to get our splits of interest sorted ..')
        if self.collection_name:
            # the user wants to go with the collection id path. fine. we will get our split names from
            # the profile database.
            if not self.bin_id:
                self.progress.end()
                raise ConfigError, 'When you declare a collection id, you must also declare a bin name\
                                    (from which the split names of interest will be acquired)'

            if self.splits_of_interest or self.splits_of_interest_path:
                self.progress.end()
                raise ConfigError, "You declared a collection id and one or more bin names so anvi'o can find out\
                                    splits of interest, but you also have specified informaiton for split names?\
                                    This is confusing. You should choose one way or another :/"

            self.splits_of_interest = ccollections.GetSplitNamesInBins(
                self.args).get_split_names_only()
        else:
            # OK. no collection id. we will go oldschool. we whope to find what we are looking for in
            # self.splits_of_interst_path  at this point (which may have been filled through the command
            # line client), or in self.splits_of_interest (which may have been filled in by another program)
            if not self.splits_of_interest:
                if not self.splits_of_interest_path:
                    self.progress.end()
                    raise ConfigError, 'You did not declare a source for split names. You either should give me\
                                        a file with split names you are interested in, or a collection id and\
                                        bin name so I can learn split names from the profile database.'

                filesnpaths.is_file_exists(self.splits_of_interest_path)
                self.splits_of_interest = set([
                    c.strip().replace('\r', '')
                    for c in open(self.splits_of_interest_path).readlines()
                ])

        self.input_file_path = '/' + '/'.join(
            os.path.abspath(self.profile_db_path).split('/')[:-1])

        self.progress.update('Reading the data ...')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        self.sample_ids = profile_db.samples  # we set this now, but we will overwrite it with args.samples_of_interest if necessary

        if not profile_db.meta['SNVs_profiled']:
            self.progress.end()
            raise ConfigError, "Well well well. It seems SNVs were not characterized for this profile database.\
                                Sorry, there is nothing to report here!"

        if self.engine == 'NT':
            self.data = profile_db.db.get_table_as_dict(
                t.variable_nts_table_name)
        elif self.engine == 'AA':
            # AA specific stuff. first check whether things were profiled
            if not profile_db.meta['AA_frequencies_profiled']:
                raise ConfigError, "It seems AA frequencies were not characterized for this profile database.\
                                    There is nothing to report here for AAs!"

            # get the data.
            self.data = profile_db.db.get_table_as_dict(
                t.variable_aas_table_name)

            # append split_name information
            for e in self.data.values():
                e['split_name'] = self.gene_callers_id_to_split_name_dict[
                    e['corresponding_gene_call']]
        else:
            raise ConfigError, "VariabilitySuper :: Anvi'o doesn't know what to do with a engine on '%s' yet :/" % self.engine

        profile_db.disconnect()

        self.progress.end()
Example #10
0
    def init_commons(self):
        self.progress.new('Init')

        self.progress.update('Checking the output file path ..')
        if self.output_file_path:
            filesnpaths.is_output_file_writable(self.output_file_path)

        self.progress.update('Checking the samples of interest ..')
        if self.samples_of_interest_path:
            filesnpaths.is_file_exists(self.samples_of_interest_path)
            self.samples_of_interest = set([s.strip() for s in open(self.samples_of_interest_path).readlines()])
        else:
            self.samples_of_interest = set([])

        self.progress.update('Making sure our databases are here ..')
        if not self.profile_db_path:
            raise ConfigError, 'You need to provide a profile database.'

        if not self.contigs_db_path:
            raise ConfigError, 'You need to provide a contigs database.'

        self.progress.update('Making sure our databases are compatible ..')
        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        if self.min_coverage_in_each_sample and not self.quince_mode:
            self.progress.end()
            raise ConfigError, "When you sepecify a coverage value through --min-coverage-in-each-sample, you must also\
                                use --quince-mode flag, since the former parameter needs to know the coverage values in all\
                                samples even if variation is reported for only one sample among otheres. This is the only way\
                                to figure out whether variation is not reported for other samples due to low or zero coverage,\
                                or there was no variation to report despite the high coverage. Anvi'o could turn --quince-mode\
                                flat automatically for you, but then it is much better if you have full control and understaning\
                                of what is going on."

        if self.quince_mode:
            self.progress.update('Accessing auxiliary data file ...')
            auxiliary_data_file_path = os.path.join(os.path.dirname(self.profile_db_path), 'AUXILIARY-DATA.h5')
            if not os.path.exists(auxiliary_data_file_path):
                raise ConfigError, "Anvi'o needs the auxiliary data file to run this program with '--quince-mode' flag.\
                                    However it wasn't found at '%s' :/" % auxiliary_data_file_path
            self.merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(auxiliary_data_file_path, None, ignore_hash = True)


        self.progress.update('Attempting to get our splits of interest sorted ..')
        if self.collection_name:
            # the user wants to go with the collection id path. fine. we will get our split names from
            # the profile database.
            if not self.bin_id:
                self.progress.end()
                raise ConfigError, 'When you declare a collection id, you must also declare a bin name\
                                    (from which the split names of interest will be acquired)'
            if self.splits_of_interest or self.splits_of_interest_path:
                self.progress.end()
                raise ConfigError, "You declared a collection id and one or more bin names so anvi'o can find out\
                                    splits of interest, but you also have specified informaiton for split names?\
                                    This is confusing. You should choose one way or another :/"

            self.splits_of_interest = ccollections.GetSplitNamesInBins(self.args).get_split_names_only()
        else:
            # OK. no collection id. we will go oldschool. we whope to find what we are looking for in
            # self.splits_of_interst_path  at this point (which may have been filled through the command
            # line client), or in self.splits_of_interest (which may have been filled in by another program)
            if not self.splits_of_interest:
                if not self.splits_of_interest_path:
                    self.progress.end()
                    raise ConfigError, 'You did not declare a source for split names. You either should give me\
                                        a file with split names you are interested in, or a collection id and\
                                        bin name so I can learn split names from the profile database.'
                filesnpaths.is_file_exists(self.splits_of_interest_path)
                self.splits_of_interest = set([c.strip().replace('\r', '') for c in open(self.splits_of_interest_path).readlines()])

        self.input_file_path = '/' + '/'.join(os.path.abspath(self.profile_db_path).split('/')[:-1])

        self.progress.update('Reading the data ...')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        self.sample_ids = profile_db.samples # we set this now, but we will overwrite it with args.samples_of_interest if necessary

        if not profile_db.meta['SNVs_profiled']:
            self.progress.end()
            raise ConfigError, "Well well well. It seems SNVs were not characterized for this profile database.\
                                Sorry, there is nothing to report here!"

        if self.focus == 'NT':
            self.data = profile_db.db.get_table_as_dict(t.variable_nts_table_name)
        else:
            raise ConfigError, "The superclass is inherited with an unknown focus. Anvi'o needs an adult :("

        profile_db.disconnect()

        self.progress.end()
Example #11
0
    def __init__(self, args, external_clustering=None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.mode = A('mode')
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.collection_name = A('collection_name')
        self.manual_mode = A('manual_mode')
        self.split_hmm_layers = A('split_hmm_layers')
        self.taxonomic_level = A('taxonomic_level')
        self.additional_layers_path = A('additional_layers')
        self.additional_view_path = A('additional_view')
        self.samples_information_db_path = A('samples_information_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.view_data_path = A('view_data')
        self.tree = A('tree')
        self.title = A('title')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.state_autoload = A('state_autoload')
        self.collection_autoload = A('collection_autoload')
        self.show_states = A('show_states')
        self.skip_check_names = A('skip_check_names')
        self.list_collections = A('list_collections')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        self.split_names_ordered = None
        self.additional_layers = None
        self.auxiliary_profile_data_available = False

        self.samples_information_dict = {}
        self.samples_order_dict = {}
        self.samples_information_default_layer_order = {}

        # make sure the mode will be set properly
        if self.collection_name and self.manual_mode:
            raise ConfigError, "You can't anvi-interactive in manual mode with a collection name."

        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        ContigsSuperclass.__init__(self, self.args)
        self.init_splits_taxonomy(self.taxonomic_level)

        if self.samples_information_db_path:
            samples_information_db = SamplesInformationDatabase(
                self.samples_information_db_path)
            self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts(
            )
            self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order(
            )
            samples_information_db.disconnect()

        if self.contigs_db_path:
            self.completeness = Completeness(self.contigs_db_path)
            self.collections.populate_collections_dict(
                self.contigs_db_path, anvio.__contigs__version__)
        else:
            self.completeness = None

        if 'skip_init_functions' in args and not args.skip_init_functions:
            self.init_functions()

        # make sure we are not dealing with apples and oranges here.
        if self.contigs_db_path and self.profile_db_path:
            is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                    self.contigs_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if not self.mode and self.manual_mode:
            self.mode = 'manual'
            self.run.info('Mode', self.mode, mc='red')
            self.load_manual_mode(args)
        elif self.mode == 'refine':
            self.load_full_mode(args)
        elif self.collection_name or self.list_collections:
            self.mode = 'collection'
            self.run.info('Mode', self.mode, mc='green')
            self.load_collection_mode(args)
        else:
            self.mode = 'full'
            self.load_full_mode(args)

        # make sure the samples information database, if there is one, is in fact compatible with the profile database
        # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is
        # being filled within the self.load_manual_mode function based on the headers of the view data.
        if self.profile_db_path and self.samples_information_db_path:
            is_profile_db_and_samples_db_compatible(
                self.profile_db_path,
                self.samples_information_db_path,
                manual_mode_exception=self.manual_mode)

        if self.external_clustering:
            self.p_meta[
                'clusterings'] = self.clusterings = self.external_clustering[
                    'clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering[
                'default_clustering']

        if not self.state_autoload and 'default' in self.states_table.states:
            self.state_autoload = 'default'

        if not self.collection_autoload and 'default' in self.collections.collections_dict:
            self.collection_autoload = 'default'

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    of splits that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with the `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped by anvi-merge because you had too many stplits\
                                    to get the clustering in a reasonable amount of time. Please read the help menu for\
                                    anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"

            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = utils.get_names_order_from_newick_tree(
            self.p_meta['clusterings'][
                self.p_meta['default_clustering']]['newick'])

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the contigs database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        if self.mode == 'full':
            self.init_non_singlecopy_gene_hmm_sources(
                self.split_names_ordered,
                return_each_gene_as_a_layer=self.split_hmm_layers)

        if self.additional_layers_path:
            filesnpaths.is_file_tab_delimited(self.additional_layers_path)
            self.additional_layers = self.additional_layers_path

        self.check_names_consistency()
        self.convert_view_data_into_json()
Example #12
0
    def __init__(self, args, external_clustering=None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.mode = A('mode')
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.collection_name = A('collection_name')
        self.manual_mode = A('manual_mode')
        self.split_hmm_layers = A('split_hmm_layers')
        self.additional_layers_path = A('additional_layers')
        self.additional_view_path = A('additional_view')
        self.samples_information_db_path = A('samples_information_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.view_data_path = A('view_data')
        self.tree = A('tree')
        self.title = A('title')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.state = A('state')
        self.show_states = A('show_states')
        self.skip_check_names = A('skip_check_names')
        self.list_collections = A('list_collections')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        self.split_names_ordered = None
        self.additional_layers = None
        self.auxiliary_profile_data_available = False

        self.samples_information_dict = {}
        self.samples_order_dict = {}
        self.samples_information_default_layer_order = {}

        # make sure the mode will be set properly
        if self.collection_name and self.manual_mode:
            raise ConfigError, "You can't anvi-interactive in manual mode with a collection name."

        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        ContigsSuperclass.__init__(self, self.args)
        self.init_splits_taxonomy()

        if self.samples_information_db_path:
            samples_information_db = SamplesInformationDatabase(self.samples_information_db_path)
            self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts()
            self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order()
            samples_information_db.disconnect()

        if self.contigs_db_path:
            self.completeness = Completeness(self.contigs_db_path)
            self.collections.populate_collections_dict(self.contigs_db_path, anvio.__contigs__version__)
        else:
            self.completeness = None

        if 'skip_init_functions' in args and not args.skip_init_functions:
            self.init_functions()

        # make sure we are not dealing with apples and oranges here.
        if self.contigs_db_path and self.profile_db_path:
            is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if not self.mode and self.manual_mode:
            self.mode = 'manual'
            self.run.info('Mode', self.mode, mc='red')
            self.load_manual_mode(args)
        elif self.mode == 'refine':
            self.load_full_mode(args)
        elif self.collection_name or self.list_collections:
            self.mode = 'collection'
            self.run.info('Mode', self.mode, mc='green')
            self.load_collection_mode(args)
        else:
            self.mode = 'full'
            self.load_full_mode(args)

        # make sure the samples information database, if there is one, is in fact compatible with the profile database
        # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is
        # being filled within the self.load_manual_mode function based on the headers of the view data.
        if self.profile_db_path and self.samples_information_db_path:
            is_profile_db_and_samples_db_compatible(self.profile_db_path, self.samples_information_db_path, manual_mode_exception=self.manual_mode)

        if self.external_clustering:
            self.p_meta['clusterings'] = self.clusterings = self.external_clustering['clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering['default_clustering']

        if not self.state and 'default' in self.states_table.states:
            self.state = 'default'

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    of splits that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with the `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped by anvi-merge because you had too many stplits\
                                    to get the clustering in a reasonable amount of time. Please read the help menu for\
                                    anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"
            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = utils.get_names_order_from_newick_tree(self.p_meta['clusterings'][self.p_meta['default_clustering']]['newick'])

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the contigs database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        if self.mode == 'full':
            self.init_non_singlecopy_gene_hmm_sources(self.split_names_ordered, return_each_gene_as_a_layer=self.split_hmm_layers)

        if self.additional_layers_path:
            filesnpaths.is_file_tab_delimited(self.additional_layers_path)
            self.additional_layers = self.additional_layers_path

        self.check_names_consistency()
        self.convert_view_data_into_json()