Ejemplo n.º 1
0
    def init_internal_genomes(self):
        self.progress.new('Initializing internal genomes')

        # to not initialize things over and over again:
        unique_profile_db_path_to_internal_genome_name = self.get_unique_profile_db_path_to_internal_genome_name_dict()

        for profile_db_path in unique_profile_db_path_to_internal_genome_name:
            self.collections = ccollections.Collections()
            self.collections.populate_collections_dict(profile_db_path)

            for genome_name in unique_profile_db_path_to_internal_genome_name[profile_db_path]:
                self.progress.update('working on %s' % (genome_name))
                c = self.genomes[genome_name]
                c['external_genome'] = False

                utils.is_profile_db_and_contigs_db_compatible(c['profile_db_path'], c['contigs_db_path'])

                split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(c)

                # here we are using the get_contigs_db_info_dict function WITH split names we found in the collection
                # which returns a partial summary from the contigs database focusing only those splits. a small workaround
                # to be able to use the same funciton for bins in collections:
                contigs_summary = summarizer.ContigSummarizer(c['contigs_db_path'])
                summary_from_contigs_db_summary = contigs_summary.get_contigs_db_info_dict(split_names=split_names_of_interest,
                                                                                           gene_caller_to_use=self.gene_caller)

                for key in summary_from_contigs_db_summary:
                    c[key] = summary_from_contigs_db_summary[key]

        self.progress.end()

        self.run.info('Internal genomes', '%d have been initialized.' % len(self.internal_genome_names))
Ejemplo n.º 2
0
    def init(self):
        self.progress.new('Initializing')

        self.progress.update('Getting split names')
        d = ccollections.GetSplitNamesInBins(self.args).get_dict()
        self.bins = d.keys()
        for split_names in d.values():
            self.split_names_of_interest.update(split_names)
        self.progress.end()

        # if the user updates the refinement of a single bin or bins, there shouldn't be multiple copies
        # of that stored in the database. so everytime 'store_refined_bins' function is called,
        # it will check this varlable and, (1) if empty, continue updating stuff in db store updates
        # in it, (2) if not empty, remove items stored in this variable from collections dict, and continue
        # with step (1). the starting point is of course self.bins. when the store_refined_bins function is
        # called the first time, it will read collection data for collection_name, and remove the bin(s) in
        # analysis from it before it stores the data:
        self.ids_for_already_refined_bins = self.bins

        self.input_directory = os.path.dirname(os.path.abspath(self.profile_db_path))

        self.run.info('Input directory', self.input_directory)
        self.run.info('Collection ID', self.collection_name)
        self.run.info('Number of bins', len(self.bins))
        self.run.info('Number of splits', len(self.split_names_of_interest))

        self.collections = ccollections.Collections()
        self.collections.populate_collections_dict(self.profile_db_path)
Ejemplo n.º 3
0
    def __init__(self, args, run=run, progress=progress):
        self.args = args
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.collection_name = A('collection_name')
        self.bin_name = A('bin_id')
        self.output_directory = A('output_dir')

        self.collections = ccollections.Collections()
        self.summary = None
Ejemplo n.º 4
0
    def __init__(self, args=None, r=run, p=progress):
        self.summary = {}

        self.debug = False
        self.quick = False
        self.profile_db_path = None
        self.contigs_db_path = None
        self.output_directory = None
        self.split_names_per_bin = None
        self.completeness_data_available = False
        self.gene_coverages_data_available = False
        self.non_single_copy_gene_hmm_data_available = False

        self.run = r
        self.progress = p

        DatabasesMetaclass.__init__(self, args, self.run, self.progress)

        # databases initiated, let's make sure we have gene covereges data avaialable.
        if self.gene_coverages_dict:
            self.gene_coverages_data_available = True

        self.collections = ccollections.Collections()
        self.collections.populate_collections_dict(self.contigs_db_path,
                                                   anvio.__contigs__version__)
        self.collections.populate_collections_dict(self.profile_db_path,
                                                   anvio.__profile__version__)

        self.collection_name = None

        if args:
            if args.list_collections:
                self.collections.list_collections()
                sys.exit()

            self.collection_name = args.collection_name
            self.output_directory = args.output_dir
            self.quick = args.quick_summary
            self.debug = args.debug

        self.sanity_check()

        filesnpaths.gen_output_directory(self.output_directory,
                                         delete_if_exists=True)
Ejemplo n.º 5
0
    def init_internal_genomes(self):
        self.progress.new('Initializing internal genomes')

        # to not initialize things over and over again:
        unique_profile_db_path_to_internal_genome_name = {}
        for profile_path in set([
                self.genomes[g]['profile_db_path']
                for g in self.internal_genome_names
        ]):
            unique_profile_db_path_to_internal_genome_name[profile_path] = [
                g for g in self.internal_genome_names
                if self.genomes[g]['profile_db_path'] == profile_path
            ]

        for profile_db_path in unique_profile_db_path_to_internal_genome_name:
            self.collections = ccollections.Collections()
            self.collections.populate_collections_dict(
                profile_db_path, anvio.__profile__version__)

            for genome_name in unique_profile_db_path_to_internal_genome_name[
                    profile_db_path]:
                self.progress.update('working on %s' % (genome_name))
                c = self.genomes[genome_name]

                dbops.is_profile_db_and_contigs_db_compatible(
                    c['profile_db_path'], c['contigs_db_path'])

                # set name
                c['name'] = genome_name

                collection_dict = self.collections.get_collection_dict(
                    c['collection_id'])
                bins_info_dict = self.collections.get_bins_info_dict(
                    c['collection_id'])

                if c['bin_id'] not in bins_info_dict:
                    self.progress.end()
                    raise ConfigError, "You betrayed us :( Genome %s does not appear to be a valid bin in collection %s in %s"\
                                % (c['bin_id'], c['collection_id'], c['profile_db_path'])

                split_names_of_interest = collection_dict[c['bin_id']]
                if not len(split_names_of_interest):
                    raise ConfigError, "There are 0 splits defined for bin id %s in collection %s..." % (
                        c['bin_id'], c['collection_id'])

                contigs_db_summary = summarizer.get_contigs_db_info_dict(
                    c['contigs_db_path'],
                    split_names=split_names_of_interest,
                    exclude_partial_gene_calls=self.exclude_partial_gene_calls)
                for key in contigs_db_summary:
                    c[key] = contigs_db_summary[key]

                # set hash
                c['genome_entry_hash'] = hashlib.sha224('_'.join([
                    split_names_of_interest[0], split_names_of_interest[-1],
                    c['contigs_db_hash']
                ])).hexdigest()
                self.hash_to_genome_name[c['genome_entry_hash']] = genome_name

        self.progress.end()

        if len(
                set([
                    self.genomes[genome_name]['genome_entry_hash']
                    for genome_name in self.internal_genome_names
                ])) != len(self.internal_genome_names):
            raise ConfigError, "Not all hash values are unique across internal genomes. This is almost impossible to happen unless something very\
                                wrong with your workflow :/ Please let the developers know if you can't figure this one out"

        # make sure genes are called in every contigs db:
        genomes_missing_gene_calls = [
            g for g in self.internal_genome_names
            if not self.genomes[genome_name]['genes_are_called']
        ]
        if len(genomes_missing_gene_calls):
            raise ConfigError, 'Genes must have been called during the generation of contigs database for this workflow to work. However,\
                                these external genomes do not have gene calls: %s' % (
                ', '.join(genomes_missing_gene_calls))

        self.run.info(
            'Internal genomes',
            '%d have been initialized.' % len(self.internal_genome_names))
Ejemplo n.º 6
0
    def __init__(self, args, external_clustering=None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.mode = A('mode')
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.collection_name = A('collection_name')
        self.manual_mode = A('manual_mode')
        self.split_hmm_layers = A('split_hmm_layers')
        self.taxonomic_level = A('taxonomic_level')
        self.additional_layers_path = A('additional_layers')
        self.additional_view_path = A('additional_view')
        self.samples_information_db_path = A('samples_information_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.view_data_path = A('view_data')
        self.tree = A('tree')
        self.title = A('title')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.state_autoload = A('state_autoload')
        self.collection_autoload = A('collection_autoload')
        self.show_states = A('show_states')
        self.skip_check_names = A('skip_check_names')
        self.list_collections = A('list_collections')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        self.split_names_ordered = None
        self.additional_layers = None
        self.auxiliary_profile_data_available = False

        self.samples_information_dict = {}
        self.samples_order_dict = {}
        self.samples_information_default_layer_order = {}

        # make sure the mode will be set properly
        if self.collection_name and self.manual_mode:
            raise ConfigError, "You can't anvi-interactive in manual mode with a collection name."

        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        ContigsSuperclass.__init__(self, self.args)
        self.init_splits_taxonomy(self.taxonomic_level)

        if self.samples_information_db_path:
            samples_information_db = SamplesInformationDatabase(
                self.samples_information_db_path)
            self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts(
            )
            self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order(
            )
            samples_information_db.disconnect()

        if self.contigs_db_path:
            self.completeness = Completeness(self.contigs_db_path)
            self.collections.populate_collections_dict(
                self.contigs_db_path, anvio.__contigs__version__)
        else:
            self.completeness = None

        if 'skip_init_functions' in args and not args.skip_init_functions:
            self.init_functions()

        # make sure we are not dealing with apples and oranges here.
        if self.contigs_db_path and self.profile_db_path:
            is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                    self.contigs_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if not self.mode and self.manual_mode:
            self.mode = 'manual'
            self.run.info('Mode', self.mode, mc='red')
            self.load_manual_mode(args)
        elif self.mode == 'refine':
            self.load_full_mode(args)
        elif self.collection_name or self.list_collections:
            self.mode = 'collection'
            self.run.info('Mode', self.mode, mc='green')
            self.load_collection_mode(args)
        else:
            self.mode = 'full'
            self.load_full_mode(args)

        # make sure the samples information database, if there is one, is in fact compatible with the profile database
        # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is
        # being filled within the self.load_manual_mode function based on the headers of the view data.
        if self.profile_db_path and self.samples_information_db_path:
            is_profile_db_and_samples_db_compatible(
                self.profile_db_path,
                self.samples_information_db_path,
                manual_mode_exception=self.manual_mode)

        if self.external_clustering:
            self.p_meta[
                'clusterings'] = self.clusterings = self.external_clustering[
                    'clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering[
                'default_clustering']

        if not self.state_autoload and 'default' in self.states_table.states:
            self.state_autoload = 'default'

        if not self.collection_autoload and 'default' in self.collections.collections_dict:
            self.collection_autoload = 'default'

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    of splits that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with the `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped by anvi-merge because you had too many stplits\
                                    to get the clustering in a reasonable amount of time. Please read the help menu for\
                                    anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"

            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = utils.get_names_order_from_newick_tree(
            self.p_meta['clusterings'][
                self.p_meta['default_clustering']]['newick'])

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the contigs database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        if self.mode == 'full':
            self.init_non_singlecopy_gene_hmm_sources(
                self.split_names_ordered,
                return_each_gene_as_a_layer=self.split_hmm_layers)

        if self.additional_layers_path:
            filesnpaths.is_file_tab_delimited(self.additional_layers_path)
            self.additional_layers = self.additional_layers_path

        self.check_names_consistency()
        self.convert_view_data_into_json()
Ejemplo n.º 7
0
    def cluster(self, input_files, args, work_dir, threads=1):
        J = lambda p: os.path.join(work_dir, p)

        cwd_backup = os.getcwd()
        os.chdir(work_dir)
        log_path = J('logs.txt')

        c = ccollections.Collections(r=run, p=progress)
        c.populate_collections_dict(input_files.profile_db)

        source_collections = set(
            map(str.strip, args.source_collections.split(',')))

        missing_collections = source_collections - set(
            c.collections_dict.keys())

        if len(missing_collections):
            raise ConfigError(
                "Some of the collections you wanted are missing in the database. "
                "Here is the list of missing collections: %s" %
                (", ".join(missing_collections)))

        c_names = []
        c_files = []

        for collection_name in source_collections:
            prefix = J(collection_name)

            c_names.append(collection_name)
            c_files.append(prefix + '.txt')

            c.export_collection(collection_name,
                                output_file_prefix=prefix,
                                include_unbinned=False)

        cmd_line = [
            self.program_name, '-c', input_files.splits_fasta, '-i',
            ','.join(c_files), '-l', ','.join(c_names), '-o',
            J('OUTPUT'), '--threads',
            str(threads), *utils.serialize_args(
                args, use_underscore=True, skip_keys=['source_collections'])
        ]

        self.progress.new(self.program_name)
        self.progress.update('Running using %d threads...' % threads)
        utils.run_command(cmd_line, log_path)
        self.progress.end()

        output_file_name = 'OUTPUT_DASTool_scaffolds2bin.txt'
        output_file_path = J(output_file_name)
        if not os.path.exists(output_file_path):
            raise ConfigError(
                "One of the critical output files is missing ('%s'). Please take a look at the "
                "log file: %s" % (output_file_name, log_path))

        clusters = {}
        with open(output_file_path, 'r') as f:
            lines = f.readlines()

            for entry in lines:
                contig, bin_name = map(str.strip, entry.split())

                pretty_bin_name = 'Bin_' + bin_name.replace('.', '_')

                if pretty_bin_name not in clusters:
                    clusters[pretty_bin_name] = []

                clusters[pretty_bin_name].append(contig)

        # restore cwd
        os.chdir(cwd_backup)

        return clusters