Beispiel #1
0
    def init(self):
        self.progress.new('Initializing')

        self.progress.update('Getting split names')
        d = ccollections.GetSplitNamesInBins(self.args).get_dict()
        self.bins = d.keys()
        for split_names in d.values():
            self.split_names_of_interest.update(split_names)
        self.progress.end()

        # if the user updates the refinement of a single bin or bins, there shouldn't be multiple copies
        # of that stored in the database. so everytime 'store_refined_bins' function is called,
        # it will check this varlable and, (1) if empty, continue updating stuff in db store updates
        # in it, (2) if not empty, remove items stored in this variable from collections dict, and continue
        # with step (1). the starting point is of course self.bins. when the store_refined_bins function is
        # called the first time, it will read collection data for collection_name, and remove the bin(s) in
        # analysis from it before it stores the data:
        self.ids_for_already_refined_bins = self.bins

        self.input_directory = os.path.dirname(os.path.abspath(self.profile_db_path))

        self.run.info('Input directory', self.input_directory)
        self.run.info('Collection ID', self.collection_name)
        self.run.info('Number of bins', len(self.bins))
        self.run.info('Number of splits', len(self.split_names_of_interest))

        self.collections = ccollections.Collections()
        self.collections.populate_collections_dict(self.profile_db_path)
Beispiel #2
0
    def init(self):
        self.sanity_check()

        self.run.info('Input BAM file(s)', ', '.join([os.path.basename(f) for f in self.input_bam_files]))

        d = ccollections.GetSplitNamesInBins(self.args).get_dict()
        self.bins = d.keys()

        for split_names in d.values():
            self.split_names_of_interest.update(split_names)

        self.run.info('Collection ID', self.collection_id)
        self.run.info('Bin(s)', ', '.join(self.bins))
        self.run.info('Number of splits', pp(len(self.split_names_of_interest)))
Beispiel #3
0
    def get_split_names_of_interest_for_internal_genome(self, entry):
        utils.is_profile_db(entry['profile_db_path'])
        # get splits of interest:
        class Args: pass
        args = Args()
        args.profile_db = entry['profile_db_path']
        args.collection_name = entry['collection_id']
        args.bin_id = entry['bin_id']

        split_names_of_interest = list(ccollections.GetSplitNamesInBins(args).get_split_names_only())

        if not len(split_names_of_interest):
            raise ConfigError("There are 0 splits defined for bin id %s in collection %s..." % (entry['bin_id'], entry['collection_id']))

        return split_names_of_interest
Beispiel #4
0
    def init(self):
        utils.is_contigs_db(self.contigs_db_path)

        self.run.info('Input BAM file(s)', ', '.join([os.path.basename(f) for f in self.input_bam_files]))

        d = ccollections.GetSplitNamesInBins(self.args).get_dict()
        self.bins = list(d.keys())

        for split_names in list(d.values()):
            self.split_names_of_interest.update(split_names)

        self.run.info('Collection ID', self.collection_name)
        self.run.info('Bin(s)', ', '.join(self.bins))
        self.run.info('Number of splits', pp(len(self.split_names_of_interest)))

        self.initialized = True
Beispiel #5
0
    def init_commons(self):
        self.progress.new('Init')

        self.progress.update('Checking the output file path ..')
        if self.output_file_path:
            filesnpaths.is_output_file_writable(self.output_file_path)

        self.progress.update('Checking the samples of interest ..')
        if self.samples_of_interest_path:
            filesnpaths.is_file_exists(self.samples_of_interest_path)
            self.samples_of_interest = set([
                s.strip()
                for s in open(self.samples_of_interest_path).readlines()
            ])
        else:
            self.samples_of_interest = set([])

        self.progress.update('Making sure our databases are here ..')
        if not self.profile_db_path:
            raise ConfigError, 'You need to provide a profile database.'

        if not self.contigs_db_path:
            raise ConfigError, 'You need to provide a contigs database.'

        self.progress.update('Making sure our databases are compatible ..')
        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        if self.min_coverage_in_each_sample and not self.quince_mode:
            self.progress.end()
            raise ConfigError, "When you sepecify a coverage value through --min-coverage-in-each-sample, you must also\
                                use --quince-mode flag, since the former parameter needs to know the coverage values in all\
                                samples even if variation is reported for only one sample among otheres. This is the only way\
                                to figure out whether variation is not reported for other samples due to low or zero coverage,\
                                or there was no variation to report despite the high coverage. Anvi'o could turn --quince-mode\
                                flat automatically for you, but then it is much better if you have full control and understaning\
                                of what is going on."

        if self.quince_mode:
            self.progress.update('Accessing auxiliary data file ...')
            auxiliary_data_file_path = os.path.join(
                os.path.dirname(self.profile_db_path), 'AUXILIARY-DATA.h5')
            if not os.path.exists(auxiliary_data_file_path):
                raise ConfigError, "Anvi'o needs the auxiliary data file to run this program with '--quince-mode' flag.\
                                    However it wasn't found at '%s' :/" % auxiliary_data_file_path
            self.merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
                auxiliary_data_file_path, None, ignore_hash=True)

        self.progress.update(
            'Attempting to get our splits of interest sorted ..')
        if self.collection_name:
            # the user wants to go with the collection id path. fine. we will get our split names from
            # the profile database.
            if not self.bin_id:
                self.progress.end()
                raise ConfigError, 'When you declare a collection id, you must also declare a bin name\
                                    (from which the split names of interest will be acquired)'

            if self.splits_of_interest or self.splits_of_interest_path:
                self.progress.end()
                raise ConfigError, "You declared a collection id and one or more bin names so anvi'o can find out\
                                    splits of interest, but you also have specified informaiton for split names?\
                                    This is confusing. You should choose one way or another :/"

            self.splits_of_interest = ccollections.GetSplitNamesInBins(
                self.args).get_split_names_only()
        else:
            # OK. no collection id. we will go oldschool. we whope to find what we are looking for in
            # self.splits_of_interst_path  at this point (which may have been filled through the command
            # line client), or in self.splits_of_interest (which may have been filled in by another program)
            if not self.splits_of_interest:
                if not self.splits_of_interest_path:
                    self.progress.end()
                    raise ConfigError, 'You did not declare a source for split names. You either should give me\
                                        a file with split names you are interested in, or a collection id and\
                                        bin name so I can learn split names from the profile database.'

                filesnpaths.is_file_exists(self.splits_of_interest_path)
                self.splits_of_interest = set([
                    c.strip().replace('\r', '')
                    for c in open(self.splits_of_interest_path).readlines()
                ])

        self.input_file_path = '/' + '/'.join(
            os.path.abspath(self.profile_db_path).split('/')[:-1])

        self.progress.update('Reading the data ...')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        self.sample_ids = profile_db.samples  # we set this now, but we will overwrite it with args.samples_of_interest if necessary

        if not profile_db.meta['SNVs_profiled']:
            self.progress.end()
            raise ConfigError, "Well well well. It seems SNVs were not characterized for this profile database.\
                                Sorry, there is nothing to report here!"

        if self.engine == 'NT':
            self.data = profile_db.db.get_table_as_dict(
                t.variable_nts_table_name)
        elif self.engine == 'AA':
            # AA specific stuff. first check whether things were profiled
            if not profile_db.meta['AA_frequencies_profiled']:
                raise ConfigError, "It seems AA frequencies were not characterized for this profile database.\
                                    There is nothing to report here for AAs!"

            # get the data.
            self.data = profile_db.db.get_table_as_dict(
                t.variable_aas_table_name)

            # append split_name information
            for e in self.data.values():
                e['split_name'] = self.gene_callers_id_to_split_name_dict[
                    e['corresponding_gene_call']]
        else:
            raise ConfigError, "VariabilitySuper :: Anvi'o doesn't know what to do with a engine on '%s' yet :/" % self.engine

        profile_db.disconnect()

        self.progress.end()
Beispiel #6
0
    def __init__(self,
                 args,
                 hmm_sources,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress
        self.hmm_sources = hmm_sources

        self.splits_dict = {}

        # process genome descriptions
        GenomeDescriptions.__init__(self,
                                    args,
                                    run=self.run,
                                    progress=self.progress)
        self.load_genomes_descriptions(skip_functions=True, init=False)

        hmm_sources_in_all_genomes = self.get_HMM_sources_common_to_all_genomes(
            sources_that_must_be_common=hmm_sources)

        if not len(hmm_sources_in_all_genomes):
            raise ConfigError(
                "There are no HMM sources among your external genomes that occur in every genome :/"
            )

        # initialize the super
        SequencesForHMMHits.__init__(self,
                                     None,
                                     sources=hmm_sources,
                                     run=self.run,
                                     progress=self.progress)

        num_internal_genomes = len(
            set([
                g['genome_hash'] for g in self.genomes.values()
                if 'profile_db_path' in g
            ]))
        collection_names = set([
            g['collection_id'] for g in self.genomes.values()
            if 'collection_id' in g
        ])

        if num_internal_genomes:
            self.run.warning(
                "SequencesForHMMHitsWrapperForMultipleContigs class is speaking (yes, the class is "
                "quite aware of its very long name thankyouverymuch). Of the total %d genome descriptions "
                "it was given, %d seem to represent internal genomes with bins in collection(s) '%s'. Anvi'o "
                "will make sure HMM hits to be used for downstream analyses are only those that match to contigs "
                "that were included in those selections." %
                (len(self.genomes), num_internal_genomes,
                 ', '.join(collection_names)),
                lc="green")

        # very hacky code follows. here we generate a self SequencesForHMMHits object,
        # and we will fill everything in it with slightly modified information so multiple
        # contigs databases could be processed by this talented class seamlessly.
        hmm_hits_splits_counter = 0
        for genome_name in self.genomes:
            g = self.genomes[genome_name]
            contigs_db_path = g['contigs_db_path']
            contigs_db_hash = g['contigs_db_hash']

            # this is an important variable and allows us to track origins of HMM hits for bins
            # and individual contigs databases seamlessly. if you want to understand truly what
            # the hell does this mean, look at `get_genome_hash_for_external_genome` and
            # `get_genome_hash_for_internal_genome` functions in `genomedescriptions.py`.
            genome_hash = None

            # here we check if the genome descriptions contain reference to a collection name,
            # because if it is the case, we need to focus only on hmm hits that are relevant
            # to splits in this collection:
            if 'collection_id' in g:
                if ('bin_id' not in g) or ('profile_db_path' not in g):
                    raise ConfigError(
                        "There is something VERY weird going on. Your genome descriptions object contains "
                        "a collection name, yet it doesn't know anything about a bin name or profile database "
                        "path. While this is very interesting because it should never happen, anvi'o will say "
                        "goodbye and abruptly quit in confusion :(")

                # setup an args object, and recover the split names of interest
                args = argparse.Namespace(profile_db=g['profile_db_path'],
                                          contigs_db=g['contigs_db_path'],
                                          bin_id=g['bin_id'],
                                          collection_name=g['collection_id'])
                split_names_of_interest = ccollections.GetSplitNamesInBins(
                    args).get_split_names_only()
                genome_hash = hashlib.sha224('_'.join(
                    [''.join(split_names_of_interest),
                     contigs_db_hash]).encode('utf-8')).hexdigest()[0:12]

                # current hmm hits now will match to the collection
                current = SequencesForHMMHits(
                    contigs_db_path,
                    sources=hmm_sources,
                    split_names_of_interest=split_names_of_interest)
            else:
                current = SequencesForHMMHits(contigs_db_path,
                                              sources=hmm_sources)
                genome_hash = contigs_db_hash

            for hmm_hit_id in current.hmm_hits:
                hit = current.hmm_hits[hmm_hit_id]
                hit['gene_callers_id'] = '%s_%d' % (contigs_db_hash,
                                                    hit['gene_callers_id'])
                hit['genome_hash'] = genome_hash
                self.hmm_hits['%s_%d' % (contigs_db_hash, hmm_hit_id)] = hit

            if not self.hmm_hits_info:
                for hmm_source in hmm_sources_in_all_genomes:
                    self.hmm_hits_info[hmm_source] = current.hmm_hits_info[
                        hmm_source]

            for hit in current.hmm_hits_splits.values():
                hit['split'] = '%s_%s' % (contigs_db_hash, hit['split'])
                hit['hmm_hit_entry_id'] = '%s_%d' % (contigs_db_hash,
                                                     hit['hmm_hit_entry_id'])
                self.hmm_hits_splits[hmm_hits_splits_counter] = hit
                hmm_hits_splits_counter += 1

            for seq in current.contig_sequences:
                self.contig_sequences['%s_%s' %
                                      (contigs_db_hash,
                                       seq)] = current.contig_sequences[seq]

            for seq in current.aa_sequences:
                self.aa_sequences['%s_%s' % (contigs_db_hash,
                                             seq)] = current.aa_sequences[seq]

            for gene_callers_id in current.genes_in_contigs:
                entry = current.genes_in_contigs[gene_callers_id]
                entry['contig'] = '%s_%s' % (contigs_db_hash, entry['contig'])
                self.genes_in_contigs['%s_%d' % (contigs_db_hash,
                                                 gene_callers_id)] = entry

            self.splits_dict[genome_name] = [
                '%s_%s' % (contigs_db_hash, s)
                for s in current.splits_in_contigs
            ]