Exemple #1
0
    def __init__(self, args, external_clustering=None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if args.__dict__.has_key(x) else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.manual_mode = A('manual_mode')
        self.split_hmm_layers = A('split_hmm_layers')
        self.additional_layers_path = A('additional_layers')
        self.additional_view_path = A('additional_view')
        self.samples_information_db_path = A('samples_information_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.view_data_path = A('view_data')
        self.tree = A('tree')
        self.title = A('title')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.state = A('state')
        self.show_states = A('show_states')
        self.skip_check_names = A('skip_check_names')

        self.split_names_ordered = None
        self.additional_layers = None
        self.auxiliary_profile_data_available = False

        self.samples_information_dict = {}
        self.samples_order_dict = {}
        self.samples_information_default_layer_order = {}

        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        ContigsSuperclass.__init__(self, self.args)

        if self.samples_information_db_path:
            samples_information_db = SamplesInformationDatabase(
                self.samples_information_db_path)
            self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts(
            )
            self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order(
            )
            samples_information_db.disconnect()

        if self.contigs_db_path:
            self.completeness = completeness.Completeness(self.contigs_db_path)
            self.collections.populate_sources_dict(self.contigs_db_path,
                                                   anvio.__contigs__version__)
        else:
            self.completeness = None

        if 'skip_init_functions' in args and not args.skip_init_functions:
            self.init_functions()

        # make sure we are not dealing with apples and oranges here.
        if self.contigs_db_path and self.profile_db_path:
            is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                    self.contigs_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if self.manual_mode:
            self.load_from_user_files(args)
        else:
            self.load_from_anvio_files(args)

        # make sure the samples information database, if there is one, is in fact compatible with the profile database
        # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is
        # being filled within the self.load_from_user_files function based on the headers of the view data.
        if self.profile_db_path and self.samples_information_db_path:
            is_profile_db_and_samples_db_compatible(
                self.profile_db_path, self.samples_information_db_path)

        if self.external_clustering:
            self.p_meta[
                'clusterings'] = self.clusterings = self.external_clustering[
                    'clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering[
                'default_clustering']

        if not self.state and 'default' in self.states_table.states:
            self.state = 'default'

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped automatically by the platform. Please\
                                    read the help menu for anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"

            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        tree = Tree(self.p_meta['clusterings'][
            self.p_meta['default_clustering']]['newick'],
                    format=1)

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = [n.name for n in tree.get_leaves()]

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the contigs database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        self.init_non_singlecopy_gene_hmm_sources(
            self.split_names_ordered,
            return_each_gene_as_a_layer=self.split_hmm_layers)

        if self.additional_layers_path:
            filesnpaths.is_file_tab_delimited(self.additional_layers_path)
            self.additional_layers = self.additional_layers_path

        self.check_names_consistency()
        self.convert_view_data_into_json()
Exemple #2
0
def get_contigs_db_info_dict(contigs_db_path,
                             run=run,
                             progress=progress,
                             include_AA_counts=False,
                             split_names=None):
    """Returns an info dict for a given contigs db"""
    class Args:
        def __init__(self):
            self.contigs_db = contigs_db_path

    args = Args()
    run = run
    progress = progress
    run.verbose = False
    progress.verbose = False
    c = ContigsSuperclass(args, r=run, p=progress)

    info_dict = {'path': contigs_db_path}

    for key in c.a_meta:
        info_dict[key] = c.a_meta[key]

    if split_names:
        split_names = set(split_names)

    if split_names:
        c.init_split_sequences()
        seq = ''.join(
            [c.split_sequences[split_name] for split_name in split_names])
        info_dict['total_length'] = len(seq)
        info_dict['gc_content'] = sequence.Composition(seq).GC_content
        info_dict['gene_caller_ids'] = set([
            e['gene_callers_id'] for e in c.genes_in_splits.values()
            if e['split'] in split_names
        ])
        info_dict['num_genes'] = len(info_dict['gene_caller_ids'])
        info_dict['avg_gene_length'] = numpy.mean([
            (c.genes_in_contigs_dict[gene_caller_id]['stop'] -
             c.genes_in_contigs_dict[gene_caller_id]['start'])
            for gene_caller_id in info_dict['gene_caller_ids']
        ])
        info_dict['num_genes_per_kb'] = info_dict[
            'num_genes'] * 1000.0 / info_dict['total_length']
        info_dict['num_splits'] = len(split_names)
    else:
        c.init_contig_sequences()
        seq = ''.join([e['sequence'] for e in c.contig_sequences.values()])
        info_dict['gc_content'] = sequence.Composition(seq).GC_content
        info_dict['num_genes'] = len(c.genes_in_contigs_dict)
        info_dict['gene_caller_ids'] = set(c.genes_in_contigs_dict.keys())
        info_dict['avg_gene_length'] = numpy.mean([
            (gene['stop'] - gene['start'])
            for gene in c.genes_in_contigs_dict.values() if not gene['partial']
        ])
        info_dict['num_genes_per_kb'] = info_dict[
            'num_genes'] * 1000.0 / info_dict['total_length']

    # get completeness / contamination estimates
    if split_names:
        comp = completeness.Completeness(contigs_db_path).get_info_for_splits(
            split_names)
    else:
        comp = completeness.Completeness(contigs_db_path).get_info_for_splits(
            set(c.splits_basic_info.keys()))

    if comp.has_key('Campbell_et_al'):
        info_dict['percent_complete'] = comp['Campbell_et_al'][
            'percent_complete']
        info_dict['percent_redundancy'] = comp['Campbell_et_al'][
            'percent_redundancy']

    # lets get all amino acids used in all complete gene calls:
    if include_AA_counts:
        if split_names:
            AA_counts_dict = c.get_AA_counts_dict(split_names=split_names)
        else:
            AA_counts_dict = c.get_AA_counts_dict()

        info_dict['AA_counts'] = AA_counts_dict['AA_counts']
        info_dict['total_AAs'] = AA_counts_dict['total_AAs']

    return info_dict
Exemple #3
0
    def process(self):
        # learn who you are:
        collection_dict = self.collections.get_collection_dict(
            self.collection_name)
        bins_info_dict = self.collections.get_bins_info_dict(
            self.collection_name)

        # init profile data for colletion.
        self.init_collection_profile(collection_dict)

        # load completeness information if available
        self.completeness = completeness.Completeness(self.contigs_db_path)
        if len(self.completeness.sources):
            self.completeness_data_available = True

        # load HMM sources for non-single-copy genes if available
        if self.non_singlecopy_gene_hmm_sources and not self.quick:
            self.init_non_singlecopy_gene_hmm_sources()
            self.non_single_copy_gene_hmm_data_available = True

        # load gene functions from contigs db superclass
        self.init_functions()

        # set up the initial summary dictionary
        self.summary['meta'] = {
            'quick':
            self.quick,
            'output_directory':
            self.output_directory,
            'collection':
            collection_dict.keys(),
            'num_bins':
            len(collection_dict.keys()),
            'collection_name':
            self.collection_name,
            'total_nts_in_collection':
            0,
            'num_contigs_in_collection':
            0,
            'anvio_version':
            __version__,
            'profile':
            self.p_meta,
            'contigs':
            self.a_meta,
            'gene_coverages_data_available':
            self.gene_coverages_data_available,
            'completeness_data_available':
            self.completeness_data_available,
            'non_single_copy_gene_hmm_data_available':
            self.non_single_copy_gene_hmm_data_available,
            'percent_contigs_nts_described_by_collection':
            0.0,
            'percent_profile_nts_described_by_collection':
            0.0,
            'percent_contigs_nts_described_by_profile':
            P(self.p_meta['total_length'], self.a_meta['total_length']),
            'percent_contigs_contigs_described_by_profile':
            P(self.p_meta['num_contigs'], self.a_meta['num_contigs']),
            'percent_contigs_splits_described_by_profile':
            P(self.p_meta['num_splits'], self.a_meta['num_splits']),
        }

        # I am not sure whether this is the best place to do this,
        self.summary['basics_pretty'] = {
            'profile': [
                ('Created on', self.p_meta['creation_date']),
                ('Version', self.p_meta['version']),
                ('Minimum conting length',
                 pretty(self.p_meta['min_contig_length'])),
                ('Number of contigs', pretty(int(self.p_meta['num_contigs']))),
                ('Number of splits', pretty(int(self.p_meta['num_splits']))),
                ('Total nucleotides',
                 humanize_n(int(self.p_meta['total_length']))),
            ],
            'contigs': [
                ('Created on', self.p_meta['creation_date']),
                ('Version', self.a_meta['version']),
                ('Split length', pretty(int(self.a_meta['split_length']))),
                ('Number of contigs', pretty(int(self.a_meta['num_contigs']))),
                ('Number of splits', pretty(int(self.a_meta['num_splits']))),
                ('Total nucleotides',
                 humanize_n(int(self.a_meta['total_length']))),
                ('K-mer size', self.a_meta['kmer_size']),
            ],
        }

        self.summary['max_shown_header_items'] = 10
        self.summary['slice_header_items_tmpl'] = '0:%d' % self.summary[
            'max_shown_header_items']
        self.summary['num_not_shown_samples'] = len(
            self.p_meta['samples']) - self.summary['max_shown_header_items']
        self.summary['num_not_shown_hmm_items'] = dict([
            (hmm_search_source,
             len(self.hmm_sources_info[hmm_search_source]['genes']) -
             self.summary['max_shown_header_items'])
            for hmm_search_type, hmm_search_source in self.hmm_searches_header
        ])

        self.summary['files'] = {}
        self.summary['collection'] = {}
        self.summary[
            'collection_profile'] = self.collection_profile  # reminder; collection_profile comes from ProfileSuperclass!
        self.summary[
            'collection_profile_items'] = self.collection_profile.values(
            )[0].keys()

        # add hmm items for each seach type:
        if self.non_single_copy_gene_hmm_data_available:
            self.summary['meta']['hmm_items'] = dict([
                (hmm_search_source,
                 self.hmm_sources_info[hmm_search_source]['genes']) for
                hmm_search_type, hmm_search_source in self.hmm_searches_header
            ])

        # summarize bins:
        for bin_id in collection_dict:
            bin = Bin(self, bin_id, collection_dict[bin_id], self.run,
                      self.progress)
            bin.output_directory = os.path.join(self.output_directory,
                                                'bin_by_bin', bin_id)
            bin.bin_profile = self.collection_profile[bin_id]

            self.summary['collection'][bin_id] = bin.create()
            self.summary['collection'][bin_id][
                'color'] = bins_info_dict[bin_id]['html_color'] or '#212121'
            self.summary['collection'][bin_id]['source'] = bins_info_dict[
                bin_id]['source'] or 'unknown_source'
            self.summary['meta']['total_nts_in_collection'] += self.summary[
                'collection'][bin_id]['total_length']
            self.summary['meta']['num_contigs_in_collection'] += self.summary[
                'collection'][bin_id]['num_contigs']

        # bins are computed, add some relevant meta info:
        self.summary['meta'][
            'percent_contigs_nts_described_by_collection'] = '%.2f' % (
                self.summary['meta']['total_nts_in_collection'] * 100.0 /
                int(self.a_meta['total_length']))
        self.summary['meta'][
            'percent_profile_nts_described_by_collection'] = '%.2f' % (
                self.summary['meta']['total_nts_in_collection'] * 100.0 /
                int(self.p_meta['total_length']))
        self.summary['meta'][
            'bins'] = self.get_bins_ordered_by_completeness_and_size()

        if not self.quick:
            # generate a TAB-delimited text output file for bin summaries
            summary_of_bins_matrix_output = {}
            properties = [
                'taxon', 'total_length', 'num_contigs', 'N50', 'GC_content',
                'percent_complete', 'percent_redundancy'
            ]

            for bin_name in self.summary['collection']:
                summary_of_bins_matrix_output[bin_name] = dict([
                    (prop, self.summary['collection'][bin_name][prop])
                    for prop in properties
                ])

            output_file_obj = self.get_output_file_handle(
                prefix='general_bins_summary.txt')
            utils.store_dict_as_TAB_delimited_file(
                summary_of_bins_matrix_output,
                None,
                headers=['bins'] + properties,
                file_obj=output_file_obj)

            # save merged matrices for bins x samples
            for table_name in self.collection_profile.values()[0].keys():
                d = {}
                for bin_id in self.collection_profile:
                    d[bin_id] = self.collection_profile[bin_id][table_name]

                output_file_obj = self.get_output_file_handle(
                    sub_directory='bins_across_samples',
                    prefix='%s.txt' % table_name)
                utils.store_dict_as_TAB_delimited_file(
                    d,
                    None,
                    headers=['bins'] + sorted(self.p_meta['samples']),
                    file_obj=output_file_obj)

            # merge and store matrices for hmm hits
            if self.non_single_copy_gene_hmm_data_available:
                for hmm_search_source in self.summary['meta']['hmm_items']:
                    # this is to keep numbers per hmm item:
                    d = {}

                    for bin_id in self.summary['meta']['bins']:
                        d[bin_id] = self.summary['collection'][bin_id]['hmms'][
                            hmm_search_source]

                    output_file_obj = self.get_output_file_handle(
                        sub_directory='bins_across_samples',
                        prefix='%s.txt' % hmm_search_source,
                        within='hmms')
                    utils.store_dict_as_TAB_delimited_file(
                        d,
                        None,
                        headers=['bins'] +
                        sorted(self.summary['meta']['hmm_items']
                               [hmm_search_source]),
                        file_obj=output_file_obj)

                # this is to keep number of hmm hits per bin:
                n = dict([(bin_id, {})
                          for bin_id in self.summary['meta']['bins']])
                for hmm_search_source in self.summary['meta']['hmm_items']:
                    for bin_id in self.summary['meta']['bins']:
                        n[bin_id][hmm_search_source] = sum(
                            self.summary['collection'][bin_id]['hmms']
                            [hmm_search_source].values())

                output_file_obj = self.get_output_file_handle(
                    sub_directory='bins_across_samples',
                    prefix='hmm_hit_totals.txt')
                utils.store_dict_as_TAB_delimited_file(
                    n,
                    None,
                    headers=['bins'] +
                    sorted(self.summary['meta']['hmm_items']),
                    file_obj=output_file_obj)

            # store percent abundance of each bin
            self.summary[
                'bin_percent_recruitment'] = self.bin_percent_recruitment_per_sample
            self.summary['bin_percent_abundance_items'] = sorted(
                self.bin_percent_recruitment_per_sample.values()[0].keys())
            output_file_obj = self.get_output_file_handle(
                sub_directory='bins_across_samples',
                prefix='bins_percent_recruitment.txt')
            utils.store_dict_as_TAB_delimited_file(
                self.bin_percent_recruitment_per_sample,
                None,
                headers=['samples'] + sorted(self.collection_profile.keys()) +
                ['__splits_not_binned__'],
                file_obj=output_file_obj)

        if self.debug:
            import json
            print json.dumps(self.summary, sort_keys=True, indent=4)

        self.index_html = SummaryHTMLOutput(
            self.summary, r=self.run,
            p=self.progress).generate(quick=self.quick)
Exemple #4
0
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None, exclude_partial_gene_calls=True):
    """Returns an info dict for a given contigs db"""

    class Args:
        def __init__(self):
            self.contigs_db = contigs_db_path

    args = Args()
    run = run
    progress = progress
    run.verbose = False
    progress.verbose = False
    c = ContigsSuperclass(args, r=run, p=progress)

    info_dict = {'path': contigs_db_path}

    for key in c.a_meta:
        info_dict[key] = c.a_meta[key]

    # Two different strategies here depending on whether we work with a given set if split ids or
    # everything in the contigs database.
    if split_names:
        split_names = set(split_names)
        c.init_split_sequences()
        seq = ''.join([c.split_sequences[split_name] for split_name in split_names])
        candidate_gene_caller_ids = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names])
    else:
        c.init_contig_sequences()
        seq = ''.join([e['sequence'] for e in c.contig_sequences.values()])
        candidate_gene_caller_ids = c.genes_in_contigs_dict.keys()

    info_dict['gc_content'] = sequence.Composition(seq).GC_content
    info_dict['total_length'] = len(seq)

    gene_caller_ids = set([])
    excluded_gene_ids = set([])
    for gene_caller_id in candidate_gene_caller_ids:
        if c.genes_in_contigs_dict[gene_caller_id]['partial'] and exclude_partial_gene_calls:
            excluded_gene_ids.add(gene_caller_id)
        else:
            gene_caller_ids.add(gene_caller_id)

    info_dict['gene_caller_ids'] = gene_caller_ids
    info_dict['excluded_gene_ids'] = excluded_gene_ids
    info_dict['num_genes'] = len(gene_caller_ids)
    info_dict['gene_lengths'] = dict([(gene_caller_id, (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start'])) for gene_caller_id in gene_caller_ids])
    info_dict['avg_gene_length'] = numpy.mean(info_dict['gene_lengths'].values())
    info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length']

    # get completeness / contamination estimates
    p_completion, p_redundancy, domain, domain_confidence, results_dict = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names if split_names else set(c.splits_basic_info.keys()))

    info_dict['percent_complete'] = p_completion
    info_dict['percent_redundancy'] = p_redundancy
    info_dict['scg_domain'] = domain
    info_dict['scg_domain_confidence'] = domain_confidence

    # lets get all amino acids used in all complete gene calls:
    if include_AA_counts:
        if split_names:
            AA_counts_dict = c.get_AA_counts_dict(split_names=split_names)
        else:
            AA_counts_dict = c.get_AA_counts_dict()

        info_dict['AA_counts'] = AA_counts_dict['AA_counts']
        info_dict['total_AAs'] = AA_counts_dict['total_AAs']

    return info_dict