def __init__(self, args, external_clustering=None): self.args = args self.views = {} self.states_table = None self.p_meta = {} self.title = 'Unknown Project' A = lambda x: args.__dict__[x] if args.__dict__.has_key(x) else None self.profile_db_path = A('profile_db') self.contigs_db_path = A('contigs_db') self.manual_mode = A('manual_mode') self.split_hmm_layers = A('split_hmm_layers') self.additional_layers_path = A('additional_layers') self.additional_view_path = A('additional_view') self.samples_information_db_path = A('samples_information_db') self.view = A('view') self.fasta_file = A('fasta_file') self.view_data_path = A('view_data') self.tree = A('tree') self.title = A('title') self.output_dir = A('output_dir') self.show_views = A('show_views') self.state = A('state') self.show_states = A('show_states') self.skip_check_names = A('skip_check_names') self.split_names_ordered = None self.additional_layers = None self.auxiliary_profile_data_available = False self.samples_information_dict = {} self.samples_order_dict = {} self.samples_information_default_layer_order = {} self.external_clustering = external_clustering self.collections = ccollections.Collections() ContigsSuperclass.__init__(self, self.args) if self.samples_information_db_path: samples_information_db = SamplesInformationDatabase( self.samples_information_db_path) self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts( ) self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order( ) samples_information_db.disconnect() if self.contigs_db_path: self.completeness = completeness.Completeness(self.contigs_db_path) self.collections.populate_sources_dict(self.contigs_db_path, anvio.__contigs__version__) else: self.completeness = None if 'skip_init_functions' in args and not args.skip_init_functions: self.init_functions() # make sure we are not dealing with apples and oranges here. if self.contigs_db_path and self.profile_db_path: is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) self.P = lambda x: os.path.join(self.p_meta['output_dir'], x) self.cwd = os.getcwd() # here is where the big deal stuff takes place: if self.manual_mode: self.load_from_user_files(args) else: self.load_from_anvio_files(args) # make sure the samples information database, if there is one, is in fact compatible with the profile database # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is # being filled within the self.load_from_user_files function based on the headers of the view data. if self.profile_db_path and self.samples_information_db_path: is_profile_db_and_samples_db_compatible( self.profile_db_path, self.samples_information_db_path) if self.external_clustering: self.p_meta[ 'clusterings'] = self.clusterings = self.external_clustering[ 'clusterings'] self.p_meta['available_clusterings'] = self.clusterings.keys() self.p_meta['default_clustering'] = self.external_clustering[ 'default_clustering'] if not self.state and 'default' in self.states_table.states: self.state = 'default' if not self.p_meta['clusterings']: if self.p_meta['merged']: raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\ that is required by the interactive interface. It may have been generated\ by anvi-merge with `--skip-hierarchical-clustering` flag, or hierarchical\ clustering step may have been skipped automatically by the platform. Please\ read the help menu for anvi-merge, and/or refer to the tutorial: \ http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging" else: raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\ that is required by the interactive interface. You must use `--cluster-contigs`\ flag for single profiles to access to this functionality. Please read the help\ menu for anvi-profile, and/or refer to the tutorial." tree = Tree(self.p_meta['clusterings'][ self.p_meta['default_clustering']]['newick'], format=1) # self.split_names_ordered is going to be the 'master' names list. everything else is going to # need to match these names: self.split_names_ordered = [n.name for n in tree.get_leaves()] # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the # unnecessary splits stored in views dicts. self.prune_view_dicts() # if there are any HMM search results in the contigs database other than 'singlecopy' sources, # we would like to visualize them as additional layers. following function is inherited from # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in # search tables: self.init_non_singlecopy_gene_hmm_sources( self.split_names_ordered, return_each_gene_as_a_layer=self.split_hmm_layers) if self.additional_layers_path: filesnpaths.is_file_tab_delimited(self.additional_layers_path) self.additional_layers = self.additional_layers_path self.check_names_consistency() self.convert_view_data_into_json()
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None): """Returns an info dict for a given contigs db""" class Args: def __init__(self): self.contigs_db = contigs_db_path args = Args() run = run progress = progress run.verbose = False progress.verbose = False c = ContigsSuperclass(args, r=run, p=progress) info_dict = {'path': contigs_db_path} for key in c.a_meta: info_dict[key] = c.a_meta[key] if split_names: split_names = set(split_names) if split_names: c.init_split_sequences() seq = ''.join( [c.split_sequences[split_name] for split_name in split_names]) info_dict['total_length'] = len(seq) info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['gene_caller_ids'] = set([ e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names ]) info_dict['num_genes'] = len(info_dict['gene_caller_ids']) info_dict['avg_gene_length'] = numpy.mean([ (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start']) for gene_caller_id in info_dict['gene_caller_ids'] ]) info_dict['num_genes_per_kb'] = info_dict[ 'num_genes'] * 1000.0 / info_dict['total_length'] info_dict['num_splits'] = len(split_names) else: c.init_contig_sequences() seq = ''.join([e['sequence'] for e in c.contig_sequences.values()]) info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['num_genes'] = len(c.genes_in_contigs_dict) info_dict['gene_caller_ids'] = set(c.genes_in_contigs_dict.keys()) info_dict['avg_gene_length'] = numpy.mean([ (gene['stop'] - gene['start']) for gene in c.genes_in_contigs_dict.values() if not gene['partial'] ]) info_dict['num_genes_per_kb'] = info_dict[ 'num_genes'] * 1000.0 / info_dict['total_length'] # get completeness / contamination estimates if split_names: comp = completeness.Completeness(contigs_db_path).get_info_for_splits( split_names) else: comp = completeness.Completeness(contigs_db_path).get_info_for_splits( set(c.splits_basic_info.keys())) if comp.has_key('Campbell_et_al'): info_dict['percent_complete'] = comp['Campbell_et_al'][ 'percent_complete'] info_dict['percent_redundancy'] = comp['Campbell_et_al'][ 'percent_redundancy'] # lets get all amino acids used in all complete gene calls: if include_AA_counts: if split_names: AA_counts_dict = c.get_AA_counts_dict(split_names=split_names) else: AA_counts_dict = c.get_AA_counts_dict() info_dict['AA_counts'] = AA_counts_dict['AA_counts'] info_dict['total_AAs'] = AA_counts_dict['total_AAs'] return info_dict
def process(self): # learn who you are: collection_dict = self.collections.get_collection_dict( self.collection_name) bins_info_dict = self.collections.get_bins_info_dict( self.collection_name) # init profile data for colletion. self.init_collection_profile(collection_dict) # load completeness information if available self.completeness = completeness.Completeness(self.contigs_db_path) if len(self.completeness.sources): self.completeness_data_available = True # load HMM sources for non-single-copy genes if available if self.non_singlecopy_gene_hmm_sources and not self.quick: self.init_non_singlecopy_gene_hmm_sources() self.non_single_copy_gene_hmm_data_available = True # load gene functions from contigs db superclass self.init_functions() # set up the initial summary dictionary self.summary['meta'] = { 'quick': self.quick, 'output_directory': self.output_directory, 'collection': collection_dict.keys(), 'num_bins': len(collection_dict.keys()), 'collection_name': self.collection_name, 'total_nts_in_collection': 0, 'num_contigs_in_collection': 0, 'anvio_version': __version__, 'profile': self.p_meta, 'contigs': self.a_meta, 'gene_coverages_data_available': self.gene_coverages_data_available, 'completeness_data_available': self.completeness_data_available, 'non_single_copy_gene_hmm_data_available': self.non_single_copy_gene_hmm_data_available, 'percent_contigs_nts_described_by_collection': 0.0, 'percent_profile_nts_described_by_collection': 0.0, 'percent_contigs_nts_described_by_profile': P(self.p_meta['total_length'], self.a_meta['total_length']), 'percent_contigs_contigs_described_by_profile': P(self.p_meta['num_contigs'], self.a_meta['num_contigs']), 'percent_contigs_splits_described_by_profile': P(self.p_meta['num_splits'], self.a_meta['num_splits']), } # I am not sure whether this is the best place to do this, self.summary['basics_pretty'] = { 'profile': [ ('Created on', self.p_meta['creation_date']), ('Version', self.p_meta['version']), ('Minimum conting length', pretty(self.p_meta['min_contig_length'])), ('Number of contigs', pretty(int(self.p_meta['num_contigs']))), ('Number of splits', pretty(int(self.p_meta['num_splits']))), ('Total nucleotides', humanize_n(int(self.p_meta['total_length']))), ], 'contigs': [ ('Created on', self.p_meta['creation_date']), ('Version', self.a_meta['version']), ('Split length', pretty(int(self.a_meta['split_length']))), ('Number of contigs', pretty(int(self.a_meta['num_contigs']))), ('Number of splits', pretty(int(self.a_meta['num_splits']))), ('Total nucleotides', humanize_n(int(self.a_meta['total_length']))), ('K-mer size', self.a_meta['kmer_size']), ], } self.summary['max_shown_header_items'] = 10 self.summary['slice_header_items_tmpl'] = '0:%d' % self.summary[ 'max_shown_header_items'] self.summary['num_not_shown_samples'] = len( self.p_meta['samples']) - self.summary['max_shown_header_items'] self.summary['num_not_shown_hmm_items'] = dict([ (hmm_search_source, len(self.hmm_sources_info[hmm_search_source]['genes']) - self.summary['max_shown_header_items']) for hmm_search_type, hmm_search_source in self.hmm_searches_header ]) self.summary['files'] = {} self.summary['collection'] = {} self.summary[ 'collection_profile'] = self.collection_profile # reminder; collection_profile comes from ProfileSuperclass! self.summary[ 'collection_profile_items'] = self.collection_profile.values( )[0].keys() # add hmm items for each seach type: if self.non_single_copy_gene_hmm_data_available: self.summary['meta']['hmm_items'] = dict([ (hmm_search_source, self.hmm_sources_info[hmm_search_source]['genes']) for hmm_search_type, hmm_search_source in self.hmm_searches_header ]) # summarize bins: for bin_id in collection_dict: bin = Bin(self, bin_id, collection_dict[bin_id], self.run, self.progress) bin.output_directory = os.path.join(self.output_directory, 'bin_by_bin', bin_id) bin.bin_profile = self.collection_profile[bin_id] self.summary['collection'][bin_id] = bin.create() self.summary['collection'][bin_id][ 'color'] = bins_info_dict[bin_id]['html_color'] or '#212121' self.summary['collection'][bin_id]['source'] = bins_info_dict[ bin_id]['source'] or 'unknown_source' self.summary['meta']['total_nts_in_collection'] += self.summary[ 'collection'][bin_id]['total_length'] self.summary['meta']['num_contigs_in_collection'] += self.summary[ 'collection'][bin_id]['num_contigs'] # bins are computed, add some relevant meta info: self.summary['meta'][ 'percent_contigs_nts_described_by_collection'] = '%.2f' % ( self.summary['meta']['total_nts_in_collection'] * 100.0 / int(self.a_meta['total_length'])) self.summary['meta'][ 'percent_profile_nts_described_by_collection'] = '%.2f' % ( self.summary['meta']['total_nts_in_collection'] * 100.0 / int(self.p_meta['total_length'])) self.summary['meta'][ 'bins'] = self.get_bins_ordered_by_completeness_and_size() if not self.quick: # generate a TAB-delimited text output file for bin summaries summary_of_bins_matrix_output = {} properties = [ 'taxon', 'total_length', 'num_contigs', 'N50', 'GC_content', 'percent_complete', 'percent_redundancy' ] for bin_name in self.summary['collection']: summary_of_bins_matrix_output[bin_name] = dict([ (prop, self.summary['collection'][bin_name][prop]) for prop in properties ]) output_file_obj = self.get_output_file_handle( prefix='general_bins_summary.txt') utils.store_dict_as_TAB_delimited_file( summary_of_bins_matrix_output, None, headers=['bins'] + properties, file_obj=output_file_obj) # save merged matrices for bins x samples for table_name in self.collection_profile.values()[0].keys(): d = {} for bin_id in self.collection_profile: d[bin_id] = self.collection_profile[bin_id][table_name] output_file_obj = self.get_output_file_handle( sub_directory='bins_across_samples', prefix='%s.txt' % table_name) utils.store_dict_as_TAB_delimited_file( d, None, headers=['bins'] + sorted(self.p_meta['samples']), file_obj=output_file_obj) # merge and store matrices for hmm hits if self.non_single_copy_gene_hmm_data_available: for hmm_search_source in self.summary['meta']['hmm_items']: # this is to keep numbers per hmm item: d = {} for bin_id in self.summary['meta']['bins']: d[bin_id] = self.summary['collection'][bin_id]['hmms'][ hmm_search_source] output_file_obj = self.get_output_file_handle( sub_directory='bins_across_samples', prefix='%s.txt' % hmm_search_source, within='hmms') utils.store_dict_as_TAB_delimited_file( d, None, headers=['bins'] + sorted(self.summary['meta']['hmm_items'] [hmm_search_source]), file_obj=output_file_obj) # this is to keep number of hmm hits per bin: n = dict([(bin_id, {}) for bin_id in self.summary['meta']['bins']]) for hmm_search_source in self.summary['meta']['hmm_items']: for bin_id in self.summary['meta']['bins']: n[bin_id][hmm_search_source] = sum( self.summary['collection'][bin_id]['hmms'] [hmm_search_source].values()) output_file_obj = self.get_output_file_handle( sub_directory='bins_across_samples', prefix='hmm_hit_totals.txt') utils.store_dict_as_TAB_delimited_file( n, None, headers=['bins'] + sorted(self.summary['meta']['hmm_items']), file_obj=output_file_obj) # store percent abundance of each bin self.summary[ 'bin_percent_recruitment'] = self.bin_percent_recruitment_per_sample self.summary['bin_percent_abundance_items'] = sorted( self.bin_percent_recruitment_per_sample.values()[0].keys()) output_file_obj = self.get_output_file_handle( sub_directory='bins_across_samples', prefix='bins_percent_recruitment.txt') utils.store_dict_as_TAB_delimited_file( self.bin_percent_recruitment_per_sample, None, headers=['samples'] + sorted(self.collection_profile.keys()) + ['__splits_not_binned__'], file_obj=output_file_obj) if self.debug: import json print json.dumps(self.summary, sort_keys=True, indent=4) self.index_html = SummaryHTMLOutput( self.summary, r=self.run, p=self.progress).generate(quick=self.quick)
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None, exclude_partial_gene_calls=True): """Returns an info dict for a given contigs db""" class Args: def __init__(self): self.contigs_db = contigs_db_path args = Args() run = run progress = progress run.verbose = False progress.verbose = False c = ContigsSuperclass(args, r=run, p=progress) info_dict = {'path': contigs_db_path} for key in c.a_meta: info_dict[key] = c.a_meta[key] # Two different strategies here depending on whether we work with a given set if split ids or # everything in the contigs database. if split_names: split_names = set(split_names) c.init_split_sequences() seq = ''.join([c.split_sequences[split_name] for split_name in split_names]) candidate_gene_caller_ids = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names]) else: c.init_contig_sequences() seq = ''.join([e['sequence'] for e in c.contig_sequences.values()]) candidate_gene_caller_ids = c.genes_in_contigs_dict.keys() info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['total_length'] = len(seq) gene_caller_ids = set([]) excluded_gene_ids = set([]) for gene_caller_id in candidate_gene_caller_ids: if c.genes_in_contigs_dict[gene_caller_id]['partial'] and exclude_partial_gene_calls: excluded_gene_ids.add(gene_caller_id) else: gene_caller_ids.add(gene_caller_id) info_dict['gene_caller_ids'] = gene_caller_ids info_dict['excluded_gene_ids'] = excluded_gene_ids info_dict['num_genes'] = len(gene_caller_ids) info_dict['gene_lengths'] = dict([(gene_caller_id, (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start'])) for gene_caller_id in gene_caller_ids]) info_dict['avg_gene_length'] = numpy.mean(info_dict['gene_lengths'].values()) info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length'] # get completeness / contamination estimates p_completion, p_redundancy, domain, domain_confidence, results_dict = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names if split_names else set(c.splits_basic_info.keys())) info_dict['percent_complete'] = p_completion info_dict['percent_redundancy'] = p_redundancy info_dict['scg_domain'] = domain info_dict['scg_domain_confidence'] = domain_confidence # lets get all amino acids used in all complete gene calls: if include_AA_counts: if split_names: AA_counts_dict = c.get_AA_counts_dict(split_names=split_names) else: AA_counts_dict = c.get_AA_counts_dict() info_dict['AA_counts'] = AA_counts_dict['AA_counts'] info_dict['total_AAs'] = AA_counts_dict['total_AAs'] return info_dict