def test(column, quiet=False, test_class=variability_test_class): coverage = len(column) results = {} if not quiet: run.warning( '', 'Profiling results for %s nts [TestFactory = %s]' % (pretty(len(column)), 'True' if test_class else 'False')) for method in ['C', 'Python']: results[method] = {} start = time.time() results[method]['profile'] = ColumnProfile[method]( column, coverage=coverage, pos=0, test_class=test_class).profile end = time.time() results[method]['delta_time'] = end - start if not quiet: run.info('%s profile' % method, results[method]['profile']) run.info('%s response time' % method, results[method]['delta_time']) run.info('Result', 'C is ~%.2f times faster' % (results['Python']['delta_time'] / results['C']['delta_time']), mc='green')
def test(column, quiet = False, test_class = variability_test_class): coverage = len(column) results = {} if not quiet: run.warning('', 'Profiling results for %s nts [TestFactory = %s]' % (pretty(len(column)), 'True' if test_class else 'False')) for method in ['C', 'Python']: results[method] = {} start = time.time() results[method]['profile'] = ColumnProfile[method](column, coverage = coverage, pos = 0, test_class = test_class).profile end = time.time() results[method]['delta_time'] = end - start if not quiet: run.info('%s profile' % method, results[method]['profile']) run.info('%s response time' % method, results[method]['delta_time']) run.info('Result', 'C is ~%.2f times faster' % (results['Python']['delta_time'] / results['C']['delta_time']), mc = 'green')
def process(self): # learn who you are: collection_dict = self.collections.get_collection_dict(self.collection_id) collection_colors = self.collections.get_collection_colors(self.collection_id) # init profile data for colletion. self.init_collection_profile(collection_dict) # load completeness information if available self.completeness = completeness.Completeness(self.contigs_db_path) if len(self.completeness.sources): self.completeness_data_available = True # load HMM sources for non-single-copy genes if available if self.non_singlecopy_gene_hmm_sources and not self.quick: self.init_non_singlecopy_gene_hmm_sources() self.non_single_copy_gene_hmm_data_available = True # load gene functions from contigs db superclass self.init_functions() # set up the initial summary dictionary self.summary['meta'] = {'quick': self.quick, 'output_directory': self.output_directory, 'collection': collection_dict.keys(), 'num_bins': len(collection_dict.keys()), 'collection_id': self.collection_id, 'total_nts_in_collection': 0, 'num_contigs_in_collection': 0, 'anvio_version': __version__, 'profile': self.p_meta, 'contigs': self.a_meta, 'gene_coverages_data_available': self.gene_coverages_data_available, 'completeness_data_available': self.completeness_data_available, 'non_single_copy_gene_hmm_data_available': self.non_single_copy_gene_hmm_data_available, 'percent_contigs_nts_described_by_collection': 0.0, 'percent_profile_nts_described_by_collection': 0.0, 'percent_contigs_nts_described_by_profile': P(self.p_meta['total_length'], self.a_meta['total_length']) , 'percent_contigs_contigs_described_by_profile': P(self.p_meta['num_contigs'], self.a_meta['num_contigs']) , 'percent_contigs_splits_described_by_profile': P(self.p_meta['num_splits'], self.a_meta['num_splits']) , } # I am not sure whether this is the best place to do this, self.summary['basics_pretty'] = {'profile': [ ('Created on', self.p_meta['creation_date']), ('Version', self.p_meta['version']), ('Minimum conting length', pretty(self.p_meta['min_contig_length'])), ('Number of contigs', pretty(int(self.p_meta['num_contigs']))), ('Number of splits', pretty(int(self.p_meta['num_splits']))), ('Total nucleotides', humanize_n(int(self.p_meta['total_length']))), ], 'contigs': [ ('Created on', self.p_meta['creation_date']), ('Version', self.a_meta['version']), ('Split length', pretty(int(self.a_meta['split_length']))), ('Number of contigs', pretty(int(self.a_meta['num_contigs']))), ('Number of splits', pretty(int(self.a_meta['num_splits']))), ('Total nucleotides', humanize_n(int(self.a_meta['total_length']))), ('K-mer size', self.a_meta['kmer_size']), ], } self.summary['max_shown_header_items'] = 10 self.summary['slice_header_items_tmpl'] = '0:%d' % self.summary['max_shown_header_items'] self.summary['num_not_shown_samples'] = len(self.p_meta['samples']) - self.summary['max_shown_header_items'] self.summary['num_not_shown_hmm_items'] = dict([(hmm_search_source, len(self.hmm_sources_info[hmm_search_source]['genes']) - self.summary['max_shown_header_items']) for hmm_search_type, hmm_search_source in self.hmm_searches_header]) self.summary['files'] = {} self.summary['collection'] = {} self.summary['collection_profile'] = self.collection_profile # reminder; collection_profile comes from ProfileSuperclass! self.summary['collection_profile_items'] = self.collection_profile.values()[0].keys() # add hmm items for each seach type: if self.non_single_copy_gene_hmm_data_available: self.summary['meta']['hmm_items'] = dict([(hmm_search_source, self.hmm_sources_info[hmm_search_source]['genes']) for hmm_search_type, hmm_search_source in self.hmm_searches_header]) # summarize bins: for bin_id in collection_dict: bin = Bin(self, bin_id, collection_dict[bin_id], self.run, self.progress) bin.output_directory = os.path.join(self.output_directory, 'bin_by_bin', bin_id) bin.bin_profile = self.collection_profile[bin_id] self.summary['collection'][bin_id] = bin.create() self.summary['collection'][bin_id]['color'] = collection_colors[bin_id] or '#212121' self.summary['meta']['total_nts_in_collection'] += self.summary['collection'][bin_id]['total_length'] self.summary['meta']['num_contigs_in_collection'] += self.summary['collection'][bin_id]['num_contigs'] # bins are computed, add some relevant meta info: self.summary['meta']['percent_contigs_nts_described_by_collection'] = '%.2f' % (self.summary['meta']['total_nts_in_collection'] * 100.0 / int(self.a_meta['total_length'])) self.summary['meta']['percent_profile_nts_described_by_collection'] = '%.2f' % (self.summary['meta']['total_nts_in_collection'] * 100.0 / int(self.p_meta['total_length'])) self.summary['meta']['bins'] = self.get_bins_ordered_by_completeness_and_size() if not self.quick: # generate a TAB-delimited text output file for bin summaries summary_of_bins_matrix_output = {} properties = ['taxon', 'total_length', 'num_contigs', 'N50', 'GC_content', 'percent_complete', 'percent_redundancy'] for bin_name in self.summary['collection']: summary_of_bins_matrix_output[bin_name] = dict([(prop, self.summary['collection'][bin_name][prop]) for prop in properties]) output_file_obj = self.get_output_file_handle(prefix = 'general_bins_summary.txt') utils.store_dict_as_TAB_delimited_file(summary_of_bins_matrix_output, None, headers = ['bins'] + properties, file_obj = output_file_obj) # save merged matrices for bins x samples for table_name in self.collection_profile.values()[0].keys(): d = {} for bin_id in self.collection_profile: d[bin_id] = self.collection_profile[bin_id][table_name] output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = '%s.txt' % table_name) utils.store_dict_as_TAB_delimited_file(d, None, headers = ['bins'] + sorted(self.p_meta['samples']), file_obj = output_file_obj) # merge and store matrices for hmm hits if self.non_single_copy_gene_hmm_data_available: for hmm_search_source in self.summary['meta']['hmm_items']: # this is to keep numbers per hmm item: d = {} for bin_id in self.summary['meta']['bins']: d[bin_id] = self.summary['collection'][bin_id]['hmms'][hmm_search_source] output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = '%s.txt' % hmm_search_source, within='hmms') utils.store_dict_as_TAB_delimited_file(d, None, headers = ['bins'] + sorted(self.summary['meta']['hmm_items'][hmm_search_source]), file_obj = output_file_obj) # this is to keep number of hmm hits per bin: n = dict([(bin_id, {}) for bin_id in self.summary['meta']['bins']]) for hmm_search_source in self.summary['meta']['hmm_items']: for bin_id in self.summary['meta']['bins']: n[bin_id][hmm_search_source] = sum(self.summary['collection'][bin_id]['hmms'][hmm_search_source].values()) output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = 'hmm_hit_totals.txt') utils.store_dict_as_TAB_delimited_file(n, None, headers = ['bins'] + sorted(self.summary['meta']['hmm_items']), file_obj = output_file_obj) # store percent abundance of each bin self.summary['bin_percent_recruitment'] = self.bin_percent_recruitment_per_sample self.summary['bin_percent_abundance_items'] = sorted(self.bin_percent_recruitment_per_sample.values()[0].keys()) output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = 'bins_percent_recruitment.txt') utils.store_dict_as_TAB_delimited_file(self.bin_percent_recruitment_per_sample, None, headers = ['samples'] + sorted(self.collection_profile.keys()) + ['__splits_not_binned__'], file_obj = output_file_obj) if self.debug: import json print json.dumps(self.summary, sort_keys=True, indent=4) self.index_html = SummaryHTMLOutput(self.summary, r = self.run, p = self.progress).generate(quick = self.quick)
def process(self): # learn who you are: collection_dict = self.collections.get_collection_dict( self.collection_name) bins_info_dict = self.collections.get_bins_info_dict( self.collection_name) # init profile data for colletion. self.init_collection_profile(collection_dict) # load completeness information if available self.completeness = completeness.Completeness(self.contigs_db_path) if len(self.completeness.sources): self.completeness_data_available = True # load HMM sources for non-single-copy genes if available if self.non_singlecopy_gene_hmm_sources and not self.quick: self.init_non_singlecopy_gene_hmm_sources() self.non_single_copy_gene_hmm_data_available = True # load gene functions from contigs db superclass self.init_functions() # set up the initial summary dictionary self.summary['meta'] = { 'quick': self.quick, 'output_directory': self.output_directory, 'collection': collection_dict.keys(), 'num_bins': len(collection_dict.keys()), 'collection_name': self.collection_name, 'total_nts_in_collection': 0, 'num_contigs_in_collection': 0, 'anvio_version': __version__, 'profile': self.p_meta, 'contigs': self.a_meta, 'gene_coverages_data_available': self.gene_coverages_data_available, 'completeness_data_available': self.completeness_data_available, 'non_single_copy_gene_hmm_data_available': self.non_single_copy_gene_hmm_data_available, 'percent_contigs_nts_described_by_collection': 0.0, 'percent_profile_nts_described_by_collection': 0.0, 'percent_contigs_nts_described_by_profile': P(self.p_meta['total_length'], self.a_meta['total_length']), 'percent_contigs_contigs_described_by_profile': P(self.p_meta['num_contigs'], self.a_meta['num_contigs']), 'percent_contigs_splits_described_by_profile': P(self.p_meta['num_splits'], self.a_meta['num_splits']), } # I am not sure whether this is the best place to do this, self.summary['basics_pretty'] = { 'profile': [ ('Created on', self.p_meta['creation_date']), ('Version', self.p_meta['version']), ('Minimum conting length', pretty(self.p_meta['min_contig_length'])), ('Number of contigs', pretty(int(self.p_meta['num_contigs']))), ('Number of splits', pretty(int(self.p_meta['num_splits']))), ('Total nucleotides', humanize_n(int(self.p_meta['total_length']))), ], 'contigs': [ ('Created on', self.p_meta['creation_date']), ('Version', self.a_meta['version']), ('Split length', pretty(int(self.a_meta['split_length']))), ('Number of contigs', pretty(int(self.a_meta['num_contigs']))), ('Number of splits', pretty(int(self.a_meta['num_splits']))), ('Total nucleotides', humanize_n(int(self.a_meta['total_length']))), ('K-mer size', self.a_meta['kmer_size']), ], } self.summary['max_shown_header_items'] = 10 self.summary['slice_header_items_tmpl'] = '0:%d' % self.summary[ 'max_shown_header_items'] self.summary['num_not_shown_samples'] = len( self.p_meta['samples']) - self.summary['max_shown_header_items'] self.summary['num_not_shown_hmm_items'] = dict([ (hmm_search_source, len(self.hmm_sources_info[hmm_search_source]['genes']) - self.summary['max_shown_header_items']) for hmm_search_type, hmm_search_source in self.hmm_searches_header ]) self.summary['files'] = {} self.summary['collection'] = {} self.summary[ 'collection_profile'] = self.collection_profile # reminder; collection_profile comes from ProfileSuperclass! self.summary[ 'collection_profile_items'] = self.collection_profile.values( )[0].keys() # add hmm items for each seach type: if self.non_single_copy_gene_hmm_data_available: self.summary['meta']['hmm_items'] = dict([ (hmm_search_source, self.hmm_sources_info[hmm_search_source]['genes']) for hmm_search_type, hmm_search_source in self.hmm_searches_header ]) # summarize bins: for bin_id in collection_dict: bin = Bin(self, bin_id, collection_dict[bin_id], self.run, self.progress) bin.output_directory = os.path.join(self.output_directory, 'bin_by_bin', bin_id) bin.bin_profile = self.collection_profile[bin_id] self.summary['collection'][bin_id] = bin.create() self.summary['collection'][bin_id][ 'color'] = bins_info_dict[bin_id]['html_color'] or '#212121' self.summary['collection'][bin_id]['source'] = bins_info_dict[ bin_id]['source'] or 'unknown_source' self.summary['meta']['total_nts_in_collection'] += self.summary[ 'collection'][bin_id]['total_length'] self.summary['meta']['num_contigs_in_collection'] += self.summary[ 'collection'][bin_id]['num_contigs'] # bins are computed, add some relevant meta info: self.summary['meta'][ 'percent_contigs_nts_described_by_collection'] = '%.2f' % ( self.summary['meta']['total_nts_in_collection'] * 100.0 / int(self.a_meta['total_length'])) self.summary['meta'][ 'percent_profile_nts_described_by_collection'] = '%.2f' % ( self.summary['meta']['total_nts_in_collection'] * 100.0 / int(self.p_meta['total_length'])) self.summary['meta'][ 'bins'] = self.get_bins_ordered_by_completeness_and_size() if not self.quick: # generate a TAB-delimited text output file for bin summaries summary_of_bins_matrix_output = {} properties = [ 'taxon', 'total_length', 'num_contigs', 'N50', 'GC_content', 'percent_complete', 'percent_redundancy' ] for bin_name in self.summary['collection']: summary_of_bins_matrix_output[bin_name] = dict([ (prop, self.summary['collection'][bin_name][prop]) for prop in properties ]) output_file_obj = self.get_output_file_handle( prefix='general_bins_summary.txt') utils.store_dict_as_TAB_delimited_file( summary_of_bins_matrix_output, None, headers=['bins'] + properties, file_obj=output_file_obj) # save merged matrices for bins x samples for table_name in self.collection_profile.values()[0].keys(): d = {} for bin_id in self.collection_profile: d[bin_id] = self.collection_profile[bin_id][table_name] output_file_obj = self.get_output_file_handle( sub_directory='bins_across_samples', prefix='%s.txt' % table_name) utils.store_dict_as_TAB_delimited_file( d, None, headers=['bins'] + sorted(self.p_meta['samples']), file_obj=output_file_obj) # merge and store matrices for hmm hits if self.non_single_copy_gene_hmm_data_available: for hmm_search_source in self.summary['meta']['hmm_items']: # this is to keep numbers per hmm item: d = {} for bin_id in self.summary['meta']['bins']: d[bin_id] = self.summary['collection'][bin_id]['hmms'][ hmm_search_source] output_file_obj = self.get_output_file_handle( sub_directory='bins_across_samples', prefix='%s.txt' % hmm_search_source, within='hmms') utils.store_dict_as_TAB_delimited_file( d, None, headers=['bins'] + sorted(self.summary['meta']['hmm_items'] [hmm_search_source]), file_obj=output_file_obj) # this is to keep number of hmm hits per bin: n = dict([(bin_id, {}) for bin_id in self.summary['meta']['bins']]) for hmm_search_source in self.summary['meta']['hmm_items']: for bin_id in self.summary['meta']['bins']: n[bin_id][hmm_search_source] = sum( self.summary['collection'][bin_id]['hmms'] [hmm_search_source].values()) output_file_obj = self.get_output_file_handle( sub_directory='bins_across_samples', prefix='hmm_hit_totals.txt') utils.store_dict_as_TAB_delimited_file( n, None, headers=['bins'] + sorted(self.summary['meta']['hmm_items']), file_obj=output_file_obj) # store percent abundance of each bin self.summary[ 'bin_percent_recruitment'] = self.bin_percent_recruitment_per_sample self.summary['bin_percent_abundance_items'] = sorted( self.bin_percent_recruitment_per_sample.values()[0].keys()) output_file_obj = self.get_output_file_handle( sub_directory='bins_across_samples', prefix='bins_percent_recruitment.txt') utils.store_dict_as_TAB_delimited_file( self.bin_percent_recruitment_per_sample, None, headers=['samples'] + sorted(self.collection_profile.keys()) + ['__splits_not_binned__'], file_obj=output_file_obj) if self.debug: import json print json.dumps(self.summary, sort_keys=True, indent=4) self.index_html = SummaryHTMLOutput( self.summary, r=self.run, p=self.progress).generate(quick=self.quick)