def gen_view_data_tables_from_atomic_data(self): essential_fields = [ f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f) ] auxiliary_fields = [ f for f in self.atomic_data_fields if constants.IS_AUXILIARY_FIELD(f) ] views_table = dbops.TableForViews(self.profile_db_path, anvio.__profile__version__, progress=self.progress) # setting standard view table structure and types view_table_structure = ['contig' ] + self.merged_sample_ids + auxiliary_fields view_table_types = [ 'text' ] + ['numeric'] * len(self.merged_sample_ids) + ['text'] # generate a dictionary for normalized coverage of each contig across samples per target self.normalized_coverages = {'contigs': {}, 'splits': {}} for target in ['contigs', 'splits']: for split_name in self.split_names: self.normalized_coverages[target][split_name] = {} for sample_id in self.merged_sample_ids: self.normalized_coverages[target][split_name][ sample_id] = self.get_normalized_coverage_of_split( target, sample_id, split_name) # generate a dictionary for max normalized ratio of each contig across samples per target self.max_normalized_ratios = {'contigs': {}, 'splits': {}} for target in ['contigs', 'splits']: for split_name in self.split_names: self.max_normalized_ratios[target][ split_name] = self.get_max_normalized_ratio_of_split( target, split_name) self.progress.new('Generating view data tables') profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True) for target in ['contigs', 'splits']: for essential_field in essential_fields: self.progress.update('Processing %s for %s ...' % (essential_field, target)) target_table = '_'.join([essential_field, target]) m = {} for split_name in self.split_names: m[split_name] = { '__parent__': self.split_parents[split_name] } for sample_id in self.merged_sample_ids: if essential_field == 'normalized_coverage': m[split_name][ sample_id] = self.normalized_coverages[target][ split_name][sample_id] elif essential_field == 'max_normalized_ratio': m[split_name][ sample_id] = self.max_normalized_ratios[ target][split_name][sample_id] elif essential_field == 'relative_abundance': m[split_name][ sample_id] = self.get_relative_abundance_of_split( target, sample_id, split_name) else: m[split_name][ sample_id] = self.atomic_data_for_each_run[ target][sample_id][split_name][ essential_field] # variable 'm' for the essential field is now ready to be its own table: profile_db.db.create_table(target_table, view_table_structure, view_table_types) db_entries = [ tuple([split_name] + [m[split_name][h] for h in view_table_structure[1:]]) for split_name in self.split_names ] profile_db.db._exec_many( '''INSERT INTO %s VALUES (%s)''' % (target_table, ','.join( ['?'] * len(view_table_structure))), db_entries) if target == 'splits': views_table.append(essential_field, target_table) profile_db.disconnect() self.progress.end() # store views in the database views_table.store()
def _run(self): self.check_args() self.set_sample_id() self.init_dirs_and_dbs() self.run.info('anvio', anvio.__version__) self.run.info('profiler_version', anvio.__profile__version__) self.run.info('sample_id', self.sample_id) self.run.info('profile_db', self.profile_db_path, display_only=True) self.run.info('contigs_db', True if self.contigs_db_path else False) self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash']) self.run.info('cmd_line', utils.get_cmd_line()) self.run.info('merged', False) self.run.info('split_length', self.a_meta['split_length']) self.run.info('min_contig_length', self.min_contig_length) self.run.info('min_mean_coverage', self.min_mean_coverage) self.run.info('clustering_performed', self.contigs_shall_be_clustered) self.run.info('min_coverage_for_variability', self.min_coverage_for_variability) self.run.info('skip_SNV_profiling', self.skip_SNV_profiling) self.run.info('skip_AA_frequencies', self.skip_AA_frequencies) self.run.info('report_variability_full', self.report_variability_full) self.run.info('gene_coverages_computed', self.a_meta['genes_are_called']) # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized # profile... if self.input_file_path: self.init_profile_from_BAM() self.profile() if self.gen_serialized_profile: self.store_profile() else: self.init_serialized_profile() self.generate_variabile_positions_table() self.profile_AA_frequencies() self.generate_gene_coverages_table() self.store_split_coverages() # here we store atomic data for contigs and splits into the database: profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True) self.atomic_contig_split_data.store_atomic_data_for_contigs_and_splits( self.sample_id, self.contigs, profile_db.db) profile_db.disconnect() # the only view for the single PROFILE database is ready, and already # set as the default view. store the info in the db: views_table = dbops.TableForViews(self.profile_db_path, anvio.__profile__version__) views_table.append('single', 'atomic_data_splits') views_table.store() if self.contigs_shall_be_clustered: self.cluster_contigs() runinfo_serialized = self.generate_output_destination('RUNINFO.cp') self.run.info('runinfo', runinfo_serialized) self.run.store_info_dict(runinfo_serialized, strip_prefix=self.output_directory) self.bam.close() self.run.quit()