def gen_view_data_tables_from_atomic_data(self): essential_fields = [f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f)] auxiliary_fields = [f for f in self.atomic_data_fields if constants.IS_AUXILIARY_FIELD(f)] # setting standard view table structure and types view_table_structure = ['contig'] + self.sample_ids_found_in_input_dbs + auxiliary_fields view_table_types = ['text'] + ['numeric'] * len(self.sample_ids_found_in_input_dbs) + ['text'] # generate a dictionary for normalized coverage of each contig across samples per target self.normalized_coverages = {'contigs': {}, 'splits': {}} for target in ['contigs', 'splits']: for split_name in self.split_names: self.normalized_coverages[target][split_name] = {} for input_profile_db_path in self.profile_dbs_info_dict: self.normalized_coverages[target][split_name][input_profile_db_path] = self.get_normalized_coverage_of_split(target, input_profile_db_path, split_name) # generate a dictionary for max normalized ratio of each contig across samples per target self.max_normalized_ratios = {'contigs': {}, 'splits': {}} for target in ['contigs', 'splits']: for split_name in self.split_names: self.max_normalized_ratios[target][split_name] = self.get_max_normalized_ratio_of_split(target, split_name) self.progress.new('Generating view data tables') for target in ['contigs', 'splits']: for essential_field in essential_fields: self.progress.update('Processing %s for %s ...' % (essential_field, target)) data_dict = {} for split_name in self.split_names: data_dict[split_name] = {'__parent__': self.split_parents[split_name]} for input_profile_db_path in self.profile_dbs_info_dict: sample_id = self.profile_dbs_info_dict[input_profile_db_path]['sample_id'] if essential_field == 'normalized_coverage': data_dict[split_name][sample_id] = self.normalized_coverages[target][split_name][input_profile_db_path] elif essential_field == 'max_normalized_ratio': data_dict[split_name][sample_id] = self.max_normalized_ratios[target][split_name][input_profile_db_path] elif essential_field == 'relative_abundance': data_dict[split_name][sample_id] = self.get_relative_abundance_of_split(target, input_profile_db_path, split_name) else: data_dict[split_name][sample_id] = self.atomic_data_for_each_run[target][input_profile_db_path][split_name][essential_field] # time to store the data for this view in the profile database table_name = '_'.join([essential_field, target]) TablesForViews(self.merged_profile_db_path).create_new_view( data_dict=data_dict, table_name=table_name, table_structure=view_table_structure, table_types=view_table_types, view_name=essential_field if target == 'splits' else None) # if SNVs were not profiled, remove all entries from variability tables: if not self.SNVs_profiled: TablesForViews(self.merged_profile_db_path).remove(view_name='variability', table_names_to_blank=['variability_splits', 'variability_contigs']) self.progress.end()
def populate_layers_additional_data_and_layer_orders(self): self.run.info_single("Additional data and layer orders...", nl_before=1, nl_after=1, mc="blue") essential_fields = [f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f)] # initialize views. args = argparse.Namespace(profile_db = self.merged_profile_db_path) profile_db_super = dbops.ProfileSuperclass(args) profile_db_super.load_views(omit_parent_column=True) # figure out sample orders dictionary layer_orders_data_dict = {} failed_attempts = [] self.progress.new('Working on layer orders') for essential_field in essential_fields: self.progress.update('recovering order for "%s"' % (essential_field)) try: data_value = clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'], distance=self.distance, linkage=self.linkage, transpose=True) layer_orders_data_dict[essential_field] = {'data_value': data_value, 'data_type': 'newick'} except: failed_attempts.append(essential_field) self.progress.end() if not len(layer_orders_data_dict): self.run.warning("This may or may not be important: anvi'o attempted to generate orders for your\ samples based on the view data, however, it failed :/") return if len(failed_attempts): self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\ available in the merged profile, clustering of some of the essential data\ failed. It is likely not a very big deal, but you shall be the judge of it.\ Anvi'o now proceeds to store layers order information for those view items\ the clustering in fact worked. Here is the list of stuff that failed: '%s'"\ % (', '.join(failed_attempts))) self.progress.new('Working on layer additional data') self.progress.update('...') layer_additional_data_dict = {} for sample_name in self.sample_ids_found_in_input_dbs: layer_additional_data_dict[sample_name] = {} # figure out num reads mapped per sample: for sample_name in self.sample_ids_found_in_input_dbs: layer_additional_data_dict[sample_name]['num_mapped_reads'] = self.total_reads_mapped_per_sample[sample_name] self.progress.end() TableForLayerOrders(args).add(layer_orders_data_dict) TableForLayerAdditionalData(args).add(layer_additional_data_dict, ['num_mapped_reads'])
def populate_misc_data_tables(self): self.run.info_single("Additional data and layer orders...", nl_before=1, nl_after=1, mc="blue") essential_fields = [f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f)] # initialize views. args = argparse.Namespace(profile_db = self.merged_profile_db_path) profile_db_super = dbops.ProfileSuperclass(args) profile_db_super.load_views(omit_parent_column=True) # figure out layer orders dictionary layer_orders_data_dict = {} failed_attempts = [] self.progress.new('Working on layer orders') for essential_field in essential_fields: self.progress.update('recovering order for "%s"' % (essential_field)) try: data_value = clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'], distance=self.distance, linkage=self.linkage, transpose=True) layer_orders_data_dict[essential_field] = {'data_value': data_value, 'data_type': 'newick'} except: failed_attempts.append(essential_field) self.progress.end() if not len(layer_orders_data_dict): self.run.warning("This may or may not be important: anvi'o attempted to generate orders for your\ samples based on the view data, however, it failed :/") return if len(failed_attempts): self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\ available in the merged profile, clustering of some of the essential data\ failed. It is likely not a very big deal, but you shall be the judge of it.\ Anvi'o now proceeds to store layers order information for those view items\ the clustering in fact worked. Here is the list of stuff that failed: '%s'"\ % (', '.join(failed_attempts))) # add the layer orders quietly TableForLayerOrders(args, r=terminal.Run(verbose=False)).add(layer_orders_data_dict) self.run.warning(None, header="Layer orders added", lc='cyan') for layer_order in layer_orders_data_dict: self.run.info_single(layer_order, mc='cyan') # done with layer orders. let's add our layer additional data and call it a day. for data_group_name in self.layer_additional_data_dict: args.target_data_group = data_group_name TableForLayerAdditionalData(args, r=terminal.Run(verbose=False)).add(self.layer_additional_data_dict[data_group_name], list(self.layer_additional_data_keys[data_group_name])) self.run.warning(None, header="Data groups added", lc='cyan') for data_group in self.layer_additional_data_dict: self.run.info_single('%s (w/%d items)' % (data_group, len(self.layer_additional_data_keys[data_group])), mc='cyan')
def gen_samples_db_for_the_merged_profile(self): """Geenrate a samples db for the merged profile. We use the ProfileSuperclass to load all the views we added into the meged profile, and generate clusterings of samples for each view to generate a default samples database.""" self.run.info_single("SAMPLES.db stuff...", nl_before=1, nl_after=1, mc="blue") essential_fields = [ f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f) ] class Args: pass args = Args() args.profile_db = self.merged_profile_db_path # initialize views. profile_db_super = dbops.ProfileSuperclass(args) profile_db_super.load_views(omit_parent_column=True) # figure out sample orders dictionary sample_orders = {} failed_attempts = [] self.progress.new('Working on SAMPLES.db') for essential_field in essential_fields: self.progress.update('recovering samples order for "%s"' % (essential_field)) try: sample_orders[essential_field] = \ clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'], distance=self.distance, linkage=self.linkage, transpose=True) except: failed_attempts.append(essential_field) self.progress.end() if not len(sample_orders): self.run.warning( "This may or may not be important: anvi'o attempted to generate a samples\ database for this merged profile, however, all attempts to cluster samples\ based on view data available in the merged profile failed. No samples db\ for you :/") return if len(failed_attempts): self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\ available in the merged profile, clustering of some of the essential data\ failed. It is likely not a very big deal, but you shall be the judge of it.\ Anvi'o now proceeds to generate a samples db with clusterings it generated\ using the view data that worked. Here is the list of stuff that failed: '%s'"\ % (', '.join(failed_attempts))) # generate the samples order file samples_order_file_path = filesnpaths.get_temp_file_path() samples_order_file = open(samples_order_file_path, 'w') samples_order_file.write('attributes\tbasic\tnewick\n') for sample_order in sample_orders: samples_order_file.write( '%s\t%s\t%s\n' % (sample_order, '', sample_orders[sample_order])) samples_order_file.close() # figure out samples information stuff samples_information = {} headers = [] for sample_name in self.sample_ids_found_in_input_dbs: samples_information[sample_name] = {} self.progress.new('Working on SAMPLES.db') self.progress.update('...') # figure out num reads mapped per sample: for sample_name in self.sample_ids_found_in_input_dbs: samples_information[sample_name][ 'num_mapped_reads'] = self.total_reads_mapped_per_sample[ sample_name] self.progress.end() # generate the samples information file samples_information_file_path = filesnpaths.get_temp_file_path() utils.store_dict_as_TAB_delimited_file(samples_information, samples_information_file_path, headers=headers) # generate the samples database samples_db = dbops.SamplesInformationDatabase(self.samples_db_path, quiet=False) samples_db.create( samples_order_path=samples_order_file_path, samples_information_path=samples_information_file_path) os.remove(samples_order_file_path) os.remove(samples_information_file_path) self.run.info('Samples database', self.samples_db_path)
def gen_view_data_tables_from_atomic_data(self): essential_fields = [ f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f) ] auxiliary_fields = [ f for f in self.atomic_data_fields if constants.IS_AUXILIARY_FIELD(f) ] views_table = dbops.TableForViews(self.profile_db_path, anvio.__profile__version__, progress=self.progress) # setting standard view table structure and types view_table_structure = ['contig' ] + self.merged_sample_ids + auxiliary_fields view_table_types = [ 'text' ] + ['numeric'] * len(self.merged_sample_ids) + ['text'] # generate a dictionary for normalized coverage of each contig across samples per target self.normalized_coverages = {'contigs': {}, 'splits': {}} for target in ['contigs', 'splits']: for split_name in self.split_names: self.normalized_coverages[target][split_name] = {} for sample_id in self.merged_sample_ids: self.normalized_coverages[target][split_name][ sample_id] = self.get_normalized_coverage_of_split( target, sample_id, split_name) # generate a dictionary for max normalized ratio of each contig across samples per target self.max_normalized_ratios = {'contigs': {}, 'splits': {}} for target in ['contigs', 'splits']: for split_name in self.split_names: self.max_normalized_ratios[target][ split_name] = self.get_max_normalized_ratio_of_split( target, split_name) self.progress.new('Generating view data tables') profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True) for target in ['contigs', 'splits']: for essential_field in essential_fields: self.progress.update('Processing %s for %s ...' % (essential_field, target)) target_table = '_'.join([essential_field, target]) m = {} for split_name in self.split_names: m[split_name] = { '__parent__': self.split_parents[split_name] } for sample_id in self.merged_sample_ids: if essential_field == 'normalized_coverage': m[split_name][ sample_id] = self.normalized_coverages[target][ split_name][sample_id] elif essential_field == 'max_normalized_ratio': m[split_name][ sample_id] = self.max_normalized_ratios[ target][split_name][sample_id] elif essential_field == 'relative_abundance': m[split_name][ sample_id] = self.get_relative_abundance_of_split( target, sample_id, split_name) else: m[split_name][ sample_id] = self.atomic_data_for_each_run[ target][sample_id][split_name][ essential_field] # variable 'm' for the essential field is now ready to be its own table: profile_db.db.create_table(target_table, view_table_structure, view_table_types) db_entries = [ tuple([split_name] + [m[split_name][h] for h in view_table_structure[1:]]) for split_name in self.split_names ] profile_db.db._exec_many( '''INSERT INTO %s VALUES (%s)''' % (target_table, ','.join( ['?'] * len(view_table_structure))), db_entries) if target == 'splits': views_table.append(essential_field, target_table) profile_db.disconnect() self.progress.end() # store views in the database views_table.store()