def store_gene_clusters(self, gene_clusters_dict): self.progress.new('Storing gene clusters in the database') self.progress.update('...') table_for_gene_clusters = TableForGeneClusters(self.pan_db_path, run=self.run, progress=self.progress) num_genes_in_gene_clusters = 0 for gene_cluster_name in gene_clusters_dict: for gene_entry in gene_clusters_dict[gene_cluster_name]: table_for_gene_clusters.add(gene_entry) num_genes_in_gene_clusters += 1 self.progress.end() table_for_gene_clusters.store() pan_db = dbops.PanDatabase(self.pan_db_path, quiet=True) pan_db.db.set_meta_value('num_gene_clusters', len(gene_clusters_dict)) pan_db.db.set_meta_value('num_genes_in_gene_clusters', num_genes_in_gene_clusters) pan_db.disconnect() self.run.info( 'gene clusters info', '%d gene_clusters stored in the database' % len(gene_clusters_dict))
def store_PCs(self, PCs_dict): self.progress.new('Storing protein clusters in the database') self.progress.update('...') table_for_PCs = dbops.TableForProteinClusters(self.pan_db_path, run=self.run, progress=self.progress) num_genes_in_PCs = 0 for pc_name in PCs_dict: for gene_entry in PCs_dict[pc_name]: table_for_PCs.add(gene_entry) num_genes_in_PCs += 1 self.progress.end() table_for_PCs.store() pan_db = dbops.PanDatabase(self.pan_db_path, quiet=True) pan_db.db.set_meta_value('num_PCs', len(PCs_dict)) pan_db.db.set_meta_value('num_genes_in_PCs', num_genes_in_PCs) pan_db.disconnect() self.run.info('protein clusters info', '%d PCs stored in the database' % len(PCs_dict))
def generate_pan_db(self): meta_values = {'internal_genome_names': ','.join(self.internal_genome_names), 'external_genome_names': ','.join(self.external_genome_names), 'num_genomes': len(self.genomes), 'min_percent_identity': self.min_percent_identity, 'pc_min_occurrence': self.PC_min_occurrence, 'mcl_inflation': self.mcl_inflation, 'default_view': 'PC_presence_absence', 'use_ncbi_blast': self.use_ncbi_blast, 'diamond_sensitive': self.sensitive, 'maxbit': self.maxbit, 'exclude_partial_gene_calls': self.exclude_partial_gene_calls, 'gene_alignments_computed': False if self.skip_alignments else True, 'genomes_storage_hash': self.genomes_storage_hash, 'project_name': self.project_name, 'PCs_clustered': False, 'description': self.description if self.description else '_No description is provided_', } dbops.PanDatabase(self.pan_db_path, quiet=False).create(meta_values)
def process_PCs(self, PCs_dict): self.progress.new('Generating view data') self.progress.update('...') PCs = list(PCs_dict.keys()) for PC in PCs: self.view_data[PC] = dict([(genome_name, 0) for genome_name in self.genomes]) self.view_data_presence_absence[PC] = dict([ (genome_name, 0) for genome_name in self.genomes ]) self.additional_view_data[PC] = { 'num_genes_in_pc': 0, 'num_genomes_pc_has_hits': 0, 'SCG': 0 } for gene_entry in PCs_dict[PC]: genome_name = gene_entry['genome_name'] self.view_data[PC][genome_name] += 1 self.view_data_presence_absence[PC][genome_name] = 1 self.additional_view_data[PC]['num_genes_in_pc'] += 1 self.additional_view_data[PC]['SCG'] = 1 if set( self.view_data[PC].values()) == set([1]) else 0 self.additional_view_data[PC]['num_genomes_pc_has_hits'] = len([ True for genome in self.view_data[PC] if self.view_data[PC][genome] > 0 ]) self.progress.end() ######################################################################################## # FILTERING BASED ON OCCURRENCE ######################################################################################## PCs_of_interest = set([]) for PC in PCs: if self.additional_view_data[PC][ 'num_genomes_pc_has_hits'] >= self.PC_min_occurrence: PCs_of_interest.add(PC) removed_PCs = 0 for PC in PCs: if PC not in PCs_of_interest: self.view_data.pop(PC) self.view_data_presence_absence.pop(PC) self.additional_view_data.pop(PC) PCs_dict.pop(PC) removed_PCs += 1 if self.PC_min_occurrence > 1: self.run.info( 'PCs min occurrence', '%d (the filter removed %d PCs)' % (self.PC_min_occurrence, removed_PCs)) ######################################################################################## # CAN WE CLUSTER THIS STUFF? DOES THE USER WANT US TO TRY REGARDLESS? ######################################################################################## if len(PCs_dict) > self.max_num_PCs_for_hierarchical_clustering: if self.enforce_hierarchical_clustering: self.run.warning( "You have %s PCs, which exceeds the number of PCs anvi'o is comfortable to cluster. But\ since you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\ to create a hierarchical clustering of your PCs anyway. It may take a bit of \ time. Pour yourself a coffee. Or go to a nice vacation. See you in 10 mins, or next year \ or never." % pp(len(PCs_dict))) else: self.run.warning("It seems you have %s protein clusters in your pangenome. This exceeds the soft limit\ of %s for anvi'o to attempt to create a hierarchical clustering of your protein clusters\ (which becomes the center tree in all anvi'o displays). If you want a hierarchical\ clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`." \ % (pp(len(PCs_dict)), pp(self.max_num_PCs_for_hierarchical_clustering))) self.skip_hierarchical_clustering = True ######################################################################################## # STORING FILTERED DATA IN THE DB ######################################################################################## table_structure = ['PC'] + sorted(self.genomes.keys()) table_types = ['text'] + ['numeric'] * len(self.genomes) dbops.TablesForViews(self.pan_db_path).create_new_view( data_dict=self.view_data, table_name='PC_frequencies', table_structure=table_structure, table_types=table_types, view_name='PC_frequencies') dbops.TablesForViews(self.pan_db_path).create_new_view( data_dict=self.view_data_presence_absence, table_name='PC_presence_absence', table_structure=table_structure, table_types=table_types, view_name='PC_presence_absence') additional_data_structure = [ 'PC', 'num_genomes_pc_has_hits', 'num_genes_in_pc', 'SCG' ] dbops.TablesForViews(self.pan_db_path).create_new_view( data_dict=self.additional_view_data, table_name='additional_data', table_structure=additional_data_structure, table_types=['text', 'numeric', 'numeric', 'numeric'], view_name=None) # add additional data structure to the self table, so we can have them initially ordered # in the interface the way additional_data_structure suggests: pan_db = dbops.PanDatabase(self.pan_db_path, quiet=True) pan_db.db.set_meta_value('additional_data_headers', ','.join(additional_data_structure[1:])) pan_db.disconnect() ######################################################################################## # RETURN THE -LIKELY- UPDATED PROTEIN CLUSTERS DICT ######################################################################################## return PCs_dict