def process(self): # load genomes from genomes storage self.load_genomes() # check sanity self.sanity_check() # gen pan_db self.generate_pan_db() # get all amino acid sequences: combined_aas_FASTA_path = self.get_output_file_path('combined-aas.fa') self.genomes_storage.gen_combined_aa_sequences_FASTA(combined_aas_FASTA_path, exclude_partial_gene_calls=self.exclude_partial_gene_calls) # get unique amino acid sequences: self.progress.new('Uniquing the output FASTA file') self.progress.update('...') unique_aas_FASTA_path, unique_aas_names_file_path, unique_aas_names_dict = utils.unique_FASTA_file(combined_aas_FASTA_path, store_frequencies_in_deflines=False) self.progress.end() self.run.info('Unique AA sequences FASTA', unique_aas_FASTA_path) # run search blastall_results = self.run_search(unique_aas_FASTA_path, unique_aas_names_dict) # generate MCL input from filtered blastall_results mcl_input_file_path = self.gen_mcl_input(blastall_results) # get clusters from MCL mcl_clusters = self.run_mcl(mcl_input_file_path) # we have the raw gene clusters dict, but we need to re-format it for following steps gene_clusters_dict = self.gen_gene_clusters_dict_from_mcl_clusters(mcl_clusters) del mcl_clusters # compute alignments for genes within each gene_cluster (or don't) gene_clusters_dict = self.compute_alignments_for_gene_clusters(gene_clusters_dict) # populate the pan db with results gene_clusters_dict = self.process_gene_clusters(gene_clusters_dict) # store gene clusters dict into the db self.store_gene_clusters(gene_clusters_dict) # generate a hierarchical clustering of gene clusters (or don't) self.gen_hierarchical_clustering_of_gene_clusters() # generate orderings of gene_clusters based on synteny of genes self.gen_synteny_based_ordering_of_gene_clusters(gene_clusters_dict) # populate layers additional data and orders self.populate_layers_additional_data_and_orders() # work with gene cluster homogeneity index self.populate_gene_cluster_homogeneity_index(gene_clusters_dict) # done self.run.info('log file', self.run.log_file_path) self.run.quit()
def gen_combined_aa_sequences_FASTA(self, output_file_path, exclude_partial_gene_calls=False): self.run.info('Exclude partial gene calls', exclude_partial_gene_calls, nl_after=1) genomes = self.get_genomes_dict() total_num_aa_sequences = 0 total_num_excluded_aa_sequences = 0 output_file = open(output_file_path, 'w') for genome_name in genomes: self.progress.new('Storing aa sequences') self.progress.update('%s ...' % genome_name) genome_data = self.D(genome_name) gene_caller_ids = sorted( [int(i[0]) for i in list(genome_data.items())]) for gene_caller_id in gene_caller_ids: partial = self.G(gene_caller_id, genome_data)['partial'].value if exclude_partial_gene_calls and partial: total_num_excluded_aa_sequences += 1 continue aa_sequence = self.G(gene_caller_id, genome_data)['aa_sequence'].value output_file.write( '>%s_%d\n' % (genomes[genome_name]['genome_hash'], int(gene_caller_id))) output_file.write('%s\n' % aa_sequence) total_num_aa_sequences += 1 self.progress.end() output_file.close() self.progress.new('Uniquing the output FASTA file') self.progress.update('...') unique_aas_FASTA_path, unique_aas_names_file_path, unique_aas_names_dict = utils.unique_FASTA_file( output_file_path, store_frequencies_in_deflines=False) self.progress.end() self.run.info('Unique AA sequences FASTA', output_file_path) self.run.info('Num AA sequences reported', '%s' % pp(total_num_aa_sequences), nl_before=1) self.run.info('Num excluded gene calls', '%s' % pp(total_num_excluded_aa_sequences)) return unique_aas_FASTA_path, unique_aas_names_dict
def gen_combined_proteins_unique_FASTA(self): self.progress.new('Storing combined protein sequences') combined_proteins_FASTA_path = self.get_output_file_path( 'combined-proteins.fa') output_file = open(combined_proteins_FASTA_path, 'w') for genome_name in self.genomes: g = self.genomes[genome_name] self.progress.update('Working on %s ...' % genome_name) for gene_caller_id in g['gene_caller_ids']: output_file.write('>%s_%d\n' % (g['genome_entry_hash'], gene_caller_id)) output_file.write( '%s\n' % self.protein_sequences_dict[genome_name][gene_caller_id]) output_file.close() self.progress.end() # unique the FASTA file unique_proteins_FASTA_path, unique_proteins_names_file_path, unique_proteins_names_dict = utils.unique_FASTA_file( combined_proteins_FASTA_path, store_frequencies_in_deflines=False) self.run.info('Num unique protein sequences', '%s' % pp(len(unique_proteins_names_dict))) self.run.info('Combined protein sequences FASTA', combined_proteins_FASTA_path) self.run.info('Unique protein sequences FASTA', unique_proteins_FASTA_path) return unique_proteins_FASTA_path, unique_proteins_names_dict
def gen_combined_proteins_unique_FASTA(self): self.progress.new('Storing combined protein sequences') combined_proteins_FASTA_path = self.get_output_file_path('combined-proteins.fa') output_file = open(combined_proteins_FASTA_path, 'w') for genome_name in self.genomes: g = self.genomes[genome_name] self.progress.update('Working on %s ...' % genome_name) for gene_caller_id in g['gene_caller_ids']: output_file.write('>%s_%d\n' % (g['genome_entry_hash'], gene_caller_id)) output_file.write('%s\n' % self.protein_sequences_dict[genome_name][gene_caller_id]) output_file.close() self.progress.end() # unique the FASTA file unique_proteins_FASTA_path, unique_proteins_names_file_path, unique_proteins_names_dict = utils.unique_FASTA_file(combined_proteins_FASTA_path, store_frequencies_in_deflines=False) self.run.info('Num unique protein sequences', '%s' % pp(len(unique_proteins_names_dict))) self.run.info('Combined protein sequences FASTA', combined_proteins_FASTA_path) self.run.info('Unique protein sequences FASTA', unique_proteins_FASTA_path) return unique_proteins_FASTA_path, unique_proteins_names_dict