Exemple #1
0
    def process(self):
        # load genomes from genomes storage
        self.load_genomes()

        # check sanity
        self.sanity_check()

        # gen pan_db
        self.generate_pan_db()

        # get all amino acid sequences:
        combined_aas_FASTA_path = self.get_output_file_path('combined-aas.fa')
        self.genomes_storage.gen_combined_aa_sequences_FASTA(combined_aas_FASTA_path,
                                                             exclude_partial_gene_calls=self.exclude_partial_gene_calls)


        # get unique amino acid sequences:
        self.progress.new('Uniquing the output FASTA file')
        self.progress.update('...')
        unique_aas_FASTA_path, unique_aas_names_file_path, unique_aas_names_dict = utils.unique_FASTA_file(combined_aas_FASTA_path, store_frequencies_in_deflines=False)
        self.progress.end()
        self.run.info('Unique AA sequences FASTA', unique_aas_FASTA_path)

        # run search
        blastall_results = self.run_search(unique_aas_FASTA_path, unique_aas_names_dict)

        # generate MCL input from filtered blastall_results
        mcl_input_file_path = self.gen_mcl_input(blastall_results)

        # get clusters from MCL
        mcl_clusters = self.run_mcl(mcl_input_file_path)

        # we have the raw gene clusters dict, but we need to re-format it for following steps
        gene_clusters_dict = self.gen_gene_clusters_dict_from_mcl_clusters(mcl_clusters)
        del mcl_clusters

        # compute alignments for genes within each gene_cluster (or don't)
        gene_clusters_dict = self.compute_alignments_for_gene_clusters(gene_clusters_dict)

        # populate the pan db with results
        gene_clusters_dict = self.process_gene_clusters(gene_clusters_dict)

        # store gene clusters dict into the db
        self.store_gene_clusters(gene_clusters_dict)

        # generate a hierarchical clustering of gene clusters (or don't)
        self.gen_hierarchical_clustering_of_gene_clusters()

        # generate orderings of gene_clusters based on synteny of genes
        self.gen_synteny_based_ordering_of_gene_clusters(gene_clusters_dict)

        # populate layers additional data and orders
        self.populate_layers_additional_data_and_orders()

        # work with gene cluster homogeneity index
        self.populate_gene_cluster_homogeneity_index(gene_clusters_dict)

        # done
        self.run.info('log file', self.run.log_file_path)
        self.run.quit()
Exemple #2
0
    def gen_combined_aa_sequences_FASTA(self,
                                        output_file_path,
                                        exclude_partial_gene_calls=False):
        self.run.info('Exclude partial gene calls',
                      exclude_partial_gene_calls,
                      nl_after=1)

        genomes = self.get_genomes_dict()

        total_num_aa_sequences = 0
        total_num_excluded_aa_sequences = 0

        output_file = open(output_file_path, 'w')

        for genome_name in genomes:
            self.progress.new('Storing aa sequences')
            self.progress.update('%s ...' % genome_name)

            genome_data = self.D(genome_name)
            gene_caller_ids = sorted(
                [int(i[0]) for i in list(genome_data.items())])

            for gene_caller_id in gene_caller_ids:
                partial = self.G(gene_caller_id, genome_data)['partial'].value

                if exclude_partial_gene_calls and partial:
                    total_num_excluded_aa_sequences += 1
                    continue

                aa_sequence = self.G(gene_caller_id,
                                     genome_data)['aa_sequence'].value

                output_file.write(
                    '>%s_%d\n' %
                    (genomes[genome_name]['genome_hash'], int(gene_caller_id)))
                output_file.write('%s\n' % aa_sequence)

                total_num_aa_sequences += 1

            self.progress.end()

        output_file.close()

        self.progress.new('Uniquing the output FASTA file')
        self.progress.update('...')
        unique_aas_FASTA_path, unique_aas_names_file_path, unique_aas_names_dict = utils.unique_FASTA_file(
            output_file_path, store_frequencies_in_deflines=False)
        self.progress.end()

        self.run.info('Unique AA sequences FASTA', output_file_path)
        self.run.info('Num AA sequences reported',
                      '%s' % pp(total_num_aa_sequences),
                      nl_before=1)
        self.run.info('Num excluded gene calls',
                      '%s' % pp(total_num_excluded_aa_sequences))

        return unique_aas_FASTA_path, unique_aas_names_dict
Exemple #3
0
    def gen_combined_proteins_unique_FASTA(self):
        self.progress.new('Storing combined protein sequences')
        combined_proteins_FASTA_path = self.get_output_file_path(
            'combined-proteins.fa')
        output_file = open(combined_proteins_FASTA_path, 'w')

        for genome_name in self.genomes:
            g = self.genomes[genome_name]
            self.progress.update('Working on %s ...' % genome_name)

            for gene_caller_id in g['gene_caller_ids']:
                output_file.write('>%s_%d\n' %
                                  (g['genome_entry_hash'], gene_caller_id))
                output_file.write(
                    '%s\n' %
                    self.protein_sequences_dict[genome_name][gene_caller_id])

        output_file.close()
        self.progress.end()

        # unique the FASTA file
        unique_proteins_FASTA_path, unique_proteins_names_file_path, unique_proteins_names_dict = utils.unique_FASTA_file(
            combined_proteins_FASTA_path, store_frequencies_in_deflines=False)

        self.run.info('Num unique protein sequences',
                      '%s' % pp(len(unique_proteins_names_dict)))
        self.run.info('Combined protein sequences FASTA',
                      combined_proteins_FASTA_path)
        self.run.info('Unique protein sequences FASTA',
                      unique_proteins_FASTA_path)

        return unique_proteins_FASTA_path, unique_proteins_names_dict
Exemple #4
0
    def gen_combined_proteins_unique_FASTA(self):
        self.progress.new('Storing combined protein sequences')
        combined_proteins_FASTA_path = self.get_output_file_path('combined-proteins.fa')
        output_file = open(combined_proteins_FASTA_path, 'w')

        for genome_name in self.genomes:
            g = self.genomes[genome_name]
            self.progress.update('Working on %s ...' % genome_name)

            for gene_caller_id in g['gene_caller_ids']:
                output_file.write('>%s_%d\n' % (g['genome_entry_hash'], gene_caller_id))
                output_file.write('%s\n' % self.protein_sequences_dict[genome_name][gene_caller_id])

        output_file.close()
        self.progress.end()

        # unique the FASTA file
        unique_proteins_FASTA_path, unique_proteins_names_file_path, unique_proteins_names_dict = utils.unique_FASTA_file(combined_proteins_FASTA_path, store_frequencies_in_deflines=False)

        self.run.info('Num unique protein sequences', '%s' % pp(len(unique_proteins_names_dict)))
        self.run.info('Combined protein sequences FASTA', combined_proteins_FASTA_path)
        self.run.info('Unique protein sequences FASTA', unique_proteins_FASTA_path)

        return unique_proteins_FASTA_path, unique_proteins_names_dict