Python Diamond Examples

Programming Language: Python

Namespace/Package Name: anvio.drivers.diamond

Method/Function: Diamond

Examples at hotexamples.com: 3

Python Diamond - 3 examples found. These are the top rated real world Python examples of anvio.drivers.diamond.Diamond extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: MODELLER.py Project: shiyi-pan/anvio

    def run_search_and_parse_results(self):
        """Align the protein against the database based on only sequence"""

        if not self.percent_identical_cutoff or not self.max_number_templates:
            raise ConfigError(
                "run_search_and_parse_results :: You initiated this class without providing values for percent_identical_cutoff "
                "and max_number_templates, which is required for this function."
            )

        # Change to MODELLER working directory
        os.chdir(self.directory)

        driver = diamond.Diamond(
            query_fasta=self.target_fasta_path,
            target_fasta=J(self.database_dir,
                           self.modeller_database + '.dmnd'),
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.blastp()

        # Change back to user directory
        os.chdir(self.start_dir)

        search_df = driver.view_as_dataframe(
            J(self.directory, driver.tabular_output_path))

        matches_found = search_df.shape[0]

        if not matches_found:
            self.run.warning(
                "No proteins with homologous sequence were found for {}. No structure will be modelled"
                .format(self.corresponding_gene_call))
            raise self.EndModeller

        # We need the gene length for proper_pident
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        while next(target_fasta):
            gene_length = len(target_fasta.seq)

        # add some useful columns
        search_df["proper_pident"] = search_df["pident"] * search_df[
            "length"] / gene_length
        search_df["code"] = search_df["sseqid"].str[:-1]
        search_df["chain"] = search_df["sseqid"].str[-1]

        # filter results by self.percent_identical_cutoff.
        max_pident_found = search_df["proper_pident"].max()
        id_of_max_pident = tuple(
            search_df.loc[search_df["proper_pident"].idxmax(),
                          ["code", "chain"]].values)
        search_df = search_df[
            search_df["proper_pident"] >= self.percent_identical_cutoff]

        search_df = search_df.sort_values("proper_pident", ascending=False)

        # If more than 1 template in 1 PDB id, just choose 1
        search_df = search_df.drop_duplicates('code', keep='first')

        # Order them and take the first self.modeller.max_number_templates.
        matches_after_filter = len(search_df)
        if not matches_after_filter:
            self.run.warning("Gene {} did not have a search result with proper percent identicalness above or equal "
                             "to {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a "
                             "proper percent identicalness of {:.2f}%. No structure will be modelled.".\
                              format(self.corresponding_gene_call,
                                     self.percent_identical_cutoff,
                                     id_of_max_pident[1],
                                     id_of_max_pident[0],
                                     max_pident_found))
            raise self.EndModeller

        # get up to self.modeller.max_number_templates of those with the highest proper_ident scores.
        search_df = search_df.iloc[:min(
            [len(search_df), self.max_number_templates])]

        # Get their chain and 4-letter ids
        self.list_of_template_code_and_chain_ids = list(
            zip(search_df["code"], search_df["chain"]))

        self.run.info("Max number of templates allowed",
                      self.max_number_templates)
        self.run.info("Number of candidate templates", matches_found)
        self.run.info(
            "After >{}% identical filter".format(
                self.percent_identical_cutoff), matches_after_filter)
        self.run.info("Number accepted as templates",
                      len(self.list_of_template_code_and_chain_ids))

        # update user on which templates are used, and write the templates to self.out
        for i in range(len(self.list_of_template_code_and_chain_ids)):
            pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i]
            ppi = search_df["proper_pident"].iloc[i]

            self.out["templates"]["pdb_id"].append(pdb_id)
            self.out["templates"]["chain_id"].append(chain_id)
            self.out["templates"]["ppi"].append(ppi)

            self.run.info(
                "Template {}".format(i + 1),
                "Protein ID: {}, Chain {} ({:.1f}% identical)".format(
                    pdb_id, chain_id, ppi))

Example #2

Show file

File: MODELLER.py Project: shiyi-pan/anvio

    def check_database(self):
        """Setup the database files

        Downloads the .pir file if it is missing
        Binarizes .pir file if .bin is missing
        Creates the .dmnd file if it is missing
        """

        bin_db_path = J(self.database_dir, self.modeller_database + ".bin")
        pir_db_path = J(self.database_dir, self.modeller_database + ".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path,
                                                      dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                      dont_raise=True)

        if bin_exists and pir_exists:
            # We good
            pass
        else:
            if not pir_exists:
                # Download .pir
                self.run.warning(
                    "Anvi'o looked in {} for a database with the name {} and with an extension "
                    "of either .bin or .pir, but didn't find anything matching that "
                    "criteria. Anvi'o will try and download the best database it knows of from "
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. "
                    "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 "
                    "database".format(self.database_dir,
                                      self.modeller_database))

                db_download_path = os.path.join(self.database_dir,
                                                "pdb_95.pir.gz")
                utils.download_file(
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz",
                    db_download_path)
                utils.run_command(
                    ['gzip', '-d', db_download_path],
                    log_file_path=filesnpaths.get_temp_file_path())

            # Binarize .pir (make .bin)
            self.run.warning(
                "Your database is not in binary format. That means accessing its contents is slower "
                "than it could be. Anvi'o is going to make a binary format. Just FYI"
            )
            self.run_binarize_database(pir_db_path, bin_db_path)

        dmnd_db_path = J(self.database_dir, self.modeller_database + '.dmnd')

        if os.path.exists(dmnd_db_path):
            return

        self.run.warning(
            "Your diamond database does not exist. It will be created.")

        script_name = "pir_to_fasta.py"

        self.copy_script_to_directory(script_name)

        input_pir_path = J(self.database_dir, self.modeller_database + '.pir')
        fasta_path = J(self.database_dir, self.modeller_database + '.fa')
        dmnd_path = J(self.database_dir, self.modeller_database)

        command = [self.executable, script_name, input_pir_path, fasta_path]

        self.run_command(command, script_name=script_name, rename_log=False)

        temp = u.FastaOutput(filesnpaths.get_temp_file_path())
        fasta = u.SequenceSource(fasta_path)

        while next(fasta):
            temp.write_id(fasta.id)
            temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X'))

        shutil.move(temp.output_file_path, fasta_path)
        fasta.close()
        temp.close()

        driver = diamond.Diamond(
            query_fasta=fasta_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.makedb(output_file_path=dmnd_path)

        os.remove(fasta_path)

Example #3

Show file

File: MODELLER.py Project: simatei/anvio

    def run_search_and_parse_results(self):
        """Align the protein against the database based on only sequence"""

        # Change to MODELLER working directory
        os.chdir(self.directory)

        columns = [
            'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gaps',
            'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'
        ]
        driver = diamond.Diamond(
            query_fasta=self.target_fasta_path,
            target_fasta=J(self.database_dir,
                           self.modeller_database + '.dmnd'),
            outfmt=' '.join(['6'] + columns),
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.blastp()

        # Change back to user directory
        os.chdir(self.start_dir)

        search_df = driver.view_as_dataframe(
            J(self.directory, driver.tabular_output_path))

        matches_found = search_df.shape[0]

        if not matches_found:
            self.run.warning(
                "No proteins with homologous sequence were found for {}. No structure will be modelled"
                .format(self.corresponding_gene_call))
            raise self.EndModeller

        # We need the gene length for pident
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        while next(target_fasta):
            gene_length = len(target_fasta.seq)

        # add some useful columns
        search_df["code"] = search_df["sseqid"].str[:-1]
        search_df["chain"] = search_df["sseqid"].str[-1]
        search_df["align_fraction"] = (search_df["length"] -
                                       search_df["gaps"]) / gene_length
        search_df["proper_pident"] = search_df["pident"] * search_df[
            "align_fraction"]

        # Find best match for align fraction and pident
        code_chain_id_of_best = tuple(
            search_df.iloc[search_df['proper_pident'].argmax()][[
                'code', 'chain'
            ]].values)
        best_hit = search_df.loc[
            (search_df['code'] == code_chain_id_of_best[0]) & \
            (search_df['chain'] == code_chain_id_of_best[1]), ['pident', 'align_fraction']
        ].iloc[0]

        # filter results by self.percent_cutoff and self.alignment_fraction_cutoff
        search_df = search_df[search_df["pident"] >= self.percent_cutoff]
        search_df = search_df[
            search_df["align_fraction"] >= self.alignment_fraction_cutoff]

        # Rank by the alignment fraction times the percent id
        search_df = search_df.sort_values("proper_pident", ascending=False)

        # If more than 1 template in 1 PDB id, just choose 1
        search_df = search_df.drop_duplicates('code', keep='first')

        matches_after_filter = len(search_df)
        if not matches_after_filter:
            self.run.warning("Gene {} did not have a search result with percent identicalness above or equal "
                             "to {}% and alignment fraction above {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a "
                             "percent identicalness of {:.2f}% and an alignment fraction of {:.3f}. No structure will be modelled.".\
                              format(self.corresponding_gene_call,
                                     self.percent_cutoff,
                                     self.alignment_fraction_cutoff,
                                     code_chain_id_of_best[1],
                                     code_chain_id_of_best[0],
                                     best_hit['pident'],
                                     best_hit['align_fraction']))
            raise self.EndModeller

        # Filter out templates with proper_pident more than 5% less than best match
        # http://merenlab.org/2018/09/04/getting-started-with-anvi-structure/#how-much-do-templates-matter
        search_df = search_df[search_df['proper_pident'] >= (
            search_df['proper_pident'].max() - 5)]

        # get up to self.modeller.max_number_templates of those with the highest proper_ident scores.
        search_df = search_df.iloc[:min(
            [len(search_df), self.max_number_templates])]

        # Get their chain and 4-letter ids
        self.list_of_template_code_and_chain_ids = list(
            zip(search_df["code"], search_df["chain"]))

        self.run.info("Max number of templates allowed",
                      self.max_number_templates)
        self.run.info("Number of candidate templates", matches_found)
        self.run.info(
            "After >{}% identical filter".format(self.percent_cutoff),
            matches_after_filter)
        self.run.info("Number accepted as templates",
                      len(self.list_of_template_code_and_chain_ids))

        # update user on which templates are used, and write the templates to self.out
        for i in range(len(self.list_of_template_code_and_chain_ids)):
            pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i]
            proper_percent_similarity = search_df["proper_pident"].iloc[i]
            percent_similarity = search_df["pident"].iloc[i]
            align_fraction = search_df["align_fraction"].iloc[i]

            self.out["templates"]["pdb_id"].append(pdb_id)
            self.out["templates"]["chain_id"].append(chain_id)
            self.out["templates"]["proper_percent_similarity"].append(
                proper_percent_similarity)
            self.out["templates"]["percent_similarity"].append(
                percent_similarity)
            self.out["templates"]["align_fraction"].append(align_fraction)

            self.run.info(
                "Template {}".format(i + 1),
                "Protein ID: {}, Chain {} ({:.1f}% identical, {:.2f} align fraction)"
                .format(pdb_id, chain_id, percent_similarity, align_fraction))