Ejemplo n.º 1
0
    def run_search_and_parse_results(self):
        """Align the protein against the database based on only sequence"""

        if not self.percent_identical_cutoff or not self.max_number_templates:
            raise ConfigError(
                "run_search_and_parse_results :: You initiated this class without providing values for percent_identical_cutoff "
                "and max_number_templates, which is required for this function."
            )

        # Change to MODELLER working directory
        os.chdir(self.directory)

        driver = diamond.Diamond(
            query_fasta=self.target_fasta_path,
            target_fasta=J(self.database_dir,
                           self.modeller_database + '.dmnd'),
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.blastp()

        # Change back to user directory
        os.chdir(self.start_dir)

        search_df = driver.view_as_dataframe(
            J(self.directory, driver.tabular_output_path))

        matches_found = search_df.shape[0]

        if not matches_found:
            self.run.warning(
                "No proteins with homologous sequence were found for {}. No structure will be modelled"
                .format(self.corresponding_gene_call))
            raise self.EndModeller

        # We need the gene length for proper_pident
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        while next(target_fasta):
            gene_length = len(target_fasta.seq)

        # add some useful columns
        search_df["proper_pident"] = search_df["pident"] * search_df[
            "length"] / gene_length
        search_df["code"] = search_df["sseqid"].str[:-1]
        search_df["chain"] = search_df["sseqid"].str[-1]

        # filter results by self.percent_identical_cutoff.
        max_pident_found = search_df["proper_pident"].max()
        id_of_max_pident = tuple(
            search_df.loc[search_df["proper_pident"].idxmax(),
                          ["code", "chain"]].values)
        search_df = search_df[
            search_df["proper_pident"] >= self.percent_identical_cutoff]

        search_df = search_df.sort_values("proper_pident", ascending=False)

        # If more than 1 template in 1 PDB id, just choose 1
        search_df = search_df.drop_duplicates('code', keep='first')

        # Order them and take the first self.modeller.max_number_templates.
        matches_after_filter = len(search_df)
        if not matches_after_filter:
            self.run.warning("Gene {} did not have a search result with proper percent identicalness above or equal "
                             "to {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a "
                             "proper percent identicalness of {:.2f}%. No structure will be modelled.".\
                              format(self.corresponding_gene_call,
                                     self.percent_identical_cutoff,
                                     id_of_max_pident[1],
                                     id_of_max_pident[0],
                                     max_pident_found))
            raise self.EndModeller

        # get up to self.modeller.max_number_templates of those with the highest proper_ident scores.
        search_df = search_df.iloc[:min(
            [len(search_df), self.max_number_templates])]

        # Get their chain and 4-letter ids
        self.list_of_template_code_and_chain_ids = list(
            zip(search_df["code"], search_df["chain"]))

        self.run.info("Max number of templates allowed",
                      self.max_number_templates)
        self.run.info("Number of candidate templates", matches_found)
        self.run.info(
            "After >{}% identical filter".format(
                self.percent_identical_cutoff), matches_after_filter)
        self.run.info("Number accepted as templates",
                      len(self.list_of_template_code_and_chain_ids))

        # update user on which templates are used, and write the templates to self.out
        for i in range(len(self.list_of_template_code_and_chain_ids)):
            pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i]
            ppi = search_df["proper_pident"].iloc[i]

            self.out["templates"]["pdb_id"].append(pdb_id)
            self.out["templates"]["chain_id"].append(chain_id)
            self.out["templates"]["ppi"].append(ppi)

            self.run.info(
                "Template {}".format(i + 1),
                "Protein ID: {}, Chain {} ({:.1f}% identical)".format(
                    pdb_id, chain_id, ppi))
Ejemplo n.º 2
0
    def check_database(self):
        """Setup the database files

        Downloads the .pir file if it is missing
        Binarizes .pir file if .bin is missing
        Creates the .dmnd file if it is missing
        """

        bin_db_path = J(self.database_dir, self.modeller_database + ".bin")
        pir_db_path = J(self.database_dir, self.modeller_database + ".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path,
                                                      dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                      dont_raise=True)

        if bin_exists and pir_exists:
            # We good
            pass
        else:
            if not pir_exists:
                # Download .pir
                self.run.warning(
                    "Anvi'o looked in {} for a database with the name {} and with an extension "
                    "of either .bin or .pir, but didn't find anything matching that "
                    "criteria. Anvi'o will try and download the best database it knows of from "
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. "
                    "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 "
                    "database".format(self.database_dir,
                                      self.modeller_database))

                db_download_path = os.path.join(self.database_dir,
                                                "pdb_95.pir.gz")
                utils.download_file(
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz",
                    db_download_path)
                utils.run_command(
                    ['gzip', '-d', db_download_path],
                    log_file_path=filesnpaths.get_temp_file_path())

            # Binarize .pir (make .bin)
            self.run.warning(
                "Your database is not in binary format. That means accessing its contents is slower "
                "than it could be. Anvi'o is going to make a binary format. Just FYI"
            )
            self.run_binarize_database(pir_db_path, bin_db_path)

        dmnd_db_path = J(self.database_dir, self.modeller_database + '.dmnd')

        if os.path.exists(dmnd_db_path):
            return

        self.run.warning(
            "Your diamond database does not exist. It will be created.")

        script_name = "pir_to_fasta.py"

        self.copy_script_to_directory(script_name)

        input_pir_path = J(self.database_dir, self.modeller_database + '.pir')
        fasta_path = J(self.database_dir, self.modeller_database + '.fa')
        dmnd_path = J(self.database_dir, self.modeller_database)

        command = [self.executable, script_name, input_pir_path, fasta_path]

        self.run_command(command, script_name=script_name, rename_log=False)

        temp = u.FastaOutput(filesnpaths.get_temp_file_path())
        fasta = u.SequenceSource(fasta_path)

        while next(fasta):
            temp.write_id(fasta.id)
            temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X'))

        shutil.move(temp.output_file_path, fasta_path)
        fasta.close()
        temp.close()

        driver = diamond.Diamond(
            query_fasta=fasta_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.makedb(output_file_path=dmnd_path)

        os.remove(fasta_path)
Ejemplo n.º 3
0
    def run_search_and_parse_results(self):
        """Align the protein against the database based on only sequence"""

        # Change to MODELLER working directory
        os.chdir(self.directory)

        columns = [
            'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gaps',
            'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'
        ]
        driver = diamond.Diamond(
            query_fasta=self.target_fasta_path,
            target_fasta=J(self.database_dir,
                           self.modeller_database + '.dmnd'),
            outfmt=' '.join(['6'] + columns),
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.blastp()

        # Change back to user directory
        os.chdir(self.start_dir)

        search_df = driver.view_as_dataframe(
            J(self.directory, driver.tabular_output_path))

        matches_found = search_df.shape[0]

        if not matches_found:
            self.run.warning(
                "No proteins with homologous sequence were found for {}. No structure will be modelled"
                .format(self.corresponding_gene_call))
            raise self.EndModeller

        # We need the gene length for pident
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        while next(target_fasta):
            gene_length = len(target_fasta.seq)

        # add some useful columns
        search_df["code"] = search_df["sseqid"].str[:-1]
        search_df["chain"] = search_df["sseqid"].str[-1]
        search_df["align_fraction"] = (search_df["length"] -
                                       search_df["gaps"]) / gene_length
        search_df["proper_pident"] = search_df["pident"] * search_df[
            "align_fraction"]

        # Find best match for align fraction and pident
        code_chain_id_of_best = tuple(
            search_df.iloc[search_df['proper_pident'].argmax()][[
                'code', 'chain'
            ]].values)
        best_hit = search_df.loc[
            (search_df['code'] == code_chain_id_of_best[0]) & \
            (search_df['chain'] == code_chain_id_of_best[1]), ['pident', 'align_fraction']
        ].iloc[0]

        # filter results by self.percent_cutoff and self.alignment_fraction_cutoff
        search_df = search_df[search_df["pident"] >= self.percent_cutoff]
        search_df = search_df[
            search_df["align_fraction"] >= self.alignment_fraction_cutoff]

        # Rank by the alignment fraction times the percent id
        search_df = search_df.sort_values("proper_pident", ascending=False)

        # If more than 1 template in 1 PDB id, just choose 1
        search_df = search_df.drop_duplicates('code', keep='first')

        matches_after_filter = len(search_df)
        if not matches_after_filter:
            self.run.warning("Gene {} did not have a search result with percent identicalness above or equal "
                             "to {}% and alignment fraction above {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a "
                             "percent identicalness of {:.2f}% and an alignment fraction of {:.3f}. No structure will be modelled.".\
                              format(self.corresponding_gene_call,
                                     self.percent_cutoff,
                                     self.alignment_fraction_cutoff,
                                     code_chain_id_of_best[1],
                                     code_chain_id_of_best[0],
                                     best_hit['pident'],
                                     best_hit['align_fraction']))
            raise self.EndModeller

        # Filter out templates with proper_pident more than 5% less than best match
        # http://merenlab.org/2018/09/04/getting-started-with-anvi-structure/#how-much-do-templates-matter
        search_df = search_df[search_df['proper_pident'] >= (
            search_df['proper_pident'].max() - 5)]

        # get up to self.modeller.max_number_templates of those with the highest proper_ident scores.
        search_df = search_df.iloc[:min(
            [len(search_df), self.max_number_templates])]

        # Get their chain and 4-letter ids
        self.list_of_template_code_and_chain_ids = list(
            zip(search_df["code"], search_df["chain"]))

        self.run.info("Max number of templates allowed",
                      self.max_number_templates)
        self.run.info("Number of candidate templates", matches_found)
        self.run.info(
            "After >{}% identical filter".format(self.percent_cutoff),
            matches_after_filter)
        self.run.info("Number accepted as templates",
                      len(self.list_of_template_code_and_chain_ids))

        # update user on which templates are used, and write the templates to self.out
        for i in range(len(self.list_of_template_code_and_chain_ids)):
            pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i]
            proper_percent_similarity = search_df["proper_pident"].iloc[i]
            percent_similarity = search_df["pident"].iloc[i]
            align_fraction = search_df["align_fraction"].iloc[i]

            self.out["templates"]["pdb_id"].append(pdb_id)
            self.out["templates"]["chain_id"].append(chain_id)
            self.out["templates"]["proper_percent_similarity"].append(
                proper_percent_similarity)
            self.out["templates"]["percent_similarity"].append(
                percent_similarity)
            self.out["templates"]["align_fraction"].append(align_fraction)

            self.run.info(
                "Template {}".format(i + 1),
                "Protein ID: {}, Chain {} ({:.1f}% identical, {:.2f} align fraction)"
                .format(pdb_id, chain_id, percent_similarity, align_fraction))