Beispiel #1
0
def run_blast(species_id_path):
    blast_cmd = NcbiblastnCommandline(
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blastn',
        query=query_file,
        db=species_db_dir / species_id_path.stem,
        outfmt=11,
        out=species_out_asn_dir / (species_id_path.stem + ".asn")
        # perc_identity=95
    )
    blast_xml_cmd = NcbiblastformatterCommandline(
        archive=species_out_asn_dir / (species_id_path.stem + ".asn"),
        outfmt=5,
        out=species_out_xml_dir / (species_id_path.stem + ".xml"),
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    blast_txt_cmd = NcbiblastformatterCommandline(
        archive=species_out_asn_dir / (species_id_path.stem + ".asn"),
        outfmt=7,
        out=species_out_txt_dir / (species_id_path.stem + ".txt"),
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    db_file = species_db_dir / (species_id_path.stem + ".ndb")
    if (species_out_xml_dir /
        (species_id_path.stem + ".xml")).exists() is False:
        if db_file.exists() is False:
            blastdb(species_id_path)
        blast_cmd()
        blast_txt_cmd()
        blast_xml_cmd()
def blast(db_file, query_file, blast_out_xml_file, blast_out_asn_file,
          blast_out_txt_file):
    blast_asn_cmd = NcbiblastnCommandline(
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blastn',
        query=query_file,
        db=db_file,
        outfmt=11,
        out=blast_out_asn_file)
    blast_asn_cmd()
    blast_xml_cmd = NcbiblastformatterCommandline(
        archive=blast_out_asn_file,
        outfmt=5,
        out=blast_out_xml_file,
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    blast_xml_cmd()
    blast_txt_cmd = NcbiblastformatterCommandline(
        archive=blast_out_asn_file,
        outfmt=7,
        out=blast_out_txt_file,
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    blast_txt_cmd()
Beispiel #3
0
def psiBlastScoring():
    """
	JB's instructions on psi-blast
	
	1. We find all similar interfaces.
	
	2. Make a MSA of structurally aligned sequences (multiple sequence alignment) 


	3. Form a score from the probability of a particular mutation showing up in the 
	   MSA 

	4. Evaluate the mutation with this score  - This is where we need the mutation

	
	Notes:
	
	-> Should not require an HPC to run each blast computation
	

	Some biopython options for blast:
	---------------------------------------------------------
	blastn -> nucleotide vs nucleotide
	blastp -> protein vs protein 
	blastx -> translated nucleotide vs protein
	tblastn -> protein vs translcated nucleotide
	tblastx -> translated nucelotide vs translated nucleotide
	---------------------------------------------------------
	"""
    try:
        # imports from previous functions
        from Bio.PDB.PDBIO import PDBIO
        from Bio.PDB.PDBParser import PDBParser
        from Bio.Data.IUPACData import protein_letters
        from Bio.SeqUtils.ProtParam import ProteinAnalysis
        from Bio.PDB.Polypeptide import PPBuilder
        from Bio.PDB.Polypeptide import standard_aa_names  # Standard amino acid names - https://biopython.org/DIST/docs/api/Bio.PDB.Polypeptide-module.html
        from Bio.PDB.Polypeptide import aa1  #  aa1 = 'ACDEFGHIKLMNPQRSTVWY'
        from Bio.PDB.Polypeptide import aa3  #  aa3 = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE',... ]

        # Basic Local Alignment Search Tool (BLAST) reader

        from Bio.Blast.Applications import NcbiblastformatterCommandline as blastn
        from Bio.Blast import NCBIXML  # For reaidng the BLAST output
    except ImportError:
        print("Error - cannot imoort")

    BLAST_EXE = '/home/oohnohnoh1/Desktop/ACADEMIA/Papermaking/OPTIMUS_BIND/ZIP/ncbi-blast-2.9.0+/bin/blastn'  # The example given is /home/sb/opt/ncbi-blast-2.6.0+/bin/blastn
    #f_in = 'seq3.txt'
    b_db = 'db/samples/TAIR8cds'
    blastn_cline = blastn(cmd=BLAST_EXE,
                          query=f_in,
                          db=b_db,
                          evalue=.0005,
                          outfmt=5)
    rh, eh = blastn.cline()

    rh.readline()
Beispiel #4
0
def blast(db_file, query_file, blast_out_xml_file, blast_out_asn_file,
          blast_out_txt_file):
    blast_asn_cmd = NcbiblastnCommandline(cmd='blastn',
                                          query=query_file,
                                          db=db_file,
                                          outfmt=11,
                                          out=blast_out_asn_file)
    blast_asn_cmd()
    blast_xml_cmd = NcbiblastformatterCommandline(archive=blast_out_asn_file,
                                                  outfmt=5,
                                                  out=blast_out_xml_file,
                                                  cmd='blast_formatter')
    blast_xml_cmd()
    blast_txt_cmd = NcbiblastformatterCommandline(archive=blast_out_asn_file,
                                                  outfmt=7,
                                                  out=blast_out_txt_file,
                                                  cmd='blast_formatter')
    blast_txt_cmd()
def run_blast(query_file,species_id_path,species_out_path):
    make_db_cmd=NcbimakeblastdbCommandline(
        cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/makeblastdb',
        dbtype='nucl',
        input_file=species_id_path,
        out=str(species_out_path/"blastdb"/species_id_path.stem)
    )
    blast_cmd=NcbiblastnCommandline(
            cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blastn',
            query=query_file,
            db=species_out_path/"blastdb"/species_id_path.stem,
            outfmt=11,
            out=species_out_path/"asn"/(species_id_path.stem+".asn")
            # perc_identity=95
        )
    blast_xml_cmd=NcbiblastformatterCommandline(
    archive=species_out_path/"asn"/(species_id_path.stem+".asn"),
    outfmt=5,
    out=species_out_path/"xml"/(species_id_path.stem+".xml"),
    cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    blast_txt_cmd=NcbiblastformatterCommandline(
        archive=species_out_path/"asn"/(species_id_path.stem+".asn"),
        outfmt=7,
        out=species_out_path/"txt"/(species_id_path.stem+".txt"),
        cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    db_file=species_out_path/"blastdb"/(species_id_path.stem+".ndb")
    if (species_out_path/"xml"/(species_id_path.stem+".xml")).exists() is False:
        if db_file.exists() is False:
            make_db_cmd()
        try:
            blast_cmd()
        except ApplicationError:
            print(blast_xml_cmd)
        try:
            blast_txt_cmd()
        except ApplicationError:
            print(blast_xml_cmd)
        try:
            blast_xml_cmd()
        except ApplicationError:
            print(blast_xml_cmd)
Beispiel #6
0
def conserved_domain_search(
        nucleotide_seq_path: str,
        cd_ans_path: str,
        cd_xml_path: Optional[str],
        cd_txt_path: Optional[str],
        cd_csv_path: Optional[str],
) -> None:
    """conserved domain search for a given nucleotide sequence

    :param nucleotide_seq_path: path to nucleotide sequence to be searched
    for conserved domains
    :type nucleotide_seq_path: str
    :param cd_ans_path: path to the result in BLAST archive (ASN.1) format,
    preferably ended with '.ans'
    :type cd_ans_path: str
    :param cd_xml_path: optional path to the result in BLAST XML format,
    preferably ended with '.xml'
    :type cd_xml_path: str
    :param cd_txt_path: optional path to the result in post-rpsblast in text
    format, preferably ended with '.txt'
    :type cd_txt_path: str
    :param cd_csv_path: optional path to the result in post-rpsblast in CSV
    format, preferably ended with '.csv'
    :type cd_csv_path: str
    :return: None
    """

    # return of the result BLAST archive (ASN.1) already exists
    if not os.path.exists(cd_ans_path):
        rpstblastn_cmd = NcbirpstblastnCommandline(
            query=nucleotide_seq_path,
            **RPSTBLASTN_KWARGS,
        )
        cd_ans, rpsblast_cmd_error_msg = rpstblastn_cmd()

        # write to result ANS.1 file if given
        if cd_ans_path:
            with open(cd_ans_path, 'w+') as _fh:
                _fh.write(cd_ans)

    # translate ANS to XML format for easier Biopython parsing
    if cd_xml_path and (not os.path.exists(cd_xml_path)):
        formatter_cmd = NcbiblastformatterCommandline(
            archive=cd_ans_path,
            out=cd_xml_path,
            outfmt=5,
        )
        _, formatter_cmd_error_msg = formatter_cmd()

    # post-rpsblast processing with rpsbproc and store in text format
    if cd_txt_path and (not os.path.exists(cd_txt_path)):
        rpsbproc_cmd = Popen(
            [
                f'rpsbproc',
                f'--infile', f'{cd_ans_path}',
                f'--outfile', f'{cd_txt_path}',
                f'--data-path', f'{CDD_DATA_DIR_PATH}',
                f'--data-mode', 'full',
                f'--evalue', f'{RPSTBLASTN_KWARGS["evalue"]}',
                f'--show-families',
                f'--quiet',
            ],
            stdout=PIPE,
            stderr=PIPE,
            stdin=PIPE,
        )
        rpsbproc_cmd.communicate()

    # parse the post-rpsblast processing results and store in CSV format
    if cd_csv_path and (not os.path.exists(cd_csv_path)):
        with open(cd_txt_path, 'r') as _fh:
            rpsbproc_output = _fh.read()
        rpsbproc_output_df = parse_rpsbproc_output(rpsbproc_output)
        rpsbproc_output_df.to_csv(cd_csv_path, index=False)
Beispiel #7
0
def search_conserved_domains(
    fasta_file_path: str,
    cd_ans_path: str,
    fasta_file_type: Optional[Union[FASTAType, str]] = None,
    cd_xml_path: Optional[str] = None,
    cd_txt_path: Optional[str] = None,
    cd_csv_path: Optional[str] = None,
) -> None:
    """perform conserved domain search for a given FASTA sequence and parse
    the results into multiple formats
    :param fasta_file_path: path to the FASTA file for domain search
    :type fasta_file_path: str
    :param fasta_file_type: FASTA file type, could be string or FASTAType
    defined in this file, or None, in which case the function will infer
    the FASTA type from the file extension
    :type fasta_file_type: Optional[Union[FASTAType, str]]
    :param cd_ans_path: path to the result in BLAST archive (ASN.1) format,
    preferably ended with '.ans'
    :type cd_ans_path: str
    :param cd_xml_path: optional path to the result in BLAST XML format,
    preferably ended with '.xml'
    :type cd_xml_path: Optional[str]
    :param cd_txt_path: optional path to the result in post-rpsblast in text
    format, preferably ended with '.txt'
    :type cd_txt_path: Optional[str]
    :param cd_csv_path: optional path to the result in post-rpsblast in CSV
    format, preferably ended with '.csv'
    :type cd_csv_path: Optional[str]
    :return: None
    """

    # refer the FASTA file type if not explicitly given or given as string
    if not fasta_file_type:
        fasta_file_type: str = os.path.splitext(fasta_file_path)[1]
    if not isinstance(fasta_file_type, FASTAType):
        try:
            fasta_file_type = FASTAType(fasta_file_type)
        except ValueError:
            _error_msg = \
                f'cannot parse the given FASTA file with extension ' \
                f'\'{fasta_file_type}\', which must be one of ' \
                f'{list(FASTAType.__members__.keys())}.'
            raise ValueError(_error_msg)

    # return of the result BLAST archive (ASN.1) already exists
    if not os.path.exists(cd_ans_path):
        if fasta_file_type == FASTAType.fna:
            rpsblast_cmd = NcbirpstblastnCommandline(
                query=fasta_file_path,
                **RPSTBLASTN_KWARGS,
            )
        elif fasta_file_type == FASTAType.faa:
            rpsblast_cmd = NcbirpsblastCommandline(
                query=fasta_file_path,
                **RPSTBLASTN_KWARGS,
            )
        else:
            _error_msg = \
                f'conserved domains search has not been implemented for ' \
                f'FASTA file type with extension \'{fasta_file_type.value}\'.'
            raise NotImplementedError(_error_msg)

        try:
            cd_ans, _ = rpsblast_cmd()
        except ApplicationError as __error:
            _warning_msg = f'error from rpsblast: {__error}; skipping ...'
            _LOGGER.warning(_warning_msg)
            return

        # write to result ANS.1 file if given
        if cd_ans_path:
            with open(cd_ans_path, 'w+') as _fh:
                _fh.write(cd_ans)

    # translate ANS to XML format for easier Biopython parsing
    if cd_xml_path and (not os.path.exists(cd_xml_path)):
        formatter_cmd = NcbiblastformatterCommandline(
            archive=cd_ans_path,
            out=cd_xml_path,
            outfmt=5,
        )
        _, formatter_cmd_error_msg = formatter_cmd()

    # post-rpsblast processing with rpsbproc and store in text format
    if cd_txt_path and (not os.path.exists(cd_txt_path)):
        rpsbproc_cmd = Popen(
            [
                f'rpsbproc',
                f'--infile',
                f'{cd_ans_path}',
                f'--outfile',
                f'{cd_txt_path}',
                f'--data-path',
                f'{CDD_DATA_DIR_PATH}',
                f'--data-mode',
                'full',
                f'--evalue',
                f'{RPSTBLASTN_KWARGS["evalue"]}',
                f'--show-families',
                f'--quiet',
            ],
            stdout=PIPE,
            stderr=PIPE,
            stdin=PIPE,
        )
        rpsbproc_cmd.wait()
        rpsbproc_cmd.communicate()

    # parse the post-rpsblast processing results and store in CSV format
    if cd_csv_path and (not os.path.exists(cd_csv_path)):
        with open(cd_txt_path, 'r') as _fh:
            rpsbproc_output = _fh.read()
        rpsbproc_output_df = __parse_rpsbproc_output(rpsbproc_output)
        if rpsbproc_output_df is not None:
            rpsbproc_output_df.to_csv(cd_csv_path, index=False)