Ejemplo n.º 1
0
def search_and_process(rpsblast, cdd_name, tmp_dir, evalue,
                       translation_id, translation):
    """
    Uses rpsblast to search indicated gene against the indicated CDD
    :param rpsblast: path to rpsblast binary
    :param cdd_name: CDD database path/name
    :param tmp_dir: path to directory where I/O will take place
    :param evalue: evalue cutoff for rpsblast
    :param translation_id: unique identifier for the translation sequence
    :param translation: protein sequence for gene to query
    :return: results
    """
    # Currently translation_id is used only for file formatting.
    # Setup I/O variables
    i = "{}/{}.txt".format(tmp_dir, translation_id)
    o = "{}/{}.xml".format(tmp_dir, translation_id)

    # Write the input file
    with open(i, "w") as fh:
        fh.write(">{}\n{}".format(translation_id, translation))

    # Setup, run the rpsblast command, and process results
    rps_command = NcbirpsblastCommandline(cmd=rpsblast, db=cdd_name,
                                          query=i, out=o, outfmt=5,
                                          evalue=evalue)
    rps_command()
    data = process_rps_output(o, evalue)

    # Currently need to return as a list due to a
    # filter in parallelize.start_processes()
    results = [{"Translation": translation, "Data": data}]
    return results
Ejemplo n.º 2
0
def local_rpsblast_2file(query_file, dbfile_path, outfile, prefs):
    """Perform RPS Blast against local database."""
    cline = NcbirpsblastCommandline(query=query_file,
                                  db=dbfile_path,
                                  out=outfile,
                                  evalue=prefs['evalue'],
                                  outfmt=5) # must output XML!
    child = subprocess.Popen(str(cline), stdout=subprocess.PIPE, shell=True)
    output, error = child.communicate() # forces the main script to wait
Ejemplo n.º 3
0
def rpsblast(blast_file, rpsblast_db, blast_path, evalue, blastoutput_xml):
    """
    Run a rpsBLAST search with blast_file rpsblast_db
    
    Args:
        blast_file (str):	  Path to fasta file used to BLAST against rpsblast_db
        rpsblast_db (str):		 Name of the database to BLAST against 
        blast_path (str):	  Path to the blastp program
        evalue (float):		  Evalue to use as threshold 
    
    Returns:
        An iterable of blast records as returned by NCBIXML.parse
    """
    def cline():
        rpsblast_cline(rpsblast_cline)
        result_handle = open(blastoutput_xml)
        blast_records = NCBIXML.parse(result_handle)
        return blast_records

    try:
        # first try to run it with location of blast program
        rpsblast_cline = NcbirpsblastCommandline(blast_path + '/rpsblast',
                                                 query=blast_file,
                                                 db=rpsblast_db,
                                                 evalue=evalue,
                                                 outfmt=5,
                                                 out="rpsblastOutput.xml")
        return rpsblast_cline(rpsblast_cline)
    except OSError as e:
        print(e)
        pass

    # else, try to copy it to current folder and run it directly
    shutil.copy2(blast_path + 'rpsblast', 'rpsblast')
    rpsblast_cline = NcbirpsblastCommandline('rpsblast',
                                             query=blast_file,
                                             db=rpsblast_db,
                                             evalue=evalue,
                                             outfmt=5,
                                             out="rpsblastOutput.xml")
    return rpsblast_cline(rpsblast_cline)
Ejemplo n.º 4
0
 def do_cdd_rps(self, record):
     """Method to get Cdd Blast hits"""
     handle = NcbirpsblastCommandline(
         query=record,
         db=self.cdd_dbase,
         out=pathlib.Path(record).with_suffix('.cdd'),
         evalue=self.evalue,
         outfmt="6 qacc sacc",
         max_target_seqs=50,
     )
     stdout, stderr = handle()
     print(stdout, stderr)
Ejemplo n.º 5
0
def find_domains(cnx, gene_ids, sequences, num_threads=1):
    try:
        # build fasta file of all genes
        fasta_filename = None
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as fasta_file:
            fasta_filename = fasta_file.name
            for gene_id, sequence in izip(gene_ids, sequences):
                _write_fasta_record(fasta_file, sequence, gene_id)

        output_directory = tempfile.mkdtemp(suffix='-blast')
        try:
            # run rpsblast
            output_filename = os.path.join(output_directory, 'rpsblast.xml')
            expectation_value_cutoff = 0.001
            cdd_database = os.path.join(_DATA_DIR, 'conserved-domain-database',
                                        'Cdd', 'Cdd')
            rpsblast_bin = os.path.join(_DATA_DIR, 'ncbi-blast', 'rpsblast')
            cline = NcbirpsblastCommandline(rpsblast_bin,
                                            query=fasta_filename,
                                            db=cdd_database,
                                            out=output_filename,
                                            outfmt=_OUTPUT_FORMAT_XML,
                                            evalue=expectation_value_cutoff,
                                            num_threads=num_threads)
            stdout, stderr = cline()

            # parse rpsblast output
            read_domains_from_xml(cnx, output_filename)

        finally:
            shutil.rmtree(output_directory)
    finally:
        if fasta_filename is not None:
            try:
                os.remove(fasta_filename)
            except IOError:
                pass

    # set the gene.cdd_status flag to True
    # this flag is used by legacy k_phamerate scripts
    with closing(cnx.cursor()) as cursor:
        id_parameters = ','.join(['%s'] * len(gene_ids))
        query = '''
            UPDATE gene
            SET cdd_status = 1
            WHERE gene.GeneID IN ( %s )
                       ''' % id_parameters
        cursor.execute(query, params=gene_ids)
Ejemplo n.º 6
0
def search_and_process2(rpsblast, cdd_name, tmp_dir, evalue, translation_id,
                        translation):
    """
    Uses rpsblast to search indicated gene against the indicated CDD
    :param rpsblast: path to rpsblast binary
    :param cdd_name: CDD database path/name
    :param tmp_dir: path to directory where I/O will take place
    :param evalue: evalue cutoff for rpsblast
    :param translation_id: unique identifier for the translation sequence
    :param translation: protein sequence for gene to query
    :return: results
    """
    # Setup I/O variables
    i = "{}/{}.txt".format(tmp_dir, translation_id)
    o = "{}/{}.xml".format(tmp_dir, translation_id)

    # Write the input file
    with open(i, "w") as fh:
        fh.write(">{}\n{}".format(translation_id, translation))

    # Setup and run the rpsblast command
    rps_command = NcbirpsblastCommandline(cmd=rpsblast,
                                          db=cdd_name,
                                          query=i,
                                          out=o,
                                          outfmt=5,
                                          evalue=evalue)
    rps_command()

    # Process results into a single list
    results = []

    with open(o, "r") as fh:
        for record in NCBIXML.parse(fh):
            # Only need to process if there are record alignments
            if record.alignments:
                for align in record.alignments:
                    for hsp in align.hsps:
                        if hsp.expect <= evalue:
                            align.hit_def = align.hit_def.replace("\"", "\'")

                            des_list = align.hit_def.split(",")
                            if len(des_list) == 1:
                                description = des_list[0].strip()
                                domain_id = None
                                name = None
                            elif len(des_list) == 2:
                                domain_id = des_list[0].strip()
                                description = des_list[1].strip()
                                name = None
                            else:
                                domain_id = des_list[0].strip()
                                name = des_list[1].strip()
                                # Name is occassionally longer than permitted
                                # in the database. Truncating avoids a
                                # MySQL error.
                                # TODO perhaps the database schema should be
                                # changed to account for this.
                                name = basic.truncate_value(name, 25, "...")
                                description = ",".join(des_list[2:]).strip()

                            # Try to put domain into domain table
                            results.append(
                                INSERT_INTO_DOMAIN.format(
                                    align.hit_id, domain_id, name,
                                    description))

                            # Try to put this hit into gene_domain table
                            data_dict = {
                                "Translation": translation,
                                "HitID": align.hit_id,
                                "Expect": float(hsp.expect),
                                "QueryStart": int(hsp.query_start),
                                "QueryEnd": int(hsp.query_end)
                            }
                            results.append(data_dict)
                            # results.append(INSERT_INTO_GENE_DOMAIN.format(
                            #     geneid, align.hit_id, float(hsp.expect),
                            #     int(hsp.query_start), int(hsp.query_end)))

    # Update this gene's DomainStatus to 1
    # results.append(UPDATE_GENE.format(geneid))
    return results
Ejemplo n.º 7
0
    def search(self):
        #print 'path to database: %s' % self.rpsblast_db
        fasta = phamerator_manage_db.get_fasta_aa(self.c,
                                                  self.PhageIDs,
                                                  include_drafts=True)
        #print fasta
        f = open(self.query_filename, 'w')
        f.write(fasta)
        f.close()
        E_VALUE_THRESH = 0.001  #Adjust the expectation cut-off here
        # from Bio.Blast import NCBIStandalone
        # output_handle, error_handle = NCBIStandalone.rpsblast(self.rpsblast_exe,self.rpsblast_db, self.query_filename, expectation=E_VALUE_THRESH)

        from Bio.Blast.Applications import NcbirpsblastCommandline
        from StringIO import StringIO
        output_handle = NcbirpsblastCommandline(cmd='rpsblast+',
                                                query=self.query_filename,
                                                db=self.rpsblast_db,
                                                evalue=E_VALUE_THRESH,
                                                outfmt=5)()[0]
        #errors = error_handle.read()
        #if errors: print 'Errors: %s' % errors
        from Bio.Blast import NCBIXML
        for record in NCBIXML.parse(StringIO(output_handle)):
            #We want to ignore any queries with no search results:
            if record.alignments:
                print "QUERY: %s..." % record.query.split(':')[0]
                for align in record.alignments:
                    for hsp in align.hsps:
                        print " %s HSP, e=%f, from position %i to %i" % (
                            align.hit_id, hsp.expect, hsp.query_start,
                            hsp.query_end)
                        print 'inserting into database'
                        align.hit_def = align.hit_def.replace("\"", "\'")
                        #self.dbase.insert(table='domain', hit_id=align.hit_id, description=align.hit_def)
                        #self.dbase.insert(table='gene_domain', GeneID=record.query, hit_id=align.hit_id, expect=float(hsp.expect), query_start=int(hsp.query_start), query_end=int(hsp.query_end))
                        try:
                            descList = align.hit_def.split(',')
                            if len(descList) >= 3:
                                DomainID, Name = descList[0], descList[1]
                                description = ','.join(descList[2:])
                            elif len(descList) == 2:
                                DomainID, description = descList[0], descList[
                                    1]
                                Name = None
                            elif len(descList) == 1:
                                description = descList[0]
                                DomainID, Name = None
                            try:
                                DomainID, Name, description = DomainID.strip(
                                ), Name.strip(), description.strip()
                            except:
                                pass  # if DomainID, Name or description are None, strip() raises an objection
                            sqlQuery = """insert into domain (hit_id, DomainID, Name, description) VALUES ("%s", "%s", "%s", "%s")""" % (
                                align.hit_id, DomainID, Name, description)
                            self.c.execute(sqlQuery)
                            self.c.execute('COMMIT')
                        except MySQLdb.Error, e:
                            print sqlQuery
                            if e[0] == 1062:
                                print e
                            else:
                                print e
                                print 'exiting on error.'
                                sys.exit()

                        try:
                            sqlQuery = """insert into gene_domain (GeneID, hit_id, expect, query_start, query_end) VALUES ("%s", "%s", %s, %s, %s)""" % (
                                record.query.split(':')[1], align.hit_id,
                                float(hsp.expect), int(
                                    hsp.query_start), int(hsp.query_end))
                            self.c.execute(sqlQuery)
                            self.c.execute('COMMIT')
                        except MySQLdb.Error, e:
                            print sqlQuery
                            if e[0] == 1062:
                                print e
                            else:
                                print e
                                print 'exiting on error.'
                                sys.exit()

                        print align.hit_def + "\n"
                        assert hsp.expect <= E_VALUE_THRESH
Ejemplo n.º 8
0
def search_conserved_domains(
    fasta_file_path: str,
    cd_ans_path: str,
    fasta_file_type: Optional[Union[FASTAType, str]] = None,
    cd_xml_path: Optional[str] = None,
    cd_txt_path: Optional[str] = None,
    cd_csv_path: Optional[str] = None,
) -> None:
    """perform conserved domain search for a given FASTA sequence and parse
    the results into multiple formats
    :param fasta_file_path: path to the FASTA file for domain search
    :type fasta_file_path: str
    :param fasta_file_type: FASTA file type, could be string or FASTAType
    defined in this file, or None, in which case the function will infer
    the FASTA type from the file extension
    :type fasta_file_type: Optional[Union[FASTAType, str]]
    :param cd_ans_path: path to the result in BLAST archive (ASN.1) format,
    preferably ended with '.ans'
    :type cd_ans_path: str
    :param cd_xml_path: optional path to the result in BLAST XML format,
    preferably ended with '.xml'
    :type cd_xml_path: Optional[str]
    :param cd_txt_path: optional path to the result in post-rpsblast in text
    format, preferably ended with '.txt'
    :type cd_txt_path: Optional[str]
    :param cd_csv_path: optional path to the result in post-rpsblast in CSV
    format, preferably ended with '.csv'
    :type cd_csv_path: Optional[str]
    :return: None
    """

    # refer the FASTA file type if not explicitly given or given as string
    if not fasta_file_type:
        fasta_file_type: str = os.path.splitext(fasta_file_path)[1]
    if not isinstance(fasta_file_type, FASTAType):
        try:
            fasta_file_type = FASTAType(fasta_file_type)
        except ValueError:
            _error_msg = \
                f'cannot parse the given FASTA file with extension ' \
                f'\'{fasta_file_type}\', which must be one of ' \
                f'{list(FASTAType.__members__.keys())}.'
            raise ValueError(_error_msg)

    # return of the result BLAST archive (ASN.1) already exists
    if not os.path.exists(cd_ans_path):
        if fasta_file_type == FASTAType.fna:
            rpsblast_cmd = NcbirpstblastnCommandline(
                query=fasta_file_path,
                **RPSTBLASTN_KWARGS,
            )
        elif fasta_file_type == FASTAType.faa:
            rpsblast_cmd = NcbirpsblastCommandline(
                query=fasta_file_path,
                **RPSTBLASTN_KWARGS,
            )
        else:
            _error_msg = \
                f'conserved domains search has not been implemented for ' \
                f'FASTA file type with extension \'{fasta_file_type.value}\'.'
            raise NotImplementedError(_error_msg)

        try:
            cd_ans, _ = rpsblast_cmd()
        except ApplicationError as __error:
            _warning_msg = f'error from rpsblast: {__error}; skipping ...'
            _LOGGER.warning(_warning_msg)
            return

        # write to result ANS.1 file if given
        if cd_ans_path:
            with open(cd_ans_path, 'w+') as _fh:
                _fh.write(cd_ans)

    # translate ANS to XML format for easier Biopython parsing
    if cd_xml_path and (not os.path.exists(cd_xml_path)):
        formatter_cmd = NcbiblastformatterCommandline(
            archive=cd_ans_path,
            out=cd_xml_path,
            outfmt=5,
        )
        _, formatter_cmd_error_msg = formatter_cmd()

    # post-rpsblast processing with rpsbproc and store in text format
    if cd_txt_path and (not os.path.exists(cd_txt_path)):
        rpsbproc_cmd = Popen(
            [
                f'rpsbproc',
                f'--infile',
                f'{cd_ans_path}',
                f'--outfile',
                f'{cd_txt_path}',
                f'--data-path',
                f'{CDD_DATA_DIR_PATH}',
                f'--data-mode',
                'full',
                f'--evalue',
                f'{RPSTBLASTN_KWARGS["evalue"]}',
                f'--show-families',
                f'--quiet',
            ],
            stdout=PIPE,
            stderr=PIPE,
            stdin=PIPE,
        )
        rpsbproc_cmd.wait()
        rpsbproc_cmd.communicate()

    # parse the post-rpsblast processing results and store in CSV format
    if cd_csv_path and (not os.path.exists(cd_csv_path)):
        with open(cd_txt_path, 'r') as _fh:
            rpsbproc_output = _fh.read()
        rpsbproc_output_df = __parse_rpsbproc_output(rpsbproc_output)
        if rpsbproc_output_df is not None:
            rpsbproc_output_df.to_csv(cd_csv_path, index=False)
Ejemplo n.º 9
0
def search(geneid, translation, database, username, password, cd_db):
	#IMPORT STUFF
	import Bio

	from Bio.Blast.Applications import NcbirpsblastCommandline
	from Bio.Blast import NCBIXML
	import MySQLdb as mdb

	#DEFINE STUFF - Change variables here for executable
	rpsblast_exe = "/usr/bin/rpsblast+"
	query_filename = "/tmp/" + geneid + ".txt"
	output_filename = "/tmp/" + geneid + "_rps_out.xml"
	E_VALUE_THRESH = 0.001	#Adjust the expectation cut-off here

	#WRITE STUFF
	f = open(query_filename,'w')
	f.write(">" + geneid + "\n" + translation)
	f.close()

	#Compile the rpsblast command that will be executed.
	#outfmt. sets the format of the cdd data. 5 = XML format
	rps_command = NcbirpsblastCommandline(cmd=rpsblast_exe, db=cd_db, query= query_filename, evalue=E_VALUE_THRESH,outfmt=5,out=output_filename)
	rps_command()
	output_handle = open(output_filename,"r")

	#PARSE STUFF
	for record in NCBIXML.parse(output_handle):
		if record.alignments:
			for align in record.alignments:
				for hsp in align.hsps:
					align.hit_def = align.hit_def.replace("\"", "\'")
					con=False #initialize this variable. In case connection can't be made in the try clause, the finally clause to close con won't fail.
					try:
						descList = align.hit_def.split(',')
						if len(descList) >= 3:
							DomainID, Name = descList[0], descList[1]
							description = ','.join(descList[2:])
						elif len(descList) == 2:
							DomainID, description = descList[0], descList[1]
							Name = None
						elif len(descList) == 1:
							description = descList[0]
							DomainID, Name = None
						try: DomainID, Name, description = DomainID.strip(), Name.strip(), description.strip()
						except: pass # if DomainID, Name or description are None, strip() raises an objection

						#Connect to mysql and post hit
						con = mdb.connect('localhost', username, password, database)
						cur = con.cursor()
						sqlQuery = """insert ignore into domain (hit_id, DomainID, Name, description) VALUES ("%s", "%s", "%s", "%s")""" % (align.hit_id, DomainID, Name, description)
						cur.execute(sqlQuery)
						sqlQuery = """insert into gene_domain (geneid, hit_id, expect, query_start, query_end) VALUES ("%s", "%s", %s, %s, %s)""" % (geneid, align.hit_id, float(hsp.expect), int(hsp.query_start), int(hsp.query_end))
						cur.execute(sqlQuery)
					except mdb.Error, e:
					  	if e[0] == 1062:
					  		#print "Error %d: %s" % (e.args[0],e.args[1])
					  		print "%s. This hit will be ignored." % e.args[1]
					  	else:
					  		sys.exit(1)
					finally:
						assert hsp.expect <= E_VALUE_THRESH
						if con:
							cur.execute('COMMIT')
							con.close()