Example #1
0
	def runBlast(self, result_handle=None):
		if result_handle == None:
			result_handle, error_handle = NCBIStandalone.blastall(self.blast_exe, self.blast_prog, self.blast_db, self.blast_query, nprocessors=self.blast_processors)

		#kdrew: if we want to pre-run blast, just run this line
		blast_records = NCBIXML.parse(result_handle)
		return blast_records
Example #2
0
def blast_2_files(input_filename,input_db):
    blast_db = fastafile.PERMANENT_STORE + input_db
    blast_file = fastafile.PERMANENT_STORE + input_filename
    if not os.path.exists(blast_db + ".nin"):
        fastafile.formatdb(blast_db)

    blast_out, error_handle = NCBIStandalone.blastall(BLAST_EXE, BLAST_PROGRAM, blast_db, blast_file)
    return blast_out 
Example #3
0
    def blastfile(self, filename):
        # run blast
        b_out, e_info = NCBIStandalone.blastall(self.blastexe, self.mode,
                                                self.dbname, filename)
        data = b_out.read()
        if not data:
            raise ValueError, 'BLAST error: %s' % e_info.read()

        return data
Example #4
0
    def __find_partials(self, minimum_blast_length=0):
        """find partial IS elements by blasting the sequences against the
        genome"""
        #if there are no IS elements, skip this step
        if len(self.annotations) == 0: return

        #write a temporary genome fasta file
        blast_db = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_genome.fasta")
        outf = open(blast_db, "w")
        SeqIO.write(self.as_records(), outf, "fasta")
        outf.close()
        #turn it into a database
        os.system(FORMAT_EXE + " -p F -i " + blast_db)

        #write a temporary IS fasta file
        blast_file = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_IS.fasta")
        self.__write_singles(blast_file)

        #get the directions of these sample IS's
        directions = [is_set.lst[0].direction for is_set in self.annotations]

        #clear annotations
        self.annotations = []

        #perform a blast
        result_handle, error_handle = NCBIStandalone.blastall(
            BLAST_EXE, "blastn", blast_db, blast_file)
        blast_records = NCBIXML.parse(result_handle)

        #iterate over the results and the directions of the queries
        for record, sample_direction in zip(blast_records, directions):
            ISlist = []
            for alignment in record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < E_VALUE_CUTOFF and len(hsp.sbjct) >= MIN_PARTIAL_LEN and \
                        len(hsp.sbjct) > minimum_blast_length:
                        chromosome = alignment.title.split(" ")[1]
                        start = hsp.sbjct_start - 1
                        end = start + len(hsp.sbjct)
                        #find out what the gene is
                        f = self.get_feature(chromosome, start, end)
                        thisdir = hsp.frame[1] * sample_direction
                        ISlist.append(
                            IS.IS(f, chromosome, start, end, self,
                                  dir=thisdir))
            if len(ISlist) > 0:
                self.annotations.append(ISSet.ISSet(ISlist, self.profile))

        #clean up- remove the temporary files
        os.remove(blast_db)
        os.remove(blast_file)
        for f in glob.glob(blast_db + ".n*"):
            os.remove(f)
        os.remove("formatdb.log")
Example #5
0
  def blast(self):
    '''aligns sequences using blast'''
    blastAppDir = self.blastAppDir
    blastDB = os.path.join(self.blastDataDir, 'blastDB.fasta')
    blastQueryFile = os.path.join(self.blastDataDir, 'filetoblast.txt')
    print 'path to filetoblast.txt:', blastQueryFile
    if sys.platform == 'win32':
      blastall_name = 'Blastall.exe'
    else:
      blastall_name = 'blastall'
    blast_exe = os.path.join(blastAppDir, blastall_name)
    if sys.platform == 'win32':
       import win32api
       blastDB = win32api.GetShortPathName(blast_db)
       blastQueryFile = win32api.GetShortPathName(blastQueryFile)
       blast_exe = win32api.GetShortPathName(blast_exe)
    blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blastDB, blastQueryFile, align_view=7)
    #print error_info.read()
    #print blast_out.read()
    blast_records = NCBIXML.parse(blast_out)
    results = []
    recordnumber = 0
    nonmatchingQueries = []
    while 1:
      recordnumber += 1
      try: b_record = blast_records.next()
      except StopIteration: break

      if not b_record:
        continue
      print 'query:', b_record.query
      e_value_thresh = 0.0001
      significant = False
      for alignment in b_record.alignments:
        bestHsp = None
        for hsp in alignment.hsps:
          if not bestHsp: bestHsp = hsp.expect
          elif bestHsp < hsp.expect: continue
          if hsp.expect < e_value_thresh:
            alignment.title = alignment.title.replace(">","")
            #if b_record.query != alignment.title:
            #print 'dir(alignment):', dir(alignment)
            #print 'hsps: ',alignment.hsps, 'accession:', alignment.accession, 'title:', alignment.title, 'length:', alignment.length
            if b_record.query != alignment.accession:
              significant = True
              print 'adding', b_record.query, 'and', alignment.accession, 'to matches (e value: ',hsp.expect, ', bit score: ', hsp.bits, ')'
              results.append((b_record.query, alignment.accession, hsp.expect, hsp.bits))
      print b_record.query, significant
      #if not significant:
      #  print 'adding', b_record.query, 'to the list of queries without matches'
      #  results.append((b_record.query, None, None))
    return results
Example #6
0
    def __find_partials(self, minimum_blast_length=0):
        """find partial IS elements by blasting the sequences against the
        genome"""
        #if there are no IS elements, skip this step
        if len(self.annotations) == 0: return

        #write a temporary genome fasta file
        blast_db = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_genome.fasta")
        outf = open(blast_db, "w")
        SeqIO.write(self.as_records(), outf, "fasta")
        outf.close()
        #turn it into a database
        os.system(FORMAT_EXE + " -p F -i " + blast_db)

        #write a temporary IS fasta file
        blast_file = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_IS.fasta")
        self.__write_singles(blast_file)

        #get the directions of these sample IS's
        directions = [is_set.lst[0].direction for is_set in self.annotations]

        #clear annotations
        self.annotations = []

        #perform a blast
        result_handle, error_handle = NCBIStandalone.blastall(BLAST_EXE,
                                        "blastn", blast_db, blast_file)
        blast_records = NCBIXML.parse(result_handle)

        #iterate over the results and the directions of the queries
        for record, sample_direction in zip(blast_records, directions):
            ISlist = []
            for alignment in record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < E_VALUE_CUTOFF and len(hsp.sbjct) >= MIN_PARTIAL_LEN and \
                        len(hsp.sbjct) > minimum_blast_length:
                        chromosome = alignment.title.split(" ")[1]
                        start = hsp.sbjct_start-1
                        end = start + len(hsp.sbjct)
                        #find out what the gene is
                        f = self.get_feature(chromosome, start, end)
                        thisdir = hsp.frame[1] * sample_direction
                        ISlist.append(IS.IS(f, chromosome, start, end, self, dir=thisdir))
            if len(ISlist) > 0:
                self.annotations.append(ISSet.ISSet(ISlist, self.profile))

        #clean up- remove the temporary files
        os.remove(blast_db)
        os.remove(blast_file)
        for f in glob.glob(blast_db + ".n*"):
            os.remove(f)
        os.remove("formatdb.log")
Example #7
0
    def runBlast(self, result_handle=None):
        # If a filehandle is given as input, simply reads and parses blast results from the input file into blast_records.
        # If a filehandle is not given as input, runs a new blast (with local arguments: blast_exe, blast_prog, etc.) on
        # Output: an iterator over a sequence of Record objects
        # If no filehandle given, or filehandle given is None, run new blast.
        if result_handle == None:
            result_handle, error_handle = NCBIStandalone.blastall(
                self.blast_exe, self.blast_prog, self.blast_db, self.blast_query, nprocessors=self.blast_processors
            )

        # Parse and return blast records from given filehandle or new blast run.
        blast_records = NCBIXML.parse(result_handle)
        return blast_records
Example #8
0
    def runBlast(self, result_handle=None):
        # If a filehandle is given as input, simply reads and parses blast results from the input file into blast_records.
        # If a filehandle is not given as input, runs a new blast (with local arguments: blast_exe, blast_prog, etc.) on
        # Output: an iterator over a sequence of Record objects
        # If no filehandle given, or filehandle given is None, run new blast.
        if result_handle == None:
            result_handle, error_handle = NCBIStandalone.blastall(
                self.blast_exe,
                self.blast_prog,
                self.blast_db,
                self.blast_query,
                nprocessors=self.blast_processors)

        # Parse and return blast records from given filehandle or new blast run.
        blast_records = NCBIXML.parse(result_handle)
        return blast_records
Example #9
0
    def identify_family(self, aaseq):
        """given an amino acid sequence, identify its family"""
        blast_file = os.path.join(TEMPORARY_DIRECTORY, "profile_temp.fasta")
        outf = open(blast_file, "w")

        temp_record = SeqRecord.SeqRecord(id="temp", seq=aaseq)

        SeqIO.write([temp_record], outf, "fasta")
        outf.close()

        result_handle, error_handle = NCBIStandalone.blastall(BLAST_EXE,
                                        "blastp", self.tpase_file, blast_file)

        try:
            record = NCBIXML.parse(result_handle).next()
        except ValueError:
            raise Exception("BLAST Exception: " + error_handle.read())

        best_hsp = None
        best_alignment = None

        #perform blast
        for alignment in record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < TPASE_MAX_E_VALUE:
                    if best_hsp:
                        if hsp.score > best_hsp.score:
                            best_alignment = alignment
                            best_hsp = hsp
                    else:
                        best_alignment = alignment
                        best_hsp = hsp

        #find family and group
        family = None
        group = None

        if best_hsp:
            fields = re.split("[\s\t]+", best_alignment.title)[1].split("|")
            #best_IS = self.__fetch_by_name(fields[0])
            family, group = fields[2], fields[3]

        #clean up by removing temporary blast file
        os.remove(blast_file)

        return family, group
Example #10
0
    def identify_family(self, aaseq):
        """given an amino acid sequence, identify its family"""
        blast_file = os.path.join(TEMPORARY_DIRECTORY, "profile_temp.fasta")
        outf = open(blast_file, "w")

        temp_record = SeqRecord.SeqRecord(id="temp", seq=aaseq)

        SeqIO.write([temp_record], outf, "fasta")
        outf.close()

        result_handle, error_handle = NCBIStandalone.blastall(
            BLAST_EXE, "blastp", self.tpase_file, blast_file)

        try:
            record = NCBIXML.parse(result_handle).next()
        except ValueError:
            raise Exception("BLAST Exception: " + error_handle.read())

        best_hsp = None
        best_alignment = None

        #perform blast
        for alignment in record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < TPASE_MAX_E_VALUE:
                    if best_hsp:
                        if hsp.score > best_hsp.score:
                            best_alignment = alignment
                            best_hsp = hsp
                    else:
                        best_alignment = alignment
                        best_hsp = hsp

        #find family and group
        family = None
        group = None

        if best_hsp:
            fields = re.split("[\s\t]+", best_alignment.title)[1].split("|")
            #best_IS = self.__fetch_by_name(fields[0])
            family, group = fields[2], fields[3]

        #clean up by removing temporary blast file
        os.remove(blast_file)

        return family, group
Example #11
0
    def blastOneBatchProbes(self, probe_id_seq_ls, blast_bin_path, database_fname, \
         tmp_blast_infname, min_no_of_identities=15, node_rank=0):
        """
		2010-4-14
		"""
        result_ls = []
        inf = open(tmp_blast_infname, 'w')
        for probe_id, probe_seq in probe_id_seq_ls:
            inf.write(">%s\n" % probe_id)  # write the probe id
            inf.write("%s\n" % probe_seq)
        inf.close()
        if self.report:
            sys.stderr.write("I'm %s, finished generating blast file for %s probes.\n"%\
               (node_rank, len(probe_id_seq_ls)))
        result_handle, error_info = NCBIStandalone.blastall(blast_bin_path,
                                                            "blastn",
                                                            database_fname,
                                                            tmp_blast_infname,
                                                            align_view=7)
        #error_info = error_info.read()	#2010-4-14 this read() causes program to hang out forever. ???
        #if error_info:
        #	sys.stderr.write("%s"%error_info)
        blast_records = NCBIXML.parse(result_handle)

        if self.report:
            sys.stderr.write("I'm %s, finished blasting.\n" % node_rank)
        for blast_record in blast_records:
            no_of_hits = min(
                1000, len(blast_record.alignments
                          ))  # top 1000 or the number of available alignments
            for i in range(no_of_hits):
                alignment_title = blast_record.alignments[i].title
                for hsp in blast_record.alignments[i].hsps:
                    if hsp.identities >= min_no_of_identities:
                        result_entry = [blast_record.query, alignment_title, hsp.query_start, hsp.query_end, \
                           hsp.identities, hsp.sbjct_start, hsp.sbjct_end,]
                        #20104-25 hsp.strand is always (None, None), hsp.frame is either (1,1) or (1, -1) when the query's end < start
                        #[query name (probe id and pos) , alignment title , number of matches, pos in contig ]
                        result_ls.append(result_entry)
        if self.report:
            sys.stderr.write("I'm %s, finished with %s blasts, got %s returns.\n"%\
                (node_rank, len(probe_id_seq_ls), len(result_ls)))
        return result_ls
Example #12
0
    def seqBlast(self, seqFile, blastType = "blastn", scoreMin = 1e-3, logFile = None):
        '''
        command line blast
        blastall -d database -i query -p blastn -o blastout
        '''
        
        if not os.path.exists(os.path.expanduser(seqFile)):
            print "(ignore) %s file not found" %(seqFile)
        
        if not os.path.exists(os.path.expanduser(self.blastDB + ".nsq")):
            print "(ignore) %s file not found" % (self.blastDB)
            
        (resultHandle,errorHandle) = NCBIStandalone.blastall(self.blastExe, blastType, self.blastDB, seqFile)       
        time.sleep(5)
        blastRecords = NCBIXML.parse(resultHandle)

        blastRecords = list(blastRecords)
        resultHandle.close()
        errorHandle.close()

        return blastRecords
Example #13
0
query_sequences = {} 
it = Bio.Fasta.Iterator(handle, Bio.Fasta.SequenceParser())
seq = it.next()
while seq:
  query_sequences[seq.description] = {} 
  query_sequences[seq.description]["number_of_hits"] = 0 
  print seq.description
  print query_sequences[seq.description]["number_of_hits"] 
  seq = it.next()

handle.close()




blast_out, error_handle = NCBIStandalone.blastall(blast_exe, blast_program, blast_db, blast_file)

#print error_handle 

records = NCBIXML.parse(blast_out)

#b_record = records.next() 
#
#
#E_VALUE_THRESH = 0.000000004
#
#print  dir(b_record)
#print  b_record.num_sequences
#print "Query = %s"  % b_record.query

#b_record = records.next() 
Example #14
0
documentation.
"""
# standard library
import os
import sys

# biopython
from Bio.Blast import NCBIStandalone

my_blast_db = os.path.join(os.getcwd(), 'at-est', 'a_cds-10-7.fasta')
my_blast_file = os.path.join(os.getcwd(), 'at-est', 'test_blast',
                             'sorghum_est-test.fasta')
my_blast_exe = os.path.join(os.getcwd(), 'blast', 'blastall')

print 'Running blastall...'
blast_out, error_info = NCBIStandalone.blastall(my_blast_exe, 'blastn',
                                                my_blast_db, my_blast_file)


b_parser = NCBIStandalone.BlastParser()

b_iterator = NCBIStandalone.Iterator(blast_out, b_parser)

while 1:
    b_record = b_iterator.next()

    if b_record is None:
        break
    
    E_VALUE_THRESH = 0.04
    for alignment in b_record.alignments:
        for hsp in alignment.hsps:
Example #15
0
 def __init__(self, blastcmd, program, database, infile, **kargs):
     if 'align_view' in kargs:
         kargs.pop('align_view')
     blastout, blasterr = NCBIStandalone.blastall(
         blastcmd, program, database, infile, **kargs)
     BlastOutputReader.__init__(self, blastout)
Example #16
0
    def runBlast(self, inputFname=None, databaseFname=None, outputFname=None, outputFnamePrefix=None, \
       blastallPath=None, minNoOfIdentities=None, \
       maxNoOfMismatches=None,\
       minIdentityPercentage=None, maxNoOfHits=10):
        """
		2012.8.19
			output xml dump if outputFnamePrefix is given.
		2012.5.23
		  -p  Program Name [String]
		  -d  Database [String]
		    default = nr
		  -i  Query File [File In]
		    default = stdin
		  -e  Expectation value (E) [Real]
		    default = 10.0

			blastall align_view option values:
				0 = pairwise,
				1 = query-anchored showing identities,
				2 = query-anchored no identities,
				3 = flat query-anchored, show identities,
				4 = flat query-anchored, no identities,
				5 = query-anchored no identities and blunt ends,
				6 = flat query-anchored, no identities and blunt ends,
				7 = XML Blast output,
				8 = tabular, 
				9 tabular with commresult_handleent lines
				10 ASN, text
				11 ASN, binary [Integer]
				    default = 0
				    range from 0 to 11
		
		"""

        result_handle, error_info = NCBIStandalone.blastall(blastallPath,
                                                            "blastn",
                                                            databaseFname,
                                                            inputFname,
                                                            align_view=7)

        #blastn_cline = NcbiblastnCommandline(cmd=self.blastnPath, query=inputFname, db=databaseFname, evalue=0.001,\
        #									outfmt=5, out="opuntia.xml")	#outfmt 5 is xml output.

        #error_info = error_info.read()	#2010-4-14 this read() causes program to hang out forever. ???
        #if error_info:
        #	sys.stderr.write("%s"%error_info)
        if outputFnamePrefix:
            outf = open('%s.xml' % (outputFnamePrefix), 'w')
            blastContent = result_handle.read()
            outf.write(blastContent)
            outf.close()
            result_handle = cStringIO.StringIO(blastContent)
        blast_records = NCBIXML.parse(result_handle)

        if self.report:
            sys.stderr.write("finished blasting.\n")

        counter = 0
        writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
        header = ['queryID', "queryStart", "queryEnd", 'queryLength', 'targetChr', 'targetStart', 'targetStop', \
          'targetLength', 'noOfIdentities', \
          'noOfMismatches', 'identityPercentage']
        writer.writerow(header)
        for blast_record in blast_records:
            no_of_hits = min(maxNoOfHits, len(blast_record.alignments)
                             )  # top 50 or the number of available alignments
            # each alignment is one chromosome (=one fasta record).
            for i in range(no_of_hits):
                alignment_title = blast_record.alignments[i].title
                targetChr = blast_record.alignments[i].hit_def
                targetLength = blast_record.alignments[i].length
                for hsp in blast_record.alignments[i].hsps:
                    hitIsGood = True
                    noOfMismatches = blast_record.query_length - hsp.identities
                    identityPercentage = float(hsp.identities) / float(
                        blast_record.query_length)
                    if minNoOfIdentities is not None and hsp.identities < minNoOfIdentities:
                        hitIsGood = False
                    if maxNoOfMismatches is not None and noOfMismatches > maxNoOfMismatches:
                        hitIsGood = False
                    if minIdentityPercentage is not None and identityPercentage < minIdentityPercentage:
                        hitIsGood = False
                    if hitIsGood:
                        counter += 1
                        result_entry = [blast_record.query, hsp.query_start, hsp.query_end, blast_record.query_length,\
                           targetChr, hsp.sbjct_start, hsp.sbjct_end, targetLength, hsp.identities, noOfMismatches,\
                           identityPercentage]
                        #20104-25 hsp.strand is always (None, None), hsp.frame is either (1,1) or (1, -1) when the query's end < start
                        #[query name (probe id and pos) , alignment title , number of matches, pos in contig ]
                        writer.writerow(result_entry)
        if self.report:
            sys.stderr.write("%s blast records, %s pass the filter.\n"%\
                (len(blast_records), counter))
        del writer
 def do_blast_search(self):
   from Bio.Blast import NCBIStandalone
   self.result_handle, self.error_handle = NCBIStandalone.blastall(self.blast_exe, "blastp",
                                                     self.blast_db, self.blast_file)
Example #18
0
    def localBlast( self, seqFile, db, method='blastp',
                    resultOut=None, e='0.01', **kw ):
        """
        Performa a local blast search (requires that the blast binaries
        and databases are installed localy).
        Uses Bio.Blast.NCBIStandalone.blastall (Biopython) for the search.

        @param seqFile: file name with search sequence in FASTA format
        @type  seqFile: str
        @param db: database(s) to search, e.g. ['swissprot', 'pdb']
        @type  db: [str]
        @param method: search program to use, e.g. 'blastp', 'fasta'
                       (default: blastp)
        @type  method: str
        @param e: expectation value cutoff
        @type  e: float
        @param resultOut: save blast output to this new file
        @type  resultOut: str
        @param kw: optional keywords::
                --- Scoring ---
                matrix         Matrix to use (default BLOSUM62).
                gap_open       Gap open penalty (default 0).
                gap_extend     Gap extension penalty (default 0).

                --- Algorithm ---
                gapped         Whether to do a gapped alignment. T/F 
                                (default T)
                wordsize       Word size (blastp default 11).
                keep_hits      Number of best hits from a region to keep
                                (default off).
                xdrop          Dropoff value (bits) for gapped alignments
                                (blastp default 25).
                hit_extend     Threshold for extending hits (blastp default 11)

                --- Processing ---
                filter         Filter query sequence? (T/F, default F)
                restrict_gi    Restrict search to these GI's.
                believe_query  Believe the query defline? (T/F, default F)
                nprocessors    Number of processors to use (default 1).

                --- Formatting ---
                alignments     Number of alignments. (default 250)
        @type  kw: any

        @raise BlastError: if program call failes
        """
        results = err = p = None
        resultOut = resultOut or self.outFolder+ self.F_BLAST_RAW_OUT
        kw = self.__dictvalues2str( kw )
        e = str(e)

        try:
            if self.verbose:
                self.log.add('running blast...')

            results, err = NCBIStandalone.blastall( settings.blast_bin,
                                                    method, db, seqFile,
                                                    expectation=e,
                                                    align_view='7', ## XML output
                                                    **kw)

            results = self.__copyFileHandle(results, resultOut)
            err = self.__copyFileHandle(err, self.outFolder+self.F_BLAST_ERROR)

            if self.verbose:
                self.log.writeln('Raw blast output copied to: ' + resultOut  )

            parsed = NCBIXML.parse( results ).next()

            self.__blast2dict( parsed, db )

        except Exception, why:
            self.log.add( T.lastErrorTrace() )
            globals().update( locals() )
            self.log.writeln('local namespace is pushed into global ')
            raise BlastError( str(why) ) 
Example #19
0
def getUniverseFromMSA(fname, propid, blastdb, ecutoff, use_subfamily_seeds):

    # executables
    uniqueseq = '/usr/bin/uniqueseq'
    blastall = '/usr/bin/blastall'
    fastacmd = '/usr/bin/fastacmd'

    if use_subfamily_seeds:
        f = open("/home/ruchira/pfam_subdir_dict.pkl")
        pfam_subdir_dict = cPickle.load(f)
        f.close()
        fam = os.path.split(os.getcwd())[1]
        if fam not in pfam_subdir_dict:
            print "Unknown PFAM family %s, exiting..." % fam
            sys.exit(1)
        seq_path_str = ' '.join(list(pfam_subdir_dict[fam]))
        os.system('cat %s > gufmsa.seeds.fa' % seq_path_str)
    else:
        # remove redundant sequences

        print 'making %s unique at %.4f proportion identical...' % (fname,
                                                                    propid),
        sys.stdout.flush()

        cmd = '%s gufmsa -alignfile %s -percentid %f &> /dev/null' % (
            uniqueseq, fname, propid)
        os.system(cmd)

        handle = open('gufmsa.a2m', 'r')
        outf = open('gufmsa.seeds.fa', 'w')

        seq_count = 0

        for x in SeqIO.parse(handle, 'fasta'):
            seq_count += 1

            header = x.description
            sequence = x.seq.tostring()

            print >> outf, '>%s' % header

            sequence = sequence.replace('-', '')
            sequence = sequence.replace('.', '')
            sequence = sequence.upper()

            print >> outf, sequence

        outf.close()
        handle.close()

        print 'done; %d sequences remaining.' % seq_count

    # blast non-redundant sequences

    print 'blasting remaining sequences against %s...' % blastdb,
    sys.stdout.flush()

    result_handle, error_handle = NCBIStandalone.blastall(blastall,
                                                          'blastp',
                                                          blastdb,
                                                          'gufmsa.seeds.fa',
                                                          expectation=ecutoff,
                                                          descriptions=10000,
                                                          alignments=10000,
                                                          filter=False)

    blasthit_ids = set([])

    for blast_record in NCBIXML.parse(result_handle):
        for alignment in blast_record.alignments:
            blasthit_ids.add(alignment.hit_id)

    outf = open('gufmsa.blasthits.ids', 'w')

    for x in blasthit_ids:
        print >> outf, x

    outf.close()

    print 'done; %d potential homologs found.' % len(blasthit_ids)

    # retrieve full-length sequences from the db

    print 'Retrieving full-length potential homolog sequences...',
    sys.stdout.flush()

    cmd = '%s -i gufmsa.blasthits.ids -d %s > TheUniverse.fa' % (fastacmd,
                                                                 blastdb)
    os.system(cmd)

    print 'done; universe is in universe.fa'
def blast(blastRootDirectory):
    if sys.platform == 'win32':
        blast_db = os.path.join(blastRootDirectory, 'blastDB.fasta')
    else:
        if not os.path.isdir('/tmp/BLAST'):
            print "making directory '/tmp/BLAST'"
            os.mkdir('/tmp/BLAST/')
        if not os.path.exists('/tmp/BLAST/formatdb'):
            shutil.copy(os.path.join(blastRootDirectory, 'formatdb'),
                        '/tmp/BLAST')
            print "copying 'formatdb' to '/tmp/BLAST/'"
        blast_db = os.path.join('/tmp/BLAST', 'blastDB.fasta')
    #print 'path to blastDB.fasta:', blast_db

    blast_file = os.path.join(blastRootDirectory, 'filetoblast.txt')
    #print 'path to filetoblast.txt:', blast_file

    if sys.platform == 'win32':
        blastall_name = 'Blastall.exe'
        blast_exe = os.path.join(blastRootDirectory, blastall_name)
    else:
        blastall_name = 'blastall'
        blast_exe = os.path.join(os.getcwd(), '../../BLAST/bin/',
                                 blastall_name)

    #print 'path to blastall:', blast_exe

    if sys.platform == 'win32':
        import win32api
        blast_db = win32api.GetShortPathName(blast_db)
        blast_file = win32api.GetShortPathName(blast_file)
        blast_exe = win32api.GetShortPathName(blast_exe)

    #cont = raw_input('blah')
    #try:
    blast_out, error_info = NCBIStandalone.blastall(blast_exe,
                                                    'blastp',
                                                    blast_db,
                                                    blast_file,
                                                    align_view=7)
    #except:
    #  f = open(blast_file, 'r')
    #  s = file.read()
    #  print s

    #print 'done BLASTing'

    print 'errors:', error_info.read()
    print 'blast output:', blast_out.read()

    b_parser = NCBIXML.BlastParser()
    #print 'got parser'

    b_record = b_parser.parse(blast_out)
    b_iterator = NCBIStandalone.Iterator(blast_out, b_parser)
    #print 'got iterator'
    results = []
    recordnumber = 0
    nonmatchingQueries = []
    while 1:
        recordnumber += 1
        b_record = b_iterator.next()

        if not b_record: break
        print 'query:', b_record.query
        if b_record is None:
            break
        e_value_thresh = 0.001
        print 'number of alignments:', len(b_record.alignments)
        significant = False
        for alignment in b_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < e_value_thresh:
                    alignment.title = alignment.title.replace(">", "")
                    if b_record.query != alignment.title:
                        significant = True
                        print 'adding', b_record.query, 'and', alignment.title, 'to the list of matches'
                        results.append(
                            (b_record.query, alignment.title, hsp.expect))
        print b_record.query, significant
        if not significant:
            print 'adding', b_record.query, 'to the list of queries without matches'
            nonmatchingQueries.append(b_record.query)

    return nonmatchingQueries, results
def blast(blastRootDirectory):
  if sys.platform == 'win32':
    blast_db = os.path.join(blastRootDirectory, 'blastDB.fasta')
  else:
    if not os.path.isdir('/tmp/BLAST'):
      print "making directory '/tmp/BLAST'"
      os.mkdir('/tmp/BLAST/')
    if not os.path.exists('/tmp/BLAST/formatdb'):
      shutil.copy(os.path.join(blastRootDirectory,'formatdb'), '/tmp/BLAST')
      print "copying 'formatdb' to '/tmp/BLAST/'"
    blast_db = os.path.join('/tmp/BLAST', 'blastDB.fasta')
  #print 'path to blastDB.fasta:', blast_db
  
  blast_file = os.path.join(blastRootDirectory, 'filetoblast.txt')
  #print 'path to filetoblast.txt:', blast_file
  
  if sys.platform == 'win32':
    blastall_name = 'Blastall.exe'
    blast_exe = os.path.join(blastRootDirectory, blastall_name)
  else:
    blastall_name = 'blastall'
    blast_exe = os.path.join(os.getcwd(), '../../BLAST/bin/', blastall_name)

  #print 'path to blastall:', blast_exe
  
  if sys.platform == 'win32':
     import win32api
     blast_db = win32api.GetShortPathName(blast_db)
     blast_file = win32api.GetShortPathName(blast_file)
     blast_exe = win32api.GetShortPathName(blast_exe)
  
  #cont = raw_input('blah')
  #try: 
  blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blast_db, blast_file,  align_view=7)
  #except:
  #  f = open(blast_file, 'r')
  #  s = file.read()
  #  print s
  
  #print 'done BLASTing'
  
  print 'errors:', error_info.read()
  print 'blast output:', blast_out.read()
  
  b_parser = NCBIXML.BlastParser()
  #print 'got parser'
  
  b_record = b_parser.parse(blast_out)
  b_iterator = NCBIStandalone.Iterator(blast_out, b_parser)
  #print 'got iterator'
  results = []
  recordnumber = 0
  nonmatchingQueries = []
  while 1:
    recordnumber += 1
    b_record = b_iterator.next()
    
    if not b_record: break
    print 'query:', b_record.query
    if b_record is None:
      break
    e_value_thresh = 0.001
    print 'number of alignments:', len(b_record.alignments)
    significant = False
    for alignment in b_record.alignments:
      for hsp in alignment.hsps:
        if hsp.expect < e_value_thresh:
          alignment.title = alignment.title.replace(">","")
          if b_record.query != alignment.title:
            significant = True
            print 'adding', b_record.query, 'and', alignment.title, 'to the list of matches'
            results.append((b_record.query, alignment.title, hsp.expect))
    print b_record.query, significant
    if not significant:
      print 'adding', b_record.query, 'to the list of queries without matches'
      nonmatchingQueries.append(b_record.query)

  return nonmatchingQueries, results
Example #22
0
    def localBlast(self,
                   seqFile,
                   db,
                   method='blastp',
                   resultOut=None,
                   e='0.01',
                   **kw):
        """
        Performa a local blast search (requires that the blast binaries
        and databases are installed localy).
        Uses Bio.Blast.NCBIStandalone.blastall (Biopython) for the search.

        @param seqFile: file name with search sequence in FASTA format
        @type  seqFile: str
        @param db: database(s) to search, e.g. ['swissprot', 'pdb']
        @type  db: [str]
        @param method: search program to use, e.g. 'blastp', 'fasta'
                       (default: blastp)
        @type  method: str
        @param e: expectation value cutoff
        @type  e: float
        @param resultOut: save blast output to this new file
        @type  resultOut: str
        @param kw: optional keywords::
                --- Scoring ---
                matrix         Matrix to use (default BLOSUM62).
                gap_open       Gap open penalty (default 0).
                gap_extend     Gap extension penalty (default 0).

                --- Algorithm ---
                gapped         Whether to do a gapped alignment. T/F 
                                (default T)
                wordsize       Word size (blastp default 11).
                keep_hits      Number of best hits from a region to keep
                                (default off).
                xdrop          Dropoff value (bits) for gapped alignments
                                (blastp default 25).
                hit_extend     Threshold for extending hits (blastp default 11)

                --- Processing ---
                filter         Filter query sequence? (T/F, default F)
                restrict_gi    Restrict search to these GI's.
                believe_query  Believe the query defline? (T/F, default F)
                nprocessors    Number of processors to use (default 1).

                --- Formatting ---
                alignments     Number of alignments. (default 250)
        @type  kw: any

        @raise BlastError: if program call failes
        """
        results = err = p = None
        resultOut = resultOut or self.outFolder + self.F_BLAST_RAW_OUT
        kw = self.__dictvalues2str(kw)
        e = str(e)

        try:
            if self.verbose:
                self.log.add('running blast...')

            results, err = NCBIStandalone.blastall(
                settings.blast_bin,
                method,
                db,
                seqFile,
                expectation=e,
                align_view='7',  ## XML output
                **kw)

            results = self.__copyFileHandle(results, resultOut)
            err = self.__copyFileHandle(err,
                                        self.outFolder + self.F_BLAST_ERROR)

            if self.verbose:
                self.log.writeln('Raw blast output copied to: ' + resultOut)

            parsed = NCBIXML.parse(results).next()

            self.__blast2dict(parsed, db)

        except Exception, why:
            self.log.add(T.lastErrorTrace())
            globals().update(locals())
            self.log.writeln('local namespace is pushed into global ')
            raise BlastError(str(why))
Example #23
0
def dazheMpiBlast():
    """
	2010-4-14
		wrap all of dazhe's old code into this function
	"""
    blast_bin_path = '/home/cmb-01/yuhuang/bin/blast/bin/blastall'
    database_fname = '/home/cmb-01/dazhemen/Data/Ler/Cereon_Ath_Ler.fasta'

    probes = vardata.vardata()

    comm = MPI.world.duplicate()
    ppp = 10  # probes per processor
    accs = []

    genome = ref_genome()
    genome.load_chr()
    probes.readfromfile("/home/cmb-01/dazhemen/CNV/CNV_probelist.csv",
                        format=2)
    ppp = len(probes.data) / (comm.size - 1) + 1

    print "I'm %s of %s, finished loading\n" % (comm.rank, comm.size)

    if comm.rank == 0:  #Master node, reads the file etc
        outf = open("/home/cmb-01/dazhemen/CNV/ler_raw.csv", "w")
        outf.write(
            "Chromosome,Position,Probe_ID,Alignment_title,Number_matches\n")
        for dest in range(1, comm.size):
            data, source, tag = comm.receiveString(dest, None)
            print "I'm 0, collected data from %s\n" % source
            partial_result_list = cPickle.loads(data)
            for result_entry in partial_result_list:
                outf.write(result_entry[0].replace("_", ",") + "," +
                           ",".join([str(a) for a in result_entry[1:]]) + "\n")
    else:
        probe_index_start = (comm.rank - 1) * ppp
        result_ls = []
        if probe_index_start + ppp >= len(probes.data):
            ppp = len(probes.data) - probe_index_start
        if ppp != 0:
            tmp_blast_infname = '/home/cmb-01/dazhemen/CNV/tmp_blast/' + str(
                comm.rank)
            inf = open(tmp_blast_infname, 'w')
            for i in xrange(probe_index_start, probe_index_start + ppp):
                inf.write(">%s_%s_%s\n" %
                          (probes.data[i][0], probes.data[i][1],
                           probes.data[i][2][0]))  # write the probe id
                inf.write(
                    "%s\n" %
                    genome.readprobe(probes.data[i][0], probes.data[i][1]))
            inf.close()
            print "I'm %s, finished with generating blast file from probes %s to %s\n" % (
                comm.rank, probe_index_start, probe_index_start + ppp)
            result_handle, error_info = NCBIStandalone.blastall(
                blast_bin_path,
                "blastn",
                database_fname,
                tmp_blast_infname,
                align_view=7)
            blast_records = NCBIXML.parse(result_handle)
            print "I'm %s, finished with blasting\n" % comm.rank
            while 1:
                try:
                    blast_record = blast_records.next()
                except:
                    "I'm %s, finished with all records\n" % comm.rank
                    break
                no_of_hits = min(1000, len(blast_record.alignments))
                for i in range(no_of_hits):
                    alignment_title = blast_record.alignments[i].title
                    for hsp in blast_record.alignments[i].hsps:
                        result_entry = [
                            blast_record.query, alignment_title, 0, 0
                        ]  #[query name (probe id and pos) , alignment title , number of matches, pos in contig ]
                        if hsp.identities >= 15:
                            result_entry[2] = hsp.identities
                            result_entry[3] = hsp.sbjct_start
                            result_ls.append(result_entry)
            print "I'm %s, finished with parsing blast result\n" % comm.rank
        result_data = cPickle.dumps(result_ls)
        comm.send(result_data, 0, 0)
Example #24
0
#!/usr/bin/python

my_blast_db = "/home/kenglish/Data/Genomes/Databases/EST_Clade_A"
my_blast_file = "Record1.fasta"
my_blast_exe = "/usr/bin/blastall"

from Bio.Blast import NCBIStandalone
from Bio.Blast import NCBIXML


result_handle, error_handle = NCBIStandalone.blastall(my_blast_exe, "blastn", my_blast_db, my_blast_file)

#$blast_results = result_handle.read()
#print blast_results

from Bio.Blast import NCBIXML
blast_records = NCBIXML.parse(result_handle)

blast_record = blast_records.next()
print blast_record.alignments


Example #25
0
 def __init__(self, blastcmd, program, database, infile, **kargs):
     if 'align_view' in kargs:
         kargs.pop('align_view')
     blastout, blasterr = NCBIStandalone.blastall(blastcmd, program,
                                                  database, infile, **kargs)
     BlastOutputReader.__init__(self, blastout)