Example #1
0
def blast2data(filehandle):  ###This should be for blast-txt
    """BLAST output to data dict"""
    data = {}
    blast_parser = NCBIStandalone.BlastParser()
    blast_iterator = NCBIStandalone.Iterator(filehandle, blast_parser)
    for blast_record in blast_iterator:
        readname = blast_record.query.split()[0]
        for alignment in blast_record.alignments:
            if re.search("\|", alignment.title):
                fields = alignment.title[1:].strip().split('|')
                refgi = fields[1]
            else:
                fields = alignment.title[1:].strip().split(' ')
            refgi = fields[0]
            for hsp in alignment.hsps:
                if hsp.expect > EVALUE_CUTOFF:
                    continue
                identity = round(
                    float(hsp.identities[0]) * 100 / hsp.identities[1], 1)
                start = hsp.sbjct_start
                end = hsp.sbjct_end
                if start > end:
                    tmp = start
                    start = end
                    end = tmp
                if not data.has_key(refgi):
                    data[refgi] = []
                if not refgi in references:
                    references.append(refgi)
                    refgenome2json(refgi)
                if not refLengths.has_key(refgi):
                    refLengths[refgi] = gi2length(refgi)
                data[refgi].append([start, identity, end, readname])
    return data
def getCoordinatesFromBlo(bloFname, padding):
    '''
    # Extract coordinates from blo file
    '''
    coord = {}

    #outf = open(outFile, 'w')
    blast_parser = NCBIStandalone.BlastParser()
    blast_iterator = NCBIStandalone.Iterator(open('temp.blo'), blast_parser)
    #blast_iterator = SearchIO.parse(open('temp.blo'),'blast-txt') #if switch to SearchIO, this is the way to go (not working yet)
    for hit in blast_iterator:
        for alignment in hit.alignments:
            for hsp in alignment.hsps:
                #print alignment.title
                #print hsp.sbjct_start, hsp.sbjct_end
                #print hsp.sbjct
                #outf.write("%s_%s-%s\n%s\n\n"%(alignment.title, hsp.sbjct_start, hsp.sbjct_end, hsp.sbjct))
                new = True
                fullName = alignment.title.replace('>', '')
                if fullName in coord.keys() and hsp.sbjct_start >= coord[
                        fullName][0] and hsp.sbjct_end <= coord[fullName][1]:
                    new = False
                if new:
                    coord[fullName] = [hsp.sbjct_start, hsp.sbjct_end]
    return coord
Example #3
0
def getHits(gene):
    ''' BLAST parser using Biopython
    Input: name of blast out file in standard ouput format
    Outputs: 2 files 
    '''
    inf = open(o.blast, 'rU')
    parser = NCBIStandalone.BlastParser()
    error_parser = NCBIStandalone.BlastErrorParser(inf)
    iterator = NCBIStandalone.Iterator(inf, error_parser)
    err_iterator = NCBIStandalone.Iterator(inf, error_parser)
    #next_record =

    ## *** Parsing *** ##
    lg = len(gene)
    if o.verbose == True:
        sys.stderr.write("\nGetting hits...\n")
    for record in iterator:
        query = record.query.split(" ")[0]

        if query in gene:
            out.write("%s\n" % gene[query])
            if record.alignments is []:
                out.write("%s\tNA\tNA\tNA\n" % gene[query])
            else:
                flag = 0
                for alignment in record.alignments:
                    for hsp in alignment.hsps:
                        #-->## ** Selection Process **##
                        if float(hsp.expect) < 0.0001 and flag < 3:
                            out.write(
                                "%s\t%s\t%s\tHigh\n" %
                                (gene[query], alignment.title.split(">")[1],
                                 float(hsp.expect)))
                            flag += 1
                        elif float(hsp.expect) < 1.0 and flag < 3:
                            out.write(
                                "%s\t%s\t%s\tLow\n" %
                                (gene[query], alignment.title.split(">")[1],
                                 float(hsp.expect)))
                            flag += 1
                        elif float(hsp.expect) < 5.0 and flag < 3:
                            out.write(
                                "%s\t%s\t%s\tScare\n" %
                                (gene[query], alignment.title.split(">")[1],
                                 float(hsp.expect)))
                            flag += 1
                        elif float(hsp.expect) > 1.0 and flag < 1:
                            out.write("%s\tNA\tNA\tNA\n" % gene[query])
                            flag += 1
            del gene[query]
            if o.verbose == True:
                sys.stderr.write('\r' + '' * 0)
                sys.stderr.write(str(int((lg - len(gene)) * 100 / lg)) + '%')
                sys.stdout.flush()
        else:
            pass

    if (lg - len(gene)) != len(gene):
        sys.stderr.write("\nGenes not found:\n%s" % gene.keys())
Example #4
0
	def runBlast(self, result_handle=None):
		if result_handle == None:
			result_handle, error_handle = NCBIStandalone.blastall(self.blast_exe, self.blast_prog, self.blast_db, self.blast_query, nprocessors=self.blast_processors)

		#kdrew: if we want to pre-run blast, just run this line
		blast_records = NCBIXML.parse(result_handle)
		return blast_records
Example #5
0
def blast_2_files(input_filename,input_db):
    blast_db = fastafile.PERMANENT_STORE + input_db
    blast_file = fastafile.PERMANENT_STORE + input_filename
    if not os.path.exists(blast_db + ".nin"):
        fastafile.formatdb(blast_db)

    blast_out, error_handle = NCBIStandalone.blastall(BLAST_EXE, BLAST_PROGRAM, blast_db, blast_file)
    return blast_out 
Example #6
0
    def blastfile(self, filename):
        # run blast
        b_out, e_info = NCBIStandalone.blastall(self.blastexe, self.mode,
                                                self.dbname, filename)
        data = b_out.read()
        if not data:
            raise ValueError, 'BLAST error: %s' % e_info.read()

        return data
Example #7
0
    def __find_partials(self, minimum_blast_length=0):
        """find partial IS elements by blasting the sequences against the
        genome"""
        #if there are no IS elements, skip this step
        if len(self.annotations) == 0: return

        #write a temporary genome fasta file
        blast_db = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_genome.fasta")
        outf = open(blast_db, "w")
        SeqIO.write(self.as_records(), outf, "fasta")
        outf.close()
        #turn it into a database
        os.system(FORMAT_EXE + " -p F -i " + blast_db)

        #write a temporary IS fasta file
        blast_file = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_IS.fasta")
        self.__write_singles(blast_file)

        #get the directions of these sample IS's
        directions = [is_set.lst[0].direction for is_set in self.annotations]

        #clear annotations
        self.annotations = []

        #perform a blast
        result_handle, error_handle = NCBIStandalone.blastall(
            BLAST_EXE, "blastn", blast_db, blast_file)
        blast_records = NCBIXML.parse(result_handle)

        #iterate over the results and the directions of the queries
        for record, sample_direction in zip(blast_records, directions):
            ISlist = []
            for alignment in record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < E_VALUE_CUTOFF and len(hsp.sbjct) >= MIN_PARTIAL_LEN and \
                        len(hsp.sbjct) > minimum_blast_length:
                        chromosome = alignment.title.split(" ")[1]
                        start = hsp.sbjct_start - 1
                        end = start + len(hsp.sbjct)
                        #find out what the gene is
                        f = self.get_feature(chromosome, start, end)
                        thisdir = hsp.frame[1] * sample_direction
                        ISlist.append(
                            IS.IS(f, chromosome, start, end, self,
                                  dir=thisdir))
            if len(ISlist) > 0:
                self.annotations.append(ISSet.ISSet(ISlist, self.profile))

        #clean up- remove the temporary files
        os.remove(blast_db)
        os.remove(blast_file)
        for f in glob.glob(blast_db + ".n*"):
            os.remove(f)
        os.remove("formatdb.log")
Example #8
0
def extract_sequences(file):
    scanner = NCBIStandalone._Scanner()
    consumer = SequencesExtractor()

    file_to_parse = open(file, 'r')

    
    scanner.feed(file_to_parse, consumer)

    file_to_parse.close()

    return consumer.sequences_list
Example #9
0
    def __find_partials(self, minimum_blast_length=0):
        """find partial IS elements by blasting the sequences against the
        genome"""
        #if there are no IS elements, skip this step
        if len(self.annotations) == 0: return

        #write a temporary genome fasta file
        blast_db = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_genome.fasta")
        outf = open(blast_db, "w")
        SeqIO.write(self.as_records(), outf, "fasta")
        outf.close()
        #turn it into a database
        os.system(FORMAT_EXE + " -p F -i " + blast_db)

        #write a temporary IS fasta file
        blast_file = os.path.join(TEMPORARY_DIRECTORY, "OASIS_temp_IS.fasta")
        self.__write_singles(blast_file)

        #get the directions of these sample IS's
        directions = [is_set.lst[0].direction for is_set in self.annotations]

        #clear annotations
        self.annotations = []

        #perform a blast
        result_handle, error_handle = NCBIStandalone.blastall(BLAST_EXE,
                                        "blastn", blast_db, blast_file)
        blast_records = NCBIXML.parse(result_handle)

        #iterate over the results and the directions of the queries
        for record, sample_direction in zip(blast_records, directions):
            ISlist = []
            for alignment in record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < E_VALUE_CUTOFF and len(hsp.sbjct) >= MIN_PARTIAL_LEN and \
                        len(hsp.sbjct) > minimum_blast_length:
                        chromosome = alignment.title.split(" ")[1]
                        start = hsp.sbjct_start-1
                        end = start + len(hsp.sbjct)
                        #find out what the gene is
                        f = self.get_feature(chromosome, start, end)
                        thisdir = hsp.frame[1] * sample_direction
                        ISlist.append(IS.IS(f, chromosome, start, end, self, dir=thisdir))
            if len(ISlist) > 0:
                self.annotations.append(ISSet.ISSet(ISlist, self.profile))

        #clean up- remove the temporary files
        os.remove(blast_db)
        os.remove(blast_file)
        for f in glob.glob(blast_db + ".n*"):
            os.remove(f)
        os.remove("formatdb.log")
Example #10
0
  def blast(self):
    '''aligns sequences using blast'''
    blastAppDir = self.blastAppDir
    blastDB = os.path.join(self.blastDataDir, 'blastDB.fasta')
    blastQueryFile = os.path.join(self.blastDataDir, 'filetoblast.txt')
    print 'path to filetoblast.txt:', blastQueryFile
    if sys.platform == 'win32':
      blastall_name = 'Blastall.exe'
    else:
      blastall_name = 'blastall'
    blast_exe = os.path.join(blastAppDir, blastall_name)
    if sys.platform == 'win32':
       import win32api
       blastDB = win32api.GetShortPathName(blast_db)
       blastQueryFile = win32api.GetShortPathName(blastQueryFile)
       blast_exe = win32api.GetShortPathName(blast_exe)
    blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blastDB, blastQueryFile, align_view=7)
    #print error_info.read()
    #print blast_out.read()
    blast_records = NCBIXML.parse(blast_out)
    results = []
    recordnumber = 0
    nonmatchingQueries = []
    while 1:
      recordnumber += 1
      try: b_record = blast_records.next()
      except StopIteration: break

      if not b_record:
        continue
      print 'query:', b_record.query
      e_value_thresh = 0.0001
      significant = False
      for alignment in b_record.alignments:
        bestHsp = None
        for hsp in alignment.hsps:
          if not bestHsp: bestHsp = hsp.expect
          elif bestHsp < hsp.expect: continue
          if hsp.expect < e_value_thresh:
            alignment.title = alignment.title.replace(">","")
            #if b_record.query != alignment.title:
            #print 'dir(alignment):', dir(alignment)
            #print 'hsps: ',alignment.hsps, 'accession:', alignment.accession, 'title:', alignment.title, 'length:', alignment.length
            if b_record.query != alignment.accession:
              significant = True
              print 'adding', b_record.query, 'and', alignment.accession, 'to matches (e value: ',hsp.expect, ', bit score: ', hsp.bits, ')'
              results.append((b_record.query, alignment.accession, hsp.expect, hsp.bits))
      print b_record.query, significant
      #if not significant:
      #  print 'adding', b_record.query, 'to the list of queries without matches'
      #  results.append((b_record.query, None, None))
    return results
Example #11
0
 def __init__(self, dbname=None, blastexe=None, mode=None, parser=None):
     if dbname is None:
         dbname = DEFAULT_BLAST_DB
     if blastexe is None:
         blastexe = DEFAULT_BLAST_EXE
     if mode is None:
         mode = DEFAULT_BLAST_MODE
     if parser is None:
         parser = NCBIStandalone.BlastParser()
     self.dbname = dbname
     self.blastexe = blastexe
     self.parser = parser
     self.mode = mode
Example #12
0
    def runBlast(self, result_handle=None):
        # If a filehandle is given as input, simply reads and parses blast results from the input file into blast_records.
        # If a filehandle is not given as input, runs a new blast (with local arguments: blast_exe, blast_prog, etc.) on
        # Output: an iterator over a sequence of Record objects
        # If no filehandle given, or filehandle given is None, run new blast.
        if result_handle == None:
            result_handle, error_handle = NCBIStandalone.blastall(
                self.blast_exe, self.blast_prog, self.blast_db, self.blast_query, nprocessors=self.blast_processors
            )

        # Parse and return blast records from given filehandle or new blast run.
        blast_records = NCBIXML.parse(result_handle)
        return blast_records
Example #13
0
def blast_parse(file, e, output):

    result_handle = open(file)
    
    blast_parser = NCBIStandalone.BlastParser()
    blast_iterator = NCBIStandalone.Iterator(result_handle, blast_parser)
    blast_record = next(blast_iterator)
    
    output = open(output, 'w')
    output.write('query title\tdescription\tlength\te value' + '\n')
    for blast_record in blast_iterator:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < e:
                    output.write(str(blast_record.query[:18]) + ' \t')
                    output.write(str(alignment.title) + '\t')
                    output.write(str(alignment.length) + '\t')
                    output.write(str(hsp.expect) + '')
                    output.write('\n')
    
    
    output.close()
Example #14
0
def blastall_seq2db(header,
                    sequence,
                    dbname="",
                    blastprogram="blastp",
                    output="ncbiparsed",
                    extra_blastp_params={
                        'F': 'F',
                        'e': '10'
                    }):
    """
    """
    if blastprogram not in ['blastp', 'tblastn', 'blastn', 'blastx']:
        raise "only blastp and tblastn are supported"

    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    # generate (semi ;-) unique filename
    uniquetag = get_random_string_tag()
    fname = "_".join(
        [uniquetag,
         str(header).replace(" ", "_"), sequence[0:10] + ".fa"])
    fname = osPathJoin(OSgetcwd(), fname)
    fh = open(fname, 'w')
    fh.write(">%s\n%s\n" % (header, sequence))
    fh.close()
    command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram,
                                            extra_params, fname, dbname)
    try:
        ci, co, ce = osPopen3(command)
        ci.close()
        if output == "ncbiparsed":
            b_parser = NCBIStandalone.BlastParser()
            blastallout = b_parser.parse(co)
        else:
            blastallout = co.read()
        co.close()
        ce.close()
    except:
        # for some kind of - obvious or freak accident case -
        # Blast or parsing of the blast record failed
        # No debugging here; just cleanup and return False
        print "BLAST CRASHED::"
        print command
        blastallout = False

    # remove the created Query file
    osRemove(fname)
    # and return!
    return blastallout
Example #15
0
    def runBlast(self, result_handle=None):
        # If a filehandle is given as input, simply reads and parses blast results from the input file into blast_records.
        # If a filehandle is not given as input, runs a new blast (with local arguments: blast_exe, blast_prog, etc.) on
        # Output: an iterator over a sequence of Record objects
        # If no filehandle given, or filehandle given is None, run new blast.
        if result_handle == None:
            result_handle, error_handle = NCBIStandalone.blastall(
                self.blast_exe,
                self.blast_prog,
                self.blast_db,
                self.blast_query,
                nprocessors=self.blast_processors)

        # Parse and return blast records from given filehandle or new blast run.
        blast_records = NCBIXML.parse(result_handle)
        return blast_records
Example #16
0
    def identify_family(self, aaseq):
        """given an amino acid sequence, identify its family"""
        blast_file = os.path.join(TEMPORARY_DIRECTORY, "profile_temp.fasta")
        outf = open(blast_file, "w")

        temp_record = SeqRecord.SeqRecord(id="temp", seq=aaseq)

        SeqIO.write([temp_record], outf, "fasta")
        outf.close()

        result_handle, error_handle = NCBIStandalone.blastall(BLAST_EXE,
                                        "blastp", self.tpase_file, blast_file)

        try:
            record = NCBIXML.parse(result_handle).next()
        except ValueError:
            raise Exception("BLAST Exception: " + error_handle.read())

        best_hsp = None
        best_alignment = None

        #perform blast
        for alignment in record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < TPASE_MAX_E_VALUE:
                    if best_hsp:
                        if hsp.score > best_hsp.score:
                            best_alignment = alignment
                            best_hsp = hsp
                    else:
                        best_alignment = alignment
                        best_hsp = hsp

        #find family and group
        family = None
        group = None

        if best_hsp:
            fields = re.split("[\s\t]+", best_alignment.title)[1].split("|")
            #best_IS = self.__fetch_by_name(fields[0])
            family, group = fields[2], fields[3]

        #clean up by removing temporary blast file
        os.remove(blast_file)

        return family, group
Example #17
0
    def identify_family(self, aaseq):
        """given an amino acid sequence, identify its family"""
        blast_file = os.path.join(TEMPORARY_DIRECTORY, "profile_temp.fasta")
        outf = open(blast_file, "w")

        temp_record = SeqRecord.SeqRecord(id="temp", seq=aaseq)

        SeqIO.write([temp_record], outf, "fasta")
        outf.close()

        result_handle, error_handle = NCBIStandalone.blastall(
            BLAST_EXE, "blastp", self.tpase_file, blast_file)

        try:
            record = NCBIXML.parse(result_handle).next()
        except ValueError:
            raise Exception("BLAST Exception: " + error_handle.read())

        best_hsp = None
        best_alignment = None

        #perform blast
        for alignment in record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < TPASE_MAX_E_VALUE:
                    if best_hsp:
                        if hsp.score > best_hsp.score:
                            best_alignment = alignment
                            best_hsp = hsp
                    else:
                        best_alignment = alignment
                        best_hsp = hsp

        #find family and group
        family = None
        group = None

        if best_hsp:
            fields = re.split("[\s\t]+", best_alignment.title)[1].split("|")
            #best_IS = self.__fetch_by_name(fields[0])
            family, group = fields[2], fields[3]

        #clean up by removing temporary blast file
        os.remove(blast_file)

        return family, group
Example #18
0
 def blast(self,fasta,output):
     """
     Blast the fasta, consume the output buffer, return the output filename
     """
     runtime().debug("Blasting %s with alignment %s using %s" %(fasta, self.alignment,self.blast_exe))
     r,e = NCBIStandalone.blastpgp(self.blast_exe, 
                                   self.db,
                                   fasta,
                                   align_infile=self.alignment,
                                   align_outfile=output,
                                   expectation=self.expect, 
                                   model_threshold=self.expect,
                                   npasses=3,
                                   nprocessors=1,
                                   **self.kwargs)
     consume(r)
     return output
Example #19
0
 def blast(self, fasta, output):
     """
     Blast the fasta, consume the output buffer, return the output filename
     """
     runtime().debug("Blasting %s with alignment %s using %s" %
                     (fasta, self.alignment, self.blast_exe))
     r, e = NCBIStandalone.blastpgp(self.blast_exe,
                                    self.db,
                                    fasta,
                                    align_infile=self.alignment,
                                    align_outfile=output,
                                    expectation=self.expect,
                                    model_threshold=self.expect,
                                    npasses=3,
                                    nprocessors=1,
                                    **self.kwargs)
     consume(r)
     return output
Example #20
0
    def blastOneBatchProbes(self, probe_id_seq_ls, blast_bin_path, database_fname, \
         tmp_blast_infname, min_no_of_identities=15, node_rank=0):
        """
		2010-4-14
		"""
        result_ls = []
        inf = open(tmp_blast_infname, 'w')
        for probe_id, probe_seq in probe_id_seq_ls:
            inf.write(">%s\n" % probe_id)  # write the probe id
            inf.write("%s\n" % probe_seq)
        inf.close()
        if self.report:
            sys.stderr.write("I'm %s, finished generating blast file for %s probes.\n"%\
               (node_rank, len(probe_id_seq_ls)))
        result_handle, error_info = NCBIStandalone.blastall(blast_bin_path,
                                                            "blastn",
                                                            database_fname,
                                                            tmp_blast_infname,
                                                            align_view=7)
        #error_info = error_info.read()	#2010-4-14 this read() causes program to hang out forever. ???
        #if error_info:
        #	sys.stderr.write("%s"%error_info)
        blast_records = NCBIXML.parse(result_handle)

        if self.report:
            sys.stderr.write("I'm %s, finished blasting.\n" % node_rank)
        for blast_record in blast_records:
            no_of_hits = min(
                1000, len(blast_record.alignments
                          ))  # top 1000 or the number of available alignments
            for i in range(no_of_hits):
                alignment_title = blast_record.alignments[i].title
                for hsp in blast_record.alignments[i].hsps:
                    if hsp.identities >= min_no_of_identities:
                        result_entry = [blast_record.query, alignment_title, hsp.query_start, hsp.query_end, \
                           hsp.identities, hsp.sbjct_start, hsp.sbjct_end,]
                        #20104-25 hsp.strand is always (None, None), hsp.frame is either (1,1) or (1, -1) when the query's end < start
                        #[query name (probe id and pos) , alignment title , number of matches, pos in contig ]
                        result_ls.append(result_entry)
        if self.report:
            sys.stderr.write("I'm %s, finished with %s blasts, got %s returns.\n"%\
                (node_rank, len(probe_id_seq_ls), len(result_ls)))
        return result_ls
Example #21
0
    def seqBlast(self, seqFile, blastType = "blastn", scoreMin = 1e-3, logFile = None):
        '''
        command line blast
        blastall -d database -i query -p blastn -o blastout
        '''
        
        if not os.path.exists(os.path.expanduser(seqFile)):
            print "(ignore) %s file not found" %(seqFile)
        
        if not os.path.exists(os.path.expanduser(self.blastDB + ".nsq")):
            print "(ignore) %s file not found" % (self.blastDB)
            
        (resultHandle,errorHandle) = NCBIStandalone.blastall(self.blastExe, blastType, self.blastDB, seqFile)       
        time.sleep(5)
        blastRecords = NCBIXML.parse(resultHandle)

        blastRecords = list(blastRecords)
        resultHandle.close()
        errorHandle.close()

        return blastRecords
Example #22
0
def blastall_file2db(fname,
                     dbname="",
                     blastprogram="blastp",
                     output="ncbiparsed",
                     extra_blastp_params={
                         'F': 'F',
                         'e': '10'
                     }):
    """
    """
    if blastprogram not in ['blastp', 'tblastn', 'blastn', 'tblastx']:
        raise "only blastp and tblastn are supported"

    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram,
                                            extra_params, fname, dbname)
    try:
        ci, co, ce = osPopen3(command)
        ci.close()
        if output == "ncbiparsed":
            b_parser = NCBIStandalone.BlastParser()
            blastallout = b_parser.parse(co)
        else:
            blastallout = co.read()
        co.close()
        ce.close()
        # do NOT remove the input fname
    except:
        co.close()
        error = ce.read().strip()
        ce.close()
        print command
        print "ERROR: '%s'" % error
        raise "BLAST CRASHED...."
    # and return!
    return blastallout
Example #23
0
 def __init__(self, blastcmd, program, database, infile, **kargs):
     if 'align_view' in kargs:
         kargs.pop('align_view')
     blastout, blasterr = NCBIStandalone.blastall(
         blastcmd, program, database, infile, **kargs)
     BlastOutputReader.__init__(self, blastout)
Example #24
0
 def __init__(self, blastcmd, program, database, infile, **kargs):
     if 'align_view' in kargs:
         kargs.pop('align_view')
     blastout, blasterr = NCBIStandalone.blastall(blastcmd, program,
                                                  database, infile, **kargs)
     BlastOutputReader.__init__(self, blastout)
Example #25
0
    def localPSIBlast( self, seqFile, db, method='blastp',
                       resultOut=None, e='0.001', **kw ):
        """
        Performa a local psi-blast search (requires that the blast binaries
        and databases are installed localy).
        Uses Bio.Blast.NCBIStandalone.blastpgp (Biopython) for the search

        @param seqFile: file name with search sequence in FASTA format
        @type  seqFile: str
        @param db: database(s) to search e.g. ['swissprot', 'pdb']
        @type  db: [str]
        @param e: expectation value cutoff (default: 0.001)
        @type  e: float
        @param resultOut: save blast output to this new file
        @type  resultOut: str

        @param kw: optional keywords::
            --- New Blast+ routine ---
            (see NcbipsiblastCommandline)

            num_iterations   Number of passes (default 1).
            matrix           Matrix to use (default BLOSUM62).
            
            --- old blastall routine ---
            --- Scoring --- 
            matrix           Matrix to use (default BLOSUM62).
            gap_open         Gap open penalty (default 11).
            gap_extend       Gap extension penalty (default 1).
            window_size      Multiple hits window size (default 40).
            npasses          Number of passes (default 1).
            passes           Hits/passes (Integer 0-2, default 1).

            --- Algorithm --- 
            gapped           Whether to do a gapped alignment (T/F, default T).
            wordsize         Word size (default 3).
            keep_hits        Number of beset hits from a region to keep (def 0)
            xdrop            Dropoff value (bits) for gapped alignments
                             (def 15)
            hit_extend       Threshold for extending hits (default 11).
            nbits_gapping    Number of bits to trigger gapping (default 22).
            pseudocounts     Pseudocounts constants for multiple passes
                             (def 9).
            xdrop_final      X dropoff for final gapped alignment (default 25).
            xdrop_extension  Dropoff for blast extensions (default 7).
            model_threshold  E-value threshold to include in multipass model
                             (default 0.005).
            required_start   Start of required region in query (default 1).
            required_end     End of required region in query (default -1).

            --- Processing --- 
            filter           Filter query sequence with SEG? (T/F, default F)
            believe_query    Believe the query defline? (T/F, default F)
            nprocessors      Number of processors to use (default 1).

            --- Formatting --- 
            alignments       Number of alignments (default 250).
        @type  kw: any

        @raise BlastError: if program call failes
        """
        ## the following should work for new Blast+ tools:
        
        #from Bio.Blast.Applications import NcbipsiblastCommandline

        #resultOut = resultOut or self.outFolder+ self.F_BLAST_RAW_OUT
        #blastx_cline = NcbipsiblastCommandline(query=seqFile, 
                                               #db=db, 
                                               #evalue=e,
                                               #outfmt=5, 
                                               #out=resultOut,
                                               #**kw)
        #stdout, stderr = blastx_cline()
        #parsed = NCBIXML.parse( results ).next()
        #self.__blast2dict( parsed, db )
        
        results = err = None
        resultOut = resultOut or self.outFolder+ self.F_BLAST_RAW_OUT
        kw = self.__dictvalues2str( kw )
        e = str(e)

        try:
            results, err = NCBIStandalone.blastpgp( settings.psi_blast_bin,
                                                    db, seqFile,
                                                    program='blastpgp',
                                                    align_view='7', ## XML output
                                                    expectation=e, **kw)

            results = self.__copyFileHandle(results,resultOut )
            err = self.__copyFileHandle(err, self.outFolder+self.F_BLAST_ERROR)

            if self.verbose:
                self.log.writeln('Raw blast output copied to: ' + resultOut )

            parsed = NCBIXML.parse( results ).next()

            self.__blast2dict( parsed, db )

        except Exception, why:
            self.log.add( T.lastErrorTrace() )
            globals().update( locals() )
            self.log.writeln('local namespace is pushed into global ')
            raise BlastError( str(why) ) 
Example #26
0
        handle = func("/somewhere/blast", "blastz", "nr",
                      "/tmp/example.fasta", \
                      nprocessors=4,
                      expectation="0.001",
                      filter= "F > /etc/passwd'")
        assert False, "Attempted output redirection not caught!"
    except ValueError, e:
        assert str(e) == "Rejecting suspicious argument for filter"
        #Good


### _Scanner

print "Running tests on _Scanner"

scanner = NCBIStandalone._Scanner()
for test in all_tests:
    print "*" * 50, "TESTING %s" % test
    datafile = os.path.join("Blast", test)
    scanner.feed(open(datafile), ParserSupport.AbstractConsumer())

for test in detailed_tests:
    print "*" * 50, "TESTING %s" % test
    datafile = os.path.join("Blast", test)
    scanner.feed(open(datafile), ParserSupport.TaggingConsumer())

### BlastParser

print "Running tests on BlastParser"

parser = NCBIStandalone.BlastParser()
Example #27
0
    def localBlast(self,
                   seqFile,
                   db,
                   method='blastp',
                   resultOut=None,
                   e='0.01',
                   **kw):
        """
        Performa a local blast search (requires that the blast binaries
        and databases are installed localy).
        Uses Bio.Blast.NCBIStandalone.blastall (Biopython) for the search.

        @param seqFile: file name with search sequence in FASTA format
        @type  seqFile: str
        @param db: database(s) to search, e.g. ['swissprot', 'pdb']
        @type  db: [str]
        @param method: search program to use, e.g. 'blastp', 'fasta'
                       (default: blastp)
        @type  method: str
        @param e: expectation value cutoff
        @type  e: float
        @param resultOut: save blast output to this new file
        @type  resultOut: str
        @param kw: optional keywords::
                --- Scoring ---
                matrix         Matrix to use (default BLOSUM62).
                gap_open       Gap open penalty (default 0).
                gap_extend     Gap extension penalty (default 0).

                --- Algorithm ---
                gapped         Whether to do a gapped alignment. T/F 
                                (default T)
                wordsize       Word size (blastp default 11).
                keep_hits      Number of best hits from a region to keep
                                (default off).
                xdrop          Dropoff value (bits) for gapped alignments
                                (blastp default 25).
                hit_extend     Threshold for extending hits (blastp default 11)

                --- Processing ---
                filter         Filter query sequence? (T/F, default F)
                restrict_gi    Restrict search to these GI's.
                believe_query  Believe the query defline? (T/F, default F)
                nprocessors    Number of processors to use (default 1).

                --- Formatting ---
                alignments     Number of alignments. (default 250)
        @type  kw: any

        @raise BlastError: if program call failes
        """
        results = err = p = None
        resultOut = resultOut or self.outFolder + self.F_BLAST_RAW_OUT
        kw = self.__dictvalues2str(kw)
        e = str(e)

        try:
            if self.verbose:
                self.log.add('running blast...')

            results, err = NCBIStandalone.blastall(
                settings.blast_bin,
                method,
                db,
                seqFile,
                expectation=e,
                align_view='7',  ## XML output
                **kw)

            results = self.__copyFileHandle(results, resultOut)
            err = self.__copyFileHandle(err,
                                        self.outFolder + self.F_BLAST_ERROR)

            if self.verbose:
                self.log.writeln('Raw blast output copied to: ' + resultOut)

            parsed = NCBIXML.parse(results).next()

            self.__blast2dict(parsed, db)

        except Exception, why:
            self.log.add(T.lastErrorTrace())
            globals().update(locals())
            self.log.writeln('local namespace is pushed into global ')
            raise BlastError(str(why))
Example #28
0
 def __init__(self, handle):
     """Initialize the class."""
     self.handle = handle
     blast_parser = NCBIStandalone.BlastParser()
     self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
Example #29
0
 def __init__(self, handle):
     self.handle = handle
     blast_parser = NCBIStandalone.BlastParser()
     self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
Example #30
0
import time
from multiprocessing import Process, Queue
import string
from Bio.Seq import Seq
from Bio.Blast import NCBIStandalone

import fileinput
import glob

OutFile = r'Blast\out\02.blast_result_total.txt'

f = open(OutFile, 'r')

Mismatch_total_file = open('Mismatch_total_LOD v3_171103.txt', 'w')

blast_parser = NCBIStandalone.BlastParser()
print blast_parser
iterator = NCBIStandalone.Iterator(f, blast_parser)

for record in iterator:
    for alignment in record.alignments:
        for hsp in alignment.hsps:
            mismatch_Number = (hsp.identities[1] -
                               (hsp.identities[0] + hsp.gaps[0]))
            Gaps_Number = hsp.gaps[0]
            if (mismatch_Number != 0):
                Mismatch_total_file.write('%s\t%s\t%s\t%s\n' %
                                          (record.query, alignment.title,
                                           mismatch_Number, Gaps_Number))

f.close()
Example #31
0
starttime = time.time()

print 'retrieving homologous sequences from UniProt using PSI-BLAST...',
sys.stdout.flush()

## execute blastpgp ##

blastexe = '/usr/bin/blastpgp'
blastdb = '/clusterfs/ohana/external/UniProt/current/protein'
iters = 4
eval = 0.0001
maxseqs = 1000

results, errors = NCBIStandalone.blastpgp(blastexe,
                                          blastdb,
                                          seedfname,
                                          expectation=eval,
                                          alignments=maxseqs,
                                          npasses=iters)

## parse psiblast hits to get ids ##

blasthits = set([])

for blast_record in NCBIXML.parse(results):
    for alignment in blast_record.alignments:
        blasthits.add(alignment.hit_id)

if len(blasthits) < 3:
    print 'Sorry, only %d homologs were retrieved from UniProt, too few sequences to determine patterns of evolutionary conservation.' % len(
        blasthits)
    sys.exit(0)
Example #32
0
def dazheMpiBlast():
    """
	2010-4-14
		wrap all of dazhe's old code into this function
	"""
    blast_bin_path = '/home/cmb-01/yuhuang/bin/blast/bin/blastall'
    database_fname = '/home/cmb-01/dazhemen/Data/Ler/Cereon_Ath_Ler.fasta'

    probes = vardata.vardata()

    comm = MPI.world.duplicate()
    ppp = 10  # probes per processor
    accs = []

    genome = ref_genome()
    genome.load_chr()
    probes.readfromfile("/home/cmb-01/dazhemen/CNV/CNV_probelist.csv",
                        format=2)
    ppp = len(probes.data) / (comm.size - 1) + 1

    print "I'm %s of %s, finished loading\n" % (comm.rank, comm.size)

    if comm.rank == 0:  #Master node, reads the file etc
        outf = open("/home/cmb-01/dazhemen/CNV/ler_raw.csv", "w")
        outf.write(
            "Chromosome,Position,Probe_ID,Alignment_title,Number_matches\n")
        for dest in range(1, comm.size):
            data, source, tag = comm.receiveString(dest, None)
            print "I'm 0, collected data from %s\n" % source
            partial_result_list = cPickle.loads(data)
            for result_entry in partial_result_list:
                outf.write(result_entry[0].replace("_", ",") + "," +
                           ",".join([str(a) for a in result_entry[1:]]) + "\n")
    else:
        probe_index_start = (comm.rank - 1) * ppp
        result_ls = []
        if probe_index_start + ppp >= len(probes.data):
            ppp = len(probes.data) - probe_index_start
        if ppp != 0:
            tmp_blast_infname = '/home/cmb-01/dazhemen/CNV/tmp_blast/' + str(
                comm.rank)
            inf = open(tmp_blast_infname, 'w')
            for i in xrange(probe_index_start, probe_index_start + ppp):
                inf.write(">%s_%s_%s\n" %
                          (probes.data[i][0], probes.data[i][1],
                           probes.data[i][2][0]))  # write the probe id
                inf.write(
                    "%s\n" %
                    genome.readprobe(probes.data[i][0], probes.data[i][1]))
            inf.close()
            print "I'm %s, finished with generating blast file from probes %s to %s\n" % (
                comm.rank, probe_index_start, probe_index_start + ppp)
            result_handle, error_info = NCBIStandalone.blastall(
                blast_bin_path,
                "blastn",
                database_fname,
                tmp_blast_infname,
                align_view=7)
            blast_records = NCBIXML.parse(result_handle)
            print "I'm %s, finished with blasting\n" % comm.rank
            while 1:
                try:
                    blast_record = blast_records.next()
                except:
                    "I'm %s, finished with all records\n" % comm.rank
                    break
                no_of_hits = min(1000, len(blast_record.alignments))
                for i in range(no_of_hits):
                    alignment_title = blast_record.alignments[i].title
                    for hsp in blast_record.alignments[i].hsps:
                        result_entry = [
                            blast_record.query, alignment_title, 0, 0
                        ]  #[query name (probe id and pos) , alignment title , number of matches, pos in contig ]
                        if hsp.identities >= 15:
                            result_entry[2] = hsp.identities
                            result_entry[3] = hsp.sbjct_start
                            result_ls.append(result_entry)
            print "I'm %s, finished with parsing blast result\n" % comm.rank
        result_data = cPickle.dumps(result_ls)
        comm.send(result_data, 0, 0)
Example #33
0
def blastall_seq2seq(fastadata=(),
                     filenames=(),
                     output="ncbiparsed",
                     blastprogram="blastp",
                     remove_files=True,
                     extra_blastp_params={
                         'F': 'F',
                         'e': '10'
                     }):
    """
    choose proper input:
    fastadata   ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) )
     or
    filenames   ( filenameQUERY, filenameSBJCT )
    """
    input = None

    if blastprogram not in ['blastp', 'tblastn', 'tblastx', 'blastx']:
        raise "only blastp and tblastn are supported"
    elif blastprogram in ['tblastn', 'tblastx']:
        dna_or_prot = "F"
    else:
        dna_or_prot = "T"

    if fastadata and type(fastadata) == type(
        ()) and len(fastadata) == 2 and not filenames:
        # input is fasta headers and sequence
        input = "fastadata"
        # write input filenames
        uniquetag = get_random_string_tag()
        fname_q = "_".join([uniquetag, str(fastadata[0][0]), 'Q.fa'])
        fname_s = "_".join([uniquetag, str(fastadata[1][0]), 'S.fa'])
        fh = open(fname_q, 'w')
        fh.write(">%s\n%s" % (fastadata[0][0], fastadata[0][1]))
        fh.close()
        fh = open(fname_s, 'w')
        fh.write(">%s\n%s" % (fastadata[1][0], fastadata[1][1]))
        fh.close()
    elif filenames and type(filenames) == type(
        ()) and len(filenames) == 2 and not fastadata:
        # input is (supposed to be) filenames
        input = "filenames"
        # get filenames
        fname_q = filenames[0]
        fname_s = filenames[1]
    elif not filenames and not fastadata:
        raise "no input!"
    else:
        raise "inproper input!"

    # formatdb
    OSsystem("%s -i %s -p %s" % (FORMATDB_PATH, fname_s, dna_or_prot))
    # and blastall!
    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    ci, co, ce = osPopen3(
        "%s -p %s %s -i %s -d %s " %
        (BLASTALL_PATH, blastprogram, extra_params, fname_q, fname_s))
    ci.close()
    if output == "ncbiparsed":
        b_parser = NCBIStandalone.BlastParser()
        blastallout = b_parser.parse(co)
    else:
        blastallout = co.read()
    co.close()
    ce.close()
    if remove_files:
        OSsystem("rm %s.*" % fname_s)
        osRemove("%s" % fname_s)
        osRemove("%s" % fname_q)
    # and return!
    return blastallout
#

starttime = time.time()

print 'retrieving homologous sequences from UniProt using PSI-BLAST...',
sys.stdout.flush()

## execute blastpgp ##

blastexe = '/usr/bin/blastpgp'
blastdb  = '/clusterfs/ohana/external/UniProt/current/protein'
iters = 4
eval = 0.0001
maxseqs = 1000

results,errors = NCBIStandalone.blastpgp(blastexe, blastdb, seedfname, expectation=eval, alignments=maxseqs, npasses=iters)

## parse psiblast hits to get ids ##

blasthits = set([])

for blast_record in NCBIXML.parse(results):
    for alignment in blast_record.alignments:
        blasthits.add(alignment.hit_id)

if len(blasthits) < 3:
    print 'Sorry, only %d homologs were retrieved from UniProt, too few sequences to determine patterns of evolutionary conservation.' % len(blasthits)
    sys.exit(0)

handle = open('intrepid-psiblast-ids.txt', 'w')
Example #35
0
documentation.
"""
# standard library
import os
import sys

# biopython
from Bio.Blast import NCBIStandalone

my_blast_db = os.path.join(os.getcwd(), 'at-est', 'a_cds-10-7.fasta')
my_blast_file = os.path.join(os.getcwd(), 'at-est', 'test_blast',
                             'sorghum_est-test.fasta')
my_blast_exe = os.path.join(os.getcwd(), 'blast', 'blastall')

print 'Running blastall...'
blast_out, error_info = NCBIStandalone.blastall(my_blast_exe, 'blastn',
                                                my_blast_db, my_blast_file)


b_parser = NCBIStandalone.BlastParser()

b_iterator = NCBIStandalone.Iterator(blast_out, b_parser)

while 1:
    b_record = b_iterator.next()

    if b_record is None:
        break
    
    E_VALUE_THRESH = 0.04
    for alignment in b_record.alignments:
        for hsp in alignment.hsps:
Example #36
0
def predict(input_sequence):
    sys.path.append(settings["SERENDIP_DIR"])
    from sequence.entropy.lib.seq_lib import FastaParser
    from Bio.Blast import NCBIStandalone

    sequence_hash = get_sequence_hash(input_sequence)
    results_path = os.path.join(settings["RESULTS_DIR"], sequence_hash)
    lock_path = results_path + ".lock"

    with FileLock(lock_path):

        if os.path.isfile(results_path):
            return parse_serendip_results(open(results_path, 'r').read())

        input_id = 'input'
        out_dir = tempfile.mkdtemp()

        try:
            out_file = os.path.join(out_dir, 'output.myrsa')
            input_fasta_path = os.path.join(out_dir, input_id + '.fa')

            # Netsurf
            open(input_fasta_path, 'w').write(">%s\n%s" % (input_id, input_sequence))
            cmd = [settings["NETSURF_EXE"],
                   "-i", input_fasta_path,
                   "-d", settings["NR70_DB"], "-a", "-k",
                   "-T", out_dir, "-o", out_file]
            _log.info(cmd)
            subprocess.call(cmd)

            blast_parser = NCBIStandalone.PSIBlastParser()
            blast_record = blast_parser.parse(open(os.path.join(out_dir, input_id + '.blastout'), 'r'))
            if blast_record.rounds <= 0:
                raise Exception("no netsurf hits")
            hit_titles = [alignment.title[1:]
                          for alignment in blast_record.rounds[-1].alignments]

            id_path = os.path.join(out_dir, input_id + '.blastout_id')
            with open(id_path, 'w') as f:
                for hit_title in hit_titles:
                    f.write(hit_title + '\n')

            blast_hits_path = os.path.join(out_dir, 'output_seqs.fa')
            cmd = [settings["FASTACMD_EXE"],
                   "-d", settings["NR70_DB"],
                   '-i', id_path,
                   '-o', blast_hits_path]
            _log.info(cmd)
            result = subprocess.call(cmd)


            if result == 0:  # We have blast hits

                # Netsurf on hits
                netsurf_append_path = os.path.join(out_dir, 'output_other.myrsa')
                subtasks = [netsurf_hit.delay(str(seq))
                            for seq in FastaParser(open(blast_hits_path, 'r'))]

                with open(netsurf_append_path, 'w') as f:
                    for subtask in subtasks:
                        f.write(subtask.get())

                # Append input sequence to blast hits for alignment as input for entropy and DynaMine
                with open(blast_hits_path, 'a') as f:
                    f.write('>input\n' + input_sequence + '\n')

                # Make alignment using muscle
                alignment_path = os.path.join(out_dir, "output.ali")
                cmd = [settings["MUSCLE_EXE"],
                       "-in", blast_hits_path,
                       "-out", alignment_path]
                _log.info(cmd)
                subprocess.call(cmd)

            else:
                raise Exception("No blast hits for input sequence")


            # Alignment position entropies
            entropy_path = os.path.join(out_dir, "output.entropy")
            hit_sequences = FastaParser(open(alignment_path, 'r'))
            hit_sequences.frequencies().normalize()
            hit_entropies = hit_sequences.frequencies().entropies()
            with open(entropy_path, 'w') as f:
                n = 0
                for entropy in hit_entropies:
                    n += 1
                    f.write(str(n) + ' ' + str(entropy) + '\n')


            # Run dynamine on each sequence
            dynamine_fasta_path = os.path.join(out_dir, "output_seq.fasta")
            for seq in FastaParser(open(blast_hits_path, 'r')):
                # We use this file name, to avoid confusing the rest of the script:
                with open(dynamine_fasta_path, 'w') as f:
                    f.write(str(seq))
                cmd = [settings["DYNAMINE_EXE"], "-a", dynamine_fasta_path]
                _log.info(cmd)
                subprocess.call(cmd, env=dict(os.environ,
                                              **{"PYTHONPATH":"/usr/local/lib/python2.7/site-packages/"}))


            # Run prediction script
            result_testing_path = os.path.join(settings["SERENDIP_DIR"], "sequence", "Result_Testing")
            combined_path = os.path.join(settings["SERENDIP_DIR"], "sequence", "five_models_combined")
            dynamine_path = os.path.splitext(dynamine_fasta_path)[0]
            cmd = [settings["RSCRIPT_EXE"], settings["RF_SCRIPT"], input_id,
                   alignment_path, entropy_path, netsurf_append_path,
                   dynamine_path, out_file, result_testing_path, combined_path]
            _log.info(cmd)
            os.chdir(out_dir)
            subprocess.call(cmd)

            output_result_path = os.path.join(out_dir, input_id + '.out')
            if not os.path.isfile(output_result_path):
                raise Exception("No ouput generated")

            shutil.copyfile(output_result_path, results_path)

            data = parse_serendip_results(open(results_path, 'r').read())

            # Start making the scene:
            yasara_scene.delay(data)

            return data

        finally:
            if os.path.isdir(out_dir):
                shutil.rmtree(out_dir)
Example #37
0
dbFile = argv[2]
outFile = argv[3]
#padding = int(argv[4])

# Format dbFile
if os.path.exists("%s.nin" % dbFile):
    print "--[WARNING]blastdb already formated, using the existing one."
else:
    print "Formatting database..."
    os.system('formatdb -i %s -p F' % dbFile)

# Run BLAST
os.system(
    'blastall -p blastn -i %s -d %s -e 1e-10 -v 100000 -b 100000 -m 0 -o temp.blo -q -2'
    % (queryFile, dbFile))

# Extract sequeces from blo file
outf = open(outFile, 'w')
blast_parser = NCBIStandalone.BlastParser()
blast_iterator = NCBIStandalone.Iterator(open('temp.blo'), blast_parser)
#blast_iterator = SearchIO.parse(open('temp.blo'),'blast-txt') #if switch to SearchIO, this is the way to go (not working yet)
for hit in blast_iterator:
    for alignment in hit.alignments:
        for hsp in alignment.hsps:
            #print alignment.title
            #print hsp.sbjct_start, hsp.sbjct_end
            #print hsp.sbjct
            outf.write(
                "%s_%s-%s\n%s\n\n" %
                (alignment.title, hsp.sbjct_start, hsp.sbjct_end, hsp.sbjct))
Example #38
0
# CL Structure: ./blastpquerydb.py [QUERY FASTA FILE] [DATABASE TO SEARCH (MAKE WITH MAKEBLASTDB)]

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio.SubsMat.MatrixInfo import blosum62
from Bio.Blast import NCBIStandalone
from sys import argv

queryList = argv[1]

for seq_record in SeqIO.parse(queryList, "fasta"):
    SeqIO.write(SeqRecord(seq_record.seq, id = seq_record.id), '/home/underasail/temp/seq.fa', 'fasta')
    blastp_cli = NcbiblastpCommandline(cmd = "/home/underasail/ncbi-blast/ncbi-blast-2.7.1+/bin/blastp", query = '/home/underasail/temp/seq.fa', db = argv[2], matrix = 'blosum62', evalue = 0.01, num_descriptions = 1, num_alignments = 1, out = '~/temp/output.txt')
    blastp_cli()
    result_handle = open('/home/underasail/temp/output.txt', 'r')
    blast_parser = NCBIStandalone.BlastParser()
    blast_record = blast_parser.parse(result_handle)
    print('Human ID: ', seq_record.id)
    for description in blast_record.descriptions:
        print('Mouse Match ID: ', description.title)
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            print('Score: ', hsp.score)
            print('Bits: ', hsp.bits)
            print('E-value: ', hsp.expect)
            print('Alignment (Query/Match): ')
            print(hsp.query)
            print(hsp.match)
            print(hsp.sbjct, \n\n)
 def do_blast_search(self):
   from Bio.Blast import NCBIStandalone
   self.result_handle, self.error_handle = NCBIStandalone.blastall(self.blast_exe, "blastp",
                                                     self.blast_db, self.blast_file)
def blast(blastRootDirectory):
    if sys.platform == 'win32':
        blast_db = os.path.join(blastRootDirectory, 'blastDB.fasta')
    else:
        if not os.path.isdir('/tmp/BLAST'):
            print "making directory '/tmp/BLAST'"
            os.mkdir('/tmp/BLAST/')
        if not os.path.exists('/tmp/BLAST/formatdb'):
            shutil.copy(os.path.join(blastRootDirectory, 'formatdb'),
                        '/tmp/BLAST')
            print "copying 'formatdb' to '/tmp/BLAST/'"
        blast_db = os.path.join('/tmp/BLAST', 'blastDB.fasta')
    #print 'path to blastDB.fasta:', blast_db

    blast_file = os.path.join(blastRootDirectory, 'filetoblast.txt')
    #print 'path to filetoblast.txt:', blast_file

    if sys.platform == 'win32':
        blastall_name = 'Blastall.exe'
        blast_exe = os.path.join(blastRootDirectory, blastall_name)
    else:
        blastall_name = 'blastall'
        blast_exe = os.path.join(os.getcwd(), '../../BLAST/bin/',
                                 blastall_name)

    #print 'path to blastall:', blast_exe

    if sys.platform == 'win32':
        import win32api
        blast_db = win32api.GetShortPathName(blast_db)
        blast_file = win32api.GetShortPathName(blast_file)
        blast_exe = win32api.GetShortPathName(blast_exe)

    #cont = raw_input('blah')
    #try:
    blast_out, error_info = NCBIStandalone.blastall(blast_exe,
                                                    'blastp',
                                                    blast_db,
                                                    blast_file,
                                                    align_view=7)
    #except:
    #  f = open(blast_file, 'r')
    #  s = file.read()
    #  print s

    #print 'done BLASTing'

    print 'errors:', error_info.read()
    print 'blast output:', blast_out.read()

    b_parser = NCBIXML.BlastParser()
    #print 'got parser'

    b_record = b_parser.parse(blast_out)
    b_iterator = NCBIStandalone.Iterator(blast_out, b_parser)
    #print 'got iterator'
    results = []
    recordnumber = 0
    nonmatchingQueries = []
    while 1:
        recordnumber += 1
        b_record = b_iterator.next()

        if not b_record: break
        print 'query:', b_record.query
        if b_record is None:
            break
        e_value_thresh = 0.001
        print 'number of alignments:', len(b_record.alignments)
        significant = False
        for alignment in b_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < e_value_thresh:
                    alignment.title = alignment.title.replace(">", "")
                    if b_record.query != alignment.title:
                        significant = True
                        print 'adding', b_record.query, 'and', alignment.title, 'to the list of matches'
                        results.append(
                            (b_record.query, alignment.title, hsp.expect))
        print b_record.query, significant
        if not significant:
            print 'adding', b_record.query, 'to the list of queries without matches'
            nonmatchingQueries.append(b_record.query)

    return nonmatchingQueries, results
Example #41
0
query_sequences = {} 
it = Bio.Fasta.Iterator(handle, Bio.Fasta.SequenceParser())
seq = it.next()
while seq:
  query_sequences[seq.description] = {} 
  query_sequences[seq.description]["number_of_hits"] = 0 
  print seq.description
  print query_sequences[seq.description]["number_of_hits"] 
  seq = it.next()

handle.close()




blast_out, error_handle = NCBIStandalone.blastall(blast_exe, blast_program, blast_db, blast_file)

#print error_handle 

records = NCBIXML.parse(blast_out)

#b_record = records.next() 
#
#
#E_VALUE_THRESH = 0.000000004
#
#print  dir(b_record)
#print  b_record.num_sequences
#print "Query = %s"  % b_record.query

#b_record = records.next() 
Example #42
0
def main():
    parser = OptionParser()
    parser.add_option("-i",
                      "--input",
                      action="store",
                      dest="input",
                      help="input file to make phylotree")
    parser.add_option("-g",
                      "--germline",
                      action="store",
                      dest="germline",
                      help="germline fasta")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      dest="output",
                      help="the file where you want all your data")
    (options, args) = parser.parse_args()
    if len(sys.argv) < 2:
        dowhat()
        parser.print_help()
        exit()

    open(options.output, 'w').write("Your Sequence Results:\n\n")
    copy(options.input, "workable.fasta")
    copy(options.germline, "germ.fasta")

    list_of_database_files = SeqIO.to_dict(
        SeqIO.parse("workable.fasta", "fasta"))

    while list_of_database_files:
        list_of_database_files = SeqIO.to_dict(
            SeqIO.parse("workable.fasta", "fasta"))
        populate_database("workable.fasta")
        print "***DatabasePopulated***"

        newsequence_search = open("germ.fasta", "r")
        cline = NcbiblastpCommandline(matrix="PAM30",
                                      evalue="20",
                                      word_size="2",
                                      query="germ.fasta",
                                      cmd='blastp',
                                      db="temporary_database",
                                      out="blastout")
        newsequence_search.close

        print "****Cline = *** --->", cline

        call_blast(cline)
        print "***Call_blast_successful***"

        result_handle = open('blastout')
        print "***result handle successful***"

        blast_parser = NCBIStandalone.BlastParser()
        print "***blast_parser****"

        blast_record = blast_parser.parse(result_handle)
        print "***blast_record***"

        newsequence_search = open("germ.fasta", 'w')
        newsequence_search.write(">" +
                                 str(blast_record.alignments[0].title[2:]) +
                                 "\n" +
                                 str(blast_record.alignments[0].hsps[0].sbjct))

        current_object = blast_record.alignments[0].title[2:]
        print current_object

        newfile = open(options.output, 'a')
        newfile.write(
            str(blast_record.alignments[0].hsps[0].query[:]) + "----> Query\n")
        newfile.write(
            str(blast_record.alignments[0].hsps[0].match[:]) +
            "----> Score of: " +
            str(blast_record.alignments[0].hsps[0].score) + "\n")
        newfile.write(
            str(blast_record.alignments[0].hsps[0].sbjct[:]) +
            "----> Template\n\n")

        list_of_database_files.pop(current_object)
        SeqIO.write(list_of_database_files.values(), "workable.fasta", "fasta")
Example #43
0
    def localPSIBlast(self,
                      seqFile,
                      db,
                      method='blastp',
                      resultOut=None,
                      e='0.001',
                      **kw):
        """
        Performa a local psi-blast search (requires that the blast binaries
        and databases are installed localy).
        Uses Bio.Blast.NCBIStandalone.blastpgp (Biopython) for the search

        @param seqFile: file name with search sequence in FASTA format
        @type  seqFile: str
        @param db: database(s) to search e.g. ['swissprot', 'pdb']
        @type  db: [str]
        @param e: expectation value cutoff (default: 0.001)
        @type  e: float
        @param resultOut: save blast output to this new file
        @type  resultOut: str

        @param kw: optional keywords::
            --- New Blast+ routine ---
            (see NcbipsiblastCommandline)

            num_iterations   Number of passes (default 1).
            matrix           Matrix to use (default BLOSUM62).
            
            --- old blastall routine ---
            --- Scoring --- 
            matrix           Matrix to use (default BLOSUM62).
            gap_open         Gap open penalty (default 11).
            gap_extend       Gap extension penalty (default 1).
            window_size      Multiple hits window size (default 40).
            npasses          Number of passes (default 1).
            passes           Hits/passes (Integer 0-2, default 1).

            --- Algorithm --- 
            gapped           Whether to do a gapped alignment (T/F, default T).
            wordsize         Word size (default 3).
            keep_hits        Number of beset hits from a region to keep (def 0)
            xdrop            Dropoff value (bits) for gapped alignments
                             (def 15)
            hit_extend       Threshold for extending hits (default 11).
            nbits_gapping    Number of bits to trigger gapping (default 22).
            pseudocounts     Pseudocounts constants for multiple passes
                             (def 9).
            xdrop_final      X dropoff for final gapped alignment (default 25).
            xdrop_extension  Dropoff for blast extensions (default 7).
            model_threshold  E-value threshold to include in multipass model
                             (default 0.005).
            required_start   Start of required region in query (default 1).
            required_end     End of required region in query (default -1).

            --- Processing --- 
            filter           Filter query sequence with SEG? (T/F, default F)
            believe_query    Believe the query defline? (T/F, default F)
            nprocessors      Number of processors to use (default 1).

            --- Formatting --- 
            alignments       Number of alignments (default 250).
        @type  kw: any

        @raise BlastError: if program call failes
        """
        ## the following should work for new Blast+ tools:

        #from Bio.Blast.Applications import NcbipsiblastCommandline

        #resultOut = resultOut or self.outFolder+ self.F_BLAST_RAW_OUT
        #blastx_cline = NcbipsiblastCommandline(query=seqFile,
        #db=db,
        #evalue=e,
        #outfmt=5,
        #out=resultOut,
        #**kw)
        #stdout, stderr = blastx_cline()
        #parsed = NCBIXML.parse( results ).next()
        #self.__blast2dict( parsed, db )

        results = err = None
        resultOut = resultOut or self.outFolder + self.F_BLAST_RAW_OUT
        kw = self.__dictvalues2str(kw)
        e = str(e)

        try:
            results, err = NCBIStandalone.blastpgp(
                settings.psi_blast_bin,
                db,
                seqFile,
                program='blastpgp',
                align_view='7',  ## XML output
                expectation=e,
                **kw)

            results = self.__copyFileHandle(results, resultOut)
            err = self.__copyFileHandle(err,
                                        self.outFolder + self.F_BLAST_ERROR)

            if self.verbose:
                self.log.writeln('Raw blast output copied to: ' + resultOut)

            parsed = NCBIXML.parse(results).next()

            self.__blast2dict(parsed, db)

        except Exception, why:
            self.log.add(T.lastErrorTrace())
            globals().update(locals())
            self.log.writeln('local namespace is pushed into global ')
            raise BlastError(str(why))
def blast(blastRootDirectory):
  if sys.platform == 'win32':
    blast_db = os.path.join(blastRootDirectory, 'blastDB.fasta')
  else:
    if not os.path.isdir('/tmp/BLAST'):
      print "making directory '/tmp/BLAST'"
      os.mkdir('/tmp/BLAST/')
    if not os.path.exists('/tmp/BLAST/formatdb'):
      shutil.copy(os.path.join(blastRootDirectory,'formatdb'), '/tmp/BLAST')
      print "copying 'formatdb' to '/tmp/BLAST/'"
    blast_db = os.path.join('/tmp/BLAST', 'blastDB.fasta')
  #print 'path to blastDB.fasta:', blast_db
  
  blast_file = os.path.join(blastRootDirectory, 'filetoblast.txt')
  #print 'path to filetoblast.txt:', blast_file
  
  if sys.platform == 'win32':
    blastall_name = 'Blastall.exe'
    blast_exe = os.path.join(blastRootDirectory, blastall_name)
  else:
    blastall_name = 'blastall'
    blast_exe = os.path.join(os.getcwd(), '../../BLAST/bin/', blastall_name)

  #print 'path to blastall:', blast_exe
  
  if sys.platform == 'win32':
     import win32api
     blast_db = win32api.GetShortPathName(blast_db)
     blast_file = win32api.GetShortPathName(blast_file)
     blast_exe = win32api.GetShortPathName(blast_exe)
  
  #cont = raw_input('blah')
  #try: 
  blast_out, error_info = NCBIStandalone.blastall(blast_exe, 'blastp', blast_db, blast_file,  align_view=7)
  #except:
  #  f = open(blast_file, 'r')
  #  s = file.read()
  #  print s
  
  #print 'done BLASTing'
  
  print 'errors:', error_info.read()
  print 'blast output:', blast_out.read()
  
  b_parser = NCBIXML.BlastParser()
  #print 'got parser'
  
  b_record = b_parser.parse(blast_out)
  b_iterator = NCBIStandalone.Iterator(blast_out, b_parser)
  #print 'got iterator'
  results = []
  recordnumber = 0
  nonmatchingQueries = []
  while 1:
    recordnumber += 1
    b_record = b_iterator.next()
    
    if not b_record: break
    print 'query:', b_record.query
    if b_record is None:
      break
    e_value_thresh = 0.001
    print 'number of alignments:', len(b_record.alignments)
    significant = False
    for alignment in b_record.alignments:
      for hsp in alignment.hsps:
        if hsp.expect < e_value_thresh:
          alignment.title = alignment.title.replace(">","")
          if b_record.query != alignment.title:
            significant = True
            print 'adding', b_record.query, 'and', alignment.title, 'to the list of matches'
            results.append((b_record.query, alignment.title, hsp.expect))
    print b_record.query, significant
    if not significant:
      print 'adding', b_record.query, 'to the list of queries without matches'
      nonmatchingQueries.append(b_record.query)

  return nonmatchingQueries, results
Example #45
0
#!/usr/bin/python

my_blast_db = "/home/kenglish/Data/Genomes/Databases/EST_Clade_A"
my_blast_file = "Record1.fasta"
my_blast_exe = "/usr/bin/blastall"

from Bio.Blast import NCBIStandalone
from Bio.Blast import NCBIXML


result_handle, error_handle = NCBIStandalone.blastall(my_blast_exe, "blastn", my_blast_db, my_blast_file)

#$blast_results = result_handle.read()
#print blast_results

from Bio.Blast import NCBIXML
blast_records = NCBIXML.parse(result_handle)

blast_record = blast_records.next()
print blast_record.alignments


Example #46
0
    def localBlast( self, seqFile, db, method='blastp',
                    resultOut=None, e='0.01', **kw ):
        """
        Performa a local blast search (requires that the blast binaries
        and databases are installed localy).
        Uses Bio.Blast.NCBIStandalone.blastall (Biopython) for the search.

        @param seqFile: file name with search sequence in FASTA format
        @type  seqFile: str
        @param db: database(s) to search, e.g. ['swissprot', 'pdb']
        @type  db: [str]
        @param method: search program to use, e.g. 'blastp', 'fasta'
                       (default: blastp)
        @type  method: str
        @param e: expectation value cutoff
        @type  e: float
        @param resultOut: save blast output to this new file
        @type  resultOut: str
        @param kw: optional keywords::
                --- Scoring ---
                matrix         Matrix to use (default BLOSUM62).
                gap_open       Gap open penalty (default 0).
                gap_extend     Gap extension penalty (default 0).

                --- Algorithm ---
                gapped         Whether to do a gapped alignment. T/F 
                                (default T)
                wordsize       Word size (blastp default 11).
                keep_hits      Number of best hits from a region to keep
                                (default off).
                xdrop          Dropoff value (bits) for gapped alignments
                                (blastp default 25).
                hit_extend     Threshold for extending hits (blastp default 11)

                --- Processing ---
                filter         Filter query sequence? (T/F, default F)
                restrict_gi    Restrict search to these GI's.
                believe_query  Believe the query defline? (T/F, default F)
                nprocessors    Number of processors to use (default 1).

                --- Formatting ---
                alignments     Number of alignments. (default 250)
        @type  kw: any

        @raise BlastError: if program call failes
        """
        results = err = p = None
        resultOut = resultOut or self.outFolder+ self.F_BLAST_RAW_OUT
        kw = self.__dictvalues2str( kw )
        e = str(e)

        try:
            if self.verbose:
                self.log.add('running blast...')

            results, err = NCBIStandalone.blastall( settings.blast_bin,
                                                    method, db, seqFile,
                                                    expectation=e,
                                                    align_view='7', ## XML output
                                                    **kw)

            results = self.__copyFileHandle(results, resultOut)
            err = self.__copyFileHandle(err, self.outFolder+self.F_BLAST_ERROR)

            if self.verbose:
                self.log.writeln('Raw blast output copied to: ' + resultOut  )

            parsed = NCBIXML.parse( results ).next()

            self.__blast2dict( parsed, db )

        except Exception, why:
            self.log.add( T.lastErrorTrace() )
            globals().update( locals() )
            self.log.writeln('local namespace is pushed into global ')
            raise BlastError( str(why) )