def blastBACTEUK(arg):
	out=open('bacterial.txt','a')
	out2=open('eukaryotic.txt','a')
	records = SeqIO.parse(open(arg), format="fasta")
	
	for record in records:
		try:
			name = record.id
			result_handleB = NCBIWWW.qblast("blastx", "nr", record.format("fasta"), ncbi_gi=False, descriptions= "1", alignments="1", format_type="XML", hitlist_size="1", entrez_query='(Bacteria[ORGN] OR Archaea[ORGN])')
			result_handleE = NCBIWWW.qblast("blastx", "nr", record.format("fasta"), ncbi_gi=False, descriptions= "1", alignments="1", format_type="XML", hitlist_size="1", entrez_query='(Eukaryota[ORGN])')

			blast_recordsB = NCBIXML.read(result_handleB)
			blast_recordsE = NCBIXML.read(result_handleE)

			if blast_recordsB.descriptions:
				print record.id

				name = record.id


				out.write(name + ',' + str(blast_recordsB.alignments[0].hsps[0].expect) + '\n')
			else:
				out.write(name + ', no hit'  + '\n')

			if blast_recordsE.descriptions:
				out2.write(name + ',' +  str(blast_recordsE.alignments[0].hsps[0].expect) + '\n')
			else:
				out2.write(name + ', no hit'  + '\n')
		except:
			errorout = open('errorlog.txt','a')
			error out.write('problem blasting ' + record.id + '\n')
			errorout.close()

	out.close()
	out2.close()
Example #2
0
def check_blast_in(input_filename,taxid_line,num, GACTC_YES,blast_filename,perct):
    strlist=str(taxid_line).split(' OR ')
    for valist in strlist:
        txid_num=valist[valist.find('(taxid:')+7:valist.find(')')]
        blast_result_file= open(blast_filename+txid_num,"w")
        txid='txid'+txid_num+' [ORGN]'
        typ='' #this is for the input sequence
        input_file = open(input_filename,"r")
        for seq_record in SeqIO.parse(input_file, "fasta"):
            if (len(typ)>200):
                result_handle = NCBIWWW.qblast("blastn", "nr", typ,word_size=13,hitlist_size=100,entrez_query=txid,expect=10)
                t=result_handle.read()
                blast_result_file.write(t)
                typ=''
                #print "200 done!"
            typ=typ+seq_record.format('fasta')
            #print "wating", typ,"finished waiting","\n\n"
        if (len(typ)>0):
            #print "working on the leftover"
            result_handle = NCBIWWW.qblast("blastn", "nr", typ,word_size=13,hitlist_size=100,entrez_query=txid,expect=10)
            t2=result_handle.read()
            #print typ
            blast_result_file.write(t2)
        input_file.close()
        blast_result_file.close()
    print "blast job done!"
Example #3
0
File: CPA.py Project: jqiankgi/CPA
def check_blast_ex(input_filename,piece_len,bool_customer,exclude_line):
	bad_match=set()
	if bool_customer and exclude_line !='':
		query_line=exclude_line
	else:
		query_line='txid9606 [ORGN]'
	(input_id,input_seq,input_len)=read_fasta(input_filename)
	f=open('~seq.txt','w')
	for i in range(0,input_len-piece_len):
		print >>f, '>'+str(i)+'\n'+input_seq[i:i+piece_len]	
	f.close()
	input_file = open('~seq.txt','r')
	count_piece=int(os.path.getsize('~seq.txt')/2000)+1
	typ=''
	blast_result_file= open('~blastresult.xml',"w")
	counter_i=0
	try:
		for seq_record in SeqIO.parse(input_file, "fasta"):
			typ=typ+seq_record.format('fasta')
			if (len(typ)>2000):
				counter_i=counter_i+1
				try:
					print "blasting"
					result_handle = NCBIWWW.qblast("blastn", "nr", typ,word_size=13,hitlist_size=100,entrez_query=query_line,expect=10)
					print str(counter_i)+" out of "+str(count_piece)+" is blasted!"
				except:
					try:
						print "mistake happens when tryint to connect to NCBI blast engine, try again!"
						result_handle = NCBIWWW.qblast("blastn", "nr", typ,word_size=13,hitlist_size=100,entrez_query=query_line,expect=10)
						print "sucessfully connect to NCBI blast engine at the second try!"
					except:
						print "can't use NCBI blast at this moment!"
						sys.exit(0)
				t=result_handle.read()
				blast_result_file.write(t)					
				typ=''
		blast_result_file.close()
		blast_result_file= open('~blastresult.xml',"a")
		result_handle = NCBIWWW.qblast("blastn", "nr", typ,word_size=13,hitlist_size=100,entrez_query=query_line,expect=10)
		t2=result_handle.read()
		blast_result_file.write(t2)
		blast_result_file.close()
	except:
		print "Error happpens while getting the blast result! Unable to use blast this time!"
		sys.exit(0)
	try:
		blast_records = NCBIXML.parse(open('~blastresult.xml')) 
		for blast_record in blast_records:
			for alignment in blast_record.alignments:
				for hsp in alignment.hsps:
					if hsp.identities == piece_len:
						bad_match.add(i)
	except:
		print "Error happens while parsing blast result"
		sys.exit(0)
	input_file.close()
	return bad_match
def run_blast(fasta,type):
    if type == "prot":
        for seqs in SeqIO.parse(fasta,"fasta"):
            clock()
            out = open("split_xml/%s.xml"%str(seqs.id),"w")
            ncbi = NCBIWWW.qblast(program="blastp",database="nr",sequence=str(seqs.seq),format_type="XML",ncbi_gi=str(seqs.id), alignments=20,word_size=3)
            out.write(ncbi.read())
            print "%s\t%f"%(str(seqs.id),float(clock()))
    elif type == "nucl":
        for seqs in SeqIO.parse(fasta,"fasta"):
            clock()
            out = open("split_xml/%s.xml"%str(seqs.id),"w")
            ncbi = NCBIWWW.qblast(program="blastp",database="nr",sequence=str(seqs.seq),format_type="XML",ncbi_gi=str(seqs.id), alignments=20,word_size=3)
            out.write(ncbi.read())
            print "%s\t%f"%(str(seqs.id),float(clock()))
Example #5
0
def fetchGenbankData(seq_list):
    Entrez.email = "*****@*****.**"
    try:
        for taxa in seq_list.keys():
            seq = seq_list[taxa]  
            print "BLAST-ing NCBI for sequence ID: " + taxa.__str__()
            retry_count = 0
            # fetch the GenBank record; retry up to 3 times if the connection is problematic.
            while retry_count < 3:
                try:
                    blast_handle = NCBIWWW.qblast('blastp', 'nr', seq)
    
                    blast_handle.seek(0)
                    blast_file = open( taxa.__str__() + '.xml', 'w' )
                    blast_file.write( blast_handle.read() )
                    blast_file.close()
                    blast_handle.close()
                    print ". . . results written to " + taxa.__str__() + '.xml'
                    break # if we get the handle OK, then break out of the loop
                except ValueError:
                    sleep(3)
                    print "Something went wrong, my GenBank query for taxa " + taxa.__str__() + " returned no records."
                    print "I'm trying again. . ."
                    retry_count += 1
            time.sleep(2)
    except ValueError:
        print "Something went wrong, my GenBank query for taxa " + taxa.__str__() + " returned no records."
        print "I'm not going retry anymore.  Sorry."
        exit(1)
Example #6
0
def find_closest_ref(fasta_file, callback=None, update_callback=lambda d: None, organism=entrez_CFSAN_genera):
	"Find closest match in NCBI Refseq to longest contig, then collect URL for it"
	if not callback:
		import datetime
		def callback(s):
			print "[{}]".format(datetime.datetime.today().ctime()), s
			
	callback("Importing modules...")

	from Bio.Blast import NCBIWWW
	import xml.etree.ElementTree as xml
	
	callback("Loading fasta ({})...".format(fasta_file))
	with open(fasta_file, 'r') as f:
		contigs = iter(sorted(list(SeqIO.parse(f, 'fasta')), lambda a,b: cmp(len(a), len(b))))
		contig = contigs.next()
		while len(contig) < 1500:
			try:
				contig = contigs.next()
			except StopIteration:
				break
	callback("Longest contig is {} bases. BLASTing...".format(len(contig)))
	r = NCBIWWW.qblast("blastn", "chromosome", ">{}\n{}".format(contig.description, contig.seq), 
					   alignments=1, 
					   entrez_query="{}".format(organism),
					   hitlist_size=1,
					   filter='L')
	callback("BLAST finished.")
	result = xml.parse(r)
	refseq = result.find(".//Iteration/Iteration_hits/Hit/Hit_id").text.split("|")[1]
	refseq_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={}&rettype=fasta&retmode=text'.format(refseq)
	update_callback({'ref_file':refseq, 'ref_url':refseq_url})
	return refseq
Example #7
0
    def blast2(self):

 #        File = open("output"+x+".txt","w")
        fasta_string = open(self.infile).read()  # or make the names fasta1.fasta and just do open(i).read
        print(fasta_string)
        database = "nr"
        program = "blastn"
        parameters = [
         ('DATABASE', database),

          ('PROGRAM', program),
          # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
          ('QUERY', fasta_string),

        ('CMD', 'Put'),
          ]
        query = [x for x in parameters if x[1] is not None]
        message = (urllib.urlencode(query))
        print (query)
        print(message)
        result_handle = NCBIWWW.qblast("blastn", "nr", fasta_string, hitlist_size=10)

        blast_records = NCBIXML.parse(result_handle)
        # or blast_record = NCBIXML.read(result_handle) if you only have one seq in file
        E_VALUE_THRESH = 0.001
        for blast_record in blast_records:
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < E_VALUE_THRESH:
                        print "alignment:", alignment.title
                        print "e-value:", hsp.expect
Example #8
0
File: blast.py Project: pazur/final
 def run(self):
     res = NCBIWWW.qblast(self.program, self.database, self.sequence, **self.queryextra)
     blast_records = NCBIXML.parse(res)
     alignments = reduce(lambda x, y: x + y, map(lambda r: r.alignments, blast_records), [])
     records = list(self.get_seqrecords(alignments))
     records = self.delete_same(records)
     return {"blast_xml": res.getvalue(), "blast_records": blast_records, "sequences": records}
Example #9
0
		def taxBLASTn(uniprotID,taxName,proSeq):
			mapTuple, debug = (), ''
			try:
				result_handle = NCBIWWW.qblast("tblastn", "nr", proSeq, expect = .0001, entrez_query = taxName+'[organism]')
				string = result_handle.read()
				result_handle.close()
				tree = xml.etree.ElementTree.fromstring(string)
				iteration = tree.find("BlastOutput_iterations/Iteration")
				hits = iteration.findall("Iteration_hits/Hit")
				topHit = hits[0]
				accessionNCBI = topHit.findtext("Hit_accession")
				qseq = topHit.findtext("Hit_hsps/Hsp/Hsp_qseq")
				hseq = topHit.findtext("Hit_hsps/Hsp/Hsp_hseq")
				midseq = topHit.findtext("Hit_hsps/Hsp/Hsp_midline")
				Hit_id = topHit.findtext("Hit_id")
				Hit_from = int(topHit.findtext("Hit_hsps/Hsp/Hsp_hit-from"))
				Hit_to = int(topHit.findtext("Hit_hsps/Hsp/Hsp_hit-to"))
				match = re.search(r'gi\|(\w+)\|',Hit_id)
				GI = match.group(1)
				debug += '\ttBLASTn hit accession and match indices: '+str(accessionNCBI)+' ('+str(Hit_to)+', '+str(Hit_from)+')\n'
				dna_seq = chromParse(GI,Hit_from,Hit_to)
				if dna_seq != '':
					mapTuple = (accessionNCBI,dna_seq,'TBLASTN',midseq)
					return mapTuple, debug
				else:
					return mapTuple, debug
			except:
				return mapTuple, debug
Example #10
0
def internetBLAST(inputFile, fileFormat='fasta', evalue=0.001):
    '''This function runs Blast online and reads
    the output (xml format). Bitscore/length ration
    for each match is calculated and the max score
    is reported to stdout.

    '''

    for seqRecord in SeqIO.parse(inputFile, fileFormat):
        print >> sys.stderr, 'Doing BLAST (internet) search for', seqRecord.id
        ratios = []
        resultHandle = NCBIWWW.qblast('blastp', 'nr', seqRecord.seq)
        blastRecords = NCBIXML.parse(resultHandle)
        for blastRecord in blastRecords:
            for alignment in blastRecord.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < evalue:
                        ratios.append(((hsp.bits / alignment.length),
                                        alignment.title))
        if ratios:
            ratio, subject = sorted(ratios, key=lambda x: x[0])[-1]
            print >> sys.stdout, '%s\t%f\t%s' % (seqRecord.id, ratio, subject)
        else:
            print >> sys.stdout, '%s\t%s\t%s' % (seqRecord.id, 'NA', 'NA')
            print >> sys.stderr, '%s\t%s\t%s' % (seqRecord.id, 'NA', 'NA')
Example #11
0
def blast(arguments):
    """ Worker function that executes the WWWNCBI blast """
    # Gathering arguments
    blast_program = arguments[0]
    name, seq = arguments[1]
    database = arguments[2]
    evalue = arguments[3]
    hitlist = arguments[4]
    output_num = arguments[5]
    output_format = arguments[6]
    # output_file defined in main

    # Executing BLAST
    save_file = open("blast_out_%s_%s" % (output_file, output_num), "a")

    try:
        result_handle = NCBIWWW.qblast(
            blast_program,
            database,
            ">%s\n%s" % (name, seq),
            expect=evalue,
            hitlist_size=hitlist,
            format_type=output_format,
        )
        # Ensuring that when the sequence input type and blast program are incompatible, the program exits cleanly and with an informative error
    except (ValueError):
        sys.exit("\nPlease check the compatibility between the input sequence type and the BLAST program")

    save_file.write(result_handle.read())
    save_file.close()
Example #12
0
def main(argv):
	try:
		opts,args = getopt.getopt(argv,'hf:o:e:',)
	except getopt.GetOptError:
		print "BlastZFGenome.py -f <path to input fasta file> -o <path to output directory and filename (default ./blast.xml)> -e <expect value (default 10e-20)>" 
		sys.exit(2)
			
	output = './blast.xml'
	fastafile = ''
	expect = 10e-20

	for opt, arg in opts:
		if opt == "-h":
			print "BlastZFGenome.py -f <path to input fasta file> -o <path to output directory and filename (default ./blast.xml)> -e <expect value (default 10e-20)>"
			sys.exit(2)
		elif opt == "-o":
			output = arg
		elif opt == "-f":
			fastafile = arg
		elif opt == "-e":
			expect = float(arg)
				
	fasta = open(fastafile).read()
	result_handle = NCBIWWW.qblast("blastn","GPIPE/59729/101/ref_top_level",fasta,expect=expect)

	save_file = open(output, "w")
	save_file.write(result_handle.read())
	save_file.close()
	result_handle.close()
Example #13
0
 def search_blast(self, accession, program="blastp", database="nr"):
     """
     See:
         http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc73
         http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc75
     """
     # First, find the original sequence based on accession
     assert self.uniprot is not None
     sequence = None
     for record in self.uniprot:
         for acc in record.accessions:
             if acc == accession:
                 sequence = record.sequence
                 break
     if not sequence:
         raise ValueError('No matching sequence for the accession number')
     # Now, feed it to BLAST
     handle = NCBIWWW.qblast(program, database, sequence)
     try:
         with self.open('blast.xml', 'w') as fp:
             fp.write(handle.read())
     finally:
         handle.close()
     self.load_blast()
     return self.blast_records
Example #14
0
File: OT.py Project: Sudoka/tupac
def get_OT(sample_seqs):
  s = ''
  query_string = ''
  for seq in sample_seqs:
    query_string += seq[0]+'\n'+seq[1]+'\n'

  blast_handle = NCBIWWW.qblast('tblastn', 'nr', query_string, entrez_query='scenedesmus dimorphus')
  blast_handle.seek(0)
  records = NCBIXML.parse(blast_handle)
  i = 0
  for record in records:
    if len(record.alignments) > 0:
      for align in record.alignments:
        row = sample_seqs[i][0] + '\t' + align.hit_id + '\t'
        frames = [hsp.frame[1] for hsp in align.hsps]
        if valid_align(frames):
          row += plus_or_minus(frames[0]) + '\t'
        else:
          row += '/' + '\t'
        query_coverage = float(sum([len(hsp.sbjct) for hsp in align.hsps])) / len(sample_seqs[i][1])
        if query_coverage < .9:
          row += str(1)
        else:
          row += str(2)
    else:
      row = sample_seqs[i][0] + '\t' + ' '*28 + '\t' + ' ' + '\t' + str(0)

    s += row + '\n'
    i += 1
  return s
Example #15
0
def main():
    my_string_to_use = open("C:/Users/Evan/Desktop/Biosecurity_Stuff/Genomes/test_gene.txt","r")


    line_one = ""   #This program searches 2 lines, the strings here will hold them during the for loop
    line_two = ""
    counter = 0     #this is used in the for loop 
    blast_counter = 1 #this counts the number of blast searches performed
    for lines in my_string_to_use:
        if counter == 0:
            counter = counter + 1
            #print "pear"  #These random print statements were used for troubleshooting
        elif counter%2 == 1:  #used if the counter is odd, it saves the line of nucleotides for later
            line_one = lines
            counter = counter+1
            #print "apple"
        elif counter%2 == 0:  # if the counter is even, it will blast search both lines and clear the temporary lines
            line_two = lines
            result_handle = NCBIWWW.qblast("blastn", "nt", line_one + line_two)
            #print "purple"
            print result_handle  #This tells you in the python terminal if a blast search was performed
            save_file = open("C:/Users/Evan/Desktop/overload_genome.xml", "w")
            save_file.write(result_handle.read())
            save_file.close()
            result_handle.close()
            print "Blast search number: ", blast_counter

            blast_counter = blast_counter + 1
            counter = counter + 1
        
        #print lines
        
    my_string_to_use.close()
    
    Open_the_XML_file()
Example #16
0
 def blast_query(self, counter, entrez_query=''):
     counter.set_work(5)
     self._format_query(); counter.count()
     self._save_query_config(); counter.count()
     try:
         print '\nLaunching BLAST query #%d...' % self._primers_hash
         blast_results = NCBIWWW.qblast('blastn', 
                                        self.database, 
                                        self._query.format('fasta'), 
                                        expect       = self.e_val, 
                                        word_size    = self.w_size,
                                        nucl_penalty = self.n_pen,
                                        nucl_reward  = self.n_rew,
                                        filter       = self.fltr,
                                        entrez_query = entrez_query,
                                        ungapped_alignment = self.no_gaps,)
         counter.count()
         #save results to a file
         results_file = open(self._results_filename, 'w')
         results_file.write(blast_results.read())
         results_file.close()
         blast_results.close()
         print '\nBLAST output was written to:\n   %s' % self._results_filename
         counter.count()
         #parse results
         results_file  = open(self._results_filename, 'r')
         self._blast_results = list(NCBIXML.parse(results_file))
         results_file.close()
         counter.count()
     except Exception, e:
         print '\nFailed to obtain BLAST query results from NCBI.'
         print e
         return False
def blast_bulk (fasta_file, settings):

	# The blast modules are imported from biopython
	from Bio.Blast import NCBIWWW, NCBIXML
	from Bio import SeqIO
	
	# parse the fasta file
	seq_list = [seq for seq in SeqIO.parse(fasta_file, 'fasta')]

	# open the fasta file
	#fasta_open = open(fasta_file, 'r')
	#fasta_handle = fasta_open.read()
	
	blast_list = []

	for seq in seq_list:
		print seq
		result_handle = NCBIWWW.qblast(settings[0], settings[1], seq.format('fasta'), megablast=settings[3], hitlist_size=settings[2])
		blast_list.append(NCBIXML.read(result_handle))
	# Blast the sequences against the NCBI nucleotide database
	# return a list with the blast results
	#result_handle = NCBIWWW.qblast(settings[0], settings[1], fasta_handle, megablast=settings[3], hitlist_size=settings[2])
	#blast_list = [item for item in NCBIXML.parse(result_handle)]	

	return blast_list
Example #18
0
def getOrthologs(seq,expect=10,hitlist_size=400,equery=None):
    """Fetch orthologous sequences using blast and return the records
        as a dataframe"""

    from Bio.Blast import NCBIXML,NCBIWWW
    from Bio import Entrez, SeqIO
    Entrez.email = "*****@*****.**"
    #entrez_query = "mycobacterium[orgn]"
    #db = '/local/blast/nr'
    #SeqIO.write(SeqRecord(Seq(seq)), 'tempseq.faa', "fasta")
    #sequtils.doLocalBlast(db, 'tempseq.faa', output='my_blast.xml', maxseqs=100, evalue=expect)

    try:
        print 'running blast..'
        result_handle = NCBIWWW.qblast("blastp", "nr", seq, expect=expect,
                              hitlist_size=500,entrez_query=equery)
        time.sleep(2)
    except:
        print 'blast timeout'
        return
    savefile = open("my_blast.xml", "w")
    savefile.write(result_handle.read())
    savefile.close()
    result_handle = open("my_blast.xml")

    df = sequtils.getBlastResults(result_handle)
    df['accession'] = df.subj.apply(lambda x: x.split('|')[3])
    df['definition'] = df.subj.apply(lambda x: x.split('|')[4])
    df = df.drop(['subj','positive','query_length','score'],1)
    print len(df)
    df.drop_duplicates(subset=['definition'], inplace=True)
    df = df[df['perc_ident']!=100]
    print len(df)
    #df = getAlignedBlastResults(df)
    return df
Example #19
0
def aa_to_mrna(aaseq):
    """Given an amino acid sequence, return the mRNA sequence, if it exists,
        from the NCBI nucleotide database."""
    result_handle = NCBIWWW.qblast("tblastn", "nr", aaseq, descriptions=10)
    result = result_handle.read()
    print result
    return result
Example #20
0
def process(hash):
    print('Starting thread for:', hash)
    error = 100

    text, error = ocr(hash)

    if len(text) > 0:
        start_time = timeit.default_timer()
        blast = NCBIWWW.qblast('blastn', 'nr', text)
        elapsed = timeit.default_timer() - start_time
        print('blast:', elapsed)
        if len(blast.getvalue()) == 0:
            error = 10  #blast nao retornou resultados
        else:
            error = 0  # tudo OK
    else:
        error = 34  #ocr nao encontrou texto

    # faz um update para incluir os resultados

    with app.test_request_context():
        r = Result.query.filter_by(hash=hash).first()
        r.error = error
        r.result = blast.getvalue()
        db.session.commit()

    return
def UniBLAST(code,Verbose=False):
    """
    Input
    ------
    Uniprot Code 
    UniBLAST(code)
    e.g. UniBLAST('O00238')
    
    Description
    -----------
    Outputs a Fasta sequence and
    Runs blasp looking through pdb database with the Uniprot code 
    
    Output
    ------
    - UniproID.fasta     FASTA Sequence 
    - UniproID_blast.xml Blast output in XML Format 
    """
    
    Entrez.email = random.choice(emails)
    if(Verbose):
        print "Using email: %s"%(Entrez.email)
    with open(code + ".fasta", "w") as out_file:
        net_handle = Entrez.efetch(db="nucleotide", id=code, rettype="fasta")
        out_file.write(net_handle.read())

    if(Verbose):
        print "Running blastp"
    result_handle = NCBIWWW.qblast("blastp", "pdb", code)
    if(Verbose):
        print "Done running blastp"
    with open(code + "_blast.xml", "w") as save_file:
        save_file.write(result_handle.read())
    result_handle.close()
Example #22
0
def get_blast_alignments(seq, query):
  ncbi = NCBIWWW.qblast(program="blastn" , database="nr", 
                      sequence=seq, entrez_query=query, format_type="XML", hitlist_size = 500, expect = 100.0)
  blast = NCBIXML.read(ncbi);
  remove_alignments = []
  
  query_length = len(seq);

  #results = []
  #for alignment in blast.alignments:
    #positive = alignment.hsps[0].positives * 100 / 80
    #if positive >= 80:
      #results.append(alignment)
  #return results
  for alignment in blast.alignments:
    overall_length = 0.0
    for hsp in alignment.hsps:
      overall_length += hsp.align_length
    if (overall_length / query_length) < 0.8:
      remove_alignments.append(alignment)

  for alignment in remove_alignments:
    blast.alignments.remove(alignment)
    
  return blast.alignments;
Example #23
0
def createPSSM():
    print "Start PSSM"

    #sequencelist = sequencelist.replace("-", ".")
    list = []

    for seq_record in SeqIO.parse("fastatmp", "fasta", IUPAC.unambiguous_dna):
        list.append(str(seq_record.seq))

    #Blast typical sequence
    result_handle = NCBIWWW.qblast("blastn", "nt", list[0])
    save_file = open("my_blast.xml", "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

    #motifs.create(test, alphabet=Gapped(IUPAC.unambiguous_dna))
    m = motifs.create(list, alphabet=Gapped(IUPAC.unambiguous_dna))
    print "motif created"


    pwm = m.counts.normalize(pseudocounts=0.25)
    print "PWM done"
    pssm = pwm.log_odds()
    print "PSSM done"
    print pssm
    return pssm
Example #24
0
def get_descriptions(sample_seqs, organism):
  query_string = ''
  for seq in sample_seqs:
    query_string += seq[0]+'\n'+seq[1]+'\n'

  blast_handle = NCBIWWW.qblast('tblastn', 'nr', query_string, entrez_query=organism)
  blast_handle.seek(0)
  records = NCBIXML.parse(blast_handle)
  descs = []
  i = 0
  for record in records:
    if len(record.alignments) > 0:
      for align in record.alignments:
        desc = [sample_seqs[i][0], align.hit_id]

        frames = [hsp.frame[1] for hsp in align.hsps]
        if valid_align(frames):
          desc.append(plus_or_minus(frames[0]))
        else:
          desc.append('/')

        query_coverage = float(sum([len(hsp.sbjct) for hsp in align.hsps])) / len(sample_seqs[i][1])
        desc.append(query_coverage)

        #list of tuples of form ( (query_start, query_end), (sbjct_start, sbjct_end), (query, match, sbjct, frame) ) sorted by query_start
        hsp_info = sorted([((hsp.query_start, hsp.query_end), (hsp.sbjct_start, hsp.sbjct_end), (hsp.query, hsp.match, hsp.sbjct, hsp.frame[1])) for hsp in align.hsps], key= lambda t: t[0][0])
        desc.append(hsp_info)

    else:
      desc = [sample_seqs[i][0], ' ', ' ', 0.0, [], []]

    descs.append(desc)
    i += 1
  return descs
Example #25
0
def blast_pdb(target_sequence, num_hits=1000):
    """
    Query the PDB using NCBI blast and return MSMSeeds initialized with the results

    Parameters
    ----------
    target_sequence : String
        The sequence of the target to use to query blast
    num_hits : int, optional
        The maximum number of hits returned by BLAST. Default: 1000

    Returns
    -------
    msmseeds : list of MSMSeed objects
        A list of MSMSeed objects initialized with a target sequence, template sequence, template structure,
        and BLAST e-value. Can be readily parallelized in Spark.
    """
    from Bio.Blast import NCBIWWW, NCBIXML
    result_handle = NCBIWWW.qblast("blastp", "pdb", target_sequence, hitlist_size=num_hits)
    blast_record = NCBIXML.read(result_handle)
    alignments = blast_record.alignments
    msmseeds = []
    for alignment in alignments:
        e_val = alignment.hsps[0].expect
        template_fasta, template_structure = _retrieve_chain(alignment.accession)
        msmseeds.append(MSMSeed(target_sequence,template_fasta, template_structure, e_val))
    return msmseeds
Example #26
0
def blast_execute(record):
    result_handle = NCBIWWW.qblast("blastn", "nt", record.seq)
    save_file = open(os.path.join(ana_dir, os.path.join(bla_dir,
                    os.path.join(xml_dir, "BLAST-" + record.name + ".xml"))), "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()
Example #27
0
	def blast_record(self):
		print "BLASTing record number %d ..." % int(self.genbank_record_number)
		result_handle = NCBIWWW.qblast("blastp", "nr", self.genbank_record_number)
		print "extracting result..."
		self.xml_result = result_handle.read()
		result_handle.close()
		return self.xml_result
Example #28
0
def blastdemo(genbankID):
    # run blastp on the swissprot database NB to scale this up we must do it locally on cluster
    result_handle = NCBIWWW.qblast("blastp", "swissprot", genbankID)
    # read the results as XML
    blast_record = NCBIXML.read(result_handle)

    # Set this value to ridiculously low
    E_VALUE_THRESH = 0.00000000000000001
    # for each alignment found, display the one with the lowest e-value, and also protein function information.
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < E_VALUE_THRESH:
                print ("****Alignment****")
                print ("sequence:", alignment.title)
                print ("length:", alignment.length)
                print ("e value:", hsp.expect)
                print (hsp.query[0:75] + "...")
                print (hsp.match[0:75] + "...")
                print (hsp.sbjct[0:75] + "...")
                print "\n"

    ### h is not defined yet, Will (problem from  iPython nb's!)
    # print h.query[0:75] + '...'
    # print h.match[0:75] + '...'
    # print h.sbjct[0:75] + '...'

    for a in blast_record.alignments:
        print a.length
def blast_online(rec, result_xml_fpath):
    from Bio.Blast import NCBIWWW

    retrying = False
    to_the_next = False
    attempt_number = 1
    while True:
        try:
            print rec.format('fasta')
            result_xml_f = NCBIWWW.qblast('blastp', 'refseq_protein', rec.format('fasta'),
                                          hitlist_size=10)
            with open(result_xml_fpath, 'w') as save_f:
                save_f.write(result_xml_f.read())

        except urllib2.HTTPError as e:
            log.warn('     Warning: could not blast through web. HTTPError: %s. Code %s. '
                     '(You can press Ctrl+C to interrupt and continue later).'
                     % (e.msg, str(e.code)))
            retrying = True

        except urllib2.URLError, e:
            log.warn('     Warning: could not blast through web. URLError: %s. '
                     '(You can press Ctrl+C to interrupt and continue later).'
                     % (e.args))
            retrying = True

        except (KeyboardInterrupt, SystemExit, GeneratorExit):
            if retrying:
                log.info('     If you restart from this step and do not remove the "%s" directory, '
                         'the process will continue from here.' % blasted_singletones_dir)
            return 1
Example #30
0
def blast_remos( r, db = 'nr' ):
    """Uses blast to find remos in a genome"""
    from Bio.Blast import NCBIWWW, NCBIXML
    import cStringIO
    b_parser = NCBIXML.BlastParser()
    E_VALUE_THRESH = 0.04
    for s in r.get_aligned_sequences():
        for remo in r.get_remos_for( s ):
            seq = remo.get_sequence_for( s.centre_sequence, False )
            print 'Blasting: %s...' % ( seq[:60] )
            result_handle = NCBIWWW.qblast( 'blastn', db, seq )
            blast_results = result_handle.read()
            blast_out = cStringIO.StringIO(blast_results)
            b_record = b_parser.parse(blast_out)
            for alignment in b_record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < E_VALUE_THRESH:
                        print '****Alignment****'
                        print 'sequence:', alignment.title
                        print 'length:', alignment.length
                        print 'e value:', hsp.expect
                        print 'sbjct_start:', hsp.sbjct_start
                        print hsp.query[0:75] + '...'
                        print hsp.match[0:75] + '...'
                        print hsp.sbjct[0:75] + '...'
            break
        break
Example #31
0
def blast_with_GIs(GI_seqIDs):
    """
    Based in part on Biopython cookbook example.
    Try and except structure of function suggested by Subir
    """
    counter = 1
    for GI_ID in GI_seqIDs:
        try:
            result = NCBIWWW.qblast("blastn", "nt", GI_ID, format_type="Text")
            blast_results = result.read()
            with open("{}_{}.txt".format("blast_results", counter),
                      "w") as outfile:
                outfile.write(blast_results)
            counter += 1
        except:
            print("No sequence available for gi|{}".format(GI_ID))
        time.sleep(1)
Example #32
0
def do_blast(seq, organism, eVal):

    while True:
        signal.alarm(120)
        try:
            result = NCBIWWW.qblast("blastp",
                                    "nr",
                                    seq,
                                    entrez_query=organism,
                                    expect=eVal)
            break
        except TimeoutException:
            print("Server timeout, trying again")
            continue

    signal.alarm(0)
    return result
Example #33
0
def blastProt(database, file_name, file_format):
    record = SeqIO.read(open(file_name), format=file_format)
    
    print("BLAST runnning")
    result_handle = NCBIWWW.qblast("blastp", database, record.format(file_format))
    print("BLAST finnished")

    mo = re.search("\d", file_name)
    number = mo.group()
    
    xml_file = "blast-Prot" + number + ".xml"
    save_file = open(xml_file, "w")
    save_file.write(result_handle.read()) 
    save_file.close() 
    result_handle.close()
    
    return xml_file
Example #34
0
def main():
    from Bio.Blast import NCBIWWW
    import os
    from Bio import SeqIO
    from io import StringIO

    print("Start reading FASTA files...")
    os.chdir("/home/yikylee/Desktop/megahit")
    list_dir = [i for i in os.listdir() if i.find(".fa") >= 0]
    for fasta in list_dir:
        _prefix = fasta.split(".")[0]
        print("Now " + _prefix + "...")
        sequence_data = open(fasta).read()
        result_handle = NCBIWWW.qblast("blastn", "nt", sequence_data, format_type="Text", hitlist_size=10)
        print(_prefix + " analysis completed.")
        with open("./" + _prefix + "_result.txt", "w") as save_to:
            save_to.write(result_handle.read())
Example #35
0
def buscaNcbi(query):
    # Buscar proteínas = blastp
    # Buscar nucleotídeos = blastn
    try:
        print("Buscando arquivo...")
        blast_result = NCBIWWW.qblast("blastp", "nr", query)
        blast_out = open(arq_file + ".xml", "w")
        blast_out.write(blast_result.read())
        blast_out.close()
        blast_result.close()
        print("Fim da busca.\nArquivo " + query +
              ".xml encontra-se disponível no diretório '" + path +
              "' para análise")

    except ValueError:
        print("\nProteína inexistente ou inválida\n\n\n")
        time.sleep(5)
Example #36
0
def genebank_sequence(name):
    esearch_query = Entrez.esearch(db="nucleotide",
                                   term=name,
                                   retmode="xml")
    esearch_result = Entrez.read(esearch_query)
    sequenc_entry = esearch_result['IdList']
    print(sequenc_entry)
    for i in sequenc_entry:
        try:
            result_blast = NCBIWWW.qblast("blastn", "nt", i,
                                          format_type='Text')
            output = result_blast.read()
            time.sleep(1)
            with open("outputfile.txt", "a") as outfile:
                outfile.write(output)
        except ValueError:
            output = ''
Example #37
0
def perform_blast(output, program, database, sequence, hitlist_size):
    """
    Called from blast_controller. Performs BLAST and writes to output file
    output= path for the output file
    program=the BLAST program to be used
    database=the database to BLAST against
    sequence=the sequence to be blasted
    hitlist_size=maximum number of hits
    """
    handle = NCBIWWW.qblast(program=program,
                            database=database,
                            sequence=sequence,
                            hitlist_size=hitlist_size)
    with open(output, "a") as out_handle:
        out_handle.write(handle.read())
        out_handle.close()
    handle.close()
Example #38
0
def net_blast(query_record, program='blastn', database='nr'):
    """
	net_blast(query_record, program, database = 'nr')
	*Perform a BLAST search over the net using the specified program & database
	*before searching, check that the search alphabet is compatible with the type of search,
	*raise a ValueError if not
	
	ARGUMENTS
	query_record: a SeqRecord object containing the query sequence
	program: the program to use, as per:
		http://www.ncbi.nlm.nih.gov/BLAST/blast_program.shtml
	database: the db to query, as per:
		http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=ProgSelectionGuide#db
	
	"""
    #check whether we have a valid query
    if not isinstance(query_record, SeqRecord):
        raise ValueError(u'Invalid Search Item')
    if len(query_record.seq) < 10:
        raise ValueError(u"Query sequence is too short")
    #check that the program is valid
    program = program.lower()
    if program not in searches:
        raise ValueError(u"Invalid Program '%s'" % program)

    #check that the alphabet and db are ok
    (required_alpha, required_dbs) = searches[program]
    if not isinstance(query_record.seq.alphabet, required_alpha):
        raise ValueError(u"Query alphabet for '%s' must be '%s'" %
                         (program, alphabets[program]))
    if not (database in protein_db or database in nucleotide_db):
        raise ValueError(u"Invalid database '%s'" % database)
    if not database in required_dbs:
        raise ValueError(u"Database '%s' cannot be used with program '%s'" %
                         (database, program))

    #Value checking done, time to run the search
    results = NCBIWWW.qblast(program,
                             database,
                             query_record.seq,
                             format_type='XML')

    #parse the results
    blast_records = NCBIXML.parse(results)

    return blast_records
Example #39
0
def blast_file(fasta_path, blast_db='nt', parser=basic_parser):
    logging.info("Running BLAST {}".format(fasta_path))
    results = []
    #record = SeqIO.read(fasta_path, format="fasta")
    fasta_string = open(fasta_path, 'r').read()
    logging.debug(fasta_string)
    result_handle = NCBIWWW.qblast(BLAST_PROG,
                                   blast_db,
                                   fasta_string,
                                   megablast=True)
    logging.info("BLAST returned")
    blast_records = NCBIXML.parse(result_handle)
    logging.info("Analyzed BLAST")
    for single_record in blast_records:
        # each run is a single sequence search from fasta_path
        results.append(parser(single_record))
    return results
Example #40
0
def getBLAST(arg):
    BLASTResultAsXML = NCBIWWW.qblast(program=arg[1],
                                      database=arg[2],
                                      sequence=arg[3],
                                      expect=arg[4],
                                      hitlist_size=arg[5],
                                      matrix_name=arg[6],
                                      alignments=arg[7])

    BLASTData = NCBIXML.parse(BLASTResultAsXML)

    maxEValue = 0.0001
    maxResults = 1

    i = 0

    for BLASTResult in BLASTData:
        for alignment in BLASTResult.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < maxEValue and maxResults < 2:
                    # Header van het BLAST resultaat
                    header = str(alignment.title)
                    # Naam organisme
                    name = header.split('[', 1)[1].split(']')[0].split('>')[0]
                    protein = header.split('|')[4].split('[')[0]
                    accession = alignment.title.split('|')[3]
                    eValue = hsp.expect
                    identity = hsp.identities
                    queryCov = float(hsp.identities) / float(len(
                        hsp.query)) * float(100)
                    score = hsp.score
                    bits = hsp.bits

                    data = str(name) + "$" + str(protein) + "$" + str(
                        accession) + "$" + str(eValue) + "$" + str(
                            identity) + "$" + str(queryCov) + "$" + str(
                                score) + "$" + str(bits)
                    print(data)
                    maxResults += 1

                if maxResults >= 2:
                    break
            i += 1
        if i == 1:
            break
Example #41
0
def blast_offtarget(fasta_string):
    """Function which count offtarget using blast.

    Args:
        fasta_string(str): Fasta sequence.

    Returns:
        Offtarget value(int).
    """
    try:
        with blast_path():
            with open('fasta', 'w') as fasta_file:
                fasta_file.write(fasta_string)
            cline = NcbiblastnCommandline(
                query="fasta",
                db="refseq_rna",
                outfmt=("'6 qseqid sseqid evalue bitscore sgi sacc staxids"
                        "sscinames scomnames stitle'"))
            stdout, stderr = cline()

        blast_lines = [
            line for line in stdout.split('\n') if 'H**o sapiens' in line
        ]

        return len(blast_lines)
    except ApplicationError:
        result_handle = NCBIWWW.qblast("blastn",
                                       "refseq_rna",
                                       fasta_string,
                                       entrez_query="txid9606 [ORGN]",
                                       expect=100,
                                       gapcosts="5 2",
                                       genetic_code=1,
                                       hitlist_size=100,
                                       word_size=len(fasta_string),
                                       megablast=True)
        blast_results = result_handle.read()

        blast_in = cStringIO.StringIO(blast_results)
        count = 0

        for record in NCBIXML.parse(blast_in):
            for align in record.alignments:
                count += 1
        return count
def BlastFastaXmlIndv(fasta_filename=None, xml_filename=None):
    if fasta_filename:
        record_iterator = SeqIO.parse(fasta_filename, "fasta")
        output_table = open(fasta_filename + ".summary.tsv", 'w')
        outputWriter = csv.writer(output_table, delimiter="\t")
        for seq_record in record_iterator:
            wait_time = 1
            while True:
                print seq_record.id
                try:
                    result_handle = NCBIWWW.qblast("blastn",
                                                   "nr",
                                                   seq_record.seq,
                                                   entrez_query="KM204118.1")
                    break
                except ValueError:
                    print "Error encountered"
                    print "Trying again in " + str(wait_time) + " seconds"
                    if wait_time > 100:
                        sys.exit()
                    time.sleep(wait_time)
                    wait_time *= 2

            blast_record = NCBIXML.read(result_handle)
            filteredHspStartEnds = FilterBlastRecord(blast_record)
            if filteredHspStartEnds and CheckPossibleRecomb(
                    filteredHspStartEnds):
                WriteARow(outputWriter, blast_record, filteredHspStartEnds)

            result_handle.close()

    elif xml_filename:
        output_table = open(xml_filename + ".summary.tsv", 'w')
        outputWriter = csv.writer(output_table, delimiter="\t")
        result_handle = open(xml_filename)
        blast_records = NCBIXML.parse(result_handle)
        for blast_record in blast_records:
            filteredHspStartEnds = FilterBlastRecord(blast_record)
            if filteredHspStartEnds and CheckPossibleRecomb(
                    filteredHspStartEnds):
                WriteARow(outputWriter, blast_record, filteredHspStartEnds)

        result_handle.close()

    output_table.close()
Example #43
0
    def fillDomainsBLAST(self):
        '''
		Using the NCBIWWW package, it searches for domains with BLAST. Domains are saved in the protdomains variable.
		:return: phageDomains, a dictionary that, for each protein in a given species, has domains associated
		'''
        print('Finding functions/domains with BLAST')
        from Bio.Blast import NCBIWWW
        from Bio.Blast import NCBIXML
        import pickle
        from pathlib import Path
        my_file = Path("files/phage_list_blast")
        if my_file.is_file():
            with open('files/phage_list_blast', 'rb') as f:
                list_done = pickle.load(f)
        else:
            list_done = []
        for spec in self.phagesProteins:
            if spec not in list_done:
                for prot in self.phagesProteins[spec]:
                    if 'hypothetical' in self.phagesProteins[spec][prot][
                            0].lower(
                            ) or 'uncharacterized' in self.phagesProteins[
                                spec][prot][0].lower(
                                ) or 'unknown' in self.phagesProteins[spec][
                                    prot][0].lower():
                        # if not self.phageDomains[bac][prot]:
                        result_handle = NCBIWWW.qblast(
                            'blastp',
                            'nr',
                            self.phagesProteins[spec][prot][1],
                            entrez_query=
                            'Acinetobacter baumannii (taxid:470), Escherichia coli (taxid:562), Klebsiella pneumonia (taxid:573)'
                        )
                        blastout = NCBIXML.read(result_handle)
                        for ali in blastout.alignments:
                            if 'hypothetical' not in ali.hit_def.lower(
                            ) and 'uncharacterized' not in ali.hit_def.lower():
                                print(ali.hit_def[:ali.hit_def.find(' [')])
                                self.phagesProteins[spec][prot][
                                    0] = ali.hit_def[:ali.hit_def.find(' [')]
                                break
                list_done.append(spec)
                with open('files/phage_list_blast', 'wb') as f:
                    pickle.dump(list_done, f)
                self.saveDomains()
Example #44
0
 def execute(self, seqRecord, outFormat):
     from Bio.Blast import NCBIWWW
     from Bio.Blast import NCBIXML
     ret = []
     rekord = seqRecord.format("fasta")
     for db in self.params["db"]:
         ret.append(
             NCBIWWW.qblast(self.params['blast'],
                            db,
                            rekord,
                            expect=float(self.params['cutoff']),
                            filter=self.params['filter'],
                            hitlist_size=int(self.params['nhits']),
                            matrix_name=self.params['matrix'],
                            alignments=int(self.params['nalign']),
                            descriptions=int(self.params['ndesc']),
                            megablast=self.params['megablast']).read())
     return ret
Example #45
0
    def call_blast(self, input_file, organism):

        blast_db = "nt"
        if organism == "mm":
            blast_db = self.mus_musculus_blast_db
        elif organism == "hs":
            blast_db = self.homo_sapiens_blast_db
        elif organism == "rn":
            blast_db = self.rattus_norvegicus_blast_db

        return_handle = NCBIWWW.qblast("blastn",
                                       blast_db,
                                       input_file,
                                       hitlist_size=10,
                                       expect=1000,
                                       word_size=7,
                                       gapcosts="5 2")
        return return_handle
Example #46
0
 def make_blast(self):
     """Faz um blast das proteínas que se encontram no ficheiro em formato fasta contra o genoma humano e imprime o tempo de duração"""
     records = SeqIO.parse(self.__file_prot, "fasta")
     save_file = open(self.__out, "w")
     for record in records:
         beginning = time.time()
         result_handle = NCBIWWW.qblast(
             "blastp",
             self.__db,
             record.format("fasta"),
             entrez_query='H**o sapiens [organism]')
         save_file.write(result_handle.read() + "\n")
         end = time.time()
         print(
             "A proteína %s já foi submetida ao blast e demorou %s segundos. "
             % (record.id, end - beginning))
     save_file.close()
     records.close()
def main():

    # get the fasta file from stdin and return the sequence (cst3.fa)
    sequence = hf.Get_sequences()[0]

    # if the file is not empty proceed
    if sequence:
        # Do Blast search of a given protein sequence against the nr database at NCBI
        # https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
        # see NCBIWWW documentation at http://biopython.org/DIST/docs/tutorial/Tutorial.html (chapter 7.1)

        # invoke the NCBI BLAST server over the internet
        # The first argument is the blast program to use for the search ('blastp' in our case)
        # The second argument specifies the databases to search against ('nr' in our case)
        # The third argument is a string containing your query sequence
        result_handle = NCBIWWW.qblast("blastp", "nr", sequence.seq)
        # print out the results (stdout)
        print(result_handle.read())
Example #48
0
def assign(seqs, outfile, title):
    """
    """
    assigned = []
    sequences = list(SeqIO.parse(seqs, "fasta"))

    result_handle = NCBIWWW.qblast(
        "blastp", "nr", "\n".join([
            ">{}\n{}".format(sequence.id, str(sequence.seq.ungap("-")))
            for sequence in sequences
        ]))
    with open("{}_result.txt".format(title), 'w') as results:
        print(result_handle.getvalue(), file=results)

    simple_blast_assignment(seqs,
                            "{}_result.txt".format(title),
                            outfile,
                            title=title)
Example #49
0
def get_blast_results(fasta_filename, blast_type="blastn", db="nt"):
    """Get the results from NCIB BLAST for the given FASTA file.

    Args:
        fasta_filename (str): The path to the FASTA file to run against BLAST
        blast_type (str): The type of BLAST to run ("blastn", "blastp", etc.)
        db (str): The blast database to run this query against ("nt", "pt", etc.)

    Return:
        list of Bio.Blast.Record.Blast records
    """
    fasta_sequence = None
    with open(fasta_filename, 'r') as fasta_file:
        fasta_sequence = fasta_file.read()

    results = NCBIWWW.qblast(blast_type, db, fasta_sequence)
    blast_records = NCBIXML.parse(results)
    return list(blast_records)
Example #50
0
def get_blast_record(seq, alignments, descriptions, hitlist_size):
    """Calls  NCBI's QBLAST server or a cloud service provider to get alignment results

    Args:
      alignments: max number of aligments from BLAST
      descriptions: max number of descriptions to show
      hitlist_size: max number of hits to return
      seq: protein sequence as string

    Returns:
      single Blast record

    """
    result_handle = NCBIWWW.qblast(program="blastp", database="nr", alignments=alignments,
                                   descriptions=descriptions,
                                   hitlist_size=hitlist_size, sequence=seq)
    blast_record = NCBIXML.read(result_handle)
    return blast_record
Example #51
0
def online_blast(seq_list):

    # convert the sequences to a sequence file (stored in the
    # working memory)
    temp = StringIO.StringIO()
    SeqIO.write(seq_list, temp, 'fasta')
    temp.seek(0, 0)

    # BLAST the sequences online against a NCBI database
    logging.debug('BLASTING sequences agaist NCBI')
    result_handle = NCBIWWW.qblast(args.ba,
                                   args.bd,
                                   temp.read(),
                                   megablast=args.mb,
                                   hitlist_size=args.hs)

    # return the results handle with the blast results
    return result_handle
Example #52
0
 def execute_blast(self, records, output, newlist):
     os.chdir(output)
     self.records = SeqIO.index(self.fastapath, "fasta")
     for i in np.arange(len(self.newlist)):
         print("Blasting gene " + self.newlist[i] + " against the " +
               self.db + " database.")
         result_handle = NCBIWWW.qblast(self.search,
                                        self.db,
                                        self.records[self.newlist[i]].seq,
                                        format_type='Text',
                                        hitlist_size=15,
                                        expect=0.0001,
                                        entrez_query='metazoa[Organism]')
         with open('{0}_result_handle.txt'.format(self.newlist[i]),
                   'w') as f:
             f.write('Gene: ' + self.newlist[i] + '\n\n\n')
             f.write('Seq:\n' + self.records[newlist[i]].format('fasta'))
             f.write(result_handle.read())
Example #53
0
def protein_blast(protein,
                  criteria,
                  threshold,
                  filename='blast.fasta',
                  db='swissprot'):
    """ perform blast search + filter by percentage coverage """
    handle = NCBIWWW.qblast('blastp', db, protein, entrez_query=criteria)
    result = NCBIXML.read(handle)

    out = open(filename, 'w')
    for alignment in result.alignments:
        sequence = alignment.hsps[0]

        if ((float)(sequence.positives) / sequence.align_length *
                100.0) >= threshold:
            out.write('>' + alignment.hit_id + '\n' + sequence.sbjct + '\n\n')

    out.close
Example #54
0
 def blastp(self, acc):
     try:
         gis = []
         print 'here'
         result_handle = NCBIWWW.qblast("blastp",
                                        "nr",
                                        acc,
                                        format_type="XML",
                                        expect=self.blast_threshold)
         print 'here'
         for blast_record in NCBIXML.parse(result_handle):
             for alignment in blast_record.alignments:
                 gis.append(alignment.title.split("|")[1])
         unique = [int(i.strip()) for i in gis if int(i) not in self.gis]
         self.gis.extend(unique)
     except:
         self.status.setdefault(acc, False)
     return
Example #55
0
def search_blast(protien, numHits=50):
    result_handle = NCBIWWW.qblast("blastp",
                                   "nr",
                                   protien,
                                   hitlist_size=int(numHits),
                                   format_type='HTML')

    save_file = open("my_blast.xml", "w")
    data = result_handle.read()
    #text = data.split('<Iteration>')[1].split('</Iteration_hits>')[0]
    #text = ' '.join([i for i in ' '.join([i for i in text.split('\n')]).split(' ') if i != ''])

    save_file.write(data)  #result_handle.read())
    save_file.close()
    result_handle.close()

    text = [i.split('</Hit_def>\n')[0] for i in data.split('</Hit_id>\n')][1:]
    names = [i.split('  <Hit_def>')[1] for i in text]
    return names
Example #56
0
def cli(input, output):
    """Simple program that BLAST searches all FASTA files in a directory
    and writes the top 5 hits for each query to a text file."""

    # open each file and print the filename to the terminal
    for filename in glob.glob(os.path.join(input, '*.fasta')):
        print(filename)
        with open(filename, 'rU') as fasta_handle:
            result_handle = NCBIWWW.qblast('blastn', 'nt',
                                           fasta_handle.read(), hitlist_size=5)  # do the actual blast search
            blast_results = SearchIO.parse(result_handle, 'blast-xml')  # parse the results without storing them
            for result in blast_results:
                i = 1
                for hsp in result.hsps:
                    output.write('Result #' + str(i) + '\n')
                    output.write(str(hsp) + '\n\n')
                    i += 1
        fasta_handle.close()
    print('Done')
Example #57
0
def processFasta(fastaFile, resultDirectory):
    print("Writing to directory ==> " + resultDirectory)
    record = SeqIO.read(fastaFile, format="fasta")
    result_handle = NCBIWWW.qblast("blastx",
                                   "nr",
                                   record.format("fasta"),
                                   expect=1e-10,
                                   hitlist_size=5)
    baseFile = os.path.basename(fastaFile)
    fileName = baseFile.rsplit('.', 1)[0] + '.xml'
    #fileName = fastaFile
    location = resultDirectory + fileName

    save_file = open(location, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle = open(location)
    genomeName = os.path.basename(fastaFile).rsplit(".", 1)[0]
    handelBlastResult(result_handle, genomeName)
Example #58
0
def find_homologues(protACC, max_number=10, filename="blast.xml"):
    """
    Find_homologues takes a protein accession number as required argument,
    and an optional max_number of results argument, default set to 10,
    and does a protein BLAST. The function returns the accession numbers
    of the BLAST proteins.
    """
    result_handle = NCBIWWW.qblast("blastp",
                                   "nr",
                                   protACC,
                                   hitlist_size=max_number)
    with open(filename, "w") as out_handle:
        out_handle.write(result_handle.read())
    with open(filename) as file:
        blast_record = NCBIXML.read(file)
        protACC = []
        for rec in blast_record.alignments:
            protACC.append(rec.accession)
    return protACC
Example #59
0
def blast(sequence, vorm='blastp'):
    try:
        time.sleep(5)
        result_handle = NCBIWWW.qblast(vorm,
                                       'nr',
                                       sequence,
                                       expect=(1 * (10**-5)),
                                       matrix_name='BLOSUM62',
                                       word_size=3,
                                       format_type='XML',
                                       hitlist_size=5)
        if vorm == 'blastp':
            return result_handle
        else:
            blast_records = NCBIXML.parse(result_handle)
            for blast_record in blast_records:
                return blast_record
    except:
        return blast(vorm, sequence)
Example #60
0
def blastx_blasten(sequentie):
    """ Deze sequentie opent een leeg XML file, daarna gaat het de blast uitvoeren met blastx. De gegevens van deze
    blast worden opgeslageni in het bestand en dit bestand wordt gesloten. Daarna wordt dit bestand weer geopend en
    wordt voor elk resultaat het organisme, eiwit, sequentie, lengte, e_value en stukje van de vergelijking opgeslagen
    in een lege lijst. Deze wordt samen met titel wat een lege string is gereturnd.

    :param sequentie: de ingevoerde sequentie
    :return: blastresultaat, een lijst met gegevens van de blatresultaten
    :return titel, een lege string
    """
    titel = ''
    blast_resultaat = []
    bestand = open("Resultaat.xml", "w")
    result_handle = NCBIWWW.qblast("blastx",
                                   "nr",
                                   sequentie,
                                   alignments=1,
                                   hitlist_size=10)
    bestand.write(result_handle.getvalue())
    bestand.close()

    result_handle = open("Resultaat.xml", "r")
    blast_records = NCBIXML.parse(result_handle)
    blast_record = next(blast_records)
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            blast_resultaat.append("****Alignment****")
            titel = alignment.title
            titels = titel.split("[")
            titelss = titels[1].split("]")
            titel_ = titel.split("|")
            titel__ = titel_[2].split("[")
            blast_resultaat.append("Blast organism: {}".format(titelss[0]))
            blast_resultaat.append("Protein: {}".format(titel__[0]))
            blast_resultaat.append("Sequence: {}".format(alignment.title))
            blast_resultaat.append("Length: {}".format(alignment.length))
            blast_resultaat.append("E-value: {}".format(hsp.expect))
            blast_resultaat.append(hsp.query[0:75] + "...")
            blast_resultaat.append(hsp.match[0:75] + "...")
            blast_resultaat.append(hsp.sbjct[0:75] + "...")
            blast_resultaat.append("\n")

    return blast_resultaat, titel