def blastBACTEUK(arg):
	out=open('bacterial.txt','a')
	out2=open('eukaryotic.txt','a')
	records = SeqIO.parse(open(arg), format="fasta")
	
	for record in records:
		try:
			name = record.id
			result_handleB = NCBIWWW.qblast("blastx", "nr", record.format("fasta"), ncbi_gi=False, descriptions= "1", alignments="1", format_type="XML", hitlist_size="1", entrez_query='(Bacteria[ORGN] OR Archaea[ORGN])')
			result_handleE = NCBIWWW.qblast("blastx", "nr", record.format("fasta"), ncbi_gi=False, descriptions= "1", alignments="1", format_type="XML", hitlist_size="1", entrez_query='(Eukaryota[ORGN])')

			blast_recordsB = NCBIXML.read(result_handleB)
			blast_recordsE = NCBIXML.read(result_handleE)

			if blast_recordsB.descriptions:
				print record.id

				name = record.id


				out.write(name + ',' + str(blast_recordsB.alignments[0].hsps[0].expect) + '\n')
			else:
				out.write(name + ', no hit'  + '\n')

			if blast_recordsE.descriptions:
				out2.write(name + ',' +  str(blast_recordsE.alignments[0].hsps[0].expect) + '\n')
			else:
				out2.write(name + ', no hit'  + '\n')
		except:
			errorout = open('errorlog.txt','a')
			error out.write('problem blasting ' + record.id + '\n')
			errorout.close()

	out.close()
	out2.close()
Esempio n. 2
0
def blast_align(fasta,blast_path,miRNA_db,mRNA_db):
    os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 1e-3 -db "+miRNA_db+" -query "+fasta+" > "+args.output+"temp_blast_miRNA.xml")
    os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 1e-5 -db "+mRNA_db+" -query "+fasta+" > "+args.output+"temp_blast_mRNA.xml")
    os.system("rm "+fasta)
    miRNA_records=NCBIXML.parse(open(args.output+"temp_blast_miRNA.xml"))
    mRNA_records=NCBIXML.parse(open(args.output+"temp_blast_mRNA.xml"))
    return (miRNA_records,mRNA_records)
Esempio n. 3
0
def fetch_indentity_from_local(seq):
    def extract_prot_id(string):
        s = string.split('|')[2]
        s = s.split(' ')[1]
        return s

    result = []
    record = SeqRecord(Seq(seq), id="tmp", name="", description="")
    SeqIO.write(record, "tmp.fastaa", "fasta")

    NcbiblastpCommandline(query='tmp.fastaa', db='_data_/_db_/HUMAN_DB', outfmt=5, out='blastp_human_output.xml')()
    NcbiblastpCommandline(query='tmp.fastaa', db='_data_/_db_/RODENTS_DB', outfmt=5, out='blastp_rodents_output.xml')()

    result_handle = open("blastp_human_output.xml")
    b_record = NCBIXML.read(result_handle)
    for alignment in b_record.alignments:
        for hsp in alignment.hsps:
            if hsp.positives == hsp.identities:
                result.append(extract_prot_id(alignment.title))

    result_handle = open("blastp_rodents_output.xml")
    b_record = NCBIXML.read(result_handle)
    for alignment in b_record.alignments:
        for hsp in alignment.hsps:
            if hsp.positives == hsp.identities:
                result.append(extract_prot_id(alignment.title))

    return ";".join(result)
Esempio n. 4
0
def bestrecipblast(org, seed, thresh=5, queue=None):
    '''Returns the best pairwise reciprocal BLAST using seed accession no. from 
    against org organism'''
    seedorg=FetchUtil.fetch_organism(seed)[0]
    acclist={}
    ac=[]
    FetchUtil.fetch_fasta(seed)
    dum=str(int(int(seed.split('.')[0][-5:])*random.random()))
        
    os.system('blastp -db nr -query Orthos/'+seed+'.fasta -evalue '+str(thresh)+
              ' -out XML/'+dum+'.xml -outfmt 5 -entrez_query \"'+org+'[ORGN]\" -use_sw_tback'+
              ' -remote')
    qoutput=open('XML/'+dum+'.xml')
        
    parser=NCBIXML.parse(qoutput)
    for lin in parser:
        for align in lin.alignments:
            for hsp in align.hsps:
                if (hsp.positives/float(hsp.align_length))>=.4 and (float(hsp.align_length)/len(hsp.query))>=.25:
                    ac.append(align.title.split('|')[1])
    print("Done. Number of sequences found: "+repr(len(ac)))

    for o in ac:
        print o
        FetchUtil.fetch_fasta(o)
        os.system('blastp -db nr -query Orthos/'+o+'.fasta -evalue '+str(thresh)+
              ' -out XML/'+dum+'.xml -outfmt 5 -entrez_query \"'+seedorg[0]+'[ORGN]\" -use_sw_tback'+
              ' -remote')
        q1output=open('XML/'+dum+'.xml')
        parse=NCBIXML.parse(q1output)
        acc=[]
        print 'blasted'
        for lin in parse:
            for align in lin.alignments:
                for hsp in align.hsps:
                    if (hsp.positives/float(hsp.align_length))>=.4 and (float(hsp.align_length)/len(hsp.query))>.25:
                        acc.append(align.title.split('|')[1])
                    else:
                        continue

        print "Done. Number of sequences found: "+repr(len(acc))
            
        if seed in acc:
            print 'it\'s twue!'
            name=FetchUtil.fetch_organism(o)[0]
            try:
                acclist[name]=[o,str(ac.index(o)+1)+'/'+str(len(ac)),str(acc.index(seed)+1)+'/'+str(len(acc))]
            except KeyError:
                acclist.update({name:[o,str(ac.index(o)+1)+'/'+str(len(ac)),str(acc.index(seed)+1)+'/'+str(len(acc))]})
                
            open('dicts/'+seed,'a').write(str(acclist)+'\n')
            break
	#elapsed=time.time()-start
	#print "Time elapsed: "+time.strftime('%M:%S',[elapsed])
    if queue is not None:
      queue.put(acclist)
    else:
      return acclist
Esempio n. 5
0
def main():
    #initialization
    n=0 # total number of query seq
    align_mi=0
    align_m=0    


    args=ParseArg()
    miRNA_result=open(args.mi_xml)
    mRNA_result=open(args.m_xml)
    miRNA_records=NCBIXML.parse(miRNA_result)
    mRNA_records=NCBIXML.parse(mRNA_result)
    output=open(args.output,'w')
    
    
    # E-values
    if args.evalue==0:
        evalue_mi=1e-5
        evalue_m=1e-15
    else:
        evalue_mi=float(args.evalue[0])
        evalue_m=float(args.evalue[1])
    
    for mi_record,m_record in itertools.izip(miRNA_records,mRNA_records):
        temp_output=''
        mi_indic=0 # whether there are miRNA alignment
        m_indic=0  # whether there are mRNA alignment
        mi_end=150  #shortest miRNA aligned end in query sequence
        n=n+1
        if (mi_record.query!=m_record.query):
            print >>sys.stderr,"The two query seqs from miRNA and mRNA results are not matched!"
            break
        temp_output=mi_record.query+'\n'
        for alignment in mi_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < evalue_mi:
                    mi_indic=1
                    line="\t".join (str(f) for f in [hsp.query_start,hsp.query_end,alignment.title,hsp.sbjct,hsp.sbjct_start,hsp.sbjct_end,hsp.expect,hsp.score])
                    temp_output=temp_output+line+'\n'
                    if mi_end>max(hsp.query_start,hsp.query_end):
                        mi_end=max(hsp.query_start,hsp.query_end)
        
        if mi_indic==0:
            mi_end=0

        for alignment in m_record.alignments:
            for hsp in alignment.hsps:
                if (hsp.expect < evalue_m) and (min(hsp.query_start,hsp.query_end)>mi_end):
                    m_indic=1
                    line="\t".join (str(f) for f in [hsp.query_start,hsp.query_end,alignment.title,hsp.sbjct,hsp.sbjct_start,hsp.sbjct_end,hsp.expect,hsp.score])
                    temp_output=temp_output+line+'\n'
        if mi_indic+m_indic>=2:
            output.write(temp_output)
        if mi_indic==1:
            align_mi+=1
        if m_indic==1:
            align_m+=1
    print n,align_mi,align_m
Esempio n. 6
0
def parseBlastResult(fileName):
    
    handle = open(fileName)
    blast_records = NCBIXML.parse(handle)
    
    results = []

    for record in blast_records:
        rec_id = str(record.query)
    
        if len(record.alignments) == 0:
            results.append( (rec_id, "-", 0, "-") )
            continue

        for algn in record.alignments:

            evalue = algn.hsps[0].expect
        
            score = 0
            ids = []
            
            for hsp in algn.hsps:
                score += hsp.bits
                ids.append(hsp.identities / float(hsp.align_length))
            
            max_identity = int(max(ids)*100)
            seq_id = algn.hit_id

            results.append( (rec_id, seq_id, max_identity, algn.hit_def ) )
            
    return results
Esempio n. 7
0
def blastdemo(genbankID):
    # run blastp on the swissprot database NB to scale this up we must do it locally on cluster
    result_handle = NCBIWWW.qblast("blastp", "swissprot", genbankID)
    # read the results as XML
    blast_record = NCBIXML.read(result_handle)

    # Set this value to ridiculously low
    E_VALUE_THRESH = 0.00000000000000001
    # for each alignment found, display the one with the lowest e-value, and also protein function information.
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < E_VALUE_THRESH:
                print ("****Alignment****")
                print ("sequence:", alignment.title)
                print ("length:", alignment.length)
                print ("e value:", hsp.expect)
                print (hsp.query[0:75] + "...")
                print (hsp.match[0:75] + "...")
                print (hsp.sbjct[0:75] + "...")
                print "\n"

    ### h is not defined yet, Will (problem from  iPython nb's!)
    # print h.query[0:75] + '...'
    # print h.match[0:75] + '...'
    # print h.sbjct[0:75] + '...'

    for a in blast_record.alignments:
        print a.length
def blastparse(blast_handle, genome, gene):
    global plusdict
    records = NCBIXML.parse(blast_handle)   # Open record from memory-mapped file
    dotter()
    for record in records:  # This process is just to retrieve HSPs from xml files
        for alignment in record.alignments:
            for hsp in alignment.hsps:
                threadlock.acquire()  # precaution
                # if hsp.identities == alignment.length:  # if the length of the match matches the legth of the sequence
                #     # if genome not in plusdict:  # add genomes in plusdict
                #     #     plusdict[genome] = defaultdict(list)
                #     # if gene not in plusdict[genome]:  # add genes to plus dict
                #     #     plusdict[genome][gene] = []
                if plusdict[genome][gene] == [] and abs(float(hsp.identities) / alignment.length) >= 0.7:
                    # If there is only one good match then apply allele number
                    plusdict[genome][gene].append("+")
                # elif "+" not in plusdict[genome][gene]:
                #     plusdict[genome][gene].append("-")
                # elif abs(float(hsp.identities) / alignment.length) >= 0.7:
                #     # If there is multiple matches then added them in a string
                #     plusdict[genome][gene].append(alignment.title.split('_')[-1])
                #     plusdict[genome][gene].sort()
                # else:
                #     # or add the
                #     plusdict[genome][gene].append('%s (%s/%s)' % (alignment.title.split('_')[-1],
                #                                                   hsp.identities,
                #                                                   alignment.length))
                # print json.dumps(plusdict, indent=4, separators=(',', ': '))
                threadlock.release()  # precaution for populate dictionary with GIL
    def __init__(self, fhand, subj_def_as_accesion=None):
        'The init requires a file to be parser'
        fhand.seek(0, 0)
        sample = fhand.read(10)
        if sample and 'xml' not in sample:
            raise ValueError('Not a xml file')
        fhand.seek(0, 0)
        self._blast_file = fhand
        metadata = self._get_blast_metadata()
        blast_version = metadata['version']
        plus = metadata['plus']
        self.db_name = metadata['db_name']

        self._blast_file.seek(0, 0)

        if ((blast_version and plus) or
                                (blast_version and blast_version > '2.2.21')):
            self.use_query_def_as_accession = True
            self.use_subject_def_as_accession = True

        else:
            self.use_query_def_as_accession = True
            self.use_subject_def_as_accession = False

        if subj_def_as_accesion is not None:
            self.use_subject_def_as_accession = subj_def_as_accesion

        #we use the biopython parser
        #if there are no results we put None in our blast_parse results
        self._blast_parse = None
        if fhand.read(1) == '<':
            fhand.seek(0)
            self._blast_parse = NCBIXML.parse(fhand)
Esempio n. 10
0
def blast_align(fasta,blast_path,linker_db):
    fasta_name=fasta.split(".")[0]
    os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 0.1 -db "+linker_db+" -query ./temp/"+fasta+" > ./temp/"+fasta_name+"_blast_linker.xml")
    linker_records=NCBIXML.parse(open("./temp/"+fasta_name+"_blast_linker.xml"))
#    os.system("rm ./temp/"+fasta)
#    os.system("rm ./temp/"+fasta_name+"_blast_linker.xml")
    return (linker_records)
Esempio n. 11
0
 def get_fancy_results_list(self, blast_results, num_results = 20):
     blast_results_list = []
 
     blast_record = list(NCBIXML.parse(blast_results))[0]
     num_results = len(blast_record.alignments) if len(blast_record.alignments) < num_results else num_results
 
     for i in range(0, num_results):
         entry = b6lib.B6Entry()
         entry.q_len = int(blast_record.query_length)
         entry.query_length = entry.q_len
         
         alignment = blast_record.alignments[i]
         hsp = alignment.hsps[0]
      
         entry.hit_def = alignment.hit_def   
         entry.subject_id = entry.hit_def
         entry.accession = alignment.accession
         entry.ncbi_link = 'http://www.ncbi.nlm.nih.gov/nuccore/%s' % entry.accession
         entry.hsp_query = hsp.query
         entry.hsp_match = hsp.match
         entry.hsp_subject = hsp.sbjct
 
         entry.identity = len([x for x in hsp.match if x == '|']) * 100.0 / len(entry.hsp_query)
         entry.coverage = len(hsp.query) * 100.0 / entry.query_length
 
         blast_results_list.append(entry)
 
     try:
         blast_results.close()
     except:
         pass
 
     return blast_results_list
def parse_blast_XML(blast_xml):
	"""
	Read the blast_xml file generated before and extract the sequence and the id of each sequence in Blast and save them to
	multiple fasta file. It will allow ClustalW to generate a Multiple Sequence Alignment from all these sequence extracted.
	"""
	blast_xml_op = open (blast_xml, 'r')
	for record in NCBIXML.parse(blast_xml_op):
		for align in record.alignments:
			hit_id = align.hit_id.split("|")
			prev_eval = 1
			coverage = align.length / 390 ######arreglar per posar longitud sequencia
			for hsp in align.hsps:
				if hsp.expect < prev_eval:
					prev_eval = hsp.expect
			efetch = Entrez.efetch(db="protein", id=hit_id, rettype="fasta")
			for line in efetch:
				line = line.rstrip()
				if line.startswith(">"):
					id_info = line
					sequence = ""
				else:
					sequence += line
			sequence += line

			organism = id_info[id_info.find("[") + 1:id_info.find("]")]
			organism = organism.split()
			if len(organism) != 1:
				species = str(organism[0] + "_" + organism[1])

			yield BlastResult(hit_id[1], species, sequence, prev_eval, coverage)
Esempio n. 13
0
def parse_results(result_file, e_val_thresh, ident_thresh, align_thresh):
	result_handle = open(result_file, 'r')  ## The XML file to parse.
	blast_records = NCBIXML.parse(result_handle)
	print 'query_id\thit_id\tpercentage_identity\tquery_length\talignment_length\te_value'

	for record in blast_records:  ## Loop through each query.
		query_id = record.query
		if len(record.alignments) > 0:  ## Check whether there are hits.
			e_val = record.alignments[0].hsps[0].expect
			if e_val < e_val_thresh:  ## Is hit below E-value?
				tot_ident = sum([hsp.identities for hsp in record.alignments[0].hsps])  ## Sum of all identities for all hsps.
				query_len = record.query_length  ## Length of query
				align_len = sum([hsp.align_length for hsp in record.alignments[0].hsps])  ## Length of query alignment to hit.
				pct_ident = tot_ident/float(align_len)*100  ## Calculates percentage identity.
				top_hit = record.alignments[0].hit_id + record.alignments[0].hit_def
				if pct_ident > ident_thresh:  ## Checks whether above percentage identity cutoff.
					if align_len > align_thresh:
						print '%s\t%s\t%f\t%i\t%i\t%s' % (query_id, top_hit, pct_ident, query_len, align_len, str(e_val))
					else:
						print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')
				else:
					print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')
			else:
				print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')
		else:
			print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')

	result_handle.close()
Esempio n. 14
0
def blast_xml_to_gff3(file_in,file_out,blast_type):
    result_handle = open(file_in)
    blast_records = NCBIXML.parse(result_handle)
    E_VALUE_THRESH = 0.04
    with open(file_out,"w") as f:
        f.write("##gff-version 3"+"\n")
        for blast_record in blast_records:
            counter = 0
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < E_VALUE_THRESH and counter < 1:
                        counter+=1
                        if hsp.strand[0] is None and hsp.frame[0] is None: f.write(blast_record.query + "\t" + 
                                                                                   str(blast_type) + "\t" + 
                                                                                   "match_part" + "\t" + 
                                                                                   str(hsp.query_start) + "\t" + 
                                                                                   str(hsp.query_end) + "\t" + 
                                                                                   str(hsp.score) + "\t" + 
                                                                                   "?" + "\t" +
                                                                                   "." + "\t" +
                                                                                   "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" +
                                                                                   "Parent="+blast_record.query+";"+
                                                                                   "Name=blast_hsp;" +
                                                                                   "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n")
                        if hsp.strand[0] is None and hsp.frame[0] is not None: f.write(blast_record.query + "\t" + 
                                                                                       str(blast_type) + "\t" + 
                                                                                       "match_part" + "\t" + 
                                                                                       str(hsp.query_start) + "\t" + 
                                                                                       str(hsp.query_end) + "\t" + 
                                                                                       str(hsp.score) + "\t" + 
                                                                                       "?" + "\t" +
                                                                                       str(hsp.frame[0]) + "\t" +
                                                                                       "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" +
                                                                                       "Parent="+blast_record.query+";"+
                                                                                       "Name=blast_hsp;" +
                                                                                       "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n")
                        if hsp.strand[0] is not None and hsp.frame[0] is None: f.write(blast_record.query + "\t" + 
                                                                                       str(blast_type) + "\t" + 
                                                                                       "match_part" + "\t" + 
                                                                                       str(hsp.query_start) + "\t" + 
                                                                                       str(hsp.query_end) + "\t" + 
                                                                                       str(hsp.score) + "\t" + 
                                                                                       str(hsp.strand[0]) + "\t" +
                                                                                       "." + "\t" +
                                                                                       "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" +
                                                                                       "Parent="+blast_record.query+";"+
                                                                                       "Name=blast_hsp;" +
                                                                                       "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n")
                        if hsp.strand[0] is not None and hsp.frame[0] is not None: f.write(blast_record.query + "\t" + 
                                                                                           str(blast_type) + "\t" + 
                                                                                           "match_part" + "\t" + 
                                                                                           str(hsp.query_start) + "\t" + 
                                                                                           str(hsp.query_end) + "\t" + 
                                                                                           str(hsp.score) + "\t" + 
                                                                                           str(hsp.strand[0]) + "\t" +
                                                                                           str(hsp.frame[0]) + "\t" +
                                                                                           "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" +
                                                                                           "Parent="+blast_record.query+";"+
                                                                                           "Name=blast_hsp;" +
                                                                                           "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n")
Esempio n. 15
0
 def get_gb_info(self, resultshandle):
     """Extracts the GenBank record IDs, the hit positions, and the sequence
     orientations from the BLAST report."""
     #   Start a parser that steps through each record
     blast_records = NCBIXML.parse(resultshandle)
     #   List to hold information about our hits
     #   Step through the BLAST records
     for record in blast_records:
         #   Step through each alignment in each record
         for alignment in record.alignments:
             #   Then the HSPs in each alignment
             for hsp in alignment.hsps:
                 #   The start and end positions of each hit
                 hit_coords = (hsp.sbjct_start, hsp.sbjct_end)
                 #   Split on the '|' character, genbank ID is last in the
                 #   list have to use -2 instead, because of the trailing '|'
                 #   in the XML report
                 hit_gbid = alignment.title.split('|')[-2]
                 #   Relative directions of the sequences
                 hit_directions = hsp.frame
                 break
             #   Tack the IDs, coordinates, and directions onto our lists
             self.gb_ids.append(hit_gbid)
             self.hit_coords.append(hit_coords)
             self.hit_directions.append(hit_directions)
     #   Finished with this file
     resultshandle.close()
     return
Esempio n. 16
0
    def create_rel(self, XMLin):
        """ Create a dictionary that relate the sequence name
        with the region to mask.

        Returns a dictionary
        """
        bat1 = {}
        b_records = NCBIXML.parse(XMLin)
        for b_record in b_records:
            for alin in b_record.alignments:
                for hsp in alin.hsps:
                    qs, qe = hsp.query_start, hsp.query_end
                    if qs > qe:
                        qe, qs = qs, qe
                    bat1.setdefault(b_record.query.split(" ")[0], set()).add((qs, qe))

        # sort and merge overlapping segments
        for b_record_query in bat1.keys():
            joined_cols = []
            for qs, qe in sorted(list(bat1[b_record_query])):
                if joined_cols:
                    last_qs, last_qe = joined_cols[-1]
                    if last_qe >= qs:
                        joined_cols[-1] = (last_qs, qe)
                        continue
                joined_cols.append((qs, qe))
            bat1[b_record_query] = joined_cols

        return bat1
Esempio n. 17
0
def include_check(blast_result_filename,include_line,seq_name_list,max_mismatch):
    strlist=str(include_line).split(' OR ')
    results={} # intermediate result to show if what organism are conserved in that seq
    results_final={} # final result to show if an seq is conserved or not in all the rquested organism
    for valist in strlist:
        txid_num=valist[valist.find('(taxid:')+7:valist.find(')')]
        blast_result_file= open(blast_result_filename+txid_num,"r")      
        found={}
        for record in NCBIXML.parse(blast_result_file):
            name=record.query
            min_len= record.query_letters-max_mismatch
            if not found.has_key(name): 
                if record.alignments :
                    for align in record.alignments :
                        for hsp in align.hsps :
                            #print "blast: ",hsp.identities,name,query_len,int(num)
                            if hsp.identities == hsp.align_len and hsp.identities>=min_len: # 100% match and has more identities than requirement length of matches
                                found[name]=1 # this valst is conserved in current 
                                if results.has_key(name):
                                    temp=results[name]
                                    temp.append(txid_num)
                                    results[name]=temp
                                else:
                                    temp=[txid_num]
                                    results[name]=temp
                        #print name,results[name]
        blast_result_file.close()
    len_organ=len(strlist)
    for i in results.keys():
        if len(results[i])==len_organ:
            results_final[i]=1
    return (results_final)
Esempio n. 18
0
def BLAST_to_BRIG(BLASTfile, resultsFile):

    rec = open(BLASTfile)
    blast_records = NCBIXML.parse(rec)

    with open(resultsFile, "w") as tabFile:

        for blast_record in blast_records:

            for alignment in blast_record.alignments:
                for match in alignment.hsps:
                    tabFile.write(
                        "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                        % (
                            blast_record.query,
                            alignment.hit_def,
                            round(float(match.identities) / float(alignment.length), 2),
                            int(match.score),
                            alignment.length,
                            int(alignment.length) - int(match.identities),
                            match.query_start,
                            (int(match.query_start) + int(alignment.length)),
                            match.sbjct_start,
                            (int(match.query_start) + int(alignment.length)),
                        )
                    )

                    break
Esempio n. 19
0
def run_blastp(match, blastdb):
    """run blastp"""
    from Bio.Blast.Applications import NcbiblastpCommandline

    for feature in match.features:
        rec = None
        fasta = feature.protein_fasta()
        if fasta == "":
            continue
        try:
            cline = NcbiblastpCommandline(db=blastdb, outfmt=5, num_threads=4)
            pipe = subprocess.Popen(
                str(cline), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE
            )
            pipe.stdin.write(fasta)
            pipe.stdin.close()
            recs = NCBIXML.parse(pipe.stdout)
            rec = recs.next()
            pipe.stdout.close()
            pipe.stderr.close()
        except OSError, err:
            logging.warning("Failed to run blastp: %s" % err)
            continue
        except ValueError, err:
            logging.warning("Parsing blast output failed: %s" % err)
            continue
Esempio n. 20
0
def blastpSp(sp, db, evalue=0.0001):
    """
    directory = tempfile.mkdtemp()
    fasta = fetchFasta(spAcc)
    fastaFile = '%s/seq.fasta' % directory
    wf = open(fastaFile, 'w')
    print(fasta, file=wf)
    wf.close()
    """
    directory = tempfile.mkdtemp()
    fastaFile = '%s/seq.fasta' % directory
    fasta = '>query\n%s' % seq(sp)
    wf = open(fastaFile, 'w')
    print(fasta, file=wf, sep='', end='')
    wf.close()
    blastp = NcbiblastpCommandline(query=fastaFile, db=db, evalue=evalue,
                                   outfmt=5, out='%s/result.xml' % directory)
    stdout, stderr = blastp()
    print(stdout, end='', sep='')
    print(stderr, end='', sep='')
    result_handle = open('%s/result.xml' % directory)
    blast_record = NCBIXML.read(result_handle)
    result_handle.close()
    os.remove(fastaFile)
    os.remove('%s/result.xml' % directory)
    os.removedirs(directory)
    hits = [align.title for align in blast_record.alignments]
    hits = [i.split('|')[1] for i in hits]
    return hits
Esempio n. 21
0
    def run (self, input_seq):

        output = []
        #Windows has problems with Popen and PIPE
        if sys.platform == 'win32':
            tmp = tempfile.NamedTemporaryFile()
            logger.debug("Running Blast with sequence: {}".format(input_seq))
            tmp.write(bytes(str(input_seq) + '\n', 'latin1'))
            tmp.seek(0)
            blast = Popen('%s -db %s -outfmt 5' % (self.blast_path, self.blastdb), universal_newlines=True, stdin=tmp,
                stdout=PIPE, stderr=PIPE)
            (blast_out, blast_err) = blast.communicate()
        else:
        #Rest of the world:
            blast = Popen('%s -db %s -outfmt 5' % (self.blast_path, self.blastdb), universal_newlines=True, shell=True,
                stdin=PIPE, stdout=PIPE, stderr=PIPE)
            (blast_out, blast_err) = blast.communicate(input=str(input_seq))

        if len(blast_err) != 0:
            logger.debug(blast_err)
        if blast_out!='\n':
            result = NCBIXML.read(StringIO(blast_out))
            for aln in result.alignments[:self.top_results]:
                logger.debug("Looping over alignments, current hit: {}".format(aln.hit_id))
                output.append((aln.hit_id, aln))
        return output
Esempio n. 22
0
def parse_blast_xml(xml_filename, query_filename, output_filename, abundance_filename=None):
    """
    Parse the XML output, looking only at the 1st alignment for each query
    Write out in format:
        
    ID \t COUNT \t LENGTH \t AMBIG \t QSTART \t QEND \t IDEN
    """
    if abundance_filename is None:
        abundance = defaultdict(lambda: 1)
    else:
        abundance = dict(line.strip().split('\t') for line in open(abundance_filename))
    handle = NCBIXML.parse(open(xml_filename))
    f = open(output_filename, 'w')
    f.write("ID\tCOUNT\tLENGTH\tAMBIG\tQSTART\tQEND\tIDEN\n")
    with open(query_filename) as h:
        for r in SeqIO.parse(h, 'fasta'):
            ambig = r.seq.count('N') + r.seq.count('?')
            blastout = handle.next()
            if len(blastout.alignments) == 0: # no match was found!
                f.write("{id}\t{count}\t{len}\t{ambig}\tNA\tNA\tNA\n".format(\
                id=r.id, count=abundance[r.id], len=len(r.seq), ambig=ambig))
            else:
                hsp = blastout.alignments[0].hsps[0]
                f.write("{id}\t{count}\t{len}\t{ambig}\t{qs}\t{qe}\t{iden}\n".format(\
                id=r.id, len=len(r.seq), qs=hsp.query_start, qe=hsp.query_end,\
                iden=hsp.identities, count=abundance[r.id], ambig=ambig))
    f.close()
def write_flanks(rbase,flanksfile):
    '''
    Parse the results from BLASTing the F-plasmid against the de novo assemblies.
    get the query length, get the first BLAST hit that matches the 3'-end of the
    query, and write the flanking region to file. 
    
    '''
    flank_record_list = []
    ## iterate over BLASTs against de novo assemblies.
    denovo_dirs = [x for x in listdir(rbase) if x.startswith('REL') or x.startswith('RM')] 
    for mygenome in denovo_dirs:
        myfulldir = join(rbase, mygenome)
        ##print(myfulldir)
        result_f = join(myfulldir,"results.xml")
        result_h = open(result_f)
        blast_record = NCBIXML.read(result_h)
        query_length = int(blast_record.query_letters)
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect > 0.0000000001:
                    ## skip bad hits.
                    continue
                if hsp.query_end != query_length:
                    ## skip hits that don't match 3' end of F-plasmid query.
                    continue
                subject_seq = join(myfulldir,"scaffolds.fasta")
                ##print(mygenome)
                my_flank_seq = get_flank(alignment, hsp, subject_seq)
                flank_record_list.append(SeqRecord(seq=my_flank_seq, id=mygenome+'_flank'))
    with open(flanksfile,'w') as flanks_outhandle:               
        SeqIO.write(flank_record_list,flanks_outhandle, format="fasta")
def main():
    
    # query = input('Enter query file name: ') # For the working example, type in 'Test_miRNA.txt'
    # filename = input('What is your desired file name for the top hits file? ') # I used 'Test_miRNA_Results.txt'
    # writer = open(filename, 'w')
    records = SeqIO.parse('gg_pre_mirna.fasta', 'fasta')
    writer = open('results.fasta', 'w')
    writer.write('Organism_name' + '\t' + 'Query_start' + '\t' + 'Query_end' + '\t' + 'Subject_start' + '\t' +
                  'Subject_end' + '\r') # Writes the header for the results file
    # print('Now BLASTing')
    for record in records:
        tempWriter = open('Temp.txt', 'w')
        tempWriter.write('>' + record.id + '\n')
        tempWriter.write(str(record.seq) + '\n')
        #os.system('blastn -task blastn-short -query '+ str(record.seq) +' -db Input/gg_db -out BLAST_result.xml -outfmt "5" ')
        os.system('blastn -task blastn-short -query gg_pre_mirna.fasta -db Input/gg_db -out BLAST_result.xml -outfmt "5" ')
        result_handle = open('BLAST_result.xml')
        blast_records = NCBIXML.parse(result_handle)
        writer.write('\r' + '*****' + '\r')
        writer.write(record.id + '\r' + '*****' + '\r')
        for blast_record in blast_records:
            parsefile(blast_record,writer)
        tempWriter.close()
    writer.close()
    print('Finished!')
def findOffTargets (refSeq,sgRNAseq):
    candidates=[]  # Return this list of candidates
    f = open('temp.fasta','wb')
    f.write(sgRNAseq+'\n')
    f.close()

    cline = NcbiblastnCommandline(query="temp.fasta", db="testdb",outfmt=5, out="temp.xml",task='blastn-short')
    cline()
    result=open('temp.xml','r')
    records = NCBIXML.read(result)
    if len(records.alignments) == 0 :
        return candidates
    records=records.alignments[0].hsps

    for record in records:
        if record.query_end < 20:  # Require ends at the seed
            continue
        if record.match[-5:] != '|'*5:  # Require 5 bp of seed is perfect match
            #print record
            continue
        if record.sbjct_end > record.sbjct_start:
            end=record.sbjct_end
            # on the + strand, sequence is from [start,end]
            if refSeq[end+2:end+4]=='GG':
                candidates.append(record)
        else:  # On the - strand
            end=record.sbjct_end
            if refSeq[end-3:end-1] == 'CC':
                candidates.append(record)
    return candidates
Esempio n. 26
0
def parse_online_blast (seq_list):

	# get the result handle and set the taxon dic
	blast_handle, taxon_dic = online_blast(seq_list), {}

	# use the biopython xml parse module to get the results
	logging.debug('Parsing blast result XML file.')
	blast_list = [item for item in NCBIXML.parse(blast_handle)]

	# walk through the blast results and prepare them for filtering
	for blast_result in blast_list:
		for alignment in blast_result.alignments:
			for hsp in alignment.hsps:
				            		
				# calculate the %identity
				identity = float(hsp.identities/(len(hsp.match)*0.01))


				# grab the genbank number
				gb_num = alignment.title.split('|')[1:4:2]
				gb_num[1] = gb_num[1].split('.')[0]

				# get the taxon id based on the genbank identifier
				if gb_num[0] not in taxon_dic:
					taxon = obtain_tax(gb_num[0])
					taxon_dic[gb_num[0]] = taxon
				else:
					taxon = taxon_dic[gb_num[0]]

				# pull all the results together and sent them to the filter function
				filter_hits([str(blast_result.query), str(alignment.title), str(gb_num[0]), str(gb_num[1]),
						str(identity), str(len(hsp.query)), str(blast_result.query_length),
						str(hsp.expect), str(hsp.bits), taxon[0], taxon[1]])
def blast_file_opener(filename, evalue, mismatches, outfile):
    """Func takes in a BLAST xml output file (filename).
    writes out the various details of interests to the outfile.

    It filters the results based on evalue and number of
    mismatches, as defined by the user. """
    E_VALUE_THRESH = float(evalue)
    mismatches = int(mismatches)
    result_handle = open(filename)
    f = open(outfile, 'w')
    temp = outfile.split(".txt")[0]
    blast_records = NCBIXML.parse(result_handle)
    for blast_record in blast_records:
        alignment_hits = set([])
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < E_VALUE_THRESH:
                    # For mismatches use (hsp.align_length - hsp.identities)
                    mmatches = hsp.align_length - hsp.identities
                    if str(mmatches) == str(mismatches):
                        data = "%s\t%s\t%s\n" %(alignment.title,
                                                blast_record.query,
                                                str(hsp.expect))
                        f.write(data)

    f.close()
    result_handle.close()
    return alignment_hits
def blast_reads(blast_string, reads, outfh):
	blast_db = '/Users/sw10/Dropbox/Sanger/blastdb/ebola/Zaire_ebolavirus_KM034562' # 2)
	blast_binary = '/Applications/ncbi-blast-2.2.29+/bin/blastn' # 3)
	xml_outfile = '/tmp/test.xml'
	evalue = 0.01 
	cline = NcbiblastnCommandline(cmd=blast_binary, out=xml_outfile, outfmt=5, query="-", db=blast_db, evalue=evalue, max_target_seqs=1, num_threads=1)
	stdout, stderr = cline(blast_string)

	with open(xml_outfile, 'r') as blast_handle:
		blast_records = NCBIXML.parse(blast_handle)
		for blast_record in blast_records:
			name = blast_record.query
			for alignment in blast_record.alignments:
				count = 1
				for hsp in alignment.hsps:
					seq = reads[name].sequence[hsp.query_start:hsp.query_end]
					qual = reads[name].quality[hsp.query_start:hsp.query_end]
					if hsp.sbjct_start > hsp.sbjct_end:
						tmp1 = [seq[i] for i in range(len(seq)-1,-1,-1)]
						seq = ''.join(tmp1)
						tmp2 = [qual[i] for i in range(len(qual)-1,-1,-1)]
						qual = ''.join(tmp2)
		
					outfh.write('@%s:%d\n%s\n+\n%s\n' % (name, count, seq, qual))
					count += 1
	os.remove(xml_outfile)
def parsePsiBlast(psiblastfilename, max_evalue):

    try:
        results_dict = {}

        handle = open(psiblastfilename, 'r')

        for blast_record in NCBIXML.parse(handle):
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect <= max_evalue:
                        subjid = alignment.title

                        if subjid in results_dict:
                            if hsp.expect < results_dict[subjid]:
                                results_dict[subjid] = hsp.expect

                        else:
                            results_dict[subjid] = hsp.expect

        handle.close()

        return results_dict

    except:
        dieError('ERROR: PSI-BLAST failed.')
Esempio n. 30
0
def get_ids(filename, dir, ethresh = 0.01):
	eValueThresh = ethresh
	result = open(os.path.join(dir,"BLAST",filename),"r") # mode omitted defaults to read only
	blast_record = NCBIXML.parse(result)
	blast_records = list(blast_record)
	record = blast_records[0]
	hits = []
	for alignment in record.alignments:
		for hsp in alignment.hsps:
			if hsp.expect < eValueThresh:
				title = alignment.title
				mdata = re.match( r'.*[A-Z|a-z]{2,3}\|(.*?)\|.*?\[([A-Z])\S* ([A-Z|a-z]{3}).*\].*?', title)
				if mdata is not None:
					accession = re.match(r'([A-Z|a-z|_|0-9]*)\..*', mdata.group(1))
					acc = str(accession.group(1))
					genus = str(mdata.group(2)[0])
					species = str(mdata.group(3)[:3])
					shortSpecies = (genus + species)
					hits.append((acc, shortSpecies))
	spec = filename[0:4]
	filteredHits = filter_species(hits,spec)
	# Saving results
	# Save as separate files for each species~!
	with open(os.path.join(dir,"accs",record_name(filename)+".csv"),'w') as csvfile:
		blasthits = csv.writer(csvfile)
		for each in filteredHits:
			blasthits.writerow([each[0]])
	csvfile.close()
Esempio n. 31
0
from Bio.Blast import NCBIXML
from Bio import SeqIO

with open("../Data/T0860/T0860.fasta", "rU") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        seq_len = len(record._seq)

result_handle = open('../Data/T0860/95SCU2B4015-Alignment.xml')
blast_records = NCBIXML.parse(result_handle)

for blast_record in blast_records:
    for alignment in blast_record.alignments:
        title = alignment.title
        length = alignment.length
        for hsp in alignment.hsps:
            print('****Alignment****')
            print('sequence:', alignment.title)
            print('length:', alignment.length)
            print('e value:', hsp.expect)
            print(hsp.query[0:137])
            print(hsp.match[0:137])
            print(hsp.sbjct[0:137])

            identities = hsp.identities
            similarity = (100 * hsp.identities / seq_len)
            target = hsp.query
            targetstart = hsp.query_start
            templatestart = hsp.sbjct_start
            match = hsp.match
            template = hsp.sbjct[0:137]
            templatestart = hsp.sbjct_start
Esempio n. 32
0
def parse_blast(blastOut, pdb, qseq, evalue=0.00001):
    '''
    Parse XML Blast outputs.
    Parameters:
      - evalue: set an alignment cutoff
      - qseq:   query sequence as a string
    '''

    with open(blastOut) as fh:
        blast_record = NCBIXML.read(fh)
    A = numpy.zeros((len(qseq),21))
    q = ['A','C','D','E','F','G','H','I','K','L','M','N','P','R','S','T','V','Y','W','Q','-']

    with open(pdb+'.ali', 'w') as out:
        out.write('>%sq\n' % pdb)
        out.write(qseq+'\n')

        Seqs = {pdb+'q':qseq}
        for alignment in blast_record.alignments:

            for hsp in alignment.hsps:
                if float(hsp.expect) > evalue: continue
                sseq = alignment.title.split('|')[-2]
                if sseq not in Seqs:
                    Seqs[sseq] = 1
                    out.write('>'+sseq+'1'+'\n')
                    out.write(hsp.sbjct+'\n')
                else:
                    Seqs[sseq] += 1
                    out.write('>'+sseq+str(Seqs[sseq])+'\n')
                    out.write(hsp.sbjct+'\n')
    # --- get A matrix
    Clusters = ucluster(pdb+'.ali')
    M = len(Clusters)
    Meff = 0.
    Seqs = {}
    for alignment in blast_record.alignments:

        for hsp in alignment.hsps:
            if float(hsp.expect) > evalue: continue
            sseq = alignment.title.split('|')[-2]
            qstart = hsp.query_start-1
            qi = qstart
            hquery = hsp.query
            hsbjct = hsp.sbjct
            if sseq not in Seqs:
                Seqs[sseq] = 1
                sseq += '1'
            else:
                Seqs[sseq] += 1
                sseq += str(Seqs[sseq])
            ma = 1./Clusters[sseq]

            for i in range(len(hquery)):
                if hquery[i] != '-':
                    if hquery[i]!=qseq[qi]:
                        raise ValueError(
                                "Mismatch in alignment sequence at position "
                                "%d: %s %s" % (i+1, hquery[i], qseq[qi]))
                    if hsbjct[i] in q: A[qi,q.index(hsbjct[i])] += ma #1.
                    else: pass
                    qi += 1
                else: pass
            Meff += ma
    for i,a in enumerate(qseq):
        try:
            A[i,q.index(a)] += 1./Clusters[pdb+'q'] #1.
            Meff += 1./Clusters[pdb+'q']
        except ValueError: pass

    # --- re-weight the A matrix, correct for lambda factor
    lmbd = Meff
    with open(pdb+'.sqc', 'w') as out:
        for i in range(len(A)):
            Si = 0
            Fa = sum(A[i])
            for j in range(len(q)):
                si = (1./(Meff+lmbd)) * ( (lmbd/len(q)) + A[i,j] ) #A[i,j]/Fa
                A[i,j] = si
                if si>0.: Si -= si*numpy.log(si)
            out.write('\t'.join([str(i+1), qseq[i], str(Si)]) + '\n')
Esempio n. 33
0
def get_hist_ss(test_seq, type='Unknown', debug=0):
    """Returns sequence elements in histone sequence, all numbers assume first element in seq has number 0!!! Not like in PDB"""

    #Let's define 1kx5 sequences
    templ_H3 = Seq(
        "ARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFKTDLRFQSSAVMALQEASEAYLVALFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA",
        IUPAC.protein)
    templ_H4 = Seq(
        "SGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG",
        IUPAC.protein)
    templ_H2A = Seq(
        "SGRGKQGGKTRAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLPKKTESSKSKSK",
        IUPAC.protein)
    templ_H2B = Seq(
        "AKSAPAPKKGSKKAVTKTQKKDGKKRRKTRKESYAIYVYKVLKQVHPDTGISSKAMSIMNSFVNDVFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK",
        IUPAC.protein)

    #'element_name':[start,stop], start stop - are inclusive as in PDB file
    #Numbering differes between symmetrical chains and 1kx5 vs 1aoi.
    #We simply take the minimum length of alpha helices over all chains in 1kx5
    #1 substructed from PDB values!!! because these values are in array index numberins starting from 0

    #docking domain (amino acids 80 – 119) from paper by Luger 1aoi, however in JMB paper we defined it as 80-118, probably to be at the trypsin cleavage site KK???, so we stick with this here. Although HistoneDB uses the Luger convention (albite with a bug - it starts with 81 - that was fixed in code now).

    ss_templ_H3 = {
        'alphaN': [43, 56],
        'alpha1': [62, 76],
        'alpha2': [84, 113],
        'alpha3': [119, 130],
        'loopL1': [78, 83],
        'loopL2': [114, 118],
        'beta1': [82, 83],
        'beta2': [117, 118],
        'mgarg1': [62, 62],
        'mgarg2': [82, 82],
        'mgarg3': [48, 48]
    }
    ss_templ_H4 = {
        'alpha1ext': [23, 28],
        'alpha1': [29, 40],
        'alpha2': [48, 75],
        'alpha3': [81, 92],
        'loopL1': [41, 47],
        'loopL2': [76, 81],
        'beta1': [44, 45],
        'beta2': [79, 80],
        'beta3': [95, 97],
        'mgarg1': [44, 44]
    }
    # ss_templ_H2A={'alpha1ext':[15,21],'alpha1':[25,36],'alpha2':[45,72],'alpha3':[78,88],'alpha3ext':[89,96],'loopL1':[37,44],'loopL2':[73,77],'beta1':[41,42],'beta2':[76,77],'beta3':[99,101],'docking domain':[91,107],'docking tail':[108,116],'mgarg1':[41,41],'mgarg2':[76,76]}
    #new def of docking domains as in Suto Luger 2000
    ss_templ_H2A = {
        'alpha1ext': [15, 21],
        'alpha1': [25, 36],
        'alpha2': [45, 72],
        'alpha3': [78, 88],
        'alpha3ext': [89, 96],
        'loopL1': [37, 44],
        'loopL2': [73, 77],
        'beta1': [41, 42],
        'beta2': [76, 77],
        'beta3': [99, 101],
        'docking domain': [80, 118],
        'mgarg1': [41, 41],
        'mgarg2': [76, 76]
    }

    ss_templ_H2B = {
        'alpha1': [33, 45],
        'alpha2': [51, 80],
        'alpha3': [86, 98],
        'alphaC': [99, 119],
        'loopL1': [46, 50],
        'loopL2': [81, 85],
        'beta1': [49, 50],
        'beta2': [84, 85],
        'mgarg1': [29, 29]
    }

    ss_templ = {
        'H3': ss_templ_H3,
        'H4': ss_templ_H4,
        'H2A': ss_templ_H2A,
        'H2B': ss_templ_H2B
    }
    templ = {
        'H3': templ_H3,
        'H4': templ_H4,
        'H2A': templ_H2A,
        'H2B': templ_H2B
    }

    #Lets use blast and see what histone is our query
    my_records = [
        SeqRecord(templ_H3, id='H3', name='H3'),
        SeqRecord(templ_H4, id='H4', name='H4'),
        SeqRecord(templ_H2A, id='H2A', name='H2A'),
        SeqRecord(templ_H2B, id='H2B', name='H2B')
    ]

    n1 = str(uuid.uuid4())
    n2 = str(uuid.uuid4())

    faa_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".faa")
    fastan2_file = os.path.join(CONFIG.TEMP_DIR, n2 + ".fasta")
    fastan1_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".fasta")
    db_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db")
    xml_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".xml")
    txt_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".txt")

    phr_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.phr")
    pin_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.pin")
    psq_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.psq")

    SeqIO.write([SeqRecord(test_seq, id='Query', name='Query')], fastan2_file,
                'fasta')

    # print(os.environ.get('PATH'))
    if (type == 'Unknown'):

        SeqIO.write(my_records, faa_file, "fasta")
        os.system('makeblastdb -dbtype prot -in %s -out %s > /dev/null' %
                  (faa_file, db_file))

        blastp_cline = NcbiblastpCommandline(query=fastan2_file,
                                             db=db_file,
                                             evalue=100,
                                             outfmt=5,
                                             out=xml_file)
        stdout, stderr = blastp_cline()

        blast_record = NCBIXML.read(open(xml_file, 'r'))

        sname = list()
        evalue = list()
        hsp_list = list()
        # length_list=list()
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                sname.append(alignment.title)
                evalue.append(hsp.expect)
                hsp_list.append(hsp)
                # length_list.append(alignment.length)
        hist_identified = sname[evalue.index(min(evalue))].split()[1]
        hsp = hsp_list[evalue.index(min(evalue))]
        # length=length_list[evalue.index(min(evalue))]
    else:
        hist_identified = type

    if (debug): print('Most likely this is histone:')
    if (debug): print(hist_identified)
    if (debug): print('Blast alignment')
    #We need to determine secondary strucutre according to template using the alignment
    # if(debug): print(hsp)

    SeqIO.write([
        SeqRecord(
            templ[hist_identified], id=hist_identified, name=hist_identified)
    ], fastan1_file, 'fasta')

    #Now we will redo it with Needlman Wunsh - the global alignment
    needle_cline = NeedleCommandline(asequence=fastan1_file,
                                     bsequence=fastan2_file,
                                     gapopen=20,
                                     gapextend=1,
                                     outfile=txt_file)
    stdout, stderr = needle_cline()
    # print('Needle alignment')

    align = AlignIO.read(txt_file, "emboss")
    if (debug):
        print(align)
    # print(hsp.gaps)
    #Blast checking
    # ss_test=dict()
    # for key,value in ss_templ[hist_identified].iteritems():
    #     print('Checking %s'%key)
    #     if((hsp.sbjct_start<=value[1])&((hsp.sbjct_end)>=value[0])):
    #         print('Belongs')
    #     else:
    #         print('Not')

    #Now we will get correspondence

    ss_test = dict()
    hist = templ[hist_identified]

    corrsp_hist = list(range(len(hist)))
    k = 0
    for a, i in zip(align[0], range(len(align[0]))):
        if (a == '-'):
            k = k + 1
        else:
            corrsp_hist[i - k] = i
    if (debug): print(corrsp_hist)

    corrsp_test = list(range(len(test_seq)))
    k = 0
    for a, i in zip(align[1], range(len(align[1]))):
        if (a == '-'):
            k = k + 1
        else:
            corrsp_test[i - k] = i
    if (debug): print(corrsp_test)

    for key, value in ss_templ[hist_identified].items():
        if (debug): print('Checking %s' % key)
        start_in_aln = corrsp_hist[value[0]]
        if (debug): print('Start in aln %d' % start_in_aln)

        end_in_aln = corrsp_hist[value[1]]
        if (debug): print('End in aln %d' % end_in_aln)

        for k in range(len(align[0])):
            try:
                start_in_test_seq = corrsp_test.index(start_in_aln + k)
            except:
                start_in_test_seq = -1
                if (debug): print("Trying to move start"),
                continue
            break
        # print('\n %d'%start_in_test_seq)

        for k in range(len(align[0])):
            try:
                end_in_test_seq = corrsp_test.index(end_in_aln - k)
            except:
                end_in_test_seq = -1
                if (debug): print('Trying to move end'),
                continue
            break

        # print('\n %d'%end_in_test_seq)
        if ((start_in_test_seq == -1) | (end_in_test_seq == -1) |
            (start_in_test_seq > end_in_test_seq)):
            ss_test[key] = [-1, -1]
        else:
            ss_test[key] = [start_in_test_seq, end_in_test_seq]
        if (debug): print(ss_test[key])

    if (type == 'Unknown'):
        #os.system("rm %s.faa %s.db.phr %s.db.pin %s.db.psq %s.fasta %s.xml %s.txt %s.fasta"%(n1,n1,n1,n1,n2,n1,n1,n1))
        os.system("rm %s %s %s %s %s %s %s %s"%\
            (faa_file,phr_file,pin_file,psq_file,fastan2_file,xml_file,txt_file,fastan1_file))

    else:
        os.system("rm   %s  %s %s" % (fastan2_file, txt_file, fastan1_file))

    return hist_identified, ss_test
Esempio n. 34
0
def cazy2class(prefix, F, remote=False):
    ''' 
	will take the cazy database (dictionary provided) and try to fetch subfamilies and place them
	as classifiers.
	'''
    print 'You chose to use CAZY database to classify GH13 family into subfamilies'\
             ' this will take a while, since have to go over BLAST results, etc..'
    cls = open(prefix + '.cls', 'w')
    # import database
    db = pickle.load(open('CazyDB.bin'))
    names = get_names(prefix + '.gm')
    for n in names:
        print 'Processing %s...' % (n)
        if remote:
            Entrez.email = '*****@*****.**'
            print '\tBlasting (Running remotely)...'
            n = n[:-1] + '_' + n[-1]
            while 1:
                try:
                    b = qblast('blastp',
                               'nr',
                               n,
                               perc_ident=90,
                               expect=1,
                               gapcosts='11 1')
                    print '\tBlast Done... \n\t\tAnalysing entries...'
                    break
                except:
                    print 'Some problem in NCBI, sleeping for 10...'
                    time.sleep(10)
        else:
            print '\tBlasting (Running locally)...'
            fi = open('temp.fasta', 'w')
            fi.write('>%s\n%s' % (n, F.seqs[F.chains[n[:4]]]))
            fi.close()
            #blastp_cline = NcbiblastpCommandline(query="temp.fasta", db="nr", evalue=0.0001,
            #                                     outfmt=5, out="temp.xml",max_target_seqs=50,
            #                                     num_alignments=50,num_threads=4)
            bl=Popen('blastp -db nr -outfmt "5" -query temp.fasta  -evalue 0.0001 -max_target_seqs 50 '\
                     '-seg yes -num_threads 4  -gapopen 10 -gapextend 1 -matrix BLOSUM90 -out temp.xml',
                     shell=True)
            bl.wait()
            print '\tBlast Done... \n\t\tAnalysing entries...'
            b = open('temp.xml')
        blast_record = NCBIXML.read(b)
        rm = Popen('rm temp.*', shell=True)
        rm.wait()
        nohit = True
        while nohit:
            for a in blast_record.alignments:
                print '\t\t\t%s' % (a.accession)
                h = a.hsps[0]
                if float(h.identities) / float(h.align_length) >= 0.9:
                    ans, k = dict_lookup(a.accession, db)
                    if ans:
                        cls.write(str(db[k]) + ';')
                        print '\t\t\t\tAccession number found in CAZY!, Subfamily %s' % (
                            db[k])
                        nohit = False
                        break
                    else:
                        if blast_record.alignments.index(a) + 1 == len(
                                blast_record.alignments):
                            cls.write('%s;' % (n))
                            nohit = False
                            print '\tNo relative found in CAZY'
                            break
                elif blast_record.alignments.index(a) + 1 == len(
                        blast_record.alignments):
                    cls.write('%s;' % (n))
                    nohit = False
                    print '\tNo relative found in CAZY'
                    break
    cls.write('\n')
    cls.close()
def process_one_input_file (input_file , OUT_DIR):
    output_file = OUT_DIR + input_file.split("/")[-1].split(".xml.gz")[0] + ".features_tsv.gz" ;
    errlog_file = OUT_DIR + input_file.split("/")[-1].split(".xml.gz")[0] + ".errorlog.txt" ;

    print("processing       : " + input_file) 
    print("creating output  : " + output_file) 
    print("creating errorlog: " + errlog_file)

    inp_file_handle = gzip.open(input_file , 'rt') 
    out_file_handle = gzip.open(output_file, 'wt')
    log_file_handle = open(errlog_file, "wt")
    
    all_records = NCBIXML.parse(inp_file_handle) 
    cnt = 0 ; 
    for RECORD in all_records:
        for alignment in RECORD.alignments:
            for hsp in alignment.hsps:
                #cnt += 1 
                L = [] 
                #if cnt % 1000 == 0:
                #    print cnt ; 
                    
                            #Features : For each RECORD (Generally we have only 1 record)
                if len (RECORD.query.split ())>1:
                    # L.append (input_file.split("/")[-1].split(".")[0]);
                    L.append (RECORD.query.split ()[0]);#<BlastOutput_query-def>T96060004884 DPY30_HUMAN</BlastOutput_query-def> #UniprotID = DPY30_HUMAN
                else:
                    L.append (RECORD.query);
                    
                L.append (RECORD.query_id);#<BlastOutput_query-ID>90843</BlastOutput_query-ID>
                L.append (RECORD.query_length)#<Iteration_query-len>99</Iteration_query-len>
                L.append (RECORD.query_letters);#<BlastOutput_query-len>99</BlastOutput_query-len>
                                
                #Features : For each Alignment : EACH <Hit> ... and usually each <HIT> may have multiple <Hsp> ... we usually have 50 HSP
                # PARAM_UNIProtID_FromGI  = func_GetUniProtID_FromGI    (alignment.hit_id , EVEXDBcon) 
                # PARAM_UNIProtID_FromACC = func_GetUniProtID_ACCESSION (alignment.hit_id , EVEXDBcon) 
                # #hit_id: gi|18202836|sp|Q9CQV8.3|1433B_MOUSE
                PARAM_UNIProtID_FromXML = set() 
                tmp = alignment.hit_id.split("|") 
                if len (tmp) == 3:
                    PARAM_UNIProtID_FromXML.add (tmp[2])
                
                PARAM_UNIProtID = PARAM_UNIProtID_FromXML  
                if len(PARAM_UNIProtID) == 0:
                    ErrStr = RECORD.query_id + "\t" + alignment.hit_id + "\t" + "GI: " + alignment.hit_id.split ("|")[1] + "\n" ; 
                    log_file_handle.write (ErrStr) 
                    continue
                else:
                    PARAM_UNIProtID = ",".join (PARAM_UNIProtID)
                    
                L.append (PARAM_UNIProtID);# --> GI --> UniprotID 
                L.append (alignment.accession);#<Hit_accession>XP_005815176</Hit_accession>
                L.append (alignment.length);#<Hit_len>104</Hit_len>
                #L.append (alignment.hit_id);#<Hit_id>gi|551527403|ref|XP_005815176.1|</Hit_id>
                #L.append (alignment.hit_def);#<Hit_def>PREDICTED: protein dpy-30 homolog [Xiphophorus maculatus]</Hit_def>
    
                #Features : For each hsp : <hsp>
                L.append (hsp.align_length);#<Hsp_align-len>98</Hsp_align-len>
                L.append (hsp.bits) ;#<Hsp_bit-score>160.614</Hsp_bit-score>
                L.append (hsp.score);#<Hsp_score>405</Hsp_score>
                L.append (hsp.expect);# EVALUE : <Hsp_evalue>1.74162e-48</Hsp_evalue>
                L.append (hsp.query_start);#<Hsp_query-from>2</Hsp_query-from>
                L.append (hsp.query_end);#<Hsp_query-to>99</Hsp_query-to>
                L.append (hsp.sbjct_start);#<Hsp_hit-from>7</Hsp_hit-from>
                L.append (hsp.sbjct_end);#<Hsp_hit-to>104</Hsp_hit-to>
                L.append (hsp.frame[0]);#<Hsp_query-frame>0</Hsp_query-frame>
                L.append (hsp.frame[1]);#<Hsp_hit-frame>0</Hsp_hit-frame>
                L.append (hsp.identities);#<Hsp_identity>74</Hsp_identity>
                L.append (hsp.positives);#<Hsp_positive>92</Hsp_positive>
                L.append (hsp.gaps);#<Hsp_gaps>0</Hsp_gaps>
                
                out_file_handle.write ("\t".join(str(x) for x in L) + "\n")
                
    inp_file_handle.close()
    out_file_handle.close()
    log_file_handle.close() 
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    for idx_record, record in enumerate(blast_records):
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        match_type = {  # Currently we can only handle BLASTN, BLASTP
            'BLASTN': 'nucleotide_match',
            'BLASTP': 'protein_match',
        }.get(record.application, 'match')

        recid = record.query
        if ' ' in recid:
            recid = recid[0:recid.index(' ')]

        rec = SeqRecord(Seq("ACTG"), id=recid)
        for idx_hit, hit in enumerate(record.alignments):
            for idx_hsp, hsp in enumerate(hit.hsps):
                qualifiers = {
                    "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp),
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_id": hit.hit_id,
                    "length": hit.length,
                    "hit_titles": hit.title.split(' >'),
                }
                if include_seq:
                    qualifiers.update({
                        'blast_qseq': hsp.query,
                        'blast_sseq': hsp.sbjct,
                        'blast_mseq': hsp.match,
                    })

                for prop in ('score', 'bits', 'identities', 'positives',
                             'gaps', 'align_length', 'strand', 'frame',
                             'query_start', 'query_end', 'sbjct_start',
                             'sbjct_end'):
                    qualifiers['blast_' + prop] = getattr(hsp, prop, None)

                desc = hit.title.split(' >')[0]
                qualifiers['description'] = desc[desc.index(' '):]

                # This required a fair bit of sketching out/match to figure out
                # the first time.
                #
                # the match_start location must account for queries and
                # subjecst that start at locations other than 1
                parent_match_start = hsp.query_start - hsp.sbjct_start
                # The end is the start + hit.length because the match itself
                # may be longer than the parent feature, so we use the supplied
                # subject/hit length to calculate the real ending of the target
                # protein.
                parent_match_end = hsp.query_start + hit.length + hsp.query.count('-')

                # If we trim the left end, we need to trim without losing information.
                used_parent_match_start = parent_match_start
                if trim:
                    if parent_match_start < 1:
                        used_parent_match_start = 0

                if trim or trim_end:
                    if parent_match_end > hsp.query_end:
                        parent_match_end = hsp.query_end + 1

                # The ``match`` feature will hold one or more ``match_part``s
                top_feature = SeqFeature(
                    FeatureLocation(used_parent_match_start, parent_match_end),
                    type=match_type, strand=0,
                    qualifiers=qualifiers
                )

                # Unlike the parent feature, ``match_part``s have sources.
                part_qualifiers = {
                    "source": "blast",
                }
                top_feature.sub_features = []
                for idx_part, (start, end, cigar) in \
                        enumerate(generate_parts(hsp.query, hsp.match,
                                                 hsp.sbjct,
                                                 ignore_under=min_gap)):
                    part_qualifiers['Gap'] = cigar
                    part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part)

                    # Otherwise, we have to account for the subject start's location
                    match_part_start = parent_match_start + hsp.sbjct_start + start - 1

                    # We used to use hsp.align_length here, but that includes
                    # gaps in the parent sequence
                    #
                    # Furthermore align_length will give calculation errors in weird places
                    # So we just use (end-start) for simplicity
                    match_part_end = match_part_start + (end - start)

                    top_feature.sub_features.append(
                        SeqFeature(
                            FeatureLocation(match_part_start, match_part_end),
                            type="match_part", strand=0,
                            qualifiers=copy.deepcopy(part_qualifiers))
                    )

                rec.features.append(top_feature)
        rec.annotations = {}
        yield rec
Esempio n. 37
0
def get_output(display, dcov, overwrite, query, dbname, evalue, coverage,
               which, e_filter, out, alignment_length):
    # form the blast result first
    if 'genome' in dbname:
        cmdstring = 'blastp -query {q} -db {d} -evalue {e} -max_hsps 1 -out {o}/result.xml -outfmt "5"'.format(
            q=query, d=dbname, e=evalue, o=out)
        col_list = [
            'sacc', 'qacc', 'slen', 'qlen', 'length', 'gaps', 'sstart', 'send',
            'qstart', 'qend', 'evalue', 'score', 'pident', 'sseq', 'match',
            'qseq'
        ]
    else:
        cmdstring = '{q}blastp -query - -db {d} -evalue {e} -max_hsps 1 -out {o}/result.xml -outfmt "5"'.format(
            q=query, d=dbname, e=evalue, o=out)
        col_list = [
            'qacc', 'sacc', 'qlen', 'slen', 'length', 'gaps', 'qstart', 'qend',
            'sstart', 'send', 'evalue', 'score', 'pident', 'qseq', 'match',
            'sseq'
        ]
    if overwrite or not os.path.isfile(out + '/result.xml'):
        os.system(cmdstring)
    # parse the result to the filter
    result_handle = open('{o}/result.xml'.format(o=out))
    blast_records = NCBIXML.parse(result_handle)
    xml_list = []
    for rec in blast_records:
        for alignment in rec.alignments:
            for hsp in alignment.hsps:
                qacc = rec.query.split()[0]
                qacc = qacc.split('.')[0]
                if '|' in qacc:
                    qacc = qacc.split('|')[1]
                sacc = alignment.accession.split()[0]
                sacc = sacc.split('.')[0]
                if '|' in sacc:
                    sacc = sacc.split('|')[1]
                xml_list.append([
                    qacc, sacc, rec.query_length, alignment.length,
                    hsp.align_length, hsp.gaps, hsp.query_start, hsp.query_end,
                    hsp.sbjct_start, hsp.sbjct_end, hsp.expect, hsp.score,
                    100 * float(hsp.identities) / float(hsp.align_length),
                    hsp.query, hsp.match, hsp.sbjct
                ])
    df = pd.DataFrame(xml_list, columns=col_list)
    # calculate the query/subject coverage and add column
    '''df['sacc'] = df['sacc'].apply(lambda x: x.split()[0])
	df['qacc'] = df['qacc'].apply(lambda x: x.split()[0])
	df['sacc'] = df['sacc'].apply(lambda x: x.split('.')[0])
	df['qacc'] = df['qacc'].apply(lambda x: x.split('.')[0])'''
    df['pident'] = df['pident'].apply(lambda x: int(x))
    qcovs = []
    scovs = []
    for index, row in df.iterrows():
        # calculate and insert corresponding values
        qc = ((row["qend"] - row["qstart"] + 1) * 100) / row["qlen"]
        qcovs.append(qc)
        sc = ((row["send"] - row["sstart"] + 1) * 100) / row["slen"]
        scovs.append(sc)
    df.insert(loc=12, column='qcovs', value=qcovs)
    df.insert(loc=13, column='scovs', value=scovs)
    blast_filter(df, coverage, which, e_filter, alignment_length)
    # sort entries based on e-value, coverages
    df.sort_values(["evalue", "qcovs"], inplace=True, ascending=[True, False])

    # try to run pfam for these results; extract sequence of good hits into one single fasta file
    genome_list = list(set(df['qacc'].tolist()))
    NCBI_list = list(set(df['sacc'].tolist()))
    # check the number of protein in the list, if empty, exit
    if len(genome_list) == 0:
        print >> sys.stderr, "No good results! Try to use other keywords or lower the filter standard."
        sys.exit(1)

    list_to_file('{o}/good_genome.txt'.format(o=out), genome_list)
    list_to_file('{o}/good_NCBI.txt'.format(o=out), NCBI_list)
    cmdstring = 'blastdbcmd -db {o}/blastdb/genome_blastdb -entry_batch {o}/good_genome.txt -out {o}/good_result.fasta;blastdbcmd -db {o}/blastdb/NCBI_blastdb -entry_batch {o}/good_NCBI.txt >> {o}/good_result.fasta'.format(
        o=out)
    os.system(cmdstring)
    # run hmmscan and parse the result as a table
    if overwrite or not os.path.isfile(out + '/pfam.out'):
        cmdstring = 'hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout {o}/pfam.out /ResearchData/pfam/pfamdb/Pfam-A.hmm {o}/good_result.fasta'.format(
            o=out)
        os.system(cmdstring)
    # parse the result in pandas
    hmm_object = hmmParser('{o}/pfam.out'.format(o=out))
    hmm_object.filterByCoverage(dcov)
    df_pfam = pd.DataFrame(hmm_object.matrix)
    domtblout_cols = 'target_name t_accession tlen query_name accession qlen evalue socre bias # of cevalue ievalue score bias hmm_from hmm_to ali_from ali_to env_from env_to acc description_of_target'.strip(
    ).split(' ')
    df_pfam.columns = domtblout_cols
    # get the clan accessions and clan info for the existing results
    df_pfam.insert(loc=3, column='clan_acc', value='')
    df_pfam.insert(loc=4, column='clan_info', value='')
    df_pfam['query_name'] = df_pfam['query_name'].apply(
        lambda x: x.split('.')[0])
    df_pfam['t_accession'] = df_pfam['t_accession'].apply(
        lambda x: x.split('.')[0])

    for index, row in df_pfam.iterrows():
        cmd = 'zgrep {s} /ResearchData/pfam/download/Pfam-A.clans.tsv.gz'.format(
            s=row["t_accession"])
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
        output = process.stdout.read().split()
        #print output
        if 'CL' in output[1]:
            row["clan_acc"] = output[1]
            row["clan_info"] = output[2]
        else:
            row["clan_acc"] = 'N/A'
            row["clan_info"] = 'N/A'
    # for each pair of columns in the filtered output, map the value of each cell to the pfam result, check if the domians are the same
    domain_dic = clan_to_dic('query_name', 'target_name', df_pfam)
    clan_dic = clan_to_dic('query_name', 'clan_acc', df_pfam)

    #pprint.pprint(clan_dic)
    # create a list for unqualified results and drop rows according to the list in two dataframes
    bad_hits_blast = []
    bad_hits_pfam = []
    no_domain = []
    for index, row in df.iterrows():
        '''if '|' in row["qacc"]:
			row["qacc"] = row["qacc"].split('|')[1]
		if '|' in row["sacc"]:
			row["sacc"] = row["sacc"].split('|')[1]'''
        # use set to compare if 2 lists have at least one common element
        try:
            if not set(domain_dic[row["qacc"]]) & set(domain_dic[row["sacc"]]):
                if not set(clan_dic[row["qacc"]]) & set(clan_dic[row["sacc"]]):
                    df = df.drop(index)
                    bad_hits_pfam.append(row["qacc"])
                    bad_hits_pfam.append(row["sacc"])
        except KeyError:  # in the case that the protein has no protein domains
            if not domain_dic.get(row["qacc"]):
                df_pfam.loc[-1] = [
                    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', row["qacc"], 'N/A',
                    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
                    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
                    'N/A', 'N/A'
                ]
                df_pfam.index = df_pfam.index + 1
                if row["qacc"] not in no_domain:
                    no_domain.append(row["qacc"])
            if not domain_dic.get(row["sacc"]):
                df_pfam.loc[-1] = [
                    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', row['sacc'], 'N/A',
                    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
                    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
                    'N/A', 'N/A'
                ]
                df_pfam.index = df_pfam.index + 1
                if row["sacc"] not in no_domain:
                    no_domain.append(row["sacc"])

    # delete rows according two lists
    df_pfam = df_pfam.drop_duplicates()
    df_pfam = df_pfam.set_index('query_name', drop=True)
    df_pfam = df_pfam.drop(bad_hits_pfam)  # drop by rows

    # save new sepeate results
    df['evalue'] = df['evalue'].apply(lambda x: '%.2e' % x)
    df.to_csv('{o}/good_blast_results.tsv'.format(o=out), sep='\t')
    df_pfam.to_csv('{o}/good_pfam_results.tsv'.format(o=out), sep='\t')
    pfam_dic = pfam_to_dic(df_pfam)
    # create hydropathy plots for these results, and add protein domains
    plot_path = out + '/plots'
    os.system('mkdir {p}'.format(p=plot_path))
    # create an final result and in the same loop finish data insertion
    outfile = open(out + '/result.html', 'w')
    outfile.write(
        '<!DOCTYPE html><html><head><title>Missing Components Results</title><style type="text/css"></head><body>\n.label {text-align: right;width:50px;}\n.data {text-align:left;padding-left: 8px;width:100px;}\n.seq{border:2px solid black;height:70px; width:100%;overflow-x:auto;overflow-y:auto;margin:1em 0;background:grey;color:white;}tab1 { padding-left: 4em;}</style></head><body>\n'
    )
    # for each pair in the blast results (good one) create two seperated hydropathy plots and create a combined one with pfam domain covered, use blastdbcmd to extract sequences from the database and pass to the quod.py
    row_count = df.shape[0]
    if row_count < display:
        display = row_count
    df = df.head(display)
    df.reset_index(inplace=True)
    df.to_csv('{o}/good_blast_results1.tsv'.format(o=out), sep='\t')
    for index, row in df.iterrows():
        d_str = domain_string(row["qacc"], row["sacc"], pfam_dic, no_domain)

        seq_string1 = 'blastdbcmd -db {o}/blastdb/genome_blastdb -entry {s}'.format(
            o=out, s=row['qacc'])
        seq_string2 = 'blastdbcmd -db {o}/blastdb/NCBI_blastdb -entry {q}'.format(
            o=out, q=row['sacc'])
        process = subprocess.Popen(seq_string1,
                                   stdout=subprocess.PIPE,
                                   shell=True)
        seq1 = process.stdout.read()
        process = subprocess.Popen(seq_string2,
                                   stdout=subprocess.PIPE,
                                   shell=True)
        seq2 = process.stdout.read()
        # create aligner
        alignment = row['qseq'] + '\n' + row['match'] + '\n' + row['sseq']

        # draw seperated plots first, draw bars of alignment and commnon domain
        # get the seperate command string first. It should have multiple domains and for same domain they should contain the same color
        os.system(
            'quod.py -l {q} -q -s "{s1}" --width 15 -c red -d {p} -o {q}_vs_{s}.png -w {qs}-{qe} {ds}'
            .format(s1=seq1,
                    p=plot_path,
                    q=row['qacc'],
                    qs=row['qstart'],
                    qe=row['qend'],
                    ds=d_str[0],
                    s=row['sacc']))
        os.system(
            'quod.py -l {s} -q -s "{s2}" --width 15 -c blue -d {p} -o {s}_vs_{q}.png -w {ss}-{se} {ds}'
            .format(s2=seq2,
                    p=plot_path,
                    s=row['sacc'],
                    ss=row['sstart'],
                    se=row['send'],
                    ds=d_str[1],
                    q=row['qacc']))
        # draw the aligned part as the combined graph
        seq = '>\n' + row['qseq'] + '\n>\n' + row['sseq']
        temp = tempfile.NamedTemporaryFile(delete=True)
        try:
            temp.write(seq)
            temp.flush()
            os.system(
                'quod.py {f} -l {q} -q --width 15 -d {p} -o {q}.png'.format(
                    f=temp.name,
                    p=plot_path,
                    q=row['qacc'] + '_' + row['sacc'] + '_aligned'))
        finally:
            temp.close()

        # insert the blast info
        outfile.write(
            '<br /><hr style="border-style:solid; border-width:5px; color:black;"/><h2 style="text-align:left;">{q}</h2></font><font size = "4"><b>Hit Accession:</b>{s}</font><table width="600px" border="0" cellspacing="0" cellpadding="2"><tr><td class="label"><b>E-value:</b></td><td class="data">{evalue}</td><td class="label"><b>Identity:</b></td><td class="data">{pident}%</td><td class="label"><b>Length:</b></td><td class="data">{length}</td></tr><tr><td class="label"><b>Q_cov:</b></td><td class="data">{qcov}%</td><td class="label"><b>S_cov:</b></td><td class="data">{scov}%</td><td class="label"></td><td class="data"></td></tr></table><p><b>Alignment:</b><tab1>Query:{qstart}-{qend}<tab1>Subject:{sstart}-{send}</p><div class="seq"><pre>{align}</pre></div>'
            .format(q=row['qacc'],
                    s=row['sacc'],
                    length=row['length'],
                    evalue=row['evalue'],
                    pident=row['pident'],
                    align=alignment,
                    qcov=row['qcovs'],
                    scov=row['scovs'],
                    qstart=row['qstart'],
                    qend=row['qend'],
                    sstart=row['sstart'],
                    send=row['send']))
        # insert images to the html file
        outfile.write(
            '<center><table style = "width:100%" border = "0"><tr><td><center><img src = "{p}/{q}.png" style="width:90%; height:90%" /></center></td><td><center><img src = "{p}/{s}.png" style="width:90%; height:90%" /></center></td></tr><tr><td colspan = "2"><center><img src = "{p}/{qs}.png" style="width:50%; height:50%" /></center></td></tr></table></center>'
            .format(p=plot_path,
                    q=row['qacc'] + '_vs_' + row['sacc'],
                    s=row['sacc'] + '_vs_' + row['qacc'],
                    qs=row['qacc'] + '_' + row['sacc'] + '_aligned'))
        # insert the hmm info
        outfile.write(
            '<center><table style = "width:100%" border = "1"><tr><td>Domain</td><td>Domain_acc</td><td>Domain_len</td><td>Protein_acc</td><td>Protein_len</td><td>evalue</td><td>from</td><td>to</td><td>Clan</td><td>Clan_acc</td></tr>'
        )
        for obj in pfam_dic[row["qacc"]]:
            outfile.write(
                '<tr><td>{domain}</td><td>{dacc}</td><td>{dlen}</td><td>{pacc}</td><td>{plen}</td><td>{evalue}</td><td>{f}</td><td>{t}</td><td>{clan}</td><td>{cacc}</td></tr>'
                .format(domain=obj[6],
                        dacc=obj[2],
                        dlen=obj[7],
                        pacc=obj[8],
                        plen=obj[9],
                        evalue=obj[10],
                        f=obj[0],
                        t=obj[1],
                        clan=obj[11],
                        cacc=obj[3]))
        for obj in pfam_dic[row["sacc"]]:
            outfile.write(
                '<tr><td>{domain}</td><td>{dacc}</td><td>{dlen}</td><td>{pacc}</td><td>{plen}</td><td>{evalue}</td><td>{f}</td><td>{t}</td><td>{clan}</td><td>{cacc}</td></tr>'
                .format(domain=obj[6],
                        dacc=obj[2],
                        dlen=obj[7],
                        pacc=obj[8],
                        plen=obj[9],
                        evalue=obj[10],
                        f=obj[0],
                        t=obj[1],
                        clan=obj[11],
                        cacc=obj[3]))
        outfile.write('</table></center><br>')

    # Eventually create the result.html file
    outfile.write('</body></html>')
    outfile.close()
Esempio n. 38
0
def readBlast(db, path, compareTo):
    outpath = os.path.join(
        path, 'temp/out_' + os.path.basename(compareTo['path']) + ".xml")
    min_evalue = 1e-6
    min_coverage = settings.homologyCutoffTo  #0.5
    min_query_coverage = settings.homologyCutoffFrom  #0.4

    # Create a lookup table from contig names to the number of the contigs
    contigLookup = {}
    for i, contig in enumerate(compareTo['contigs']):
        contigLookup[getFastaName(contig)] = i

    with open(outpath) as outHandle:
        # Crawl across all the hits
        for record in NCBIXML.parse(outHandle):
            matches = []
            for alignment in record.alignments:

                # Find the contig of the hit
                contigNo = contigLookup[alignment.hit_def]
                contigHit = compareTo['contigs'][contigNo]

                for hsp in alignment.hsps:
                    if hsp.expect > min_evalue:
                        break

                    # find the gene for each hit
                    location = (hsp.sbjct_start + hsp.sbjct_end) // 2

                    match = matchInPV(location, contigHit)
                    if match:
                        match['expect'] = hsp.expect
                        match['length'] = hsp.identities
                        match['hsp'] = hsp
                        match['contigNo'] = contigNo
                        match['contigName'] = contigHit['record'].description
                        matches.append(match)
                        continue

                    matchRecords = featurefinder.findMatchingFeatures(
                        contigHit['record'], location, ['CDS', 'rRNA', 'gene'])
                    # store the result
                    if matchRecords:
                        match = {
                            'record': matchRecords[0],
                            'expect': hsp.expect,
                            'length': hsp.identities,
                            'hsp': hsp,
                            'contigNo': contigNo,
                            'contigName': contigHit['record'].description
                        }
                        # Match this with an existing gene record if available
                        annotateMatch(match, contigHit)
                        matches.append(match)

            # Now, if we have matches we need to associate them with the right gene in our records
            if matches:
                # But first we want to group any matches that are to the same gene together
                # And then screen them out if the TOTAL amount matched is less than 50%
                groupedMatches = []
                lociMatched = set([m['locus'] for m in matches])
                for locus in lociMatched:
                    locusMatches = [m for m in matches if m['locus'] == locus]
                    ourMatch = dict(locusMatches[0])
                    if len(locusMatches) > 1:
                        ourMatch['multipleHits'] = []
                        ourMatch['length'] = 0
                        for duplicate in locusMatches:
                            ourMatch['multipleHits'].append(duplicate['hsp'])
                            ourMatch['length'] = ourMatch[
                                'length'] + duplicate['length']
                        ourMatch['multipleHits'].sort(
                            key=lambda x: x.query_start)

                    # NB: geneLength in bp, length in amino acids, hence *3
                    ourMatch['coverage'] = (ourMatch['length'] *
                                            3) / ourMatch['geneLength']
                    if ourMatch['coverage'] > min_coverage:
                        groupedMatches.append(ourMatch)

                if groupedMatches:
                    for groupedMatch in groupedMatches:
                        # Get the strain and name out of the record.query
                        queryStrainName, queryContigNo, queryTractName = splitQueryName(
                            record.query)
                        queryContig = findStrain(
                            db, queryStrainName)['contigs'][int(queryContigNo)]
                        geneMatch = next(x for x in queryContig['tracts']
                                         if x['name'] == queryTractName)
                        if not geneMatch:
                            print("Something has gone wrong, gene '" +
                                  record.query + "' not found")
                            continue

                        groupedMatch['queryCoverage'] = (
                            groupedMatch['length'] *
                            3) / geneMatch['geneLength']
                        if groupedMatch['queryCoverage'] < min_query_coverage:
                            continue

                        if 'blastMatch' not in geneMatch:
                            geneMatch['blastMatch'] = {}

                        # Create a link from the query match to the subject (hit) match
                        contigLinkName = groupedMatch['contigName']
                        if contigLinkName not in geneMatch['blastMatch']:
                            geneMatch['blastMatch'][contigLinkName] = []
                        geneMatch['blastMatch'][contigLinkName].append(
                            groupedMatch)

                        # Do we have a link to another PV gene?
                        # If so create a bidirectional link
                        if 'tractNo' in groupedMatch:
                            otherGene = compareTo['contigs'][groupedMatch[
                                'contigNo']]['tracts'][groupedMatch['tractNo']]
                            geneMatch['links'].add(otherGene['uid'])
                            otherGene['links'].add(geneMatch['uid'])
Esempio n. 39
0
def summarize_blast_output(blast_out=None,
                           blast_file=None,
                           min_identity=None,
                           expect=None):
    """
  Parse NCBI BLAST XML output and convert to a list of simple summary
  objects.  Note that this is very specific to searching the PDB, and returns
  incomplete information (suitable for summarizing in a flat table).
  """
    assert ([blast_out, blast_file].count(None) == 1)
    from Bio.Blast import NCBIXML
    import iotbx.pdb.fetch
    if (blast_out is not None):
        blast_in = cStringIO.StringIO(blast_out)
    else:
        assert os.path.isfile(blast_file)
        blast_in = open(blast_file)
    parsed = NCBIXML.parse(blast_in)
    blast = parsed.next()
    if (len(blast.alignments) == 0):
        raise Sorry("No matching sequences!")
    results = []
    for i_hit, hit in enumerate(blast.alignments):
        pdb_chain_id = str(hit.accession)
        #hit.accession may only have pdb_id, e.g. 1EMG
        if len(pdb_chain_id.split("_")) > 1:
            pdb_id, chain_id = pdb_chain_id.split("_")
        else:
            pdb_id = pdb_chain_id
            chain_id = None
        #
        hsp = hit.hsps[0]
        assert (hsp.align_length > 0)
        identity = 100 * hsp.identities / hsp.align_length
        if (min_identity is not None) and (identity < min_identity):
            continue
        # XXX this is really appalling, but the NCBI groups together identical
        # sequences in its BLAST output, so I need to parse the accession code
        # strings to extract the individual PDB IDs
        hit_def_fields = hit.hit_def.split("|")
        all_ids = []
        for i_field, field in enumerate(hit_def_fields):
            if (field == "pdb") and (i_field < len(hit_def_fields) - 1):
                next_pdb_id = hit_def_fields[i_field + 1]
                if "Chain" in hit_def_fields[i_field + 2]:
                    next_chain_id = hit_def_fields[i_field + 2].split()[0]
                else:
                    next_chain_id = None
                if (iotbx.pdb.fetch.looks_like_pdb_id(next_pdb_id)):
                    all_ids.append([next_pdb_id, next_chain_id])
        summary = blast_hit(hit_num=i_hit + 1,
                            pdb_id=pdb_id,
                            chain_id=chain_id,
                            evalue=hsp.expect,
                            length=hsp.align_length,
                            identity=identity,
                            positives=100 * hsp.positives / hsp.align_length,
                            hsp=hsp,
                            all_ids=all_ids)
        results.append(summary)
    return results
Esempio n. 40
0
def search_and_process2(rpsblast, cdd_name, tmp_dir, evalue, translation_id,
                        translation):
    """
    Uses rpsblast to search indicated gene against the indicated CDD
    :param rpsblast: path to rpsblast binary
    :param cdd_name: CDD database path/name
    :param tmp_dir: path to directory where I/O will take place
    :param evalue: evalue cutoff for rpsblast
    :param translation_id: unique identifier for the translation sequence
    :param translation: protein sequence for gene to query
    :return: results
    """
    # Setup I/O variables
    i = "{}/{}.txt".format(tmp_dir, translation_id)
    o = "{}/{}.xml".format(tmp_dir, translation_id)

    # Write the input file
    with open(i, "w") as fh:
        fh.write(">{}\n{}".format(translation_id, translation))

    # Setup and run the rpsblast command
    rps_command = NcbirpsblastCommandline(cmd=rpsblast,
                                          db=cdd_name,
                                          query=i,
                                          out=o,
                                          outfmt=5,
                                          evalue=evalue)
    rps_command()

    # Process results into a single list
    results = []

    with open(o, "r") as fh:
        for record in NCBIXML.parse(fh):
            # Only need to process if there are record alignments
            if record.alignments:
                for align in record.alignments:
                    for hsp in align.hsps:
                        if hsp.expect <= evalue:
                            align.hit_def = align.hit_def.replace("\"", "\'")

                            des_list = align.hit_def.split(",")
                            if len(des_list) == 1:
                                description = des_list[0].strip()
                                domain_id = None
                                name = None
                            elif len(des_list) == 2:
                                domain_id = des_list[0].strip()
                                description = des_list[1].strip()
                                name = None
                            else:
                                domain_id = des_list[0].strip()
                                name = des_list[1].strip()
                                # Name is occassionally longer than permitted
                                # in the database. Truncating avoids a
                                # MySQL error.
                                # TODO perhaps the database schema should be
                                # changed to account for this.
                                name = basic.truncate_value(name, 25, "...")
                                description = ",".join(des_list[2:]).strip()

                            # Try to put domain into domain table
                            results.append(
                                INSERT_INTO_DOMAIN.format(
                                    align.hit_id, domain_id, name,
                                    description))

                            # Try to put this hit into gene_domain table
                            data_dict = {
                                "Translation": translation,
                                "HitID": align.hit_id,
                                "Expect": float(hsp.expect),
                                "QueryStart": int(hsp.query_start),
                                "QueryEnd": int(hsp.query_end)
                            }
                            results.append(data_dict)
                            # results.append(INSERT_INTO_GENE_DOMAIN.format(
                            #     geneid, align.hit_id, float(hsp.expect),
                            #     int(hsp.query_start), int(hsp.query_end)))

    # Update this gene's DomainStatus to 1
    # results.append(UPDATE_GENE.format(geneid))
    return results
def Serotype_BLAST(ProbeID, ProbeSEQ, ProbeScore, ResultTableName, ProbeMFIThreshold):
     from Bio import SeqIO
     from Bio.Blast import NCBIStandalone
     from Bio.Blast import NCBIXML
     import MySQLdb

     PROBEID = str(int(float(ProbeID)))
     PROBESEQ = ProbeSEQ     
     MaxAlignCNT = 1000
     AlignCNT = 0

     conn = MySQLdb.connect(host = HOSTlocal,
                           user = USER,
                           passwd = PASS,
                           db = DB)
     cursor = conn.cursor()
        
     Save_fasta("/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/ProbeBlastSeq.fasta", PROBEID, PROBESEQ)

     Template_DB = "/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/FMD_Selected_Template/FMD_FinalConsensusDB/FMDFinalConsensusDB"
     
     BLASTN_v29template("/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/CurrentFMDBlast.xml","/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/ProbeBlastSeq.fasta", Template_DB)

     #print "BLASTING"
     result_handle = open("/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/CurrentFMDBlast.xml","r")

     blast_records = NCBIXML.parse(result_handle)
     for blast_record in blast_records:
         for alignment in blast_record.alignments:
             for hsp in alignment.hsps:
                 if AlignCNT < MaxAlignCNT:
                      AlignCNT = AlignCNT+1

                      SStart = (hsp.query_start)
                      SEnd = (hsp.query_end)
                      
                      #print alignment.title
                      #print "Identities: " + str(hsp.identities)
                      #print "que: " + hsp.query +  str(hsp.query_start) + "::" + str(hsp.query_end)
                      #print "mat: " + hsp.match
                      #print "sub: " + hsp.sbjct + str(hsp.sbjct_start) + "::" + str(hsp.sbjct_end)
                      #print "Query Start NT:" + str(hsp.query_start)   
                      #print "Query Start NT:" + str(hsp.query_end)
                      #print "Subject Start NT:" + str(hsp.sbjct_start)
                      #print "Subject End NT:" + str(hsp.sbjct_end)
                      #print ""

                      preNUC_SCORE = str(float(ProbeScore)/len(hsp.sbjct))
                      NUC_SCORE = preNUC_SCORE[0:10]
                      
                      SubjectLength = len(hsp.query)
                      for NT in range(0,SubjectLength):
                          NT_ATCG = str(hsp.query[NT:NT+1])  
                          NT_Pos = str(hsp.sbjct_start+NT)

                          SERO = alignment.title
                                                   
                          Arguements = " '"+ NT_Pos + "','" + PROBEID + "','" + NUC_SCORE + "','" + NT_ATCG + "' "
                          EnterLine = "RESULTS_" + ResultTableName  + " (Position, Probe_ID, Nuc_Score, Nucleotide) VALUES (" + Arguements + ")"
                          ActLine = "INSERT INTO " + EnterLine
                          
                          cursor.execute(ActLine)
                          #print 'done'
     
     cursor.close()
     conn.commit()
     conn.close()
Esempio n. 42
0
    
    output.close()
    
    os.system("makeblastdb -in dups_removed.fasta -dbtype prot -out " + temp_dir + "/blast")
    
    output = open("dups_removed.fasta", "a")
    
    blast_cline = NcbiblastpCommandline(db=temp_dir + "/blast", query=temp_dir + "/seq_file", outfmt=5)
    
    blast_result = blast_cline()
    
    xml_file = open(temp_dir + "/xml_file", "w+")
    xml_file.write(blast_result[0])
    xml_file.seek(0, 0)

    blast_iterator = NCBIXML.parse(xml_file)
    
    record = blast_iterator.i()
    alignments = record.alignments[:]

    for alignment in alignments: 
        hsps = alignment.hsps[0]
        alignment_title = alignment.title.split(" ")
        if i.id == alignment_title[1]:
            continue
        
        percent_ident = float(hsps.identities) / float(alignment.length)
        
        if percent_ident > 0.95:
            print("Deleted!\n")
            break
Esempio n. 43
0
def _getTopFromBlast(blastXML,
                     TF,
                     top=0,
                     exContaminSpecies=True,
                     outfile=None,
                     newHeader=True):
    '''
	Parses Blast result XML files and writes the best or all results with less information in a tsv file.

	:param blastXML: The filename of the Blast output (must be output type 5)
	:param TF: An instance of the TaxFinder class
	:param top: Write only the best `top` hits to file. If `top` is 0, all hits are saved.
	:param exContaminSpecies: Shall hits of known contaminated species be excluded?
	:param outfile: The file to write the results to (including path). If it is None, use the basename of `blastXML`
	:param newHeader: Where the Blast results produced with new headers (database from 2016 and newer)?
	:creates: `resulttables/FILE.tsv`
	'''

    contaminantSpecies = {
        118797, 59538, 7213
    }  # Lipotes vexillifer, Pantholops hodgsonii, Ceratitis capitata

    if outfile is None:
        outfile = 'resulttables/{}.tsv'.format(_myGetBasename(blastXML))

    if top < 0:
        top = 0

    with open(blastXML, 'r') as f, open(outfile, 'w') as out:
        records = NCBIXML.parse(f)

        out.write('\t'.join(('Tax-ID', 'Acc', 'Species', 'Rank', 'e-value',
                             'Length', 'Lineage', 'Prot-Name',
                             'Query-Protein')) + '\n')

        for record in records:
            for i, alignment in enumerate(record.alignments):
                if top and i > top:
                    break

                infos = TF.getInfoFromHitDef(alignment.hit_id,
                                             alignment.hit_def,
                                             newHeader=newHeader)

                for info in infos:
                    if exContaminSpecies and info[
                            'taxid'] in contaminantSpecies:
                        continue

                    lineage = ', '.join(
                        TF.getLineage(info['taxid'], display='name'))

                    for hsp in alignment.hsps:
                        try:
                            line = '\t'.join(
                                (str(info['taxid']), info['acc'], info['name'],
                                 info['rank'], str(hsp.expect),
                                 str(hsp.align_length), lineage,
                                 info['protname'], record.query.split('|')[1]))
                        except IndexError:
                            line = '\t'.join(
                                (str(info['taxid']), info['acc'], info['name'],
                                 info['rank'], str(hsp.expect),
                                 str(hsp.align_length), lineage,
                                 info['protname'], record.query))

                        out.write(line + '\n')
Esempio n. 44
0
    def run_qblast(self, program, database, query, e_value, entrez_filter,
                   additional_args, expected_hits):
        try:
            if program == "blastn":
                # Check the megablast parameter is accepted
                handle = NCBIWWW.qblast(program,
                                        database,
                                        query,
                                        alignments=10,
                                        descriptions=10,
                                        hitlist_size=10,
                                        entrez_query=entrez_filter,
                                        expect=e_value,
                                        **additional_args)
            else:
                handle = NCBIWWW.qblast(program,
                                        database,
                                        query,
                                        alignments=10,
                                        descriptions=10,
                                        hitlist_size=10,
                                        entrez_query=entrez_filter,
                                        expect=e_value,
                                        **additional_args)
        except HTTPError:
            # e.g. a proxy error
            raise MissingExternalDependencyError("internet connection failed")
        record = NCBIXML.read(handle)

        if record.query == "No definition line":
            # We used a sequence as the query
            self.assertEqual(len(query), record.query_letters)
        elif query.startswith(">"):
            # We used a FASTA record as the query
            expected = query[1:].split("\n", 1)[0]
            self.assertEqual(expected, record.query)
        elif record.query_id.startswith("Query_") and len(
                query) == record.query_letters:
            # We used a sequence as the entry and it was given a placeholder name
            pass
        else:
            # We used an identifier as the query
            self.assertIn(
                query, record.query_id.split("|"),
                "Expected %r within query_id %r" % (query, record.query_id))

        # Check the recorded input parameters agree with those requested
        self.assertEqual(float(record.expect), e_value)
        self.assertEqual(record.application.lower(), program)
        self.assertTrue(len(record.alignments) <= 10)
        self.assertTrue(len(record.descriptions) <= 10)

        # Check the expected result(s) are found in the alignments
        if expected_hits is None:
            self.assertEqual(len(record.alignments),
                             0)  # Expected no alignments!
        else:
            self.assertTrue(
                len(record.alignments) > 0)  # Expected some alignments!
            found_result = False
            for expected_hit in expected_hits:
                for alignment in record.alignments:
                    if expected_hit in alignment.hit_id.split("|"):
                        found_result = True
                        break
            if len(expected_hits) == 1:
                print("Update this test to have some redundancy...")
                for alignment in record.alignments:
                    print(alignment.hit_id)
            self.assertTrue(
                found_result,
                "Missing all expected hits (%s), instead have: %s" %
                (", ".join(expected_hits), ", ".join(
                    a.hit_id for a in record.alignments)))

        # Check the expected result(s) are found in the descriptions
        if expected_hits is None:
            self.assertEqual(len(record.descriptions),
                             0)  # Expected no descriptions!
        else:
            self.assertTrue(
                len(record.descriptions) > 0)  # Expected some descriptions!
            found_result = False
            for expected_hit in expected_hits:
                for descr in record.descriptions:
                    if expected_hit == descr.accession \
                            or expected_hit in descr.title.split(None, 1)[0].split("|"):
                        found_result = True
                        break
            assert found_result, "Missing all of %s in descriptions" % expected_hit
            self.assertTrue(found_result)
Esempio n. 45
0
        get_fasta.close()

        #BLAST output
        blastp_cline = NcbiblastpCommandline(query=filename,
                                             db="nr",
                                             evalue=0.001,
                                             outfmt=5,
                                             out=filename[:-5] + ".xml",
                                             num_alignments=1,
                                             entrez_query="human[Organism]",
                                             remote=True)
        stdout, stderr = blastp_cline()

        #Parse XML output file
        blastp_cline = open(filename[:-5] + ".xml")
        blast_record = NCBIXML.read(blastp_cline)
        if len(blast_record.alignments) > 0:
            for alignment in blast_record.alignments:
                hsp = alignment.hsps[0]
                ident = float(hsp.identities) / hsp.align_length
                accession = alignment.accession
                accession_numbers.append(accession)
                homologies.append(ident)
                time.sleep(1)
        else:
            query_accession_numbers.append("No homologs")
            accession_numbers.append("No homologs")
            homologies.append("No homologs")
            time.sleep(1)
            continue
    else:
Esempio n. 46
0
    if tmp[1][0]<=tmp[0][1]:
        return True
        
    elif tmp[1][0]-tmp[0][1]<19:
        return True
    else:
        return False
    
def cmpToList(mTuple,mList):
    for i in range(len(mList)):
        if isOverlap(mTuple,mList[i]):
            return i
    return None
    

blast_results = NCBIXML.parse(open("/home/wenlei/LUO/ZF/blast/bac_results.xml","r"))

for result in blast_results:
    for alignment in result.alignments:
        print alignment.hit_def
        mList = []
        for hsp in alignment.hsps:
            mTuple = (hsp.query_start,hsp.query_end)
            mList.append(mTuple)
        mList.sort()
        print mList
        tmp_list = [mList[0]]
        
        for i in mList:
            mark = cmpToList(i,tmp_list)
            if mark != None:
Esempio n. 47
0
    'xbt006.xml',
]

for test in detailed_tests:
    assert test in all_tests

### NCBIXML.BlastParser

print "Running tests on NCBIXML.BlastParser"

for test in all_tests:
    print "*" * 50, "TESTING %s" % test
    datafile = os.path.join("Blast", test)
    input = open(datafile)

    records = NCBIXML.parse(input)

    for record in records:
        alignments = record.alignments
        if not alignments:
            print '%s - no hits' % record.query_id
            continue
        print '%s - %i alignments with a total of %i HSPs' \
              % (record.query_id,
                 len(alignments),
                 reduce(lambda a,b: a+b, [len(a.hsps) for a in alignments]))

        if not test in detailed_tests:
            continue

        E_VALUE_THRESH = 10**-10
Esempio n. 48
0
f_record = next(SeqIO.parse("m_cold.fasta", "fasta"))

print("Doing the BLAST and retrieving the results...")
result_handle = NCBIWWW.qblast("blastn", "nr", f_record.format("fasta"))

# save the results for later, in case we want to look at it
with open("m_cold_blast.out", "w") as save_file:
    blast_results = result_handle.read()
    save_file.write(blast_results)

print("Parsing the results and extracting info...")

# option 1 -- open the saved file to parse it
# option 2 -- create a handle from the string and parse it
string_result_handle = StringIO(blast_results)
b_record = NCBIXML.read(string_result_handle)

# now get the alignment info for all e values greater than some threshold
E_VALUE_THRESH = 0.1

for alignment in b_record.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            print("****Alignment****")
            print("sequence: %s" % alignment.title)
            print("length: %i" % alignment.length)
            print("e value: %f" % hsp.expect)
            print(hsp.query[0:75] + "...")
            print(hsp.match[0:75] + "...")
            print(hsp.sbjct[0:75] + "...")
Esempio n. 49
0
    def out():
        file = open("lines.txt", "w")

        if aligns == "0":
            if chos == 'BLASTn':

                record = SeqIO.read(str(e1.get()), "fasta")
                ser = record.seq
                print(ser)
                if db == 'Nucleotide collection(DNA)':
                    result_handle = NCBIWWW.qblast("blastn", "nt", ser)
                elif db == 'NCBI Transcript Ref_Seq(DNA)':
                    result_handle = NCBIWWW.qblast("blastn", "refseq_rna", ser)
                elif db == 'PDB nucleotide database(DNA)':
                    result_handle = NCBIWWW.qblast("blastn", "pdbnt", ser)
                elif db == 'Non-redundant(Protein)':
                    err()
                elif db == 'NCBI Protein Ref_Seq(Protein)':
                    err()
                elif db == 'Non-redundant UniProtKB/SwissProt(Protein)':
                    err()
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastn", "est", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastn", "est", ser)
                elif db == 'RefSeq Representative Genome Database(DNA)':
                    result_handle = NCBIWWW.qblast(
                        "blastn", "refseq_representative_genomes", ser)

                blast_record = NCBIXML.read(result_handle)
                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if 1 == 1:
                            print("****Alignment****")
                            print("sequence:", alignment.title)
                            print("length:", alignment.length)
                            print("e value:", hsp.expect)
                            print(hsp.query[0:75] + "...")
                            print(hsp.sbjct[0:75] + "...")
                            view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % (
                                alignment.title, alignment.length, hsp.expect,
                                hsp.query[0:100] + "...", hsp.match[0:100] +
                                "...", hsp.sbjct[0:100] + "...")
                            T.insert(END, view)
                            file.write(view)

                filename = e2.get()
                if not os.path.exists(os.path.dirname(filename)):
                    try:
                        os.makedirs(os.path.dirname(filename))
                    except OSError as exc:  # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                file = open(filename, "w")
                reads = open("lines.txt")
                for i, line in enumerate(reads):
                    result = "%s" % line
                    file.write(result)
                reads.close()
                file.close()

            elif chos == 'BLASTp':

                record = SeqIO.read(str(e1.get()), "fasta")
                ser = record.seq
                print(ser)
                if db == 'Nucleotide collection(DNA)':
                    result_handle = NCBIWWW.qblast("blastp", "nt", ser)
                elif db == 'NCBI Transcript Ref_Seq(DNA)':
                    result_handle = NCBIWWW.qblast("blastp", "refseq_rna", ser)
                elif db == 'PDB nucleotide database(DNA)':
                    result_handle = NCBIWWW.qblast("blastp", "pdbnt", ser)
                elif db == 'Non-redundant(Protein)':
                    result_handle = NCBIWWW.qblast("blastp", "nr", ser)
                elif db == 'NCBI Protein Ref_Seq(Protein)':
                    result_handle = NCBIWWW.qblast("blastp", "refseq_protein",
                                                   ser)
                elif db == 'Non-redundant UniProtKB/SwissProt(Protein)':
                    result_handle = NCBIWWW.qblast("blastp", "swissprot", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastp", "est", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastp", "est", ser)
                elif db == 'RefSeq Representative Genome Database(DNA)':
                    result_handle = NCBIWWW.qblast(
                        "blastp", "refseq_representative_genomes", ser)
                blast_record = NCBIXML.read(result_handle)

                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if 1 == 1:
                            print("****Alignment****")
                            print("sequence:", alignment.title)
                            print("length:", alignment.length)
                            print("e value:", hsp.expect)
                            print(hsp.query[0:75] + "...")
                            print(hsp.sbjct[0:75] + "...")
                            view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % (
                                alignment.title, alignment.length, hsp.expect,
                                hsp.query[0:100] + "...",
                                hsp.match[0:len(hsp.query)] + "...",
                                hsp.sbjct[0:100] + "...")
                            T.insert(END, view)
                            file.write(view)

                filename = e2.get()
                if not os.path.exists(os.path.dirname(filename)):
                    try:
                        os.makedirs(os.path.dirname(filename))
                    except OSError as exc:  # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                file = open(filename, "w")
                reads = open("lines.txt")
                for i, line in enumerate(reads):
                    result = "%s" % line
                    file.write(result)
                reads.close()
                file.close()

            elif chos == 'tBLASTn':

                record = SeqIO.read(str(e1.get()), "fasta")
                ser = record.seq
                print(ser)
                if db == 'Nucleotide collection(DNA)':
                    result_handle = NCBIWWW.qblast("tblastn", "nt", ser)
                elif db == 'NCBI Transcript Ref_Seq(DNA)':
                    result_handle = NCBIWWW.qblast("tblastn", "refseq_rna",
                                                   ser)
                elif db == 'PDB nucleotide database(DNA)':
                    result_handle = NCBIWWW.qblast("tblastn", "pdbnt", ser)
                elif db == 'Non-redundant(Protein)':
                    result_handle = NCBIWWW.qblast("tblastn", "nr", ser)
                elif db == 'NCBI Protein Ref_Seq(Protein)':
                    result_handle = NCBIWWW.qblast("tblastn", "refseq_protein",
                                                   ser)
                elif db == 'Non-redundant UniProtKB/SwissProt(Protein)':
                    result_handle = NCBIWWW.qblast("tblastn", "swissprot", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("tblastn", "est", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("tblastn", "est", ser)
                elif db == 'RefSeq Representative Genome Database(DNA)':
                    result_handle = NCBIWWW.qblast(
                        "tblastn", "refseq_representative_genomes", ser)
                with open("Blast Result.xml", "w") as out_handle:
                    out_handle.write(result_handle.read())
                result_handle.close()
                blast_qresult = SearchIO.read('Blast Result.xml', 'blast-xml')

                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if 1 == 1:
                            print("****Alignment****")
                            print("sequence:", alignment.title)
                            print("length:", alignment.length)
                            print("e value:", hsp.expect)
                            print(hsp.query[0:75] + "...")
                            print(hsp.sbjct[0:75] + "...")
                            view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % (
                                alignment.title, alignment.length, hsp.expect,
                                hsp.query[0:100] + "...",
                                hsp.match[0:len(hsp.query)] + "...",
                                hsp.sbjct[0:100] + "...")
                            T.insert(END, view)
                            file.write(view)

                filename = e2.get()
                if not os.path.exists(os.path.dirname(filename)):
                    try:
                        os.makedirs(os.path.dirname(filename))
                    except OSError as exc:  # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                file = open(filename, "w")
                reads = open("lines.txt")
                for i, line in enumerate(reads):
                    result = "%s" % line
                    file.write(result)
                reads.close()
                file.close()

            elif chos == 'BLASTx':

                record = SeqIO.read(str(e1.get()), "fasta")
                ser = record.seq
                print(ser)
                if db == 'Nucleotide collection(DNA)':
                    result_handle = NCBIWWW.qblast("blastx", "nt", ser)
                elif db == 'NCBI Transcript Ref_Seq(DNA)':
                    result_handle = NCBIWWW.qblast("blastx", "refseq_rna", ser)
                elif db == 'PDB nucleotide database(DNA)':
                    result_handle = NCBIWWW.qblast("blastx", "pdbnt", ser)
                elif db == 'Non-redundant(Protein)':
                    result_handle = NCBIWWW.qblast("blastx", "nr", ser)
                elif db == 'NCBI Protein Ref_Seq(Protein)':
                    result_handle = NCBIWWW.qblast("blastx", "refseq_protein",
                                                   ser)
                elif db == 'Non-redundant UniProtKB/SwissProt(Protein)':
                    result_handle = NCBIWWW.qblast("blastx", "swissprot", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastx", "est", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastx", "est", ser)
                elif db == 'RefSeq Representative Genome Database(DNA)':
                    result_handle = NCBIWWW.qblast(
                        "blastx", "refseq_representative_genomes", ser)
                blast_record = NCBIXML.read(result_handle)

                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if 1 == 1:
                            print("****Alignment****")
                            print("sequence:", alignment.title)
                            print("length:", alignment.length)
                            print("e value:", hsp.expect)
                            print(hsp.query[0:75] + "...")
                            print(hsp.sbjct[0:75] + "...")
                            view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % (
                                alignment.title, alignment.length, hsp.expect,
                                hsp.query[0:100] + "...",
                                hsp.match[0:len(hsp.query)] + "...",
                                hsp.sbjct[0:100] + "...")
                            T.insert(END, view)
                            file.write(view)

                filename = e2.get()
                if not os.path.exists(os.path.dirname(filename)):
                    try:
                        os.makedirs(os.path.dirname(filename))
                    except OSError as exc:  # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                file = open(filename, "w")
                reads = open("lines.txt")
                for i, line in enumerate(reads):
                    result = "%s" % line
                    file.write(result)
                reads.close()
                file.close()

            elif chos == 'tBLASTx':

                record = SeqIO.read(str(e1.get()), "fasta")
                ser = record.seq
                print(ser)
                if db == 'Nucleotide collection(DNA)':
                    result_handle = NCBIWWW.qblast("tblastx", "nt", ser)
                elif db == 'NCBI Transcript Ref_Seq(DNA)':
                    result_handle = NCBIWWW.qblast("tblastx", "refseq_rna",
                                                   ser)
                elif db == 'PDB nucleotide database(DNA)':
                    result_handle = NCBIWWW.qblast("tblastx", "pdbnt", ser)
                elif db == 'Non-redundant(Protein)':
                    result_handle = NCBIWWW.qblast("tblastx", "nr", ser)
                elif db == 'NCBI Protein Ref_Seq(Protein)':
                    result_handle = NCBIWWW.qblast("tblastx", "refseq_protein",
                                                   ser)
                elif db == 'Non-redundant UniProtKB/SwissProt(Protein)':
                    result_handle = NCBIWWW.qblast("tblastx", "swissprot", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("tblastx", "est", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("tblastx", "est", ser)
                elif db == 'RefSeq Representative Genome Database(DNA)':
                    result_handle = NCBIWWW.qblast(
                        "tblastx", "refseq_representative_genomes", ser)
                blast_record = NCBIXML.read(result_handle)

                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if 1 == 1:
                            print("****Alignment****")
                            print("sequence:", alignment.title)
                            print("length:", alignment.length)
                            print("e value:", hsp.expect)
                            print(hsp.query[0:75] + "...")
                            print(hsp.sbjct[0:75] + "...")
                            view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % (
                                alignment.title, alignment.length, hsp.expect,
                                hsp.query[0:100] + "...", hsp.match[0:100] +
                                "...", hsp.sbjct[0:100] + "...")
                            T.insert(END, view)
                            file.write(view)

                filename = e2.get()
                if not os.path.exists(os.path.dirname(filename)):
                    try:
                        os.makedirs(os.path.dirname(filename))
                    except OSError as exc:  # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                file = open(filename, "w")
                reads = open("lines.txt")
                for i, line in enumerate(reads):
                    result = "%s" % line
                    file.write(result)
                reads.close()
                file.close()

        elif aligns == "1":
            if chos == 'BLASTn':

                record = SeqIO.read(str(e1.get()), "fasta")
                ser = record.seq
                print(ser)
                if db == 'Nucleotide collection(DNA)':
                    result_handle = NCBIWWW.qblast("blastn", "nt", ser)
                elif db == 'NCBI Transcript Ref_Seq(DNA)':
                    result_handle = NCBIWWW.qblast("blastn", "refseq_rna", ser)
                elif db == 'PDB nucleotide database(DNA)':
                    result_handle = NCBIWWW.qblast("blastn", "pdbnt", ser)
                elif db == 'Non-redundant(Protein)':
                    result_handle = NCBIWWW.qblast("blastn", "nr", ser)
                elif db == 'NCBI Protein Ref_Seq(Protein)':
                    result_handle = NCBIWWW.qblast("blastn", "refseq_protein",
                                                   ser)
                elif db == 'Non-redundant UniProtKB/SwissProt(Protein)':
                    result_handle = NCBIWWW.qblast("blastn", "swissprot", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastn", "est", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastn", "est", ser)
                elif db == 'RefSeq Representative Genome Database(DNA)':
                    result_handle = NCBIWWW.qblast(
                        "blastn", "refseq_representative_genomes", ser)

                blast_record = NCBIXML.read(result_handle)

                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if 1 == 1:
                            print("****Alignment****")
                            print("sequence:", alignment.title)
                            print("length:", alignment.length)
                            print("e value:", hsp.expect)
                            print(hsp.query[0:75] + "...")
                            print(hsp.sbjct[0:75] + "...")
                            view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % (
                                alignment.title, alignment.length, hsp.expect,
                                hsp.query[0:100] + "...", hsp.match[0:100] +
                                "...", hsp.sbjct[0:100] + "...")
                            file.write(view)

                filename = e2.get()
                if not os.path.exists(os.path.dirname(filename)):
                    try:
                        os.makedirs(os.path.dirname(filename))
                    except OSError as exc:  # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                file = open(filename, "w")
                reads = open("lines.txt")
                for i, line in enumerate(reads):
                    result = "%s" % line
                    file.write(result)
                reads.close()
                file.close()

            elif chos == 'BLASTp':

                record = SeqIO.read(str(e1.get()), "fasta")
                ser = record.seq
                print(ser)
                if db == 'Nucleotide collection(DNA)':
                    result_handle = NCBIWWW.qblast("blastp", "nt", ser)
                elif db == 'NCBI Transcript Ref_Seq(DNA)':
                    result_handle = NCBIWWW.qblast("blastp", "refseq_rna", ser)
                elif db == 'PDB nucleotide database(DNA)':
                    result_handle = NCBIWWW.qblast("blastp", "pdbnt", ser)
                elif db == 'Non-redundant(Protein)':
                    result_handle = NCBIWWW.qblast("blastp", "nr", ser)
                elif db == 'NCBI Protein Ref_Seq(Protein)':
                    result_handle = NCBIWWW.qblast("blastp", "refseq_protein",
                                                   ser)
                elif db == 'Non-redundant UniProtKB/SwissProt(Protein)':
                    result_handle = NCBIWWW.qblast("blastp", "swissprot", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastp", "est", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastp", "est", ser)
                elif db == 'RefSeq Representative Genome Database(DNA)':
                    result_handle = NCBIWWW.qblast(
                        "blastp", "refseq_representative_genomes", ser)

                blast_record = NCBIXML.read(result_handle)

                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if 1 == 1:
                            print("****Alignment****")
                            print("sequence:", alignment.title)
                            print("length:", alignment.length)
                            print("e value:", hsp.expect)
                            print(hsp.query[0:75] + "...")
                            print(hsp.sbjct[0:75] + "...")
                            view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % (
                                alignment.title, alignment.length, hsp.expect,
                                hsp.query[0:150] + "...", hsp.match[0:100] +
                                "...", hsp.sbjct[0:150] + "...")
                            T.insert(END, view)
                            file.write(view)

                filename = e2.get()
                if not os.path.exists(os.path.dirname(filename)):
                    try:
                        os.makedirs(os.path.dirname(filename))
                    except OSError as exc:  # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                file = open(filename, "w")
                reads = open("lines.txt")
                for i, line in enumerate(reads):
                    result = "%s" % line
                    file.write(result)
                reads.close()
                file.close()

            elif chos == 'tBLASTn':

                record = SeqIO.read(str(e1.get()), "fasta")
                ser = record.seq
                print(ser)
                if db == 'Nucleotide collection(DNA)':
                    result_handle = NCBIWWW.qblast("tblastn", "nt", ser)
                elif db == 'NCBI Transcript Ref_Seq(DNA)':
                    result_handle = NCBIWWW.qblast("tblastn", "refseq_rna",
                                                   ser)
                elif db == 'PDB nucleotide database(DNA)':
                    result_handle = NCBIWWW.qblast("tblastn", "pdbnt", ser)
                elif db == 'Non-redundant(Protein)':
                    result_handle = NCBIWWW.qblast("tblastn", "nr", ser)
                elif db == 'NCBI Protein Ref_Seq(Protein)':
                    result_handle = NCBIWWW.qblast("tblastn", "refseq_protein",
                                                   ser)
                elif db == 'Non-redundant UniProtKB/SwissProt(Protein)':
                    result_handle = NCBIWWW.qblast("tblastn", "swissprot", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("tblastn", "est", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("tblastn", "est", ser)
                elif db == 'RefSeq Representative Genome Database(DNA)':
                    result_handle = NCBIWWW.qblast(
                        "tblastn", "refseq_representative_genomes", ser)
                blast_record = NCBIXML.read(result_handle)

                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if 1 == 1:
                            print("****Alignment****")
                            print("sequence:", alignment.title)
                            print("length:", alignment.length)
                            print("e value:", hsp.expect)
                            print(hsp.query[0:75] + "...")
                            print(hsp.sbjct[0:75] + "...")
                            view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % (
                                alignment.title, alignment.length, hsp.expect,
                                hsp.query[0:150] + "...", hsp.match[0:150] +
                                "...", hsp.sbjct[0:150] + "...")
                            T.insert(END, view)
                            file.write(view)

                filename = e2.get()
                if not os.path.exists(os.path.dirname(filename)):
                    try:
                        os.makedirs(os.path.dirname(filename))
                    except OSError as exc:  # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                file = open(filename, "w")
                reads = open("lines.txt")
                for i, line in enumerate(reads):
                    result = "%s" % line
                    file.write(result)
                reads.close()
                file.close()

            elif chos == 'BLASTx':

                record = SeqIO.read(str(e1.get()), "fasta")
                ser = record.seq
                print(ser)
                if db == 'Nucleotide collection(DNA)':
                    result_handle = NCBIWWW.qblast("blastx", "nt", ser)
                elif db == 'NCBI Transcript Ref_Seq(DNA)':
                    result_handle = NCBIWWW.qblast("blastx", "refseq_rna", ser)
                elif db == 'PDB nucleotide database(DNA)':
                    result_handle = NCBIWWW.qblast("blastx", "pdbnt", ser)
                elif db == 'Non-redundant(Protein)':
                    result_handle = NCBIWWW.qblast("blastx", "nr", ser)
                elif db == 'NCBI Protein Ref_Seq(Protein)':
                    result_handle = NCBIWWW.qblast("blastx", "refseq_protein",
                                                   ser)
                elif db == 'Non-redundant UniProtKB/SwissProt(Protein)':
                    result_handle = NCBIWWW.qblast("blastx", "swissprot", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastx", "est", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("blastx", "est", ser)
                elif db == 'RefSeq Representative Genome Database(DNA)':
                    result_handle = NCBIWWW.qblast(
                        "blastx", "refseq_representative_genomes", ser)
                blast_record = NCBIXML.read(result_handle)

                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if 1 == 1:
                            print("****Alignment****")
                            print("sequence:", alignment.title)
                            print("length:", alignment.length)
                            print("e value:", hsp.expect)
                            print(hsp.query[0:75] + "...")
                            print(hsp.sbjct[0:75] + "...")
                            view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % (
                                alignment.title, alignment.length, hsp.expect,
                                hsp.query[0:150] + "...", hsp.match[0:100] +
                                "...", hsp.sbjct[0:150] + "...")
                            T.insert(END, view)
                            file.write(view)

                filename = e2.get()
                if not os.path.exists(os.path.dirname(filename)):
                    try:
                        os.makedirs(os.path.dirname(filename))
                    except OSError as exc:  # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                file = open(filename, "w")
                reads = open("lines.txt")
                for i, line in enumerate(reads):
                    result = "%s" % line
                    file.write(result)
                reads.close()
                file.close()

            elif chos == 'tBLASTx':
                record = SeqIO.read(str(e1.get()), "fasta")
                ser = record.seq
                print(ser)
                if db == 'Nucleotide collection(DNA)':
                    result_handle = NCBIWWW.qblast("tblastx", "nt", ser)
                elif db == 'NCBI Transcript Ref_Seq(DNA)':
                    result_handle = NCBIWWW.qblast("tblastx", "refseq_rna",
                                                   ser)
                elif db == 'PDB nucleotide database(DNA)':
                    result_handle = NCBIWWW.qblast("tblastx", "pdbnt", ser)
                elif db == 'Non-redundant(Protein)':
                    result_handle = NCBIWWW.qblast("tblastx", "nr", ser)
                elif db == 'NCBI Protein Ref_Seq(Protein)':
                    result_handle = NCBIWWW.qblast("tblastx", "refseq_protein",
                                                   ser)
                elif db == 'Non-redundant UniProtKB/SwissProt(Protein)':
                    result_handle = NCBIWWW.qblast("tblastx", "swissprot", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("tblastx", "est", ser)
                elif db == 'Expressed Sequences tags(DNA)':
                    result_handle = NCBIWWW.qblast("tblastx", "est", ser)
                elif db == 'RefSeq Representative Genome Database(DNA)':
                    result_handle = NCBIWWW.qblast(
                        "tblastx", "refseq_representative_genomes", ser)
                blast_record = NCBIXML.read(result_handle)

                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if 1 == 1:
                            print("****Alignment****")
                            print("sequence:", alignment.title)
                            print("length:", alignment.length)
                            print("e value:", hsp.expect)
                            print(hsp.query[0:75] + "...")
                            print(hsp.sbjct[0:75] + "...")
                            view = "sequence:%s\nlength:%s\ne value:%s\n%s\n%s\n%s\n\n" % (
                                alignment.title, alignment.length, hsp.expect,
                                hsp.query[0:150] + "...", hsp.match[0:100] +
                                "...", hsp.sbjct[0:150] + "...")
                            T.insert(END, view)
                            file.write(view)

                filename = e2.get()
                if not os.path.exists(os.path.dirname(filename)):
                    try:
                        os.makedirs(os.path.dirname(filename))
                    except OSError as exc:  # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                file = open(filename, "w")
                reads = open("lines.txt")
                for i, line in enumerate(reads):
                    result = "%s" % line
                    file.write(result)
                reads.close()
                file.close()

        root = Tk()
        S = Scrollbar(root)
        dis = open("lines.txt", "r").read()
        T = Text(root, height=50, width=500)
        S.pack(side=RIGHT, fill=Y)
        T.pack(side=LEFT, fill=Y)
        S.config(command=T.yview)
        S.config(command=T.xview)
        T.config(yscrollcommand=S.set)
        T.config(xscrollcommand=S.set)
        T.insert(END, dis)
        mainloop()
Esempio n. 50
0
    def getBlastHits(self):
        """ Function for blasting the handle sequence against the NCBI nt database to identify homologies
        """
        from Bio.Blast import NCBIWWW
        import sys
        import subprocess as sp
        sys.stdout = Unbuffered(sys.stdout)

        local = True
        if local:
            #localdb='/sw/data/uppnex/blast_databases/nt'
            localdb = '/Users/erikborgstrom/localBioInfo/BLASTnt/nt'
            from Bio.Blast.Applications import NcbiblastnCommandline
            from Bio.Blast import NCBIXML
            from cStringIO import StringIO
            import time
            import os

            #setting up blast
            database = localdb
            blastsetting = 'strict'
            infile = open('tmp.fa', 'w')
            infile.write('>tmp\n' + self.sequence + '\n')
            infile.close()
            if blastsetting == 'strict':
                cline = NcbiblastnCommandline(query=infile.name,
                                              db=database,
                                              evalue=0.001,
                                              outfmt=5)  #, out='tmp.blastout')
            elif blastsetting == 'sloppy':
                cline = NcbiblastnCommandline(
                    query=infile.name,
                    db=database,
                    evalue=0.001,
                    outfmt=5,
                    dust='no',
                    perc_identity=80,
                    task='blastn')  #,out='tmp.blastout')
            cline = NcbiblastnCommandline(
                cmd='blastn',
                outfmt=5,
                query=infile.name,
                db=database,
                gapopen=5,
                gapextend=2,
                culling_limit=2)  #,out='tmp.blastout')
            print str(cline)

            blast_handle = cline.__call__()
            #blastn = sp.Popen(cline.__str__().split(), stdout=sp.PIPE, stderr=sp.PIPE)
            #blastn.wait()
            #stdout, stderr = blastn.communicate()
            #print blastn.returncode
            #print cline.__str__().split()
            #blast_handle = stdout, stderr

            #print blast_handle

            blast_handle = StringIO(blast_handle[0])
            blast_handle.seek(0)
            #os.remove(infile.name)
        else:
            sys.stdout.write('getting blast hits for handle#' + str(self.id) +
                             '\n')
            result_handle = NCBIWWW.qblast("blastn",
                                           "nr",
                                           '>tmp\n' + self.sequence,
                                           format_type='XML')
            sys.stdout.write('start parsing blast for handle#' + str(self.id) +
                             '\n')
            from cStringIO import StringIO
            blast_handle = StringIO(result_handle.read())
            blast_handle.seek(0)

        from Bio.Blast import NCBIXML
        records = NCBIXML.parse(blast_handle)
        hits = 0
        for blast_record in records:
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    perc_identity = float(hsp.identities) / float(
                        hsp.align_length) * 100
                    perc_coverage = float(hsp.align_length) / float(
                        blast_record.query_letters) * 100
                    if perc_identity >= 90 and perc_coverage >= 90: hits += 1
        self.blastHits = hits
Esempio n. 51
0
def main(argv):

    argsgiven = 0
    query = ''
    subject = ''
    build_DB = True
    usage = 'seq_uniq_seek.py -q <queryfile>.fasta -s <subjectfile>.fasta -B [build database true/false]'
    verbal = True
    opts, args = getopt.getopt(argv, "xmhq:s:o:", ["subject=", "query="])
    for opt, arg in opts:
        if opt == '-h':
            print(usage)
            sys.exit()
        elif opt == '-x':
            build_DB = False
        elif opt in ("-q", "--query"):
            query = arg
            argsgiven += 1
        elif opt in ("-m", "--mute"):
            verbal = False
            argsgiven += 1
        elif opt in ("-s", "--subject"):
            subject = arg
            argsgiven += 1
        elif opt in ("-o", "--output"):
            output = arg
            argsgiven += 1
    if (argsgiven < 3):
        print(usage)
        sys.exit(2)

    if (verbal):
        print(
            "\n ---- ==== SEEK UNIQ SEQ ==== ---- \nFinding sequences occuring in "
            + query + " that are not occuring in " + subject +
            " and saving in " + output + ".fasta\n")

    if (build_DB):
        if (verbal):
            print("Building blast database for subject file (" + subject + ")")
        makedb = NcbimakeblastdbCommandline(cmd='makeblastdb',
                                            input_file=subject,
                                            dbtype='nucl',
                                            parse_seqids=True)
        makedb()
        if (verbal): print("Done.\n")
    else:
        if (verbal): print("Not building database. Hoping for the best")

    if (verbal):
        print("Blasting query (" + query + ") against subject database (" +
              subject + ")")

    if (verbal): print("Splitting query into multiple files to save memory.")

    shutil.rmtree("chunks", ignore_errors=True)
    os.mkdir("chunks")

    record_iter = SeqIO.parse(open(query), "fasta")
    for i, batch in enumerate(batch_iterator(record_iter, 10000)):
        filename = "chunks/chunk_%i.fasta" % (i + 1)
        with open(filename, "w") as handle:
            count = SeqIO.write(batch, handle, "fasta")

    if (verbal): print("Building query index dictionary")
    q_dict = SeqIO.index(query, "fasta")
    hits = []

    chunks = glob.glob('chunks/chunk*')
    for i, file in enumerate(chunks):
        now = datetime.now()
        dt_string = now.strftime("%d-%m_%H:%M:%S")
        print("[xenoseq_blast   " + dt_string +
              "] So anyway... I'm busy blasting... " +
              str(round(i / len(chunks) * 100, 2)) + "%")

        blastn_cline = NcbiblastnCommandline(cmd='blastn',
                                             query=file,
                                             db=subject,
                                             num_threads=8,
                                             evalue=1e-5,
                                             perc_identity=90,
                                             outfmt=5,
                                             out="reads_all_vs_all.xml")

        blastn_cline()

        # Bit below is from: https://biopython.org/wiki/Retrieve_nonmatching_blast_queries

        for record in NCBIXML.parse(open("reads_all_vs_all.xml")):
            for alignment in record.alignments:
                if (alignment.length > 100):
                    hits.append(record.query.split()[0])
        os.remove("reads_all_vs_all.xml")

    shutil.rmtree("chunks")

    if (verbal): print("Subtracting hits from query dict keys")
    misses = set(q_dict.keys()) - set(hits)
    orphans = [q_dict[name] for name in misses]
    if (verbal):
        print("%i out of %i records in query are unique" %
              (len(misses), len(q_dict)))
    if (verbal): print("Writing to file %s" % (output))
    SeqIO.write(orphans, output, 'fasta')
    if (verbal): print("Done. Hoping for the best.\n")
Esempio n. 52
0
import sys

from Bio.Blast import NCBIXML
if len(sys.argv) > 1:
    blast = NCBIXML.parse(open(sys.argv[1], 'rU'))
else:
    blast = NCBIXML.parse(sys.stdin)

for record in blast:
    for align in record.alignments:
        if (align.hsps[0].frame[0] >= 0) and (align.hsps[0].frame[1] >= 0):
            print record.query, "\t", align.title, "\t", align.hsps[0].expect
            break
Esempio n. 53
0
 def runMetrics(self):
     cont = open("contiguity.txt", "w")
     cont.write("ID\tLength\tFPKM\tExpected counts\n")
     frag1 = open("fragmentation_1.txt", "w")
     frag1.write("ID\tLength\tFPKM\tExpected counts\n")
     frag2 = open("fragmentation_2.txt", "w")
     frag2.write("ID\tLength\tFPKM\tExpected counts\n")
     frag3 = open("fragmentation_3.txt", "w")
     frag3.write("ID\tLength\tFPKM\tExpected counts\n")
     frag4 = open("fragmentation_4.txt", "w")
     frag4.write("ID\tLength\tFPKM\tExpected counts\n")
     frag5 = open("fragmentation_5.txt", "w")
     frag5.write("ID\tLength\tFPKM\tExpected counts\n")
     handle_rf_as = open(self.blast_output_rf_as)
     blast_records = NCBIXML.parse(handle_rf_as)
     cov_sum = 0.0
     align_length_sum = 0.0
     corr_bases_sum = 0.0
     hits_ref_sum = 0.0
     cont_sum = 0.0
     frag_sum = 0.0
     frag_sum_1 = 0.0
     frag_sum_2 = 0.0
     frag_sum_3 = 0.0
     frag_sum_4 = 0.0
     frag_sum_5 = 0.0
     assem_trpts_used = 0.0
     assem_trpts_used_frag = 0.0
     self.check_trpts = {}
     for blast_record in blast_records:
         result_iter = self.identifiedMetrics(blast_record)
         cov_sum += result_iter[0]
         align_length_sum += result_iter[1]
         corr_bases_sum += result_iter[2]
         if result_iter[3] == 1:
             hits_ref_sum += 1
             cont_sum += 1
             assem_trpts_used += 1
             cont.write("%s\t%s\t%s\t%s\n" % (result_iter[5][0], str(self.isoforms[result_iter[5][0]][0]), str(self.isoforms[result_iter[5][0]][1]), str(self.isoforms[result_iter[5][0]][2])))
         elif result_iter[4] == 1:
             hits_ref_sum += 1
             frag_sum += 1
             frag_sum_1 += 1
             assem_trpts_used += result_iter[4]
             assem_trpts_used_frag += result_iter[4]
             for i in result_iter[5]:
                 frag1.write("%s\t%s\t%s\t%s\n" % (i, str(self.isoforms[i][0]), str(self.isoforms[i][1]), str(self.isoforms[i][2])))
         elif result_iter[4] == 2:
             hits_ref_sum += 1
             frag_sum += 1
             frag_sum_2 += 1
             assem_trpts_used += result_iter[4]
             assem_trpts_used_frag += result_iter[4]
             for i in result_iter[5]:
                 frag2.write("%s\t%s\t%s\t%s\n" % (i, str(self.isoforms[i][0]), str(self.isoforms[i][1]), str(self.isoforms[i][2])))
         elif result_iter[4] == 3:
             hits_ref_sum += 1
             frag_sum += 1
             frag_sum_3 += 1
             assem_trpts_used += result_iter[4]
             assem_trpts_used_frag += result_iter[4]
             for i in result_iter[5]:
                 frag3.write("%s\t%s\t%s\t%s\n" % (i, str(self.isoforms[i][0]), str(self.isoforms[i][1]), str(self.isoforms[i][2])))
         elif result_iter[4] == 4:
             hits_ref_sum += 1
             frag_sum += 1
             frag_sum_4 += 1
             assem_trpts_used += result_iter[4]
             assem_trpts_used_frag += result_iter[4]
             for i in result_iter[5]:
                 frag4.write("%s\t%s\t%s\t%s\n" % (i, str(self.isoforms[i][0]), str(self.isoforms[i][1]), str(self.isoforms[i][2])))
         elif result_iter[4] >= 5:
             hits_ref_sum += 1
             frag_sum += 1
             frag_sum_5 += 1
             assem_trpts_used += result_iter[4]
             assem_trpts_used_frag += result_iter[4]
             for i in result_iter[5]:
                 frag5.write("%s\t%s\t%s\t%s\n" % (i, str(self.isoforms[i][0]), str(self.isoforms[i][1]), str(self.isoforms[i][2])))
     handle_rf_as.close(), cont.close(), frag1.close(), frag2.close(), frag3.close(), frag4.close(), frag5.close()
     identified = 100 * (hits_ref_sum / self.ref) 
     completeness = 100 * (cov_sum / hits_ref_sum)
     contiguity = 100 * (cont_sum / hits_ref_sum)
     fragmented = 100 * (frag_sum / hits_ref_sum)
     fragmented_1 = 100 * (frag_sum_1 / hits_ref_sum)
     fragmented_2 = 100 * (frag_sum_2 / hits_ref_sum)
     fragmented_3 = 100 * (frag_sum_3 / hits_ref_sum)
     fragmented_4 = 100 * (frag_sum_4 / hits_ref_sum)
     fragmented_5 = 100 * (frag_sum_5 / hits_ref_sum)
     accuracy = 100 * (corr_bases_sum / align_length_sum)  
     result_completenessCont = identified, hits_ref_sum, completeness, cov_sum, contiguity, cont_sum, fragmented, frag_sum, assem_trpts_used, assem_trpts_used_frag, accuracy, [fragmented_1, frag_sum_1, fragmented_2, frag_sum_2, fragmented_3, frag_sum_3, fragmented_4, frag_sum_4, fragmented_5, frag_sum_5]
     handle_as_rf = open(self.blast_output_as_rf)
     non_mat = open("non_match.txt", "w")
     non_mat.write("ID\tLength\tFPKM\tExpected counts\n")
     chim = open("chimerism.txt", "w")
     chim.write("ID\tLength\tFPKM\tExpected counts\n")
     blast_records = NCBIXML.parse(handle_as_rf)
     chimaeras = 0.0
     no_hits = 0.0
     
     for blast_record in blast_records:
         result_iter = self.chimerismNonMatch(blast_record)
         if result_iter[0] == 1:
             chimaeras += 1
             chim.write("%s\t%s\t%s\t%s\n" % (str(blast_record.query).split(" ")[0], str(self.isoforms[str(blast_record.query).split(" ")[0]][0]), str(self.isoforms[str(blast_record.query).split(" ")[0]][1]), str(self.isoforms[str(blast_record.query).split(" ")[0]][2])))
         elif result_iter[1] == 1:
             no_hits += 1
             non_mat.write("%s\t%s\t%s\t%s\n" % (str(blast_record.query).split(" ")[0], str(self.isoforms[str(blast_record.query).split(" ")[0]][0]), str(self.isoforms[str(blast_record.query).split(" ")[0]][1]), str(self.isoforms[str(blast_record.query).split(" ")[0]][2])))
     handle_as_rf.close(), non_mat.close(), chim.close()
     perc_chimaeras = (chimaeras / float(self.express)) * 100
     perc_no_hits = (no_hits / float(self.express)) * 100
     result_chimerism_ord = perc_chimaeras, chimaeras, perc_no_hits, no_hits
     return result_completenessCont, result_chimerism_ord
Esempio n. 54
0
def blastxml2gff3(blastxml, include_seq=False):

    blast_records = NCBIXML.parse(blastxml)
    for idx_record, record in enumerate(blast_records):
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        # match_type = {  # Currently we can only handle BLASTN, BLASTP
        #    "BLASTN": "nucleotide_match",
        #    "BLASTP": "protein_match",
        # }.get(record.application, "match")
        match_type = "match"
        collected_records = []

        recid = record.query
        if " " in recid:
            recid = clean_string(recid[0:recid.index(" ")])

        for idx_hit, hit in enumerate(record.alignments):
            # gotta check all hsps in a hit to see boundaries
            rec = SeqRecord(Seq("ACTG"), id=recid)
            parent_match_start = 0
            parent_match_end = 0
            hit_qualifiers = {
                "ID": "b2g.%s.%s" % (idx_record, idx_hit),
                "source": "blast",
                "accession": hit.accession,
                "hit_id": clean_string(hit.hit_id),
                "score": None,
                "length": hit.length,
                "hit_titles": clean_slist(hit.title.split(" >")),
                "hsp_count": len(hit.hsps),
            }
            desc = hit.title.split(" >")[0]
            hit_qualifiers["Name"] = desc
            sub_features = []
            for idx_hsp, hsp in enumerate(hit.hsps):
                if idx_hsp == 0:
                    # -2 and +1 for start/end to convert 0 index of python to 1 index of people, -2 on start because feature location saving issue
                    parent_match_start = hsp.query_start
                    parent_match_end = hsp.query_end
                    hit_qualifiers["score"] = hsp.expect
                # generate qualifiers to be added to gff3 feature
                hit_qualifiers["score"] = min(hit_qualifiers["score"],
                                              hsp.expect)
                hsp_qualifiers = {
                    "ID": "b2g.%s.%s.hsp%s" % (idx_record, idx_hit, idx_hsp),
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_id": clean_string(hit.hit_id),
                    "length": hit.length,
                    "hit_titles": clean_slist(hit.title.split(" >")),
                }
                if include_seq:
                    if (
                            "blast_qseq",
                            "blast_sseq",
                            "blast_mseq",
                    ) in hit_qualifiers.keys():
                        hit_qualifiers.update({
                            "blast_qseq":
                            hit_qualifiers["blast_qseq"] + hsp.query,
                            "blast_sseq":
                            hit_qualifiers["blast_sseq"] + hsp.sbjct,
                            "blast_mseq":
                            hit_qualifiers["blast_mseq"] + hsp.match,
                        })
                    else:
                        hit_qualifiers.update({
                            "blast_qseq": hsp.query,
                            "blast_sseq": hsp.sbjct,
                            "blast_mseq": hsp.match,
                        })
                for prop in (
                        "score",
                        "bits",
                        "identities",
                        "positives",
                        "gaps",
                        "align_length",
                        "strand",
                        "frame",
                        "query_start",
                        "query_end",
                        "sbjct_start",
                        "sbjct_end",
                ):
                    hsp_qualifiers["blast_" + prop] = getattr(hsp, prop, None)

                # check if parent boundary needs to increase to envelope hsp
                # if hsp.query_start < parent_match_start:
                #    parent_match_start = hsp.query_start - 1
                # if hsp.query_end > parent_match_end:
                #    parent_match_end = hsp.query_end + 1

                parent_match_start, parent_match_end = check_bounds(
                    parent_match_start, parent_match_end, hsp.query_start,
                    hsp.query_end)

                # add hsp to the gff3 feature as a "match_part"
                sub_features.append(
                    SeqFeature(
                        FeatureLocation(hsp.query_start - 1, hsp.query_end),
                        type="match_part",
                        strand=0,
                        qualifiers=copy.deepcopy(hsp_qualifiers),
                    ))

            # Build the top level seq feature for the hit
            hit_qualifiers["description"] = "Hit to %s..%s of %s" % (
                parent_match_start,
                parent_match_end,
                desc,
            )
            top_feature = SeqFeature(
                FeatureLocation(parent_match_start - 1, parent_match_end),
                type=match_type,
                strand=0,
                qualifiers=hit_qualifiers,
            )
            # add the generated subfeature hsp match_parts to the hit feature
            top_feature.sub_features = copy.deepcopy(
                sorted(sub_features, key=lambda x: int(x.location.start)))
            # Add the hit feature to the record
            rec.features.append(top_feature)
            rec.annotations = {}
            collected_records.append(rec)
        for rec in collected_records:
            yield rec
Esempio n. 55
0
    parser.add_argument('-e',
                        '--e_value',
                        help='E-value threshold',
                        metavar='float number',
                        type=float)

    args = parser.parse_args()

    input_file = args.input
    e_value_threshold = args.e_value

    app_output_path = "aligned.fasta"
    not_app_output_path = "nonaligned.fasta"

    with open(app_output_path,
              'w') as out_al_file, open(not_app_output_path,
                                        'w') as out_nonal_file:
        for fasta in SeqIO.parse(input_file, "fasta"):
            query = NCBIWWW.qblast("blastn",
                                   "nt",
                                   fasta.seq,
                                   expect=e_value_threshold,
                                   format_type="XML")
            blast_result = NCBIXML.parse(query)
            for result in blast_result:
                if len(result.alignments) > 0:
                    SeqIO.write(fasta, out_al_file, "fasta")
                elif len(result.alignments) == 0:
                    SeqIO.write(fasta, out_nonal_file, "fasta")
Esempio n. 56
0
# Now, we'll perform a blast search for other sequences
# which are homologous to the obtained above sequence of
# human beta-2 adrenergic receptor
# Here, we'll seach for a homolog from Bos taurus (Cow)
result_handle = NCBIWWW.qblast("blastp",
                               "swissprot",
                               query_seq,
                               hitlist_size=1,
                               entrez_query="Bos taurus[orgn]")
# ...and write the result to .xml file
blast_result = open("Bovine_seq.xml", "w")
blast_result.write(result_handle.read())
blast_result.close()
# now we can read this file at any time and get the sequence, its id, etc.
print(NCBIXML.read(open("Bovine_seq.xml")).alignments[0].__str__()
      )  # returns formated string with this alignment details
print('Seq ID: ',
      NCBIXML.read(open("Bovine_seq.xml")).alignments[0].hit_id.split("|")[1])
# getting the sequence of an HSP (high scoring segment pair)
print('Protein sequence: ',
      NCBIXML.read(open("Bovine_seq.xml")).alignments[0].hsps[0].sbjct)

# This loop reads the species list specified in the species.txt file and print the Entrez queries
with open("./data/species.txt", 'r') as species:
    species_lines = species.readlines()
    for line in species_lines:
        species_latin = line.split(".")[0].strip()
        species_common = line.split(".")[1].strip()
        entrez_q = str(species_latin + "[orgn]")
        print(species_common, ': ', entrez_q)
                                   #### gets the length of query and stores to a variable
total = 0
#for filename in glob.glob(input_file_names):
for filename in glob.glob(sys.argv[2]):
    record = SeqIO.read(filename, "fasta")
    query_length = len(record)
                                   #### compare hits to current input query
                                   #### define the handle

    filename = sys.argv[2]
    filename2 = filename + ".xml"


    result_handle = open(filename2)
    
    blast_record = NCBIXML.read(result_handle)
                                   #### write query file name
    counter = 0

                                   #### screen blast output records against parameters
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            alignment_length = alignment.length
            identical_residues = hsp.identities
            percent_identity = float(identical_residues) / float(query_length) 

            cond1 = percent_identity <= high_identity 
            cond2 = percent_identity > low_identity
            cond3 = alignment_length <= query_length * high_length
            cond4 = alignment_length > query_length * low_length
                                   #### write blast output that passes screens
Esempio n. 58
0
def test_calculate_mean_sd():
    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = physcraper.FilterBlast(data_obj, ids)

    # test begins
    fn = 'Senecio_scopolii_subsp._scopolii'
    # partly copy of read_local_blast_query
    general_wd = os.getcwd()
    if not os.path.exists(os.path.join(filteredScrape.workdir, "blast")):
        os.makedirs(os.path.join(filteredScrape.workdir, "blast"))

    fn_path = './tests/data/precooked/fixed/local-blast/{}'.format(fn)
    fn_path = os.path.abspath(fn_path)
    print(fn_path)
    os.chdir(os.path.join(filteredScrape.workdir, "blast"))
    local_blast.run_filter_blast(filteredScrape.workdir,
                                 fn_path,
                                 fn_path,
                                 output=os.path.join(
                                     filteredScrape.workdir,
                                     "blast/output_{}.xml".format(fn)))

    output_blast = os.path.join(filteredScrape.workdir,
                                "blast/output_{}.xml".format(fn))
    xml_file = open(output_blast)
    os.chdir(general_wd)
    blast_out = NCBIXML.parse(xml_file)
    hsp_scores = {}
    add_hsp = 0
    for record in blast_out:
        for alignment in record.alignments:
            for hsp in alignment.hsps:
                gi = int(alignment.title.split(" ")[1])
                hsp_scores[gi] = {
                    "hsp.bits": hsp.bits,
                    "hsp.score": hsp.score,
                    "alignment.length": alignment.length,
                    "hsp.expect": hsp.expect
                }
                add_hsp = add_hsp + float(hsp.bits)
    # make values to select for blast search, calculate standard deviation, mean
    mean_sed = local_blast.calculate_mean_sd(hsp_scores)
    sum_hsp = len(hsp_scores)
    mean = (add_hsp / sum_hsp)
    sd_all = 0
    for item in hsp_scores:
        val = hsp_scores[item]["hsp.bits"]
        sd = (val - mean) * (val - mean)
        sd_all += sd
    sd_val = sqrt(sd_all / sum_hsp)
    # print((sd_val, 4), round(mean_sed['sd'], 4))
    # print(mean,4), round(mean_sed['mean'], 4)

    assert round(sd_val, 4) == round(mean_sed['sd'], 4)
    assert round(mean, 4) == round(mean_sed['mean'], 4)
def blastxml2gff3(blastxml, include_seq=False):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    for idx_record, record in enumerate(blast_records):
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        match_type = {  # Currently we can only handle BLASTN, BLASTP
            "BLASTN": "nucleotide_match",
            "BLASTP": "protein_match",
        }.get(record.application, "match")

        recid = record.query
        if " " in recid:
            recid = recid[0:recid.index(" ")]

        rec = SeqRecord(Seq("ACTG"), id=recid)
        for idx_hit, hit in enumerate(record.alignments):
            # gotta check all hsps in a hit to see boundaries
            parent_match_start = 0
            parent_match_end = 0
            hit_qualifiers = {
                "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, "0"),
                "source": "blast",
                "accession": hit.accession,
                "hit_id": hit.hit_id,
                "length": hit.length,
                "hit_titles": hit.title.split(" >"),
                "hsp_count": len(hit.hsps),
            }
            sub_features = []
            for idx_hsp, hsp in enumerate(hit.hsps):
                hsp_qualifiers = {
                    "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp),
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_id": hit.hit_id,
                    "length": hit.length,
                    "hit_titles": hit.title.split(" >"),
                }
                if include_seq:
                    hsp_qualifiers.update({
                        "blast_qseq": hsp.query,
                        "blast_sseq": hsp.sbjct,
                        "blast_mseq": hsp.match,
                    })

                for prop in (
                        "score",
                        "bits",
                        "identities",
                        "positives",
                        "gaps",
                        "align_length",
                        "strand",
                        "frame",
                        "query_start",
                        "query_end",
                        "sbjct_start",
                        "sbjct_end",
                ):
                    hsp_qualifiers["blast_" + prop] = getattr(hsp, prop, None)

                desc = hit.title.split(" >")[0]
                hsp_qualifiers["description"] = desc[desc.index(" "):]

                # check if parent boundary needs to increase
                if hsp.query_start < parent_match_start:
                    parent_match_start = hsp.query_start
                if hsp.query_end > parent_match_end:
                    parent_match_end = hsp.query_end + 1

                # Build out the match_part features for each HSP
                for idx_part, (start, end, cigar) in enumerate(
                        generate_parts(hsp.query,
                                       hsp.match,
                                       hsp.sbjct,
                                       ignore_under=10)):
                    hsp_qualifiers["Gap"] = cigar
                    hsp_qualifiers["ID"] = hit_qualifiers["ID"] + (".%s" %
                                                                   idx_part)

                    match_part_start = hsp.query_start

                    # We used to use hsp.align_length here, but that includes
                    # gaps in the parent sequence
                    #
                    # Furthermore align_length will give calculation errors in weird places
                    # So we just use (end-start) for simplicity
                    match_part_end = match_part_start + (end - start)

                    sub_features.append(
                        SeqFeature(
                            FeatureLocation(match_part_start, match_part_end),
                            type="match_part",
                            strand=0,
                            qualifiers=copy.deepcopy(hsp_qualifiers),
                        ))

            # Build the top level seq feature for the hit
            top_feature = SeqFeature(
                FeatureLocation(parent_match_start, parent_match_end),
                type=match_type,
                strand=0,
                qualifiers=hit_qualifiers,
            )
            # add the generated subfeature hsp match_parts to the hit feature
            top_feature.sub_features = copy.deepcopy(sub_features)
            # Add the hit feature to the record
            rec.features.append(top_feature)
        rec.annotations = {}
        yield rec
Esempio n. 60
0
def showBlastMapping():
    '''
	For each protein, create an overview over where the hits where mapped over the length of the protein.

	:creates: `blastmappings/*.png`
	'''

    os.makedirs('blastmappings', exist_ok=True)

    fnames = sorted(list(CR.getProteinFiles()))

    fnt = ImageFont.load_default()

    for fname in fnames:
        print('Mapping {:<50}'.format(fname), end='\r')

        query_length = 0
        with open('fastas/{}.fasta'.format(fname), 'r') as f:
            next(f)
            for line in f:
                query_length += len(line.rstrip())

        counters = [np.zeros(query_length, np.int) for x in range(6)]
        numHsps = [0] * 6

        with open('blastresults/{}.xml'.format(fname), 'r') as f:
            records = NCBIXML.parse(f)

            for record in records:
                for alignment in record.alignments:
                    for hsp in alignment.hsps:
                        if hsp.expect > 1e-15:
                            n = 0
                        elif hsp.expect > 1e-30:
                            n = 1
                        elif hsp.expect > 1e-60:
                            n = 2
                        elif hsp.expect > 1e-90:
                            n = 3
                        elif hsp.expect > 1e-120:
                            n = 4
                        else:
                            n = 5
                        counters[n][hsp.query_start - 1:hsp.query_end - 1] += 1
                        numHsps[n] += 1

        ma = [np.amax(counters[n]) * 0.01 for n in range(6)]

        counters = [
            counters[n] /
            ma[n] if ma[n] != 0 else np.ones(query_length, np.int)
            for n in range(6)
        ]

        im = Image.new('RGB', (query_length + 60, 600), (255, 255, 255))
        dr = ImageDraw.Draw(im)

        dr.text((2, 40), '> 1e-15', (0, 0, 0), fnt)
        dr.text((2, 140), '> 1e-30', (0, 0, 0), fnt)
        dr.text((2, 240), '> 1e-60', (0, 0, 0), fnt)
        dr.text((2, 340), '> 1e-90', (0, 0, 0), fnt)
        dr.text((2, 440), '> 1e-120', (0, 0, 0), fnt)
        dr.text((2, 540), '<= 1e-120', (0, 0, 0), fnt)

        for n in range(6):
            dr.text((2, 60 + 100 * n), 'n = {}'.format(numHsps[n]), (0, 0, 0),
                    fnt)

        colors = [(0, 0, 0), (0, 0, 200), (0, 200, 0), (200, 0, 200),
                  (200, 0, 0), (150, 150, 0)]

        for n in range(int(query_length / 100)):
            col = 160 + n * 100
            dr.line([(col, 0), (col, 600)], fill=(125, 125, 125), width=1)

        for n in range(6):
            for col, thickness in enumerate(counters[n]):
                dr.line([(col + 60, n * 100), (col + 60, thickness + n * 100)],
                        fill=colors[n],
                        width=1)

        #im.show()
        im.save('blastmappings/{}.png'.format(fname))