Esempio n. 1
0
def blast_align(fasta,blast_path,miRNA_db,mRNA_db):
    os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 1e-3 -db "+miRNA_db+" -query "+fasta+" > "+args.output+"temp_blast_miRNA.xml")
    os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 1e-5 -db "+mRNA_db+" -query "+fasta+" > "+args.output+"temp_blast_mRNA.xml")
    os.system("rm "+fasta)
    miRNA_records=NCBIXML.parse(open(args.output+"temp_blast_miRNA.xml"))
    mRNA_records=NCBIXML.parse(open(args.output+"temp_blast_mRNA.xml"))
    return (miRNA_records,mRNA_records)
Esempio n. 2
0
def main():
    #initialization
    n=0 # total number of query seq
    align_mi=0
    align_m=0    


    args=ParseArg()
    miRNA_result=open(args.mi_xml)
    mRNA_result=open(args.m_xml)
    miRNA_records=NCBIXML.parse(miRNA_result)
    mRNA_records=NCBIXML.parse(mRNA_result)
    output=open(args.output,'w')
    
    
    # E-values
    if args.evalue==0:
        evalue_mi=1e-5
        evalue_m=1e-15
    else:
        evalue_mi=float(args.evalue[0])
        evalue_m=float(args.evalue[1])
    
    for mi_record,m_record in itertools.izip(miRNA_records,mRNA_records):
        temp_output=''
        mi_indic=0 # whether there are miRNA alignment
        m_indic=0  # whether there are mRNA alignment
        mi_end=150  #shortest miRNA aligned end in query sequence
        n=n+1
        if (mi_record.query!=m_record.query):
            print >>sys.stderr,"The two query seqs from miRNA and mRNA results are not matched!"
            break
        temp_output=mi_record.query+'\n'
        for alignment in mi_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < evalue_mi:
                    mi_indic=1
                    line="\t".join (str(f) for f in [hsp.query_start,hsp.query_end,alignment.title,hsp.sbjct,hsp.sbjct_start,hsp.sbjct_end,hsp.expect,hsp.score])
                    temp_output=temp_output+line+'\n'
                    if mi_end>max(hsp.query_start,hsp.query_end):
                        mi_end=max(hsp.query_start,hsp.query_end)
        
        if mi_indic==0:
            mi_end=0

        for alignment in m_record.alignments:
            for hsp in alignment.hsps:
                if (hsp.expect < evalue_m) and (min(hsp.query_start,hsp.query_end)>mi_end):
                    m_indic=1
                    line="\t".join (str(f) for f in [hsp.query_start,hsp.query_end,alignment.title,hsp.sbjct,hsp.sbjct_start,hsp.sbjct_end,hsp.expect,hsp.score])
                    temp_output=temp_output+line+'\n'
        if mi_indic+m_indic>=2:
            output.write(temp_output)
        if mi_indic==1:
            align_mi+=1
        if m_indic==1:
            align_m+=1
    print n,align_mi,align_m
Esempio n. 3
0
def bestrecipblast(org, seed, thresh=5, queue=None):
    '''Returns the best pairwise reciprocal BLAST using seed accession no. from 
    against org organism'''
    seedorg=FetchUtil.fetch_organism(seed)[0]
    acclist={}
    ac=[]
    FetchUtil.fetch_fasta(seed)
    dum=str(int(int(seed.split('.')[0][-5:])*random.random()))
        
    os.system('blastp -db nr -query Orthos/'+seed+'.fasta -evalue '+str(thresh)+
              ' -out XML/'+dum+'.xml -outfmt 5 -entrez_query \"'+org+'[ORGN]\" -use_sw_tback'+
              ' -remote')
    qoutput=open('XML/'+dum+'.xml')
        
    parser=NCBIXML.parse(qoutput)
    for lin in parser:
        for align in lin.alignments:
            for hsp in align.hsps:
                if (hsp.positives/float(hsp.align_length))>=.4 and (float(hsp.align_length)/len(hsp.query))>=.25:
                    ac.append(align.title.split('|')[1])
    print("Done. Number of sequences found: "+repr(len(ac)))

    for o in ac:
        print o
        FetchUtil.fetch_fasta(o)
        os.system('blastp -db nr -query Orthos/'+o+'.fasta -evalue '+str(thresh)+
              ' -out XML/'+dum+'.xml -outfmt 5 -entrez_query \"'+seedorg[0]+'[ORGN]\" -use_sw_tback'+
              ' -remote')
        q1output=open('XML/'+dum+'.xml')
        parse=NCBIXML.parse(q1output)
        acc=[]
        print 'blasted'
        for lin in parse:
            for align in lin.alignments:
                for hsp in align.hsps:
                    if (hsp.positives/float(hsp.align_length))>=.4 and (float(hsp.align_length)/len(hsp.query))>.25:
                        acc.append(align.title.split('|')[1])
                    else:
                        continue

        print "Done. Number of sequences found: "+repr(len(acc))
            
        if seed in acc:
            print 'it\'s twue!'
            name=FetchUtil.fetch_organism(o)[0]
            try:
                acclist[name]=[o,str(ac.index(o)+1)+'/'+str(len(ac)),str(acc.index(seed)+1)+'/'+str(len(acc))]
            except KeyError:
                acclist.update({name:[o,str(ac.index(o)+1)+'/'+str(len(ac)),str(acc.index(seed)+1)+'/'+str(len(acc))]})
                
            open('dicts/'+seed,'a').write(str(acclist)+'\n')
            break
	#elapsed=time.time()-start
	#print "Time elapsed: "+time.strftime('%M:%S',[elapsed])
    if queue is not None:
      queue.put(acclist)
    else:
      return acclist
    def __init__(self, fhand, subj_def_as_accesion=None):
        'The init requires a file to be parser'
        fhand.seek(0, 0)
        sample = fhand.read(10)
        if sample and 'xml' not in sample:
            raise ValueError('Not a xml file')
        fhand.seek(0, 0)
        self._blast_file = fhand
        metadata = self._get_blast_metadata()
        blast_version = metadata['version']
        plus = metadata['plus']
        self.db_name = metadata['db_name']

        self._blast_file.seek(0, 0)

        if ((blast_version and plus) or
                                (blast_version and blast_version > '2.2.21')):
            self.use_query_def_as_accession = True
            self.use_subject_def_as_accession = True

        else:
            self.use_query_def_as_accession = True
            self.use_subject_def_as_accession = False

        if subj_def_as_accesion is not None:
            self.use_subject_def_as_accession = subj_def_as_accesion

        #we use the biopython parser
        #if there are no results we put None in our blast_parse results
        self._blast_parse = None
        if fhand.read(1) == '<':
            fhand.seek(0)
            self._blast_parse = NCBIXML.parse(fhand)
Esempio n. 5
0
def detection_no_hits_found(files):
    """
    Print sequences where no hit were found

    :param files:
    :return:
    """

    dict_no_hit = defaultdict(list)  # dict ==> {strain0 : [scaffold_0, scaffold_1, etc],
    # strain1 : [...],
    # etc ... }

    for file in files:

        strain = os.path.basename(os.path.dirname(file))
        blast_records = NCBIXML.parse(open(file))  # cmd to parse each blast file
        # no_hits = 0
        for blast_record in blast_records:
            query = blast_record.query.split()[0]  # extract name of contig
            if not blast_record.alignments:
                # no_hits += 1  # count nr of no hit in xml
                print(query)
                dict_no_hit[strain].append(query)

    for strain, scaffolds in dict_no_hit.iteritems():
        with open("/Volumes/BioSan/Users/dpflieger/GB-3G/Blast_scaffold/" + strain + ".no_hit_found.txt",
                  "w") as outfile:
            print("\n".join(scaffolds), file=outfile)
def blastparse(blast_handle, genome, gene):
    global plusdict
    records = NCBIXML.parse(blast_handle)   # Open record from memory-mapped file
    dotter()
    for record in records:  # This process is just to retrieve HSPs from xml files
        for alignment in record.alignments:
            for hsp in alignment.hsps:
                threadlock.acquire()  # precaution
                # if hsp.identities == alignment.length:  # if the length of the match matches the legth of the sequence
                #     # if genome not in plusdict:  # add genomes in plusdict
                #     #     plusdict[genome] = defaultdict(list)
                #     # if gene not in plusdict[genome]:  # add genes to plus dict
                #     #     plusdict[genome][gene] = []
                if plusdict[genome][gene] == [] and abs(float(hsp.identities) / alignment.length) >= 0.7:
                    # If there is only one good match then apply allele number
                    plusdict[genome][gene].append("+")
                # elif "+" not in plusdict[genome][gene]:
                #     plusdict[genome][gene].append("-")
                # elif abs(float(hsp.identities) / alignment.length) >= 0.7:
                #     # If there is multiple matches then added them in a string
                #     plusdict[genome][gene].append(alignment.title.split('_')[-1])
                #     plusdict[genome][gene].sort()
                # else:
                #     # or add the
                #     plusdict[genome][gene].append('%s (%s/%s)' % (alignment.title.split('_')[-1],
                #                                                   hsp.identities,
                #                                                   alignment.length))
                # print json.dumps(plusdict, indent=4, separators=(',', ': '))
                threadlock.release()  # precaution for populate dictionary with GIL
Esempio n. 7
0
 def get_fancy_results_list(self, blast_results, num_results = 20):
     blast_results_list = []
 
     blast_record = list(NCBIXML.parse(blast_results))[0]
     num_results = len(blast_record.alignments) if len(blast_record.alignments) < num_results else num_results
 
     for i in range(0, num_results):
         entry = b6lib.B6Entry()
         entry.q_len = int(blast_record.query_length)
         entry.query_length = entry.q_len
         
         alignment = blast_record.alignments[i]
         hsp = alignment.hsps[0]
      
         entry.hit_def = alignment.hit_def   
         entry.subject_id = entry.hit_def
         entry.accession = alignment.accession
         entry.ncbi_link = 'http://www.ncbi.nlm.nih.gov/nuccore/%s' % entry.accession
         entry.hsp_query = hsp.query
         entry.hsp_match = hsp.match
         entry.hsp_subject = hsp.sbjct
 
         entry.identity = len([x for x in hsp.match if x == '|']) * 100.0 / len(entry.hsp_query)
         entry.coverage = len(hsp.query) * 100.0 / entry.query_length
 
         blast_results_list.append(entry)
 
     try:
         blast_results.close()
     except:
         pass
 
     return blast_results_list
Esempio n. 8
0
def parseBlastResult(fileName):
    
    handle = open(fileName)
    blast_records = NCBIXML.parse(handle)
    
    results = []

    for record in blast_records:
        rec_id = str(record.query)
    
        if len(record.alignments) == 0:
            results.append( (rec_id, "-", 0, "-") )
            continue

        for algn in record.alignments:

            evalue = algn.hsps[0].expect
        
            score = 0
            ids = []
            
            for hsp in algn.hsps:
                score += hsp.bits
                ids.append(hsp.identities / float(hsp.align_length))
            
            max_identity = int(max(ids)*100)
            seq_id = algn.hit_id

            results.append( (rec_id, seq_id, max_identity, algn.hit_def ) )
            
    return results
def parsePsiBlast(psiblastfilename, max_evalue):

    try:
        results_dict = {}

        handle = open(psiblastfilename, 'r')

        for blast_record in NCBIXML.parse(handle):
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect <= max_evalue:
                        subjid = alignment.title

                        if subjid in results_dict:
                            if hsp.expect < results_dict[subjid]:
                                results_dict[subjid] = hsp.expect

                        else:
                            results_dict[subjid] = hsp.expect

        handle.close()

        return results_dict

    except:
        dieError('ERROR: PSI-BLAST failed.')
Esempio n. 10
0
def parse_results(result_file, e_val_thresh, ident_thresh, align_thresh):
	result_handle = open(result_file, 'r')  ## The XML file to parse.
	blast_records = NCBIXML.parse(result_handle)
	print 'query_id\thit_id\tpercentage_identity\tquery_length\talignment_length\te_value'

	for record in blast_records:  ## Loop through each query.
		query_id = record.query
		if len(record.alignments) > 0:  ## Check whether there are hits.
			e_val = record.alignments[0].hsps[0].expect
			if e_val < e_val_thresh:  ## Is hit below E-value?
				tot_ident = sum([hsp.identities for hsp in record.alignments[0].hsps])  ## Sum of all identities for all hsps.
				query_len = record.query_length  ## Length of query
				align_len = sum([hsp.align_length for hsp in record.alignments[0].hsps])  ## Length of query alignment to hit.
				pct_ident = tot_ident/float(align_len)*100  ## Calculates percentage identity.
				top_hit = record.alignments[0].hit_id + record.alignments[0].hit_def
				if pct_ident > ident_thresh:  ## Checks whether above percentage identity cutoff.
					if align_len > align_thresh:
						print '%s\t%s\t%f\t%i\t%i\t%s' % (query_id, top_hit, pct_ident, query_len, align_len, str(e_val))
					else:
						print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')
				else:
					print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')
			else:
				print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')
		else:
			print '%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')

	result_handle.close()
Esempio n. 11
0
    def create_rel(self, XMLin):
        """ Create a dictionary that relate the sequence name
        with the region to mask.

        Returns a dictionary
        """
        bat1 = {}
        b_records = NCBIXML.parse(XMLin)
        for b_record in b_records:
            for alin in b_record.alignments:
                for hsp in alin.hsps:
                    qs, qe = hsp.query_start, hsp.query_end
                    if qs > qe:
                        qe, qs = qs, qe
                    bat1.setdefault(b_record.query.split(" ")[0], set()).add((qs, qe))

        # sort and merge overlapping segments
        for b_record_query in bat1.keys():
            joined_cols = []
            for qs, qe in sorted(list(bat1[b_record_query])):
                if joined_cols:
                    last_qs, last_qe = joined_cols[-1]
                    if last_qe >= qs:
                        joined_cols[-1] = (last_qs, qe)
                        continue
                joined_cols.append((qs, qe))
            bat1[b_record_query] = joined_cols

        return bat1
Esempio n. 12
0
def run_blastp(match, blastdb):
    """run blastp"""
    from Bio.Blast.Applications import NcbiblastpCommandline

    for feature in match.features:
        rec = None
        fasta = feature.protein_fasta()
        if fasta == "":
            continue
        try:
            cline = NcbiblastpCommandline(db=blastdb, outfmt=5, num_threads=4)
            pipe = subprocess.Popen(
                str(cline), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE
            )
            pipe.stdin.write(fasta)
            pipe.stdin.close()
            recs = NCBIXML.parse(pipe.stdout)
            rec = recs.next()
            pipe.stdout.close()
            pipe.stderr.close()
        except OSError, err:
            logging.warning("Failed to run blastp: %s" % err)
            continue
        except ValueError, err:
            logging.warning("Parsing blast output failed: %s" % err)
            continue
Esempio n. 13
0
def BLAST_to_BRIG(BLASTfile, resultsFile):

    rec = open(BLASTfile)
    blast_records = NCBIXML.parse(rec)

    with open(resultsFile, "w") as tabFile:

        for blast_record in blast_records:

            for alignment in blast_record.alignments:
                for match in alignment.hsps:
                    tabFile.write(
                        "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                        % (
                            blast_record.query,
                            alignment.hit_def,
                            round(float(match.identities) / float(alignment.length), 2),
                            int(match.score),
                            alignment.length,
                            int(alignment.length) - int(match.identities),
                            match.query_start,
                            (int(match.query_start) + int(alignment.length)),
                            match.sbjct_start,
                            (int(match.query_start) + int(alignment.length)),
                        )
                    )

                    break
def parse_blast_XML(blast_xml):
	"""
	Read the blast_xml file generated before and extract the sequence and the id of each sequence in Blast and save them to
	multiple fasta file. It will allow ClustalW to generate a Multiple Sequence Alignment from all these sequence extracted.
	"""
	blast_xml_op = open (blast_xml, 'r')
	for record in NCBIXML.parse(blast_xml_op):
		for align in record.alignments:
			hit_id = align.hit_id.split("|")
			prev_eval = 1
			coverage = align.length / 390 ######arreglar per posar longitud sequencia
			for hsp in align.hsps:
				if hsp.expect < prev_eval:
					prev_eval = hsp.expect
			efetch = Entrez.efetch(db="protein", id=hit_id, rettype="fasta")
			for line in efetch:
				line = line.rstrip()
				if line.startswith(">"):
					id_info = line
					sequence = ""
				else:
					sequence += line
			sequence += line

			organism = id_info[id_info.find("[") + 1:id_info.find("]")]
			organism = organism.split()
			if len(organism) != 1:
				species = str(organism[0] + "_" + organism[1])

			yield BlastResult(hit_id[1], species, sequence, prev_eval, coverage)
Esempio n. 15
0
 def get_gb_info(self, resultshandle):
     """Extracts the GenBank record IDs, the hit positions, and the sequence
     orientations from the BLAST report."""
     #   Start a parser that steps through each record
     blast_records = NCBIXML.parse(resultshandle)
     #   List to hold information about our hits
     #   Step through the BLAST records
     for record in blast_records:
         #   Step through each alignment in each record
         for alignment in record.alignments:
             #   Then the HSPs in each alignment
             for hsp in alignment.hsps:
                 #   The start and end positions of each hit
                 hit_coords = (hsp.sbjct_start, hsp.sbjct_end)
                 #   Split on the '|' character, genbank ID is last in the
                 #   list have to use -2 instead, because of the trailing '|'
                 #   in the XML report
                 hit_gbid = alignment.title.split('|')[-2]
                 #   Relative directions of the sequences
                 hit_directions = hsp.frame
                 break
             #   Tack the IDs, coordinates, and directions onto our lists
             self.gb_ids.append(hit_gbid)
             self.hit_coords.append(hit_coords)
             self.hit_directions.append(hit_directions)
     #   Finished with this file
     resultshandle.close()
     return
Esempio n. 16
0
def parse_blast_xml(xml_filename, query_filename, output_filename, abundance_filename=None):
    """
    Parse the XML output, looking only at the 1st alignment for each query
    Write out in format:
        
    ID \t COUNT \t LENGTH \t AMBIG \t QSTART \t QEND \t IDEN
    """
    if abundance_filename is None:
        abundance = defaultdict(lambda: 1)
    else:
        abundance = dict(line.strip().split('\t') for line in open(abundance_filename))
    handle = NCBIXML.parse(open(xml_filename))
    f = open(output_filename, 'w')
    f.write("ID\tCOUNT\tLENGTH\tAMBIG\tQSTART\tQEND\tIDEN\n")
    with open(query_filename) as h:
        for r in SeqIO.parse(h, 'fasta'):
            ambig = r.seq.count('N') + r.seq.count('?')
            blastout = handle.next()
            if len(blastout.alignments) == 0: # no match was found!
                f.write("{id}\t{count}\t{len}\t{ambig}\tNA\tNA\tNA\n".format(\
                id=r.id, count=abundance[r.id], len=len(r.seq), ambig=ambig))
            else:
                hsp = blastout.alignments[0].hsps[0]
                f.write("{id}\t{count}\t{len}\t{ambig}\t{qs}\t{qe}\t{iden}\n".format(\
                id=r.id, len=len(r.seq), qs=hsp.query_start, qe=hsp.query_end,\
                iden=hsp.identities, count=abundance[r.id], ambig=ambig))
    f.close()
def blast_file_opener(filename, evalue, mismatches, outfile):
    """Func takes in a BLAST xml output file (filename).
    writes out the various details of interests to the outfile.

    It filters the results based on evalue and number of
    mismatches, as defined by the user. """
    E_VALUE_THRESH = float(evalue)
    mismatches = int(mismatches)
    result_handle = open(filename)
    f = open(outfile, 'w')
    temp = outfile.split(".txt")[0]
    blast_records = NCBIXML.parse(result_handle)
    for blast_record in blast_records:
        alignment_hits = set([])
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < E_VALUE_THRESH:
                    # For mismatches use (hsp.align_length - hsp.identities)
                    mmatches = hsp.align_length - hsp.identities
                    if str(mmatches) == str(mismatches):
                        data = "%s\t%s\t%s\n" %(alignment.title,
                                                blast_record.query,
                                                str(hsp.expect))
                        f.write(data)

    f.close()
    result_handle.close()
    return alignment_hits
Esempio n. 18
0
def include_check(blast_result_filename,include_line,seq_name_list,max_mismatch):
    strlist=str(include_line).split(' OR ')
    results={} # intermediate result to show if what organism are conserved in that seq
    results_final={} # final result to show if an seq is conserved or not in all the rquested organism
    for valist in strlist:
        txid_num=valist[valist.find('(taxid:')+7:valist.find(')')]
        blast_result_file= open(blast_result_filename+txid_num,"r")      
        found={}
        for record in NCBIXML.parse(blast_result_file):
            name=record.query
            min_len= record.query_letters-max_mismatch
            if not found.has_key(name): 
                if record.alignments :
                    for align in record.alignments :
                        for hsp in align.hsps :
                            #print "blast: ",hsp.identities,name,query_len,int(num)
                            if hsp.identities == hsp.align_len and hsp.identities>=min_len: # 100% match and has more identities than requirement length of matches
                                found[name]=1 # this valst is conserved in current 
                                if results.has_key(name):
                                    temp=results[name]
                                    temp.append(txid_num)
                                    results[name]=temp
                                else:
                                    temp=[txid_num]
                                    results[name]=temp
                        #print name,results[name]
        blast_result_file.close()
    len_organ=len(strlist)
    for i in results.keys():
        if len(results[i])==len_organ:
            results_final[i]=1
    return (results_final)
def parse_blast_xml_for_training(xml_filename, bowtie_filename, output_filename):
    """
    Parse the XML output, looking only at the 1st alignment for each query
    Write out in format:
        
    Phred   Cycle   B2  B1  B0  Class
    """
    fa_dict = dict((r['ID'], r) for r in BowTieReader(bowtie_filename))

    f = open(output_filename, 'w')
    f.write("Phred\tCycle\tB2\tB1\tB0\tClass\n")
    for blastout in NCBIXML.parse(open(xml_filename)):
        if len(blastout.alignments) == 0: # no match was found!
            continue
        hsp = blastout.alignments[0].hsps[0]
        record = fa_dict[blastout.query]
        primer_offset = int(record['offset'])
        for i in xrange(2, len(hsp.match)):# toDO: allow for i<2 and still get B2, B1
            # global position is i + (query_start-1) + primer_offset
            if hsp.match[i]==" " and hsp.query[i]!='-' and hsp.sbjct[i]!='-':
                pdb.set_trace()
                # is a mismatch!
                f.write(str(ord(record['qual'][i+hsp.query_start-1])-33) + '\t')
                f.write(str(i + hsp.query_start - 1 + primer_offset) + '\t')
                f.write(hsp.query[i-2] + '\t')
                f.write(hsp.query[i-1] + '\t')
                f.write(hsp.query[i] + '\t')
                f.write('-\n')

    f.close()
Esempio n. 20
0
def parse_online_blast (seq_list):

	# get the result handle and set the taxon dic
	blast_handle, taxon_dic = online_blast(seq_list), {}

	# use the biopython xml parse module to get the results
	logging.debug('Parsing blast result XML file.')
	blast_list = [item for item in NCBIXML.parse(blast_handle)]

	# walk through the blast results and prepare them for filtering
	for blast_result in blast_list:
		for alignment in blast_result.alignments:
			for hsp in alignment.hsps:
				            		
				# calculate the %identity
				identity = float(hsp.identities/(len(hsp.match)*0.01))


				# grab the genbank number
				gb_num = alignment.title.split('|')[1:4:2]
				gb_num[1] = gb_num[1].split('.')[0]

				# get the taxon id based on the genbank identifier
				if gb_num[0] not in taxon_dic:
					taxon = obtain_tax(gb_num[0])
					taxon_dic[gb_num[0]] = taxon
				else:
					taxon = taxon_dic[gb_num[0]]

				# pull all the results together and sent them to the filter function
				filter_hits([str(blast_result.query), str(alignment.title), str(gb_num[0]), str(gb_num[1]),
						str(identity), str(len(hsp.query)), str(blast_result.query_length),
						str(hsp.expect), str(hsp.bits), taxon[0], taxon[1]])
Esempio n. 21
0
    def _convertCDNA(self, diff_dir, gene, cDNA, refSeqName, refSeqVer):
        if gene[1] == refSeqName and gene[2] == refSeqVer: # Should only really differ by version number
            return cDNA
        else:
            diff_name = os.path.join(diff_dir, "%s.%sto%s.xml" % (gene[1], refSeqVer, gene[2]))
            if not(os.path.isfile(diff_name)):
                raise Exception("No BLAST xml diff file for %s from %s to %s" % (gene[1], refSeqVer, gene[2]))
            f = open(diff_name)
            blast_records = list(NCBIXML.parse(f))
            f.close()

            if len(blast_records) < 1:
                raise Exception("BLAST xml diff does not have at least one record")
            if len(blast_records[0].alignments) < 1:
                raise Exception("BLAST xml diff does not have at least one alignment")
            if len(blast_records[0].alignments[0].hsps) < 1:
                raise Exception("BLAST xml diff does not have at least one hsps in alignment")

            hsp = blast_records[0].alignments[0].hsps[0]
            offset = hsp.sbjct_start - 1

            parser = Parser()
            variant =  parser.parse("", cDNA)

            if variant.position != '' and variant.position.find('*') < 0:
                variant.position = str(int(variant.position) + offset)
            if variant.range_lower != '' and variant.range_lower.find('*') < 0:
                variant.range_lower = str(int(variant.range_lower) + offset)
            if variant.range_upper != '' and variant.range_upper.find('*') < 0:
                variant.range_upper = str(int(variant_range_upper) + offset)
      
            cDNA = variant.ToString()
            
            return cDNA
Esempio n. 22
0
def readNcbiXml(infile, options): 
    #minPos, minLen, minExpSize, minExpClones, minMotifClones, minExpFreq, sample2total
    #Read in blast-output file:
    rh = open(infile)
    records = NCBIXML.parse( rh)

    clone2hits = {} #key = cloneName, val = [ (list of hit papers, identity) ]

    for record in records: #each seed
        if record.query_length < options.minLen: #ignore seeds that are shorter than minimum required length
            continue
        clone = record.query
        clone2hits[clone] = []
        for aln in record.alignments:
            for hit in aln.hsps: #each hit
                #if float(hit.identities)/len(hit.query) < minPositives:
                if float(hit.positives)/len(hit.query) < options.minPos: #ignore matches with similarity lower than required
                    continue
                if aln.title.split()[-1] == clone or isSameClone(clone, aln.title.split()[-1], hit): #hit is the seed itself, ignore
                    continue

                #if len(hit.match) < options.minLen:
                if len(hit.match) < record.query_length:
                    continue
                clone2hits[ clone ].append( (aln.title.split()[-1], hit.positives, hit.query, hit.match, hit.sbjct) )
            #Sort the hits by size
            hits = clone2hits[clone]
            clone2hits[clone] = sorted(hits, key=lambda h:getHitSize(h[0]), reverse=True)
    return clone2hits
def load_align_table(file1,file2):
    init()
    files=[file1,file2]
    for i in files:
        result_handle = open(i)
        for blast_result in NCBIXML.parse(result_handle):
            for alignment in blast_result.alignments:
                for hsp in alignment.hsps:
                    title=alignment.title
                    e_val=hsp.expect
                    scr=hsp.score
                    a_len=alignment.length
                    ident=hsp.identities
                    q_seq=hsp.query
                    m_seq=hsp.match
                    s_seq=hsp.sbjct
                    q_row=blast_result.query.split('|')
                    pid=int(q_row[1])
                    r=Proteome.byPid(pid)
                    s_row=title.split('|')
                    
                    t=Align(querydesc=blast_result.query,
                            subid=int(s_row[3]),
                            subjectdesc=title,
                            evall=e_val,
                            score=scr,
                            align_length=a_len,
                            Identity=ident,
                            queryseq=q_seq,
                            match=m_seq,
                            subjctseq=s_seq,
                            ident_percent=(ident/float(a_len)),
                            proteome=r)
Esempio n. 24
0
def get_ids(filename, dir, ethresh = 0.01):
	eValueThresh = ethresh
	result = open(os.path.join(dir,"BLAST",filename),"r") # mode omitted defaults to read only
	blast_record = NCBIXML.parse(result)
	blast_records = list(blast_record)
	record = blast_records[0]
	hits = []
	for alignment in record.alignments:
		for hsp in alignment.hsps:
			if hsp.expect < eValueThresh:
				title = alignment.title
				mdata = re.match( r'.*[A-Z|a-z]{2,3}\|(.*?)\|.*?\[([A-Z])\S* ([A-Z|a-z]{3}).*\].*?', title)
				if mdata is not None:
					accession = re.match(r'([A-Z|a-z|_|0-9]*)\..*', mdata.group(1))
					acc = str(accession.group(1))
					genus = str(mdata.group(2)[0])
					species = str(mdata.group(3)[:3])
					shortSpecies = (genus + species)
					hits.append((acc, shortSpecies))
	spec = filename[0:4]
	filteredHits = filter_species(hits,spec)
	# Saving results
	# Save as separate files for each species~!
	with open(os.path.join(dir,"accs",record_name(filename)+".csv"),'w') as csvfile:
		blasthits = csv.writer(csvfile)
		for each in filteredHits:
			blasthits.writerow([each[0]])
	csvfile.close()
def blast_reads(blast_string, reads, outfh):
	blast_db = '/Users/sw10/Dropbox/Sanger/blastdb/ebola/Zaire_ebolavirus_KM034562' # 2)
	blast_binary = '/Applications/ncbi-blast-2.2.29+/bin/blastn' # 3)
	xml_outfile = '/tmp/test.xml'
	evalue = 0.01 
	cline = NcbiblastnCommandline(cmd=blast_binary, out=xml_outfile, outfmt=5, query="-", db=blast_db, evalue=evalue, max_target_seqs=1, num_threads=1)
	stdout, stderr = cline(blast_string)

	with open(xml_outfile, 'r') as blast_handle:
		blast_records = NCBIXML.parse(blast_handle)
		for blast_record in blast_records:
			name = blast_record.query
			for alignment in blast_record.alignments:
				count = 1
				for hsp in alignment.hsps:
					seq = reads[name].sequence[hsp.query_start:hsp.query_end]
					qual = reads[name].quality[hsp.query_start:hsp.query_end]
					if hsp.sbjct_start > hsp.sbjct_end:
						tmp1 = [seq[i] for i in range(len(seq)-1,-1,-1)]
						seq = ''.join(tmp1)
						tmp2 = [qual[i] for i in range(len(qual)-1,-1,-1)]
						qual = ''.join(tmp2)
		
					outfh.write('@%s:%d\n%s\n+\n%s\n' % (name, count, seq, qual))
					count += 1
	os.remove(xml_outfile)
Esempio n. 26
0
def parse_BLAST(blast_results, tol):
    """
    Using NCBIXML parse the BLAST results, storing & returning good hits

    Here good hits are:
        * hsp.identities/float(record.query_length) >= tol

    :param blast_results: full path to a blast run output file (in XML format)
    :param tol: the cutoff threshold (see above for explaination)

    :type blast_results: string
    :type tol: float

    :rtype: list of satifying hit names
    """
    if os.path.isfile(os.path.expanduser(blast_results)):
        hits = []
        for record in NCBIXML.parse(open(blast_results)):
            for align in record.alignments:
                for hsp in align.hsps:
                    hit_name = record.query.split(',')[1].strip()
                    if hsp.identities/float(record.query_length) >= tol:
                        hits.append(hit_name.strip())
    else:
        sys.stderr.write("BLAST results do not exist. Exiting.\n")
        sys.exit(1)
    return hits
def main(argv):
    inputfile = ''
    outputfile = ''
    try:
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
    except getopt.GetoptError:
        print ("test.py -i <inputfile> -o <outputfile>")
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print ("test.py -i <inputfile> -o <outputfile>")
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
    print ("Input file is " + inputfile)
    print ("Output file is " + outputfile)
    outfile = open(outputfile, 'w')
    outfile.write("qseqid\tqseqdef\tqframe\tqlen\tqstart\tqend\tsseqid\tsseqdef\tslen\tsstart\tsend\tidentity\tpositive\tgaps\talign_len\teValue\n")
    try:
        result_handle = open(inputfile)
        blast_records = NCBIXML.parse(result_handle)
        for record in blast_records:
            for alignment in record.alignments:
                for hsp in alignment.hsps:
                    fields = [record.query_id,record.query,str(hsp.frame),str(record.query_length),str(hsp.query_start),str(hsp.query_end),alignment.hit_id,alignment.hit_def,str(alignment.length),str(hsp.sbjct_start),str(hsp.sbjct_end),str(hsp.identities),str(hsp.positives),str(hsp.gaps),str(hsp.align_length),str(hsp.expect)]
                    outfile.write("\t".join(fields) + "\n")
    except IOError:
        print ("no such file!")
Esempio n. 28
0
def blast_xml_to_gff3(file_in,file_out,blast_type):
    result_handle = open(file_in)
    blast_records = NCBIXML.parse(result_handle)
    E_VALUE_THRESH = 0.04
    with open(file_out,"w") as f:
        f.write("##gff-version 3"+"\n")
        for blast_record in blast_records:
            counter = 0
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    if hsp.expect < E_VALUE_THRESH and counter < 1:
                        counter+=1
                        if hsp.strand[0] is None and hsp.frame[0] is None: f.write(blast_record.query + "\t" + 
                                                                                   str(blast_type) + "\t" + 
                                                                                   "match_part" + "\t" + 
                                                                                   str(hsp.query_start) + "\t" + 
                                                                                   str(hsp.query_end) + "\t" + 
                                                                                   str(hsp.score) + "\t" + 
                                                                                   "?" + "\t" +
                                                                                   "." + "\t" +
                                                                                   "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" +
                                                                                   "Parent="+blast_record.query+";"+
                                                                                   "Name=blast_hsp;" +
                                                                                   "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n")
                        if hsp.strand[0] is None and hsp.frame[0] is not None: f.write(blast_record.query + "\t" + 
                                                                                       str(blast_type) + "\t" + 
                                                                                       "match_part" + "\t" + 
                                                                                       str(hsp.query_start) + "\t" + 
                                                                                       str(hsp.query_end) + "\t" + 
                                                                                       str(hsp.score) + "\t" + 
                                                                                       "?" + "\t" +
                                                                                       str(hsp.frame[0]) + "\t" +
                                                                                       "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" +
                                                                                       "Parent="+blast_record.query+";"+
                                                                                       "Name=blast_hsp;" +
                                                                                       "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n")
                        if hsp.strand[0] is not None and hsp.frame[0] is None: f.write(blast_record.query + "\t" + 
                                                                                       str(blast_type) + "\t" + 
                                                                                       "match_part" + "\t" + 
                                                                                       str(hsp.query_start) + "\t" + 
                                                                                       str(hsp.query_end) + "\t" + 
                                                                                       str(hsp.score) + "\t" + 
                                                                                       str(hsp.strand[0]) + "\t" +
                                                                                       "." + "\t" +
                                                                                       "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" +
                                                                                       "Parent="+blast_record.query+";"+
                                                                                       "Name=blast_hsp;" +
                                                                                       "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n")
                        if hsp.strand[0] is not None and hsp.frame[0] is not None: f.write(blast_record.query + "\t" + 
                                                                                           str(blast_type) + "\t" + 
                                                                                           "match_part" + "\t" + 
                                                                                           str(hsp.query_start) + "\t" + 
                                                                                           str(hsp.query_end) + "\t" + 
                                                                                           str(hsp.score) + "\t" + 
                                                                                           str(hsp.strand[0]) + "\t" +
                                                                                           str(hsp.frame[0]) + "\t" +
                                                                                           "ID="+blast_record.query+":"+alignment.title.replace(";","_").replace(" ","_") + ";" +
                                                                                           "Parent="+blast_record.query+";"+
                                                                                           "Name=blast_hsp;" +
                                                                                           "Alias="+alignment.title.replace(";","_").replace(" ","_")+"\n")
Esempio n. 29
0
def blast_align(fasta,blast_path,linker_db):
    fasta_name=fasta.split(".")[0]
    os.system(blast_path+" -task blastn -outfmt 5 -num_threads 6 -evalue 0.1 -db "+linker_db+" -query ./temp/"+fasta+" > ./temp/"+fasta_name+"_blast_linker.xml")
    linker_records=NCBIXML.parse(open("./temp/"+fasta_name+"_blast_linker.xml"))
#    os.system("rm ./temp/"+fasta)
#    os.system("rm ./temp/"+fasta_name+"_blast_linker.xml")
    return (linker_records)
def main():
    
    # query = input('Enter query file name: ') # For the working example, type in 'Test_miRNA.txt'
    # filename = input('What is your desired file name for the top hits file? ') # I used 'Test_miRNA_Results.txt'
    # writer = open(filename, 'w')
    records = SeqIO.parse('gg_pre_mirna.fasta', 'fasta')
    writer = open('results.fasta', 'w')
    writer.write('Organism_name' + '\t' + 'Query_start' + '\t' + 'Query_end' + '\t' + 'Subject_start' + '\t' +
                  'Subject_end' + '\r') # Writes the header for the results file
    # print('Now BLASTing')
    for record in records:
        tempWriter = open('Temp.txt', 'w')
        tempWriter.write('>' + record.id + '\n')
        tempWriter.write(str(record.seq) + '\n')
        #os.system('blastn -task blastn-short -query '+ str(record.seq) +' -db Input/gg_db -out BLAST_result.xml -outfmt "5" ')
        os.system('blastn -task blastn-short -query gg_pre_mirna.fasta -db Input/gg_db -out BLAST_result.xml -outfmt "5" ')
        result_handle = open('BLAST_result.xml')
        blast_records = NCBIXML.parse(result_handle)
        writer.write('\r' + '*****' + '\r')
        writer.write(record.id + '\r' + '*****' + '\r')
        for blast_record in blast_records:
            parsefile(blast_record,writer)
        tempWriter.close()
    writer.close()
    print('Finished!')
def process_blast_data(xml_filename:str, species:str, sequence_length:int):
    result_handle = open(xml_filename, 'r')
    blast_records = NCBIXML.parse(result_handle)
    
    
    #Processing Blast Data 
    blast_save = {}
    for sequence in blast_records:
        E_VALUE_THRESH = 1
        unique = True
        for alignment in sequence.alignments:
           # print(str(sequence.query))
            for hsp in alignment.hsps:
                title = str(alignment.title)
                if not hsp.expect < E_VALUE_THRESH:
                    unique = False
                if unique:
                    if sequence.query not in blast_save:
                        blast_save[sequence.query] = list()
                    blast_save[sequence.query].append((alignment.title, hsp.query[0:sequence_length]))
    
    
    match_mismatch_number = {}
    match_mismatch_list = []
    for (query, matches) in blast_save.items():
        match_count = 0
        mismatch_count = 0
        mismatch_titles = []
        mismatch_sequence = []
        for match in matches:
            (title, sequence) = match
            if species in title:
                match_count = match_count+1
            else:
                mismatch_count = mismatch_count + 1
                mismatch_titles.append(title)
                mismatch_sequence.append(sequence)
        match_mismatch_number = {"query" : query, 
                                 "match_count": match_count,
                                 "mismatch_count": mismatch_count,
                                 "mismatch_titles": mismatch_titles,
                                 "mismatch_sequences": mismatch_sequence
        }
        match_mismatch_list.append(match_mismatch_number)
    matches_df = pd.DataFrame(match_mismatch_list)
    return(matches_df)
Esempio n. 32
0
def parse_blast_results(blast_xml_files):
	for blast_xml_file in blast_xml_files:
		print(blast_xml_file)
		with open(blast_xml_file, 'r') as result_handle:
			blast_records = NCBIXML.parse(result_handle)
			# blast_record = next(blast_records)
			# for alignment in blast_record.alignments:
			# 	for hsp in alignment.hsps:
			# 		print(hsp.expect)
			for blast_record in blast_records:
				# print(blast_record)
				for alignment in blast_record.alignments:
					# print(alignment)
					for hsp in alignment.hsps:
						print(hsp)
						print("sequence:", alignment.title)
						print("e value:", hsp.expect)
Esempio n. 33
0
def net_blast(query_record, program='blastn', database='nr'):
    """
	net_blast(query_record, program, database = 'nr')
	*Perform a BLAST search over the net using the specified program & database
	*before searching, check that the search alphabet is compatible with the type of search,
	*raise a ValueError if not
	
	ARGUMENTS
	query_record: a SeqRecord object containing the query sequence
	program: the program to use, as per:
		http://www.ncbi.nlm.nih.gov/BLAST/blast_program.shtml
	database: the db to query, as per:
		http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=ProgSelectionGuide#db
	
	"""
    #check whether we have a valid query
    if not isinstance(query_record, SeqRecord):
        raise ValueError(u'Invalid Search Item')
    if len(query_record.seq) < 10:
        raise ValueError(u"Query sequence is too short")
    #check that the program is valid
    program = program.lower()
    if program not in searches:
        raise ValueError(u"Invalid Program '%s'" % program)

    #check that the alphabet and db are ok
    (required_alpha, required_dbs) = searches[program]
    if not isinstance(query_record.seq.alphabet, required_alpha):
        raise ValueError(u"Query alphabet for '%s' must be '%s'" %
                         (program, alphabets[program]))
    if not (database in protein_db or database in nucleotide_db):
        raise ValueError(u"Invalid database '%s'" % database)
    if not database in required_dbs:
        raise ValueError(u"Database '%s' cannot be used with program '%s'" %
                         (database, program))

    #Value checking done, time to run the search
    results = NCBIWWW.qblast(program,
                             database,
                             query_record.seq,
                             format_type='XML')

    #parse the results
    blast_records = NCBIXML.parse(results)

    return blast_records
Esempio n. 34
0
def blast_in_handle(handle, b='guess', log=True):
    """
    gueses blast format
    :returns list of SearchIO objects, one set of hits per per query sequence per field
    """
    if log:
        ml.debug(fname())
    multiq = []
    if b == 'guess':
        # gues the format
        l = handle.readline()
        handle.seek(0, 0)  # seek to begining
        if re.search(r'^BLASTN \d+\.\d+\.\d+', l):
            # blast object prob plaintext
            b_type = 'plain'
            if log:
                ml.info('Inferred BLAST format: txt.')
        elif re.search(r'<\?xml version', l):
            # run xml parser
            b_type = 'xml'
            if log:
                ml.info('Inferred BLAST format: xml.')
        else:
            if log:
                ml.error(
                    'Could not guess the BLAST format, preferred format is NCBI xml.'
                )
            return None
    else:
        b_type = b

    if b_type == 'plain':
        for p in blast_parse_txt(handle):
            multiq.append(p)

        return multiq

    elif b_type == 'xml':
        for p in NCBIXML.parse(handle):
            multiq.append(p)

        return multiq
    else:
        if log:
            ml.error('BLAST type not known: allowed types: plain, xml, guess')
        return None
Esempio n. 35
0
def runBlastParser(cline, bOutFile):
    """Ensure cline is in fact the blastp command"""
    if str(shutil.which("blastp")) in str(cline):
        os.system(str(cline))
        #        print("opening xml")
        rec = open(bOutFile)
        #        print("parsing...")
        blast_records = NCBIXML.parse(rec)
    else:
        raise Exception("Blastp path not found in command line argument")

    #	if os.path.isfile(locus_sbjct):
    #		os.remove(locus_sbjct)

    # os.remove(bOutFile)

    return blast_records
Esempio n. 36
0
    def parse(self, blast_record):
        records = list()

        with open(blast_record.get_filename()) as open_file:
            try:

                blast_records = NCBIXML.parse(open_file)

                for record in blast_records:
                    parsed_record = self._parse_record(record)
                    records.append(parsed_record)
            except ValueError as error:
                print(error, blast_record.get_filename())

        blast_record._records = records

        return blast_record
Esempio n. 37
0
    def parse_secondary(self):
        records = NCBIXML.parse(open('secondary.xml'))
        positives = []
        for record in records:
            title = ParseDefline(record.query)
            for desc in record.descriptions:

                if desc.e <= self.expect_diff:
                    positives.append(title.id)
        print positives
        hipster = SeqIO.parse(open(self.substract),'fasta')
        hipster = SeqIO.to_dict(hipster)
        unique = list(set(hipster.keys())-set(positives))
        handle = open('unique.faa','wb')
        for key in unique:
            record = hipster[key]
            SeqIO.write(record,handle,'fasta')
Esempio n. 38
0
def conservation_check(blast_result_file,conservation_list,seq_name_list):
     results = [[0] * len(conservation_list) for row in range(len(seq_name_list))]
     for record in NCBIXML.parse(open(blast_result_file)) :
          if record.alignments :
               name=record.query
               num=split(record.query,"_")[1]
               type=split(record.query,"_")[2]
               num=int(num,10)
               for align in record.alignments :
                    for hsp in align.hsps :
                         for index in range(len(conservation_list)):
                              if align.hit_def.find(conservation_list[index]) >-1:
                                   s1=bool(hsp.query_start ==1)
                                   s2=bool(hsp.query_start <= record.query_length-21)
                                   e1=bool(hsp.query_end == record.query_length)
                                   e2=bool(hsp.query_end >= 21)
                                   #print "result",results[num-1][index]
                                   #print "result",len(conservation_list)
                                   if (type =='HTH'):
                                        if (s2 and e1) and (results[num-1][index]==-1):
                                             results[num-1][index]=2
                                                #print "itshouldbe","num",num,"index",index,"change?",results[num][index+1]
                                        elif (s1  and e2) and (results[num-1][index]==1):
                                                results[num-1][index]=2
                                        elif (s1  and e2) and (results[num-1][index]==0):
                                                results[num-1][index]=-1
                                        elif (s2 and e1) and (results[num-1][index]==0):
                                                results[num-1][index]=1
						#print "into",num,index,results[num][index+1]
					    #print "HTH"
					    #print "hsp.query_start",hsp.query_start, "end",hsp.query_end, "len",record.query_length, "num",num, "index",index
					    #print "firt", bool(s2)
					    #print "second", bool(e1)
                                   elif (type =='HTT'):
					    #print "HTT",bool(s2 and e1)
                                        if(s2 and e1):
                                             results[num-1][index]=1
                                             #print "into",num,index,results[num-1][index]
					    #print "HTT"
                                   elif (type =='TTH'):
                                        if(s1  and e2):
						#print "into only ",num,index,name
                                             results[num-1][index]=-1
					    #print "TTH"
     print "doneconservation"
     return results
Esempio n. 39
0
    def __enter__(self):
        # First, we try to guess the filetype of the target.
        ftype = self._guess_filetype(self.target)

        if ftype == 'fasta':
            self.makeblastdb(self.target, self.dbtype, self.blastdb)
            self.run_blast(self.cmd)  #, self.query, self.target, self.tempdir)
            self.file_in = open(self.blastout, 'r')
        elif ftype == 'blastdb':
            # The target is already a blastdb file, so we can set that in the
            # ncbi blast command directly
            self.cmd.db = self.target
            self.run_blast(self.cmd)
            self.file_in = open(self.blastout, 'r')
        elif ftype == 'xml':
            self.file_in = open(self.target, 'r')
        return NCBIXML.parse(self.file_in)
Esempio n. 40
0
def blast_file(fasta_path, blast_db='nt', parser=basic_parser):
    logging.info("Running BLAST {}".format(fasta_path))
    results = []
    #record = SeqIO.read(fasta_path, format="fasta")
    fasta_string = open(fasta_path, 'r').read()
    logging.debug(fasta_string)
    result_handle = NCBIWWW.qblast(BLAST_PROG,
                                   blast_db,
                                   fasta_string,
                                   megablast=True)
    logging.info("BLAST returned")
    blast_records = NCBIXML.parse(result_handle)
    logging.info("Analyzed BLAST")
    for single_record in blast_records:
        # each run is a single sequence search from fasta_path
        results.append(parser(single_record))
    return results
Esempio n. 41
0
def store_and_parse_blast_results(file_name, blast_output_handle_list):
    """
    Stores blast results to file and then parses the file to a list we can
    reuse.  There may be a better way to do this, but we can only iterate
    the blast handle once, and saving to disk lets us avoid rerunning the
    same jobs on NCBI's servers while developing and debugging this script.
    """
    blast_output_file = open(file_name, 'w')
    for blast_output_handle in blast_output_handle_list:
        blast_output_file.write(blast_output_handle.read())
        blast_output_handle.close()
    blast_output_file.close()
    # Parse the blast results file and convert it to a list we can reuse
    blast_output_handle = open(file_name, 'r')
    blast_records = NCBIXML.parse(blast_output_handle)
    blast_records = list(blast_records)
    return blast_records
Esempio n. 42
0
 def blastp(self, fasta, pdb_db):
     """
     Runs BLASTP locally on a input fasta file and specified BLASTP database
     :param fasta: The absolute path to a FASTA file
     :param pdb_db: A BLASTP database
     :return: Bio.blast.Record object
     """
     logger.debug('Running blastp ' + fasta)
     cline = NcbiblastpCommandline(cmd='blastp',
                                   query=fasta,
                                   db=pdb_db,
                                   evalue=0.001,
                                   outfmt=5)
     std_out, std_err = cline()
     blast_records = NCBIXML.parse(StringIO(std_out))
     record = next(blast_records)
     return record
def special_gene(target_fie, database, gene_list):
    database = database.split("/")[-1]  ##########1/27/2015
    os.system('makeblastdb -in database/' + database + ' -out ' + database +
              '_db -dbtype nucl')  ##########1/28/2015
    os.system('blastn -query ' + target_file + ' -db ' + database +
              '_db -out ' + database + '_vs_' + target_file + '.xml ' +
              '-outfmt 5')  ##########1/28/2015
    xml_file = database + '_vs_' + target_file + '.xml'
    result_handle = open(xml_file)
    blast_record = NCBIXML.parse(result_handle)
    records = list(blast_record)
    E_thresh = 1e-10
    for x in gene_list:
        handle = SeqIO.parse("database/" + database,
                             "fasta")  ##########1/28/2015
        length_list = []
        for y in handle:
            if x in y.description:
                length_x = len(y.seq)
                length_list.append(length_x)
        aver_len = float(sum(length_list)) / len(length_list)
        hspbit = []
        alignmentlist = []
        for record in records:
            for alignment in record.alignments:
                if x in alignment.hit_def:  #multi gene database, so...
                    print x, "got a hit, evaluating the hit quality..."
                    score = 0
                    for hsp in alignment.hsps:
                        if hsp.expect < E_thresh:
                            score += hsp.bits
                    alignment = alignment.hit_def + ':' + str(score)
                    hspbit.append(score)
                    alignmentlist.append(alignment)
        scorelist = dict(zip(alignmentlist, hspbit))
        score = 0
        for Htype in scorelist:
            if scorelist[Htype] > score:
                First_Choice = Htype
                score = scorelist[Htype]
        if float(score) >= 0.1 * aver_len:
            print "$$$", First_Choice, "got a hit, score:", score
        else:
            print "$$$No ", x, "exists"
    os.system("rm " + database + "_db.*")  ##########1/28/2015
    os.system("rm " + xml_file)  ##########1/28/2015
Esempio n. 44
0
def XMLparse(Path):
    E_VALUE = pow(10,-40)
    result = NCBIXML.parse(open(Path))
    numBlastHits = 0
    hitsDict = {}
    for blast_record in result:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect <= E_VALUE:
                    numBlastHits += 1
                    break

        hitsDict[blast_record.query] = numBlastHits
        numBlastHits = 0
    result.close()
    print(writeCSV(Path, hitsDict))
    return[]
Esempio n. 45
0
def Filter_Readouts_by_Genome(cand_readout_file='selected_candidates.fasta', 
                              genome_db='hg38',
                              readout_folder=_readout_folder, genome_folder=_genome_folder,
                              word_size=10, evalue=1000, save_postfix='genome', 
                              verbose=True):
    """Filter readout candiates by blasting against genome
    Inputs:
    Outputs:
    """
    if not os.path.isfile(cand_readout_file):
        cand_readout_file = os.path.join(readout_folder, cand_readout_file)
        if not os.path.isfile(cand_readout_file):
            raise IOError(f"Wrong input candidate readout file:{cand_readout_file}, not exist.")
    elif '.fasta' not in cand_readout_file:
        raise IOError(f"Wrong input file type for {cand_readout_file}")
    # blast!
    blast_outfile = cand_readout_file.replace('.fasta', f'_{genome_db}.xml')
    output = NcbiblastnCommandline(query=cand_readout_file,
                                    num_threads=12,
                                    db=os.path.join(genome_folder, genome_db),
                                    evalue=500,
                                    word_size=10,
                                    out=blast_outfile,
                                    outfmt=5)()[0]
    # decide which to keep
    genomeblast_keeps = []                
    blast_records = NCBIXML.parse(open(os.path.join(readout_folder, 'selected_candidates_hg38.xml'), 'r'))
    for blast_record in blast_records:
        if verbose:
            print(blast_record.query_id, len(blast_record.alignments))
        keep = filter_readouts_by_blast(blast_record, verbose=verbose)
        genomeblast_keeps.append(keep)

    # save all 
    with open(cand_readout_file, "r") as handle:
        record_keeps = []
        for _i, record in enumerate(SeqIO.parse(handle, "fasta")):
            if genomeblast_keeps[_i]:
                record_keeps.append(record)
    save_filename = cand_readout_file.replace('.fasta', f'_{save_postfix}.fasta')
    with open(save_filename, "w") as output_handle:
        SeqIO.write(record_keeps, output_handle, "fasta")
    if verbose:
        print(f"-- number of readout candidates kept: {len(record_keeps)}")
    
    return record_keeps
Esempio n. 46
0
def blast(seq,
          binary="blastp",
          db="blastdb/FPbase_blastdb.fsa",
          max_hits=30,
          fmt=15,
          **kwargs):
    assert binary in ("blastp", "blastx"), "Unrecognized blast binary"
    if not (os.path.isfile(db) and (len(os.listdir(os.path.dirname(db))) > 5)):
        make_blastdb(db)
    binary = f"bin/{binary}_" + ("osx" if sys.platform == "darwin" else "nix")
    max_hits = kwargs.pop("max_target_seqs", max_hits)
    fmt = kwargs.pop("outfmt", fmt)
    with tempfile.NamedTemporaryFile(suffix=".fsa") as tmp:
        if not seq.startswith(">"):
            seq = ">query\n" + seq
        tmp.write(seq.encode())
        tmp.seek(0)
        cmd = [
            binary,
            "-query",
            tmp.name,
            "-outfmt",
            str(fmt),  # xml format
            "-db",
            db,
            "-max_target_seqs",
            str(max_hits),
            "-max_hsps",
            "1",
        ]  # only show one alignment for each pair
        for key, value in kwargs.items():
            cmd.extend([f"-{key}", str(value)])
        with tempfile.NamedTemporaryFile(suffix=".txt") as outfile:
            cmd.extend(["-out", outfile.name])
            run(cmd)
            if fmt == 5:
                from Bio.Blast import NCBIXML

                records = NCBIXML.parse(outfile.file)
                return [serialize_record(r) for r in records]
            if fmt == 15:
                out = outfile.file.read().decode()
                return json.loads(out).get("BlastOutput2")
            else:
                return outfile.file.read().decode()
Esempio n. 47
0
def blast_offtarget(fasta_string):
    """Function which count offtarget using blast.

    Args:
        fasta_string(str): Fasta sequence.

    Returns:
        Offtarget value(int).
    """
    try:
        with blast_path():
            with open('fasta', 'w') as fasta_file:
                fasta_file.write(fasta_string)
            cline = NcbiblastnCommandline(
                query="fasta",
                db="refseq_rna",
                outfmt=("'6 qseqid sseqid evalue bitscore sgi sacc staxids"
                        "sscinames scomnames stitle'"))
            stdout, stderr = cline()

        blast_lines = [
            line for line in stdout.split('\n') if 'H**o sapiens' in line
        ]

        return len(blast_lines)
    except ApplicationError:
        result_handle = NCBIWWW.qblast("blastn",
                                       "refseq_rna",
                                       fasta_string,
                                       entrez_query="txid9606 [ORGN]",
                                       expect=100,
                                       gapcosts="5 2",
                                       genetic_code=1,
                                       hitlist_size=100,
                                       word_size=len(fasta_string),
                                       megablast=True)
        blast_results = result_handle.read()

        blast_in = cStringIO.StringIO(blast_results)
        count = 0

        for record in NCBIXML.parse(blast_in):
            for align in record.alignments:
                count += 1
        return count
Esempio n. 48
0
def blast(query, subj, minoverlap, logger, wd, threads):
    """Return bool and positions of query sequences that overlapped
with subject given parameters."""
    query_file = os.path.join(wd, 'query.fasta')
    subj_file = os.path.join(wd, 'subj.fasta')
    SeqIO.write(query, query_file, "fasta")
    SeqIO.write(subj, subj_file, "fasta")
    try:
        # options: http://www.ncbi.nlm.nih.gov/books/NBK1763/
        cline = NcbiblastnCommandline(query=query_file,
                                      subject=subj_file,
                                      outfmt=5,
                                      cmd=blastn,
                                      word_size=8,
                                      num_threads=threads)
        logger.debug(cline)
        output = cline()[0]
    except ApplicationError:  # as error_msg:
        # logger.debug(error_msg)
        # logger.warn("---- BLAST Error ----")
        # TODO: work out why this is happening, doesn't seem to affect
        #  results though, low priority
        return [], []
    finally:
        os.remove(query_file)
        os.remove(subj_file)
    # list of T or F for success of alignment between queries and
    #  subject
    bools = []
    # record start and end position to avoid composite sequence
    #  problems
    positions = []
    # BLAST records for each query sequence matched against subj
    bresults = NCBIXML.parse(StringIO(output))
    for record in bresults:
        if record.alignments:
            res = record.alignments[0].hsps[0]
            # if identities > minoverlap, keep
            if res.identities > minoverlap:
                bools.append(True)
                positions.append(res.query_start)
                positions.append(res.query_end)
                continue
        bools.append(False)
    return bools, positions
Esempio n. 49
0
def getBLAST(arg):
    BLASTResultAsXML = NCBIWWW.qblast(program=arg[1],
                                      database=arg[2],
                                      sequence=arg[3],
                                      expect=arg[4],
                                      hitlist_size=arg[5],
                                      matrix_name=arg[6],
                                      alignments=arg[7])

    BLASTData = NCBIXML.parse(BLASTResultAsXML)

    maxEValue = 0.0001
    maxResults = 1

    i = 0

    for BLASTResult in BLASTData:
        for alignment in BLASTResult.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < maxEValue and maxResults < 2:
                    # Header van het BLAST resultaat
                    header = str(alignment.title)
                    # Naam organisme
                    name = header.split('[', 1)[1].split(']')[0].split('>')[0]
                    protein = header.split('|')[4].split('[')[0]
                    accession = alignment.title.split('|')[3]
                    eValue = hsp.expect
                    identity = hsp.identities
                    queryCov = float(hsp.identities) / float(len(
                        hsp.query)) * float(100)
                    score = hsp.score
                    bits = hsp.bits

                    data = str(name) + "$" + str(protein) + "$" + str(
                        accession) + "$" + str(eValue) + "$" + str(
                            identity) + "$" + str(queryCov) + "$" + str(
                                score) + "$" + str(bits)
                    print(data)
                    maxResults += 1

                if maxResults >= 2:
                    break
            i += 1
        if i == 1:
            break
Esempio n. 50
0
def parse_BLAST(blast_results, tol, careful):
    """
    Using NCBIXML parse the BLAST results, storing & returning good hits

    Here good hits are:
        * hsp.identities/float(record.query_length) >= tol

    :param blast_results: full path to a blast run output file (in XML format)
    :param tol: the cutoff threshold (see above for explaination)

    :type blast_results: string
    :type tol: float

    :rtype: list of satifying hit names
    """
    if os.path.isfile(os.path.expanduser(blast_results)):
        hits = []
        for record in NCBIXML.parse(open(blast_results)):
            for align in record.alignments:
                for hsp in align.hsps:
                    hit_name = record.query.split(',')[1].strip()
                    cutoff = hsp.identities/float(record.query_length)
                    if cutoff >= tol:
                        hits.append(hit_name.strip())
                    # New method for the --careful option
                    elif cutoff >= tol-careful:
                        print "Please confirm this hit:"
                        print "Name,SeqFindr score,Len(align),Len(query),Identities,Gaps"
                        print "%s,%f,%i,%i,%i,%i" % (hit_name, cutoff, hsp.align_length, record.query_length, hsp.identities, hsp.gaps)
                        accept = raw_input("Should this be considered a hit? (y/N)")
                        if accept == '':
                            pass
                        elif accept.lower() == 'n':
                            pass
                        elif accept.lower() == 'y':
                            hits.append(hit_name.strip())
                        else:
                            print "Input must be y, n or enter."
                            print "Assuming n"
                    else:
                        pass
    else:
        sys.stderr.write("BLAST results do not exist. Exiting.\n")
        sys.exit(1)
    return hits
Esempio n. 51
0
    def _perform_alignment(self, idx__seq_discrpt):

        idx, (seq, description) = idx__seq_discrpt
        pval = float(description.split(':')[1])
        final_results = []
        if pval <= self.p_value_threshold:
            FileUtility.create_fasta_file('../tmp/temp' + str(idx) + '.fasta',
                                          [seq], ['temp'])
            blastx_cline = NcbiblastnCommandline(
                query='../tmp/temp' + str(idx) + '.fasta',
                db=
                "/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/EZ/raw/eztaxon_qiime_full.fasta",
                evalue=0.001,
                outfmt=5,
                out='../tmp/temp' + str(idx) + '.xml')
            blastx_cline()
            f = open('../tmp/temp' + str(idx) + '.xml', 'r')
            blast_records = NCBIXML.parse(f)
            flag = False
            score = -1
            alignment_length = -1
            results = []
            for blast_record in blast_records:
                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if not flag and score == -1:
                            score = hsp.score
                            alignment_length = hsp.align_length
                            flag = True
                        if hsp.score >= score and hsp.align_length >= alignment_length and 'Eukarya' not in self.ez_taxa_dict[
                                alignment.hit_id]:
                            results.append(
                                (self.ez_taxa_dict[alignment.hit_id],
                                 hsp.expect))
            if len(results) > 0:
                res = self.lowest_certain_level(results)
                if res:
                    final_results = (seq,
                                     self.refine_ez_taxonomy(res) + idx[-1],
                                     pval)
                else:
                    final_results = (seq, 'ZZZNOVEL' + idx[-1], pval)
            else:
                final_results = (seq, 'ZZZNOVEL' + idx[-1], pval)
        return final_results
Esempio n. 52
0
def read_blast(fn):

    res_handle = open(fn, "r")

    blast_record = NCBIXML.parse(res_handle)
    #print(blast_record)
    #normal_bast_res = 0
    #unexpected_blast_res = 0

    alt_dict = {}
    first = True
    fam = ''
    pickle_name = ''

    for record in blast_record:
        if (first):
            arr = record.query.split('.')
            fam = arr[0]
            pickle_name = "alts_" + fam + ".pkl"
            if ("alts_" + fam + ".pkl" in os.listdir()):
                alt_dict = pickle.load(open(pickle_name, "rb"))
            first = False

        alt_dict[record.query] = []
        cter = 0

        for al in record.alignments:
            cter += 1
            should_print = False
            hsp_list = []
            for hsp in al.hsps:
                length = hsp.align_length
                score = hsp.score
                if (hsp.align_length and record.query_length
                        and hsp.align_length < .9 * record.query_length):
                    break
                else:
                    should_print = True
                    alt_dict[record.query].append((al.title, hsp.query))

    pickle.dump(alt_dict, open(pickle_name, "wb"))

    res_handle.close()

    return
Esempio n. 53
0
def get_bsr_for_strain(bsr_records, blast_results_path, strain_name):
    """Give a bunch of BSR_Record {'match_key':BSR_Record, ...}
    and a blast results filepath. BSR etc. updated in the Match.
    Dict of BSR_Record returned as provided.

    BLAST records should have the FASTA title as alignment.hit_def
    """

    print('Getting BSR from this blast results file:', blast_results_path)

    record_count = 0
    records_with_hits = 0

    for record in NCBIXML.parse(open(blast_results_path)):
        if record.alignments:
            records_with_hits += 1
            #Take the best hit for each query sequence
            hsps = record.alignments[0].hsps[0]
            sbjct_descrpt = record.alignments[0].hit_def
            sbjct_seq = ''.join([nt for nt in hsps.sbjct if nt != '-'])
            best_score = hsps.score
            #query_score = record.alignments[0].hsps[0].score
            query_descrpt = record.query

            if query_descrpt in bsr_records:
                bsr_records[query_descrpt].add_hit(sbjct_descrpt, best_score,
                                                   sbjct_seq, strain_name)
            else:
                print(
                    query_descrpt,
                    'not found in bsr_records, this shouldnt happen but can result from weirdly formated FastA record descriptions.'
                )
                raise KeyError
        # If this didn't hit in the subject genome we still want to create an
        # empty placeholder in the match record
        else:
            bsr_records[record.query].add_hit('', 0, None, strain_name)
        if record_count % 100 == 0:
            print(record_count, '... ', sep='', end='')
        record_count += 1

    print('Of {} reference sequences, {} had hits in the primary proteome.'.
          format(record_count, records_with_hits))

    return bsr_records
Esempio n. 54
0
def diamondindex(db1, db2, outputdir, evalue=1E-30):
    """Run DIAMOND search and parse the results.
       Input:
       db1 and db2 are BLAST databases.
       outputdir is a place to put the BLAST results.
       Returns:
       A dictionary where keys are the query proteins,
       values are the corresponding hits."""
    print("db1 is....  " + db1)
    print("db2 is....  " + db2)
    #Make dir to store results
    blastoutputdir = outputdir + "/BLAST_data"
    if not isdir(blastoutputdir):
        makedirs(abspath(blastoutputdir))
    #the weird re split stuff is so I can use the file names of DBs,
    #not paths to make the output file name.
    species1 = re.split('[\\\/.]+', db1)[-2]
    species2 = re.split('[\\\/.]+', db2)[-2]
    outputID = "/" + species1 + "_vs_" + species2 + ".XML"
    blastoutputfile = abspath(blastoutputdir + outputID)
    #Make blastdbs from fasta:
    makediamond = "diamond  makedb --in " + db2 + " --db " + db2
    print(makediamond)
    subprocess.call(makediamond, shell=True)
    rundiamond = ("diamond blastp --db " + db2 + " --out " + blastoutputfile +
                  " --query " + db1 + " --outfmt 5 -e 1e-3 --quiet")
    print(rundiamond)
    subprocess.call(rundiamond)
    #Parse the BLAST
    result_handle = open(blastoutputfile, "r")
    blastrecs = NCBIXML.parse(result_handle)
    iddict1 = {}
    #consider making the iddict1 an ordered dict to ensure correctness in synteny assessment.
    for B in blastrecs:
        if B.alignments:
            ali = B.alignments[0]
            qID = B.query
            hID = ali.hit_def
            #THis is only the score of the top HSP, not for the whole thing
            score = ali.hsps[0].score
            iddict1[qID] = {
                "hit ID " + species2: hID,
                "score " + species2: score
            }
    return iddict1
def read_in_xml(xml_inf):

    with open(xml_inf) as xml:
        with open(
                "{}_parsed_cutoff_{}nt.out".format(
                    xml_inf.split(".out")[0], args.length), "w+") as outf:
            hit_no = 0

            xml_parse = NCBIXML.parse(xml)

            List_of_hits_to_sort = []

            for entry in xml_parse:

                for alignment in entry.alignments:

                    for hsp in alignment.hsps:
                        length = hsp.identities

                        if length > 100:
                            output_string = ""
                            output_string += "alignment_length\t{}\n".format(
                                length)
                            output_string += "alignment_evalue\t{}\n".format(
                                hsp.expect)
                            output_string += "alignment_score\t{}\n".format(
                                hsp.score)
                            output_string += "query\t{}\n".format(entry.query)
                            output_string += "query_pos\t{}\t{}\n".format(
                                hsp.query_start, hsp.query_end)
                            output_string += "hsp\n"
                            output_string += "{}\n{}\n{}\n".format(
                                hsp.query, hsp.match, hsp.sbjct)
                            output_string += "match\t{}\n".format(
                                alignment.title)
                            output_string += "query_pos\t{}\t{}\n".format(
                                hsp.sbjct_start, hsp.sbjct_end)
                            output_string += "#\n"
                            List_of_hits_to_sort.append(
                                (length, output_string))

            for hit in sorted(List_of_hits_to_sort, reverse=True):
                hit_no += 1
                outf.write("hit_number\t{}\n".format(hit_no))
                outf.write(hit[1])
def BlastFastaXmlIndv(fasta_filename=None, xml_filename=None):
    if fasta_filename:
        record_iterator = SeqIO.parse(fasta_filename, "fasta")
        output_table = open(fasta_filename + ".summary.tsv", 'w')
        outputWriter = csv.writer(output_table, delimiter="\t")
        for seq_record in record_iterator:
            wait_time = 1
            while True:
                print seq_record.id
                try:
                    result_handle = NCBIWWW.qblast("blastn",
                                                   "nr",
                                                   seq_record.seq,
                                                   entrez_query="KM204118.1")
                    break
                except ValueError:
                    print "Error encountered"
                    print "Trying again in " + str(wait_time) + " seconds"
                    if wait_time > 100:
                        sys.exit()
                    time.sleep(wait_time)
                    wait_time *= 2

            blast_record = NCBIXML.read(result_handle)
            filteredHspStartEnds = FilterBlastRecord(blast_record)
            if filteredHspStartEnds and CheckPossibleRecomb(
                    filteredHspStartEnds):
                WriteARow(outputWriter, blast_record, filteredHspStartEnds)

            result_handle.close()

    elif xml_filename:
        output_table = open(xml_filename + ".summary.tsv", 'w')
        outputWriter = csv.writer(output_table, delimiter="\t")
        result_handle = open(xml_filename)
        blast_records = NCBIXML.parse(result_handle)
        for blast_record in blast_records:
            filteredHspStartEnds = FilterBlastRecord(blast_record)
            if filteredHspStartEnds and CheckPossibleRecomb(
                    filteredHspStartEnds):
                WriteARow(outputWriter, blast_record, filteredHspStartEnds)

        result_handle.close()

    output_table.close()
Esempio n. 57
0
def readVirusXML1(fname):
	result_handle = open(fname, 'r')
	blast_records = NCBIXML.parse(result_handle)
	virusE={}
	virusID={}
	for blast_record in blast_records:
		query = blast_record.query
		for alignment in blast_record.alignments:
			subject = alignment.title.upper()
			for hsp in alignment.hsps:
				if not virusE.has_key(query):
					virusE[query]=float(hsp.expect)
					virusID[query]=subject # lowest subject
				elif float(hsp.expect) < virusE[query]:
					virusE[query]=float(hsp.expect)
					virusID[query]=subject # lowest subject
	result_handle.close()
	return virusE, virusID
Esempio n. 58
0
 def blastp(self, acc):
     try:
         gis = []
         print 'here'
         result_handle = NCBIWWW.qblast("blastp",
                                        "nr",
                                        acc,
                                        format_type="XML",
                                        expect=self.blast_threshold)
         print 'here'
         for blast_record in NCBIXML.parse(result_handle):
             for alignment in blast_record.alignments:
                 gis.append(alignment.title.split("|")[1])
         unique = [int(i.strip()) for i in gis if int(i) not in self.gis]
         self.gis.extend(unique)
     except:
         self.status.setdefault(acc, False)
     return
Esempio n. 59
0
def filter_blast(blast_result_file_name, pan_protein_file_path,
                 filtered_pan_protein_file_path, length_table):
    '''
    input 1: blast_result_file_name
    input 2: pan_protein_file_path
    output 1: filtered_pan_protein_file_path
    output 2: length_table
    '''
    pan_protein_dic = SeqIO.index(str(pan_protein_file_path), "fasta")
    with filtered_pan_protein_file_path.open("w+") as out_fl:
        with length_table.open("w") as length_table_fl:
            for record in NCBIXML.parse(open(blast_result_file_name)):
                query_name = record.query
                query_len = record.query_letters
                assert record.query_letters == record.query_length
                length_table_fl.write("{}\t{}\n".format(query_name, query_len))
                if query_len <= 20: continue
                SeqIO.write(pan_protein_dic[query_name], out_fl, "fasta")
Esempio n. 60
0
def process_rps_output(filepath, evalue):
    """Process rpsblast output and return list of dictionaries."""
    results = []
    with open(filepath, "r") as fh:
        for record in NCBIXML.parse(fh):
            for align in record.alignments:
                des, d_id, name = process_align(align)
                for hsp in align.hsps:
                    if hsp.expect <= evalue:
                        dict = {"HitID": align.hit_id,
                                "DomainID": d_id,
                                "Name": name,
                                "Description": des,
                                "Expect": float(hsp.expect),
                                "QueryStart": int(hsp.query_start),
                                "QueryEnd": int(hsp.query_end)}
                        results.append(dict)
    return results