def _get_blast_hits(self, blast_db, seqs): """ blast each seq in seqs against blast_db and retain good hits """ max_evalue = self.Params['Max E value'] min_percent_identity = self.Params['Min percent identity'] if min_percent_identity < 1.0: min_percent_identity *= 100.0 seq_ids = [s[0] for s in seqs] result = {} blast_result = blast_seqs( seqs, Blastall, blast_db=blast_db, params={'-p': 'blastn', '-n': 'T'}, add_seq_names=False) if blast_result['StdOut']: lines = [x for x in blast_result['StdOut']] blast_result = BlastResult(lines) else: return {}.fromkeys(seq_ids, []) for seq_id in seq_ids: blast_result_id = seq_id.split()[0] try: result[seq_id] = [(e['SUBJECT ID'], float(e['E-VALUE'])) for e in blast_result[blast_result_id][0] if (float(e['E-VALUE']) <= max_evalue and float(e['% IDENTITY']) >= min_percent_identity)] except KeyError: result[seq_id] = [] return result
def blast_genome(seqs, blast_db, e_value, max_hits, word_size, working_dir, blast_mat_root, extra_params=[], DEBUG=True): """Blast sequences against all genes in a genome seqs -- input sequences as strings blast_db -- path to blast database e_value -- e_value (float) max_hits -- maximum sequences detected by BLAST to show word_size -- word size for initial BLAST screen. blast_mat_root -- location of BLAST matrix files extra_params -- additional paramters to pass to BLAST DEBUG -- display verbose debugging outout """ # set up params to use with blastp or params = { # matrix "-M": "BLOSUM62", # max procs "-a": "1", # expectation "-e": e_value, # max seqs to show "-b": max_hits, # Word size "-W": word_size, # max one line descriptions "-v": max_hits, # tabular output "-m": "9", # program "-p": "blastn" } params.update(extra_params) output = blast_seqs(seqs, Blastall, blast_db=blast_db, params=params, WorkingDir=working_dir, add_seq_names=False, blast_mat_root=blast_mat_root) raw_output = [x for x in output['StdOut']] return raw_output