def test_LastProteinIds9(self): """LastProteinIds9 should give last protein ids in iter""" result = LastProteinIds9(self.rec) self.assertEqual(result, ['ece:Z4181', 'ecs:ECs3717', 'cvi:CV2421',\ 'sfl:CP0138', 'spt:SPA2730', 'sec:SC2804', 'stm:STM2872']) #should also work if threshold set result = LastProteinIds9(self.rec, False, threshold=8e-6) self.assertEqual(result, ['ece:Z4181', 'ecs:ECs3717', 'cvi:CV2421',\ 'sfl:CP0138']) #should work on multiple records result = map(LastProteinIds9, PsiBlastQueryFinder(self.rec2)) self.assertEqual(len(result), 2) self.assertEqual(result[0], ['ece:Z4181', 'ecs:ECs3717', 'cvi:CV2421',\ 'sfl:CP0138', 'spt:SPA2730', 'sec:SC2804', 'stm:STM2872']) self.assertEqual(result[1], ['ece:Z4182', 'ecs:ECs3718', 'cvi:CV2422'])
def ids_from_seq_lower_threshold(seq, n, max_iterations, app, core_threshold, \ lower_threshold, step=100): """Returns ids that match a seq, decreasing the sensitivity.""" last_num_ids = None checkpoints = [] cp_name_base = make_unique_str() # cache ides for each iteration # store { iteration_num:(core_threshold, [list of matching ids]) } all_ids = {} try: i = 0 while 1: #-h is the e-value threshold for inclusion in the score matrix model app.Parameters['-h'].on(core_threshold) app.Parameters['-e'].on(core_threshold) if core_threshold > lower_threshold: raise ThresholdFound if checkpoints: #-R restarts from a previously stored file app.Parameters['-R'].on(checkpoints[-1]) #store the score model from this iteration curr_check = 'checkpoint_' + cp_name_base + '_' + str(i) + \ '.chk' app.Parameters['-C'].on(curr_check) output = app(seq) result = list(output.get('BlastOut', output['StdOut'])) #sometimes fails on first try -- don't know why, but this seems #to fix problem while not result: output = app(seq) result = list(output.get('BlastOut', output['StdOut'])) ids = LastProteinIds9(result, keep_values=True, filter_identity=False) output.cleanUp() all_ids[i + 1] = (core_threshold, copy(ids)) if not access(curr_check, F_OK): raise ThresholdFound checkpoints.append(curr_check) num_ids = len(ids) if num_ids >= n: raise ThresholdFound last_num_ids = num_ids core_threshold *= step if i >= max_iterations - 1: #because max_iterations is 1-based raise ThresholdFound i += 1 except ThresholdFound: for c in checkpoints: remove(c) #turn app.Parameters['-R'] off so that for the next file it does not #try and read in a checkpoint file that is not there app.Parameters['-R'].off() return ids, i + 1, all_ids
def ids_from_seq_two_step(seq, n, max_iterations, app, core_threshold, \ extra_threshold, lower_threshold, second_db=None): """Returns ids that match a seq, using a 2-tiered strategy. Optionally uses a second database for the second search. """ #first time through: reset 'h' and 'e' to core #-h is the e-value threshold for including seqs in the score matrix model app.Parameters['-h'].on(core_threshold) #-e is the e-value threshold for the final blast app.Parameters['-e'].on(core_threshold) checkpoints = [] ids = [] last_num_ids = None for i in range(max_iterations): if checkpoints: app.Parameters['-R'].on(checkpoints[-1]) curr_check = 'checkpoint_%s.chk' % i app.Parameters['-C'].on(curr_check) output = app(seq) #if we didn't write a checkpoint, bail out if not access(curr_check, F_OK): break #if we got here, we wrote a checkpoint file checkpoints.append(curr_check) result = list(output.get('BlastOut', output['StdOut'])) output.cleanUp() if result: ids = LastProteinIds9(result, keep_values=True, filter_identity=False) num_ids = len(ids) if num_ids >= n: break if num_ids == last_num_ids: break last_num_ids = num_ids #if we didn't write any checkpoints, second run won't work, so return ids if not checkpoints: return ids #if we got too many ids and don't have a second database, return the ids we got if (not second_db) and num_ids >= n: return ids #second time through: reset 'h' and 'e' to get extra hits, and switch the #database if appropriate app.Parameters['-h'].on(extra_threshold) app.Parameters['-e'].on(lower_threshold) if second_db: app.Parameters['-d'].on(second_db) for i in range( max_iterations): #will always have last_check if we get here app.Parameters['-R'].on(checkpoints[-1]) curr_check = 'checkpoint_b_%s.chk' % i app.Parameters['-C'].on(curr_check) output = app(seq) #bail out if we couldn't write a checkpoint if not access(curr_check, F_OK): break #if we got here, the checkpoint worked checkpoints.append(curr_check) result = list(output.get('BlastOut', output['StdOut'])) if result: ids = LastProteinIds9(result, keep_values=True, filter_identity=False) num_ids = len(ids) if num_ids >= n: break if num_ids == last_num_ids: break last_num_ids = num_ids #return the ids we got. may not be as many as we wanted. for c in checkpoints: remove(c) return ids