コード例 #1
0
 def test_LastProteinIds9(self):
     """LastProteinIds9 should give last protein ids in iter"""
     result = LastProteinIds9(self.rec)
     self.assertEqual(result, ['ece:Z4181', 'ecs:ECs3717', 'cvi:CV2421',\
         'sfl:CP0138', 'spt:SPA2730', 'sec:SC2804', 'stm:STM2872'])
     #should also work if threshold set
     result = LastProteinIds9(self.rec, False, threshold=8e-6)
     self.assertEqual(result, ['ece:Z4181', 'ecs:ECs3717', 'cvi:CV2421',\
         'sfl:CP0138'])
     #should work on multiple records
     result = map(LastProteinIds9, PsiBlastQueryFinder(self.rec2))
     self.assertEqual(len(result), 2)
     self.assertEqual(result[0], ['ece:Z4181', 'ecs:ECs3717', 'cvi:CV2421',\
         'sfl:CP0138', 'spt:SPA2730', 'sec:SC2804', 'stm:STM2872'])
     self.assertEqual(result[1], ['ece:Z4182', 'ecs:ECs3718', 'cvi:CV2422'])
コード例 #2
0
ファイル: blast.py プロジェクト: wangdi2014/for_qiime_scripts
def ids_from_seq_lower_threshold(seq, n, max_iterations, app, core_threshold, \
    lower_threshold, step=100):
    """Returns ids that match a seq, decreasing the sensitivity."""
    last_num_ids = None
    checkpoints = []
    cp_name_base = make_unique_str()

    # cache ides for each iteration
    # store { iteration_num:(core_threshold, [list of matching ids]) }
    all_ids = {}
    try:
        i = 0
        while 1:
            #-h is the e-value threshold for inclusion in the score matrix model
            app.Parameters['-h'].on(core_threshold)
            app.Parameters['-e'].on(core_threshold)
            if core_threshold > lower_threshold:
                raise ThresholdFound
            if checkpoints:
                #-R restarts from a previously stored file
                app.Parameters['-R'].on(checkpoints[-1])
            #store the score model from this iteration
            curr_check = 'checkpoint_' + cp_name_base + '_' + str(i) + \
                    '.chk'
            app.Parameters['-C'].on(curr_check)
            output = app(seq)
            result = list(output.get('BlastOut', output['StdOut']))
            #sometimes fails on first try -- don't know why, but this seems
            #to fix problem
            while not result:
                output = app(seq)
                result = list(output.get('BlastOut', output['StdOut']))

            ids = LastProteinIds9(result,
                                  keep_values=True,
                                  filter_identity=False)
            output.cleanUp()
            all_ids[i + 1] = (core_threshold, copy(ids))
            if not access(curr_check, F_OK):
                raise ThresholdFound
            checkpoints.append(curr_check)
            num_ids = len(ids)
            if num_ids >= n:
                raise ThresholdFound
            last_num_ids = num_ids
            core_threshold *= step
            if i >= max_iterations - 1:  #because max_iterations is 1-based
                raise ThresholdFound
            i += 1
    except ThresholdFound:
        for c in checkpoints:
            remove(c)
        #turn app.Parameters['-R'] off so that for the next file it does not
        #try and read in a checkpoint file that is not there
        app.Parameters['-R'].off()
        return ids, i + 1, all_ids
コード例 #3
0
ファイル: blast.py プロジェクト: wangdi2014/for_qiime_scripts
def ids_from_seq_two_step(seq, n, max_iterations, app, core_threshold, \
    extra_threshold, lower_threshold, second_db=None):
    """Returns ids that match a seq, using a 2-tiered strategy.
    
    Optionally uses a second database for the second search.
    """
    #first time through: reset 'h' and 'e' to core
    #-h is the e-value threshold for including seqs in the score matrix model
    app.Parameters['-h'].on(core_threshold)
    #-e is the e-value threshold for the final blast
    app.Parameters['-e'].on(core_threshold)
    checkpoints = []
    ids = []
    last_num_ids = None
    for i in range(max_iterations):
        if checkpoints:
            app.Parameters['-R'].on(checkpoints[-1])
        curr_check = 'checkpoint_%s.chk' % i
        app.Parameters['-C'].on(curr_check)

        output = app(seq)
        #if we didn't write a checkpoint, bail out
        if not access(curr_check, F_OK):
            break
        #if we got here, we wrote a checkpoint file
        checkpoints.append(curr_check)
        result = list(output.get('BlastOut', output['StdOut']))
        output.cleanUp()
        if result:
            ids = LastProteinIds9(result,
                                  keep_values=True,
                                  filter_identity=False)
        num_ids = len(ids)
        if num_ids >= n:
            break
        if num_ids == last_num_ids:
            break
        last_num_ids = num_ids

    #if we didn't write any checkpoints, second run won't work, so return ids
    if not checkpoints:
        return ids

    #if we got too many ids and don't have a second database, return the ids we got
    if (not second_db) and num_ids >= n:
        return ids

    #second time through: reset 'h' and 'e' to get extra hits, and switch the
    #database if appropriate
    app.Parameters['-h'].on(extra_threshold)
    app.Parameters['-e'].on(lower_threshold)
    if second_db:
        app.Parameters['-d'].on(second_db)
    for i in range(
            max_iterations):  #will always have last_check if we get here
        app.Parameters['-R'].on(checkpoints[-1])
        curr_check = 'checkpoint_b_%s.chk' % i
        app.Parameters['-C'].on(curr_check)
        output = app(seq)
        #bail out if we couldn't write a checkpoint
        if not access(curr_check, F_OK):
            break
        #if we got here, the checkpoint worked
        checkpoints.append(curr_check)
        result = list(output.get('BlastOut', output['StdOut']))
        if result:
            ids = LastProteinIds9(result,
                                  keep_values=True,
                                  filter_identity=False)
        num_ids = len(ids)
        if num_ids >= n:
            break
        if num_ids == last_num_ids:
            break
        last_num_ids = num_ids
    #return the ids we got. may not be as many as we wanted.
    for c in checkpoints:
        remove(c)
    return ids