コード例 #1
0
ファイル: Pick70_blast.py プロジェクト: kaelfischer/lib_prrsv
def Parse_Blast (id , output, Group_dic_same,  OLIGOLEN):
    import blast_parse
    print 0.0
    print output
    parse_dic = {}
    blast_parse.parse_alignment(output, parse_dic, 1)
    #now we need the information to check that not only that we hit the same id but in the corresponding region too
    new_id = id[:]
    new_id = string.replace(new_id,' ','')
    new_id = string.replace(new_id,'\t','')

    try:
        Group = Group_dic_same[new_id] # [[hit_id, hit_strand, qstart, qend,hstart,hend,...],[hit_id, hit_strand, qstart,qend, hstart, hend,...]]
        print Group
    except KeyError:
        Group = []
    GroupIds=[] # the ids
    for groupHit in Group:
        hitid = groupHit[0]
        GroupIds.append(hitid)
    
    ret = parse_dic.values()[0] #only one entry there for the entire result
    Query_len = ret[0]
    gene_end = Query_len - OLIGOLEN 
    hit_list = ret[1:-1]
    alignment = ret[-1]

    energy_dic={}
    empty_dic ={}
    for pos in range (0, gene_end+1):
        empty_dic[pos] =[ 0,-1] #energy, location

    if len(hit_list) == 0:
        return {}
    else:
        hitGroup =[]
        for j in range (0, len(hit_list)):
            hit_id = hit_list[j][0] #hit_id , hit_len 
            hit_id = string.replace(hit_id,' ','')
            hit_id = string.replace(hit_id,'\t','')
            
            aligns = string.split(alignment [j], ' Score =')[1:]
            coverArea = []
            temp_energy_dic = empty_dic.copy()
            for i in range(0, len(aligns)): #individula alignment of individula hit
                plusline, minusline, p_start, m_start, p_end, m_end  =  blast_parse.parse_align(aligns[i])

                #we need to check if the hit can be ignored because it is the query itself
                #according to the Group_Same_Dic
                ignore =0
                if hit_id in GroupIds: # found id , possible
                    for hitInfo  in Group: #hitInfo is [hitid, hitstrand(+/-), qstart, qend, hitstart, hitend]
                        HITid, HITstrand, Qstart, Qend, Hstart, Hend = hitInfo
                        #check if id match
                        if HITid != hit_id:
                            continue

                        #check if strand information match
                        tolerant_diff = 5
                        if m_start < m_end: #+ strand in  the alignment
                            if HITstrand =='-':
                                continue
                        else:
                            if HITstrand =='+':
                                continue
                        # a key point is if the hit is itself it must be detected as a complete unit, which is almost the same as the hit in the GroupDicSame # + strand in the alignment
                        #if abs(Qstart - p_start)<=5  and abs(Qend- p_end)<=5 and abs(Hstart-m_start)<=5 and abs(Hend -m_end)<=5:
                        # Qstart, Qend : in group file
                        # p_end,hit_id: in blast
                        

                        if ((Qstart <=  p_start)  and ( Qend >=  p_end ))  :
                            if (( HITstrand =='+') and (Hstart <= m_start) and (Hend >= m_end)):
                                ignore = 1
                            elif (( HITstrand =='-') and (Hstart >= m_start) and (Hend <= m_end)):
                                ignore =1
                            if ignore:
                                print "ignored segment: query:",p_start,p_end,hit_id,":", Hstart,Hend
                                break

                        if abs(Qstart - p_start)<=5  and abs(Qend- p_end)<=5 and abs(Hstart-m_start)<=5 and abs(Hend -m_end)<=5:
                            ignore = 1
                            print "ignored segment: query:",p_start,p_end,hit_id,":", Hstart,Hend
                            break

                if ignore :
                    continue

                if WithInRange(coverArea, p_start, p_end):
                    continue
                a_length = len(plusline)
                
                p_start = p_start -1
                p_end = p_end -1
                if (m_start< m_end):
                    m_start = m_start-1
                    m_end = m_end  - 1
                else:
                    m_start = m_start +1
                    m_end = m_end +1
                
                energy_list = compute_energy2 (plusline, minusline)
                #start energy
                start_energy = 3.4
                for i in range (0, min(OLIGOLEN, a_length)):
                    start_energy = start_energy + energy_list[i][ADD]

                #position = p_start
                if (p_start <= gene_end):
                    if start_energy < temp_energy_dic[p_start][0]:
                        temp_energy_dic[p_start] = [ start_energy, m_start]

                poffset = 0 #check gap
                moffset = 0
                #the alignment of after the alignment
                energy = start_energy
                for pos in range(1,a_length - WORD):
                    if pos != 0 and plusline[pos-1] == '-': #check gap
                        poffset = poffset - 1
                    if pos!=0 and minusline[pos-1] =='-':
                        moffset = moffset - 1
                        
                    position = p_start+pos
                    preal_position = position + poffset #compensate for gap

                    if (m_start< m_end):
                        position = m_start+pos
                        mreal_position = position +moffset
                    else:
                        position = m_start -pos
                        mreal_position = position - moffset
                    
                    if (preal_position > gene_end) :#or (real_position < 0): #check bound
                        break

                    else:
                        energy = energy + energy_list[pos-1][SUB]
                        end = pos+OLIGOLEN -1
                        if end < a_length:
                            energy = energy + energy_list[end][ADD]

                        if energy < temp_energy_dic[preal_position][0]:
                            temp_energy_dic[preal_position] = [energy,mreal_position]

                poffset = 0  #no gap
                moffset =0
                end = min(OLIGOLEN, a_length)

                energy = start_energy
                #the alignemnet is completely inside of the oligo selection
                if (end == a_length):
                    start = p_start+end-OLIGOLEN  
                    for i in range(max(0,-(start) ), OLIGOLEN-end):
                        preal_position = start + i
                        if preal_position >gene_end:
                            break

                        if energy < temp_energy_dic[preal_position][0]:
                            if m_start < m_end:
                                m_location = m_start + preal_position - p_start
                            else:
                                m_location = m_start - (preal_position - p_start)
                            temp_energy_dic[preal_position] = [energy,m_location]

                #before the alignment 
                for pos in range(end-1, max(WORD, OLIGOLEN - p_start)-1 , -1): #version 2.7 correction
                    preal_position = p_start+ pos -OLIGOLEN
                    if preal_position > gene_end:
                        break
                    energy = energy - energy_list[pos][ADD]
                    if energy < temp_energy_dic[preal_position][0]:
                        if m_start < m_end:
                            m_location = m_start + preal_position - p_start
                        else:
                            m_location = m_start - (preal_position - p_start)
                        temp_energy_dic[preal_position] = [energy,m_location]
            
            for key in temp_energy_dic.keys():
                if temp_energy_dic[key] != [0,-1]:
                    try:
                        energy_dic[key][0] = min(energy_dic[key][0], temp_energy_dic[key][0])
                    except KeyError:
                        energy_dic[key] = [temp_energy_dic[key][0]]
                    energy_dic[key].append([hit_id, temp_energy_dic[key][0], temp_energy_dic[key][1]])                   
    return energy_dic #{pos:[min_energy,[hit_id, energy],[hit_id, energy],...], ....}
コード例 #2
0
        "blastall -p blastn -a 2 -i $inputf -d $genomef -S $STRAND  -F F  -v $MAX -b $MAX -G 5 -E 0 -o oligo_blastout"
    )
    os.system("chmod u+w $genomef" + ".*")
    os.system("rm $genomef" + ".*")

    resultf = open("oligo_blastout", "r")

    parse_dic = {}
    content = []
    line = " "
    while line != "":
        line = resultf.readline()
        if line[:6] == "BLASTN":
            if content == []:
                continue
            blast_parse.parse_alignment(content, parse_dic, 1)
            Query = parse_dic.keys()[0]
            print Query
            out.write(
                ">"
                + Query
                + "\t"
                + str(gc_dic[Query])
                + "\t"
                + str(repeat_dic[Query])
                + "\t"
                + str(sw_dic[Query])
                + "\n"
            )
            oligo_dup_output(out, parse_dic)
            content = [line]