Example #1
0
def needle_diff(left_struc, right_struc, key, minimal=True):
    '''Returns a diff between ``left_struc`` and ``right_struc``.  

    If ``left_struc`` and ``right_struc`` are both serializable as
    arrays, this function will use Needleman-Wunsch sequence alignment
    to find a minimal diff between them.  Otherwise, returns the same
    result as :func:`keyset_diff`.

    This function probably shouldn't be called directly.  Instead, use
    :func:`udiff`, which will call :func:`keyset_diff` if appropriate
    anyway.
    '''
    if type(left_struc) not in (list, tuple):
        return keyset_diff(left_struc, right_struc, key, minimal)

    assert type(right_struc) in (list, tuple)

    a, aleft, aright = needle.needle(left_struc, right_struc, needle_penalty)
    alignments = needle.backtrack(left_struc, right_struc, a, needle_penalty)
    can_align = False

    for aleft, aright in alignments:
        if aleft[:len(left_struc)] == left_struc:
            can_align = True
            break

    if not can_align:
        return keyset_diff(left_struc, right_struc, key, minimal)

    out = []
    for k in range(len(aleft)):
        sub_key = key + [k]
        if isinstance(aleft[k], needle.Gap):
            out.append([sub_key, aright[k]])
        elif isinstance(aright[k], needle.Gap):
            out.append([sub_key])
        else:
            out.extend(
                diff(aleft[k],
                     aright[k],
                     key=sub_key,
                     minimal=minimal,
                     verbose=False))
    return out
Example #2
0
def fullSeq(path, params):
    import pickle
    import queryDevice
    import needle
    import random
    inFile = open( path, 'r')
    lines = inFile.readlines()
    inFile.close()
    out = open(path ,'w')
    out.write("%s\tseq_id\tseq_sim\n"%lines[0].rstrip('\n'))
    seqIdDict = {}
    for line in lines[1:]:                
        elements = line.split('\t')
        proteinAcc_1 = elements[0]
        proteinAcc_2 = elements[1]
        pairName = ('_').join([proteinAcc_1, proteinAcc_2])
        try:            
            (seqSim, seqId) = seqIdDict[pairName]
            out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim))
            continue
        except KeyError:
            #print "aligning sequences of: %s\t%s"%(proteinAcc_1, proteinAcc_2)
            pass
        data = queryDevice.queryDevice("SELECT td.protein_sequence, td.protein_accession FROM target_dictionary td WHERE td.protein_accession IN ('%s')"% "','".join([proteinAcc_1, proteinAcc_2]), params)
        lkp = {}
        for entry in data:
            lkp[entry[1]] = entry[0]
        try:
            seq_1 = lkp[proteinAcc_1]
            seq_2 = lkp[proteinAcc_2]     
        except KeyError:
            seqIdDict[pairName] = (None, None)
            out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), None, None))
            continue
        ################################################
        # Align the sequences using needle from EMBOSS.
        needleReport = needle.needle(params['needlepath'], seq_1, seq_2)
        # Parse the output of the alignment
        (seqSim, seqId) = needle.parseNeedle(needleReport)
        seqIdDict[pairName] = (seqSim, seqId)
        out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim))
    out.close()
Example #3
0
def needle_diff(left_struc, right_struc, key, minimal=True):
    '''Returns a diff between ``left_struc`` and ``right_struc``.  

    If ``left_struc`` and ``right_struc`` are both serializable as
    arrays, this function will use Needleman-Wunsch sequence alignment
    to find a minimal diff between them.  Otherwise, returns the same
    result as :func:`keyset_diff`.

    This function probably shouldn't be called directly.  Instead, use
    :func:`udiff`, which will call :func:`keyset_diff` if appropriate
    anyway.
    '''
    if type(left_struc) not in (list, tuple):
        return keyset_diff(left_struc, right_struc, key, minimal)

    assert type(right_struc) in (list, tuple)

    a, aleft, aright = needle.needle(left_struc, right_struc, needle_penalty)
    alignments = needle.backtrack(left_struc, right_struc, a, needle_penalty)
    can_align = False

    for aleft, aright in alignments:
        if aleft[:len(left_struc)] == left_struc:
            can_align = True
            break

    if not can_align:
        return keyset_diff(left_struc, right_struc, key, minimal)

    out = []
    for k in range(len(aleft)):
        sub_key = key + [k]
        if isinstance(aleft[k], needle.Gap):
            out.append([sub_key, aright[k]])
        elif isinstance(aright[k], needle.Gap):
            out.append([sub_key])
        else:
            out.extend(diff(aleft[k], aright[k], key=sub_key, 
                            minimal=minimal, verbose=False))
    return out
Example #4
0
def pfam_a(path, params):
    import queryDevice
    import needle
    inFile = open( path, 'r')
    lines = inFile.readlines()
    inFile.close()
    out = open(path ,'w')
    out.write("%s\tdom_seq_id\tdom_seq_sim\tpfam_1\tpfam_2\n"%lines[0].rstrip('\n'))
    seqIdDict = {}
    for line in lines[1:]:
        elements = line.split('\t')
        proteinAcc_1 = elements[0]
        proteinAcc_2 = elements[1]
        pairName = ('_').join([proteinAcc_1, proteinAcc_2])
        try:            
            (seqSim, seqId, pfam_1, pfam_2) = seqIdDict[pairName]
            out.write("%s\t%s\t%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim, pfam_1, pfam_2))
            continue
        except KeyError:
            #print "aligning bs_containing domains of: %s\t%s"%(proteinAcc_1, proteinAcc_2)
            pass
        data_1 = queryDevice.queryDevice("""SELECT DISTINCT mp.pfam_a,
                             pd.start, pd.end, td.protein_sequence, td.protein_accession 
                         FROM map_pfam mp 
                         JOIN pfam_domains pd 
                         ON mp.pfam_a = pd.pfam_a 
                         JOIN target_dictionary td 
                         ON td.protein_accession = mp.protein_accession
                         WHERE mp.protein_accession = '%s' 
			 AND pd.protein_accession =  '%s' """ % (proteinAcc_1, proteinAcc_1), params)
        data_2 = queryDevice.queryDevice("""SELECT DISTINCT mp.pfam_a,
                             pd.start, pd.end, td.protein_sequence, td.protein_accession 
                         FROM map_pfam mp 
                         JOIN pfam_domains pd 
                         ON mp.pfam_a = pd.pfam_a 
                         JOIN target_dictionary td 
                         ON td.protein_accession = mp.protein_accession
                         WHERE mp.protein_accession = '%s' 
                         AND pd.protein_accession = '%s' """ % (proteinAcc_2, proteinAcc_2),params)

        lkp = {}
        for entry in data_1 + data_2:
            (pfam, start, end, fullSeq, acc) = (entry[0], entry[1], entry[2], entry[3], entry[4])
            seq = fullSeq[start:end]
            lkp[acc] = (seq, pfam)
        try:
            seq_1 = lkp[proteinAcc_1][0]
            pfam_1 =lkp[proteinAcc_1][1]
            seq_2 = lkp[proteinAcc_2][0]
            pfam_2 = lkp[proteinAcc_2][1]
        except KeyError:
            out.write("%s\t%s\t%s\t%s\t%s\n"%(line.rstrip('\n'), None, None, None, None))
            continue
        ################################################
        # Align the sequences using needle from EMBOSS.
        needleReport = needle.needle(params['needlepath'], seq_1, seq_2)
        ################################################
        # Parse the output of the alignment
        (seqSim, seqId) = needle.parseNeedle(needleReport)
        seqIdDict[pairName] = (seqSim, seqId, pfam_1, pfam_2)
        out.write("%s\t%s\t%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim, pfam_1, pfam_2))
    out.close()