Beispiel #1
0
def bSite(path, params):
    """Aligns binding site residues of GPCRs and kinases."""
    import queryDevice
    import parse
    import needle
    ksAccFile = params['ksAccFile']
    gsAccFile = params['gsAccFile'] 
    ksAliFile = params['ksAliFile'] 
    gsAliFile = params['gsAliFile']
    sitePositions = params['sitePositions']
    kinaseAcc = parseAcc(ksAccFile)
    gpcrAcc = parseAcc(gsAccFile)
    kinaseAli = parseKSAli(ksAliFile)
    gpcrAli = parseGSAli(gsAliFile)
    kinaseDict = sliceBS(kinaseAcc, kinaseAli, sitePositions['kinase'])
    gpcrDict = sliceBS(gpcrAcc, gpcrAli, sitePositions['gpcr'])
    # Processing the table of homologous pairs.
    inFile = open( path, 'r')
    lines = inFile.readlines()
    inFile.close()
    out = open(path ,'w')
    out.write("%s\tbs_seq_id\tbs_seq_sim\n"%lines[0].rstrip('\n'))
    seqIdDict = {}
    for line in lines[1:]:
        elements = line.split('\t')
        proteinAcc_1 = elements[0]
        proteinAcc_2 = elements[1]
        pairName = ('_').join([proteinAcc_1, proteinAcc_2])
        try:
            (seqSim, seqId) = seqIdDict[pairName]
            out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim))
            continue
        except KeyError:
            pass    
        if proteinAcc_1 in kinaseDict or proteinAcc_2 in kinaseDict:
            (seq_1, seq_2) = (kinaseDict[proteinAcc_1], kinaseDict[proteinAcc_2])
        elif proteinAcc_1 in gpcrDict and proteinAcc_2 in gpcrDict:
            (seq_1, seq_2) = (gpcrDict[proteinAcc_1], gpcrDict[proteinAcc_2])
        else: 
            out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), None, None))
            continue
        ################################################
        # Align the sequences using needle from EMBOSS.
        needleReport = needle.ungapped_needle(params['needlepath'], seq_1, seq_2)
        ################################################
        (seqSim, seqId) = needle.parseNeedle(needleReport)
        seqIdDict[pairName]=(seqSim, seqId)
        out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim))
    out.close()
Beispiel #2
0
def fullSeq(path, params):
    import pickle
    import queryDevice
    import needle
    import random
    inFile = open( path, 'r')
    lines = inFile.readlines()
    inFile.close()
    out = open(path ,'w')
    out.write("%s\tseq_id\tseq_sim\n"%lines[0].rstrip('\n'))
    seqIdDict = {}
    for line in lines[1:]:                
        elements = line.split('\t')
        proteinAcc_1 = elements[0]
        proteinAcc_2 = elements[1]
        pairName = ('_').join([proteinAcc_1, proteinAcc_2])
        try:            
            (seqSim, seqId) = seqIdDict[pairName]
            out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim))
            continue
        except KeyError:
            #print "aligning sequences of: %s\t%s"%(proteinAcc_1, proteinAcc_2)
            pass
        data = queryDevice.queryDevice("SELECT td.protein_sequence, td.protein_accession FROM target_dictionary td WHERE td.protein_accession IN ('%s')"% "','".join([proteinAcc_1, proteinAcc_2]), params)
        lkp = {}
        for entry in data:
            lkp[entry[1]] = entry[0]
        try:
            seq_1 = lkp[proteinAcc_1]
            seq_2 = lkp[proteinAcc_2]     
        except KeyError:
            seqIdDict[pairName] = (None, None)
            out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), None, None))
            continue
        ################################################
        # Align the sequences using needle from EMBOSS.
        needleReport = needle.needle(params['needlepath'], seq_1, seq_2)
        # Parse the output of the alignment
        (seqSim, seqId) = needle.parseNeedle(needleReport)
        seqIdDict[pairName] = (seqSim, seqId)
        out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim))
    out.close()
Beispiel #3
0
def pfam_a(path, params):
    import queryDevice
    import needle
    inFile = open( path, 'r')
    lines = inFile.readlines()
    inFile.close()
    out = open(path ,'w')
    out.write("%s\tdom_seq_id\tdom_seq_sim\tpfam_1\tpfam_2\n"%lines[0].rstrip('\n'))
    seqIdDict = {}
    for line in lines[1:]:
        elements = line.split('\t')
        proteinAcc_1 = elements[0]
        proteinAcc_2 = elements[1]
        pairName = ('_').join([proteinAcc_1, proteinAcc_2])
        try:            
            (seqSim, seqId, pfam_1, pfam_2) = seqIdDict[pairName]
            out.write("%s\t%s\t%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim, pfam_1, pfam_2))
            continue
        except KeyError:
            #print "aligning bs_containing domains of: %s\t%s"%(proteinAcc_1, proteinAcc_2)
            pass
        data_1 = queryDevice.queryDevice("""SELECT DISTINCT mp.pfam_a,
                             pd.start, pd.end, td.protein_sequence, td.protein_accession 
                         FROM map_pfam mp 
                         JOIN pfam_domains pd 
                         ON mp.pfam_a = pd.pfam_a 
                         JOIN target_dictionary td 
                         ON td.protein_accession = mp.protein_accession
                         WHERE mp.protein_accession = '%s' 
			 AND pd.protein_accession =  '%s' """ % (proteinAcc_1, proteinAcc_1), params)
        data_2 = queryDevice.queryDevice("""SELECT DISTINCT mp.pfam_a,
                             pd.start, pd.end, td.protein_sequence, td.protein_accession 
                         FROM map_pfam mp 
                         JOIN pfam_domains pd 
                         ON mp.pfam_a = pd.pfam_a 
                         JOIN target_dictionary td 
                         ON td.protein_accession = mp.protein_accession
                         WHERE mp.protein_accession = '%s' 
                         AND pd.protein_accession = '%s' """ % (proteinAcc_2, proteinAcc_2),params)

        lkp = {}
        for entry in data_1 + data_2:
            (pfam, start, end, fullSeq, acc) = (entry[0], entry[1], entry[2], entry[3], entry[4])
            seq = fullSeq[start:end]
            lkp[acc] = (seq, pfam)
        try:
            seq_1 = lkp[proteinAcc_1][0]
            pfam_1 =lkp[proteinAcc_1][1]
            seq_2 = lkp[proteinAcc_2][0]
            pfam_2 = lkp[proteinAcc_2][1]
        except KeyError:
            out.write("%s\t%s\t%s\t%s\t%s\n"%(line.rstrip('\n'), None, None, None, None))
            continue
        ################################################
        # Align the sequences using needle from EMBOSS.
        needleReport = needle.needle(params['needlepath'], seq_1, seq_2)
        ################################################
        # Parse the output of the alignment
        (seqSim, seqId) = needle.parseNeedle(needleReport)
        seqIdDict[pairName] = (seqSim, seqId, pfam_1, pfam_2)
        out.write("%s\t%s\t%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim, pfam_1, pfam_2))
    out.close()