def getSeqbyName(): if len(sys.argv) < 4: print 'getSeqbyName: get msa sequence without gaps by searching fasta name' print 'example: python utils_msa.py getseqbyname PF07714_full.fa BTK_HUMAN\n' return msafile = sys.argv[2] msaheader = sys.argv[3].upper() print 'msa file: %s' % msafile print 'target entry: %s' % msaheader msaseq = '' m = msa(msafile) m.setTarget(msaheader) for s in m.msaArray: if msaheader in s[0]: msaheader = s[0] msaseq = s[1] outputSeq = [] for a in msaseq: if a in ['.', '-', '_']: continue else: outputSeq.append(a) print msaheader print ''.join(outputSeq)
def test_triple_compact(self): from msa import Disjunct as d, Sub as s self.assertEqual( msa.compact(msa.msa(["ABC", "BCD", "BCX"])), [d((d((s("A"),), ()),), ()), s("BC"), d((d((), (s("D"),)),), (s("X"),))], )
def msai2resi(): if len(sys.argv) < 4: print 'msai2resi: output the mapping between msa position index and pdb residue number' print 'example:python utils_msa.py msai2resi PF07714_full.fa BTK_HUMAN 1k2p.pdb\n' print 'output: PF07714_full.fa.1k2p.pdb.map' return msafile = sys.argv[2] target = sys.argv[3] pdbfile = sys.argv[4] outfile = msafile+'.'+pdbfile+'.map' print 'msafile: %s\ntarget header: %s\npdbfile: %s\noutput file: %s' % (msafile, target, pdbfile, outfile) m = msa(msafile) p = protein(pdbfile) rtmap = m.getResiTargetMap(p, target) if len(rtmap) < 1: print 'error occoured in generating rtmap' return #print '%s: %s' % (tvar, repr(rtmap[tvar])) # construct trmap from rtmap # 3128: (B641, 'R') trmap = {} #trmap = {v: k for k, v in rtmap.iteritems()} fout = open(outfile ,'w') for k in rtmap: msai, resn = rtmap[k] if msai in trmap: print 'error. duplicate key [%d] in rtmap' % msai return trmap[msai] = (k, resn) fout.write('%d %d %d' % (msai, k, resn)) fout.close()
def ncg2sdiicol(): if len(sys.argv)<7: print 'ncg2sdiicol: write selected MSA column into .sdiicol file' print 'python utils_msa.py ncg2sdiicol 1aps_A_1_97.rpdb.tip 1aps_A_1_97.rpdb.tip.ncg PF00708_full.txt.rseq PF00708_full.txt.all_2_sdiii ACYP2_HORSE 2' return pdbfile = sys.argv[2] # pdb name ncgfile = sys.argv[3] # hcg msafile = sys.argv[4] # msa (full or reduced) sdiifile = sys.argv[5] # sdii target = sys.argv[6] # target name orderlist = [int(i) for i in sys.argv[7].split(',')] outfile = pdbfile[0:4]+'_'+msafile[0:7]+'.sdiicol'# new substitution matrix print 'pdbfile :%s' % pdbfile print 'ncgfile :%s' % ncgfile print 'msafile :%s' % msafile print 'sdiifile :%s' % sdiifile print 'uniprot name :%s' % target print 'ncg order list : [%s]' % repr(orderlist) print 'outfile: %s' % outfile # get msa in matrix format m = msa(msafile) msaMatrix = np.array([list(s[1]) for s in m.msaArray]) # matrix format of msa #for i in xrange(0, len(seqs)): # print seqs[i] print 'msa matrix: ' + repr(msaMatrix.shape) # get resi -> msai map p = protein(pdbfile) rtmap = m.getResiTargetMap(p, target) # ('A9', (14, 'V')) : (resi+chain, (MSA index, resn)) sdiidict = loadsdii(sdiifile) # key: 39-140-210, value = 0.0788593466276019 msaGroupArray = ncg2msa(ncgfile, rtmap) # unsorted [[86, 83, 198, 127, 120], [138, 76, 82, 127, 132]] # output msa column set colset = set() for i in orderlist: for g in msaGroupArray: rg = g[0:i] # get ith order contact group rg.sort() # for generating key sdiikey = '-'.join([str(r) for r in rg]) if sdiikey not in sdiidict: #print 'ncg2sdiicol(): discard group: %s for low sdii' % sdiikey continue print (sdiikey, sdiidict[sdiikey]) for resi in rg: # for significant ncg, add corresponding MSA column index colset.add(resi) print 'ncg2sdiicol():writing %s: %s' % (outfile, repr(colset)) fout = open(outfile, 'w') fout.write(' '.join([str(c) for c in colset])) fout.close()
def MSAReduction(): if len(sys.argv) < 4: print 'msareduction: reduce columns and rows by cutoffs' print 'example: python utils_msa.py msareduction PF07714_full.fa BTK_HUMAN 0.2 0.62\n' return msafile = sys.argv[2] target = sys.argv[3] gap_cutoff = float(sys.argv[4]) # gap cutoff hamming_cutoff = float(sys.argv[5]) # hamming cutoff print 'msa file: %s' % msafile print 'target: %s' % target print 'gap cutoff: %f' % gap_cutoff print 'hamming cutoff: %s' % hamming_cutoff print 'loading msa file ...' m = msa(msafile) m.setTarget(target) (seqboard, scoreboard, column_index, row_index) = m.get_msaboard_RC_RR(gap_cutoff, hamming_cutoff) ''' print 'score matrix:' for i in xrange(0, len(score)): print repr(score[i]) print 'column index: %s' % repr(column_index) print 'row index: %s' % repr(row_index) ''' #seqs = np.array(seqboard)[row_index,:][:,column_index] seqs = np.array(seqboard)[row_index,:] # complete column reduced row fout = open(msafile+'.rseq', 'w') for i in xrange(0, len(row_index)): header = m.msaArray[row_index[i]][0] fout.write('>'+header+'\n') fout.write(''.join(seqs[i,:])+'\n') fout.close() print 'save reduced sequences to file: [%s]' % (msafile+'.rseq') #np.savetxt(msafile+'.rseq', seqs, delimiter='') score = np.array(scoreboard)[row_index,:][:,column_index] #np.savetxt(msafile+'.score', score, fmt='%.8', delimiter=',') print 'save score to file: [%s]' % (msafile+'.score') np.savetxt(msafile+'.score', score, fmt='%d', delimiter=',') print 'save reduced row indices to file: [%s]' % (msafile+'.row') fout = open(msafile+'.row', 'w') fout.write(','.join([str(i) for i in row_index])+'\n') fout.close() print 'save reduced column indices to file: [%s]' % (msafile+'.col') fout = open(msafile+'.col', 'w') fout.write(','.join([str(i) for i in column_index])+'\n') fout.close()
def reduceByWeight(): if len(sys.argv) < 5: print 'reduceByWeight: reduce a msa file by weighing and reduce scale (x%)' print 'example: python utils_msa.py reducebyweight 1k2p_PF07714_full.fa test.weight BTK_HUMAN 0.5\n' return msafile = sys.argv[2] weightfile = sys.argv[3] target = sys.argv[4] scale = float(sys.argv[5]) outfile ='%s.r%d' % (msafile, scale*100) print 'msa file: %s' % msafile print 'weight file: %s' % weightfile print 'target: %s' % target print 'reduce scale: %f' % scale print 'output file: %s' % outfile weight = np.loadtxt(weightfile, delimiter=',') print 'weight loaded : %s' % repr(weight.shape) print 'loading msa file ...' m = msa(msafile) m.setTarget(target) rlist=[] for i in xrange(0, len(weight)): rlist.append((i, weight[i])) # 0 -> len(weight) # small -> large sort_rlist = sorted(rlist, key=lambda x: x[1]) #for k in xrange(0, len(sort_rlist)): # print '[%d]:[%s]' % (k, repr(sort_rlist[k])) goal = int(len(weight) * (1-scale)) target_flag = False fout = open(outfile, 'w') # save msa sequences with large weights print 'Writing output ...' for k in xrange(goal, len(weight)): (index, w) = sort_rlist[k] #print '%d, %f' % (index, w) if m.msaArray[index][0] == m.target[0]: target_flag = True fout.write('>%s\n%s\n' % (m.msaArray[index][0], m.msaArray[index][1])) if target_flag == False: print 'Inserting target sequence: %s' % m.target[0] fout.write('>%s\n%s\n' % (m.target[0], m.target[1])) fout.close() print 'reduced msa: [%s]\nlen: %d' % (outfile, goal)
def sdii2resi(): if len(sys.argv) < 5: print 'resi2target: given a residue number output the corresponding position in target msa' print 'example:python utils_msa.py sdii2resi PF07714_full.fa.r50 BTK_HUMAN 1k2p.pdb PF07714_full.fa.r50.3128_3_sdii\n' print 'output: PF07714_full.fa.r50.3128_3_sdii_resi' return msafile = sys.argv[2] target = sys.argv[3] pdbfile = sys.argv[4] sdiifile = sys.argv[5] print 'msafile: %s\ntarget header: %s\npdbfile: %s\nsdii file: %s' % (msafile, target, pdbfile, sdiifile) m = msa(msafile) p = protein(pdbfile) rtmap = m.getResiTargetMap(p, target) if len(rtmap) < 1: print 'error occoured in generating rtmap' return #print '%s: %s' % (tvar, repr(rtmap[tvar])) # construct trmap from rtmap # 3128: (B641, 'R') trmap = {} #trmap = {v: k for k, v in rtmap.iteritems()} for k in rtmap: msai, resn = rtmap[k] if msai in trmap: print 'error. duplicate key [%d] in rtmap' % msai return trmap[msai] = (k, resn) #print trmap # read sdii file with open(sdiifile) as f: sdiilines = f.readlines() outfile = sdiifile + '_resi' fout = open(outfile, 'w') # 52 [pid:20029] 926-3089-3128 0.001106226720675 count = 0 for line in sdiilines: count += 1 print '%d/%d processed ...' % (count, len(sdiilines)) strArr = line.strip().split(' ') msailist = strArr[2].split('-') sdiivalue = strArr[3] fout.write('%s %s\n' % ('-'.join([repr(trmap[int(i)]) for i in msailist]), sdiivalue)) fout.close() print 'done.\noutput file: [%s]' % outfile
def main(): # test msa weight msafile = 'test_msa.txt' target = '1k2p' m = msa(msafile, target) score, varlist = m.msaboard(0.0, 0.5) print score sdii_core = sdii(score) print sdii_core.w_entropy(sdii_core.data[:,[0,1]].T) weight = np.loadtxt('test_msa.weight', delimiter=',') sdii_core.setWeight(weight) print sdii_core.w_entropy(sdii_core.data[:,[0,1]].T) print sdii_core.weight print 'sum(weight): %f' % sum(sdii_core.weight)
def searchpdbseq(): if len(sys.argv) < 2: print 'searchpdbseq: locate pdb sequence in MSA' print 'example: python utils_msa.py searchpdbseq PF07714_full.fa 1T49_A.pdb\n' return msafile = sys.argv[2] target = sys.argv[3] print 'msa file: %s' % msafile print 'pdb target: %s' % target m = msa(msafile) p = protein(target) if m.searchTargetPDB(p)==0: print 'cannot locate pdb sequence in MSA'
def reduceByHamming(): if len(sys.argv) < 3: print 'reduceByHamming: reduce a msa file by selecting sequences that have sequential similarity < 62% (hamming dist > 0.38)' print 'example: python utils_msa.py reducebyhamming PF07714_full.fa BTK_HUMAN\n' return msafile = sys.argv[2] target = sys.argv[3] outfile = '%s.s62' % msafile print 'msa file: %s\ntarget header: %s\noutfile: %s\n' % (msafile, target, outfile) print 'loading msa ...' m = msa(msafile) m.setTarget(target) print m.hammingReduction(outfile, 0.62)
def main(filename, algorithm_type, gap_penalty_open, gap_penalty_extension): with open(filename, 'r') as content_file: content = content_file.read() chains = io.read_input(content) if algorithm_type == 'nw': alignments, score = needleman_wunsch(chains[0].chain, chains[1].chain, blosum_62_scoring, gap_penalty_open) io.print_output(chains[0].name, chains[1].name, alignments[0], alignments[1], score) elif algorithm_type == 'nwap': alignments, score = nw_affine_penalty(chains[0].chain, chains[1].chain, blosum_62_scoring, gap_penalty_open, gap_penalty_extension) io.print_output(chains[0].name, chains[1].name, alignments[0], alignments[1], score) elif algorithm_type == 'sw': alignments, score = smith_waterman(chains[0].chain, chains[1].chain, blosum_62_scoring, gap_penalty_open) io.print_output(chains[0].name, chains[1].name, alignments[0], alignments[1], score) elif algorithm_type == 'hb': alignments = hirschberg(chains[0].chain, chains[1].chain, blosum_62_scoring, gap_penalty_open) io.print_output(chains[0].name, chains[1].name, alignments[0], alignments[1], None) elif algorithm_type == 'msa': chains_raw = [c.chain for c in chains] alignments = msa(chains_raw, gap_penalty_open) chains_str = ["".join(c.chain) for c in chains] io.print_output_multi(chains_str, alignments) else: assert False, 'Wrong algorythm type'
def getMsabyName(): if len(sys.argv) < 4: print 'getMsabyName: get msa sequence with gaps by searching fasta name' print 'example: python utils_msa.py getmsabyname PF07714_full.fa BTK_HUMAN\n' return msafile = sys.argv[2] msaheader = sys.argv[3].upper() print 'msa file: %s' % msafile print 'target entry: %s' % msaheader msaseq = '' m = msa(msafile) m.setTarget(msaheader) for s in m.msaArray: if msaheader in s[0]: print s[0] print s[1]
def resi2msai(): if len(sys.argv) < 5: print 'resi2target: given a residue number output the corresponding position in target msa' print 'python utils_msa.py resi2msai PF00014_full.txt BPT1_BOVIN 5pti_pf.pdb A6' return msafile = sys.argv[2] target = sys.argv[3] pdbfile = sys.argv[4] tvar = sys.argv[5] print 'msafile: %s\ntarget header: %s\npdbfile: %s\ntarget variable: %s' % (msafile, target, pdbfile, tvar) m = msa(msafile) p = protein(pdbfile) print p.resDict[tvar] rtmap = m.getResiTargetMap(p, target) if len(rtmap) < 1: return print 'map %s: %s' % (tvar, repr(rtmap[tvar])) return (tvar, rtmap[tvar][0], rtmap[tvar][1])
def resi2target(): if len(sys.argv) < 5: print 'resi2target: given a residue number output the corresponding position in target msa' print 'example:python utils_msa.py resi2target PF07714_full.fa.r50 BTK_HUMAN 1k2p.pdb B641\n' return msafile = sys.argv[2] target = sys.argv[3] pdbfile = sys.argv[4] tvar = sys.argv[5] print 'msafile: %s\ntarget header: %s\npdbfile: %s\ntarget variable: %s' % (msafile, target, pdbfile, tvar) m = msa(msafile) p = protein(pdbfile) print p.resDict[tvar] rtmap = m.getResiTargetMap(p, target) if len(rtmap) < 1: return print 'map %s: %s' % (tvar, repr(rtmap[tvar])) return (tvar, rtmap[tvar][0], rtmap[tvar][1])
def pdistDistribution(): if len(sys.argv) < 3: print 'pdist: write pairwise sequence simiarity value in a file' print 'example: python utils_msa.py pdist PF07714_full.fa BTK_HUMAN\n' return msafile = sys.argv[2] target = sys.argv[3] outfile = '%s.pdist' % msafile print 'msa file: %s\ntarget header: %s\noutfile: %s\n' % (msafile, target, outfile) print 'loading msa ...' m = msa(msafile) m.setTarget(target) print print 'saving to [%s] ...' % outfile pdist = m.getpdist() np.savetxt(outfile, pdist, fmt='%.8', delimiter='\n') print '%d pdist saved.' % len(pdist)
def init(): if len(sys.argv) < 6: print "Usage: python mproc_coevol_sdii.py msafile weightfile cutoff target_seq msapos order" print "Example 1: python mproc_coevol_sdii.py PF07714_full.fa.r50 PF07714_full.fa.r50.weight 0.6 BTK_HUMAN 3128 3" print "Example 2: python mproc_coevol_sdii.py PF07714_full.fa.s62 NA 0.6 BTK_HUMAN all 3" return msafile = sys.argv[1] weightfile = sys.argv[2] drop_cutoff = float(sys.argv[3]) # for reduce columns targetHeader = sys.argv[4] target = sys.argv[5].lower() order = int(sys.argv[6]) print "msafile: [%s]" % msafile print "weightfile: [%s]" % weightfile print "drop_cutoff: [%f]" % drop_cutoff print "target msa header: [%s]" % targetHeader print "target var: [%s]" % target print "order: [%d]" % order outfile = "%s.%s_%d_sdii" % (msafile, target, order) print "write to [%s]" % outfile # msa init m = msa(msafile) m.setTarget(targetHeader) print "original data dimension: (%d, %d)" % (m.seqNum, m.seqlen) # weight_cutoff = 0.3 # for weighting msa sequence # taken care of in matlab score, varlist = m.msaboard(drop_cutoff) # , weight_cutoff) # return a compact score print "reduced data dimension: %s" % repr(score.shape) if (target != "all") and (int(target) not in varlist): print "The alignment for var %s is not significant. exit." % target return # sdii init sdii_core = sdii(score) print "Loading weight ..." if weightfile.upper() != "NA": pfam_weight = np.loadtxt(weightfile, delimiter=",") print "Weight vector: %s" % repr(pfam_weight.shape) print "Applying weight to sdii data ..." sdii_core.setWeight(pfam_weight) # set sequence weight else: print "setting weight: %r" % sdii_core.isWeighted print "Setting varlist to sdii ..." sdii_core.setVarlist(varlist) # set sequence weight print "Setting target variable ..." sdii_core.setTarget(target) print "Setting task order ..." sdii_core.setOrder(order) # tasklist init # calculating total tasks tasks = [] if target == "all": print "generating tasks for all ..." for s in set(itertools.combinations(list(range(len(varlist))), order)): tasks.append(list(s)) print "In total %d/%d for order %d." % (len(tasks), binom(len(varlist), order), order) else: print "generating tasks for variable %s" % target for s in set(itertools.combinations(list(range(len(varlist))), order - 1)): target_idx = varlist.index(int(target)) if target_idx not in s: st = list(s) st.append(target_idx) tasks.append(st) print "In total %d/%d for order %d." % (len(tasks), binom(len(varlist), order), order) sdii_core.setTotalTask(len(tasks)) # split tasks into blocks tasklist = [] n = len(tasks) / 20 + 1 for i in xrange(0, len(tasks), n): tasklist.append(tasks[i : i + n]) print "spliting tasks into %d blocks" % len(tasklist) print "init done." return (sdii_core, tasklist, outfile)
def test_triple_compact(self): from msa import Disjunct as d, Sub as s self.assertEqual(msa.compact(msa.msa(['ABC', 'BCD', 'BCX'])), [d((d((s('A'),), ()),), ()), s('BC'), d((d((), (s('D'),)),), (s('X'),))])
def test_pair(self): from msa import Disjunct as d, Sub as s self.assertEqual(msa.msa(['ABC', 'BCDE']), [d((s('A'),), ()), s('B'), s('C'), d((), (s('D'), s('E')))])
def main(): global alphabet if len(sys.argv) < 6: print 'Usage: python proc_coevol_sdii.py msafile weightfile cutoff target_seq msapos order' print 'Example: python proc_coevol_sdii.py PF07714_full.fa.r50 PF07714_full.fa.r50.weight 0.6 BTK_HUMAN 3128 3' return msafile = sys.argv[1] weightfile = sys.argv[2] drop_cutoff = float(sys.argv[3]) # for reduce columns targetHeader = sys.argv[4] target = sys.argv[5].lower() order = int(sys.argv[6]) print 'msafile: [%s]' % msafile print 'weightfile: [%s]' % weightfile print 'drop_cutoff: [%f]' % drop_cutoff print 'target msa header: [%s]' % targetHeader print 'target var: [%s]' % target print 'order: [%d]' % order outfile = '%s.%s_%d_sdii' % (msafile, target, order) print 'write to [%s]' % outfile m = msa(msafile) m.setTarget(targetHeader) print 'original data dimension: (%d, %d)' % (m.seqNum, m.seqlen) #weight_cutoff = 0.3 # for weighting msa sequence # taken care of in matlab score, varlist = m.msaboard(drop_cutoff) #, weight_cutoff) # return a compact score print 'reduced data dimension: %s' % repr(score.shape) ''' score: A..C..D.EF index: 0123456789 # after reduction score: ACDE index: 0123 -> input in sdii calculation index: 0368 = varlist = alphabet ''' alphabet = [str(i) for i in varlist] #print alphabet #m.writeScoreboard('1k2p_PF07714_seed.score') if (target != 'all') and (int(target) not in varlist): print 'The alignment for var %s is not significant. exit.' % target return if target == 'all': pk = binom(len(varlist), order) else: pk = binom(len(varlist), order-1) - len(varlist) - 1 print 'total calculations: %d' % pk print 'Loading weight ...' pfam_weight = np.loadtxt(weightfile, delimiter=',') print 'Weight vector: %s' % repr(pfam_weight.shape) sdii_core = sdii(score) print 'Applying weight to sdii data ...' sdii_core.setWeight(pfam_weight) # set sequence weight fout = open(outfile, 'w') t0 = time.time() count = 0 for s in set(itertools.combinations(list(range(len(alphabet))), order)): if (target == 'all') or (alphabet.index(target) in s): count+=1 print '%d/%d: %s ' % (count, pk, '-'.join([(alphabet[i]) for i in s])) ret_sdii = sdii_core.calc_sdii(list(s)) t1 = time.time() print 'time used: %d seconds\n' % (t1-t0) fout.write('%s %.15f\n' % ('-'.join([(alphabet[i]) for i in s]), ret_sdii)) t0 = t1 fout.close()
import classic import numpy as np import time # 读取基础数据 with open('parameter.json', 'r') as load_f: p = json.load(load_f) demand = np.array(p['demand']) capacity = np.array(p['capacity']) free = np.array(p['free']) n = np.array(p['number']) # 计算 start_time_msa = time.time() x, ttime = msa.msa(demand, capacity, free, n) end_time_msa = time.time() time_msa = end_time_msa - start_time_msa print(f'time_msa:{time_msa}') start_time_classic = time.time() x1, time1 = classic.classic(demand, capacity, free, n) end_time_classic = time.time() time_classic = end_time_classic - start_time_classic print(f'time_classic:{time_classic}') width = 0.35 fig = plt.figure() fig.canvas.set_window_title('flow') plt.bar(np.arange(n) - width / 2, x, width=width)
result_path = info_dict['result_path'] if os.path.exists(result_path): raise FileExistsError(f"Output path {result_path} already exists") else: os.mkdir(result_path) pred_oupt_path = f'{result_path}/output' os.mkdir(pred_oupt_path) # Extract whole docking domain sequences from genbank file # according to provided protein id information extract_seq.extract_seq(info_dict) # Cluster the docking domains into 3 classes, then align the # class 1 sequences msa.clustering(info_dict) msa.msa(info_dict) # Pair the query sequences and integrate them with the interacting # sequences and extra sequences to perform Ouroboros analysis query_seq.prepare_query_fl(info_dict) # Run Ouroboros analysis with user-defined parameters and find # the result with the best LLH Ouroboros_analysis.ouroboros_analysis(info_dict) # matrix and plot the matrix info_dict['output_path'] = pred_oupt_path int_prob.prob_mtx(info_dict) # Predict the protein order according to interaction probability, # start/end protein, protein class
def test_pair(self): from msa import Disjunct as d, Sub as s self.assertEqual(msa.msa(["ABC", "BCDE"]), [d((s("A"),), ()), s("B"), s("C"), d((), (s("D"), s("E")))])
def ncg2blossum(): if len(sys.argv) < 7: print 'ncg2blossum: construct new substitution matrix from contact group' print 'example:python utils_msa.py ncg2blossum 5pti_pf.pdb 5pti_pf.tip.ncg PF00014_full.txt.rseq PF00014_full.txt.sdii BPT1_BOVIN order' print 'output: a substitution matrix file (same format as BLOSSUM62)' return #print sys.argv[0] # utils_msa.py #print sys.argv[1] # hcg2blossum pdbfile = sys.argv[2] # pdb name ncgfile = sys.argv[3] # hcg msafile = sys.argv[4] # msa (full or reduced) sdiifile = sys.argv[5] # sdii target = sys.argv[6] # target name order = int(sys.argv[7]) outfile = msafile[0:7]+".sm" # new substitution matrix # get msa in matrix format m = msa(msafile) msaMatrix = np.array([list(s[1]) for s in m.msaArray]) # matrix format of msa #for i in xrange(0, len(seqs)): # print seqs[i] print 'msa matrix: ' + repr(msaMatrix.shape) # get resi -> msai map p = protein(pdbfile) rtmap = m.getResiTargetMap(p, target) sdiidict = loadsdii(sdiifile) # key: 39-140-210, value = 0.0788593466276019 msaGroupArray = ncg2msa(ncgfile, rtmap) # [[210, 215], [106, 211], [73, 95, 166], [109, 124, 139]] # get non overlapped column indices colset = set() for g in msaGroupArray: rg = g[0:order] # get ith order contact group rg.sort() # for generating key sdiikey = '-'.join([str(r) for r in rg]) if sdiikey not in sdiidict: #print 'ncg2sdiicol(): discard group: %s for low sdii' % sdiikey continue print (sdiikey, sdiidict[sdiikey]) for resi in rg: # for significant ncg, add corresponding MSA column index colset.add(resi) # init substitution matrix EBlist = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '*'] #AAlist = sorted(EBlist) #AAlist = sorted(['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '*']) AAlist = sorted(['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']) sm = {} for i in xrange(0, len(AAlist)): for j in xrange(i, len(AAlist)): key = '%s%s' % (AAlist[i], AAlist[j]) sm[key] = 0 print AAlist print 'Alphabet: %d' % len(AAlist) print 'AA: %d' % len(sm) # accumulate substitution matrix AA frequency for all the contact group columns # Sum the scores for each columns across column print '' w = 0 # count column number for col in colset: w+=1 calcColSM(sm, msaMatrix, col) ''' for mg in msaGroupArray: # form key for co-evolve value sdiikey = '-'.join([str(i) for i in mg]) if sdiikey not in sdiidict: print 'hcg2blossum():discard group: %s' % sdiikey continue sdiiweight = sdiidict[sdiikey] print (sdiikey, sdiiweight) # accumulate SM for each contact group / column group for col in mg: w +=1 calcColSM(sm, msaMatrix, col) print '' ''' #print repr(sm) #print '' n = msaMatrix.shape[0] T = w*n*(n-1)/2 # normalization term print 'w: %d' % w # number of columns (contact group) print 'n: %d' % n # number of sequence print 'T: %d' % T # convert cij to qij # Normalize the pair frequencies so they will sum to 1 for c in sm: sm[c] = 1.0*sm[c]/T #print repr(sm) #print '' # Calculate the expected probability of occurrence of the ith residue in an (i,j) pair # pi = qii + sum( qij/2 )_{i!=j} pi = {} for i in xrange(0, len(AAlist)): A = AAlist[i] sum_qij = 0 for j in xrange(i+1, len(AAlist)): # i should not = j B = AAlist[j] sum_qij += sm[A+B]/2 pi[A] = sm[A+A] + sum_qij print repr(pi) print '' # The desired denominator is the expected frequency for each pair eij = {} for i in xrange(0, len(AAlist)): A = AAlist[i] for j in xrange(i+1, len(AAlist)): B = AAlist[j] eij[A+B] = 2 * pi[A] * pi[B] eij[A+A] = pi[A] * pi[A] print len(eij) print repr(eij) print '' # Log odds ratio sij = round(2*log2(qij/eij)) sij = {} for i in xrange(0, len(AAlist)): A = AAlist[i] for j in xrange(i, len(AAlist)): B = AAlist[j] if eij[A+B] == 0.0 or sm[A+B]==0.0: sij[A+B] = 0 else: sij[A+B] = int(round(2*math.log((sm[A+B]/eij[A+B]),2))) # sij[A+B] = sm[A+B]/eij[A+B] print repr(sij) print len(sij) print '' saveBlosum(EBlist, sij, outfile)