Example #1
0
def getSeqbyName():
	if len(sys.argv) < 4:
		print 'getSeqbyName: get msa sequence without gaps by searching fasta name'
		print 'example: python utils_msa.py getseqbyname PF07714_full.fa BTK_HUMAN\n'
		return

	msafile = sys.argv[2]
	msaheader = sys.argv[3].upper()
	print 'msa file: %s' % msafile
	print 'target entry: %s' % msaheader

	msaseq = ''
	m = msa(msafile)
	m.setTarget(msaheader)

	for s in m.msaArray:
		if msaheader in s[0]:
			msaheader = s[0]
			msaseq = s[1]

	outputSeq = []
	for a in msaseq:
		if a in ['.', '-', '_']:
			continue
		else:
			outputSeq.append(a)

	print msaheader
	print ''.join(outputSeq)
Example #2
0
def msai2resi():
	if len(sys.argv) < 4:
		print 'msai2resi: output the mapping between msa position index and pdb residue number'
		print 'example:python utils_msa.py msai2resi PF07714_full.fa BTK_HUMAN 1k2p.pdb\n'
		print 'output: PF07714_full.fa.1k2p.pdb.map'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	pdbfile = sys.argv[4]
	outfile = msafile+'.'+pdbfile+'.map'

	print 'msafile: %s\ntarget header: %s\npdbfile: %s\noutput file: %s' % (msafile, target, pdbfile, outfile)
	m = msa(msafile)
	p = protein(pdbfile)
	rtmap = m.getResiTargetMap(p, target)
	if len(rtmap) < 1:
		print 'error occoured in generating rtmap'
		return
	#print '%s: %s' % (tvar, repr(rtmap[tvar]))
	# construct trmap from rtmap
	# 3128: (B641, 'R')
	trmap = {}
	#trmap = {v: k for k, v in rtmap.iteritems()}
	fout = open(outfile ,'w')
	for k in rtmap:
		msai, resn = rtmap[k]
		if msai in trmap:
			print 'error. duplicate key [%d] in rtmap' % msai
			return
		trmap[msai] = (k, resn)
		fout.write('%d %d %d' % (msai, k, resn))
	fout.close()
Example #3
0
    def test_triple_compact(self):
        from msa import Disjunct as d, Sub as s

        self.assertEqual(
            msa.compact(msa.msa(["ABC", "BCD", "BCX"])),
            [d((d((s("A"),), ()),), ()), s("BC"), d((d((), (s("D"),)),), (s("X"),))],
        )
Example #4
0
def ncg2sdiicol():
	if len(sys.argv)<7:
		print 'ncg2sdiicol: write selected MSA column into .sdiicol file'
		print 'python utils_msa.py ncg2sdiicol 1aps_A_1_97.rpdb.tip 1aps_A_1_97.rpdb.tip.ncg PF00708_full.txt.rseq PF00708_full.txt.all_2_sdiii ACYP2_HORSE 2'
		return

	pdbfile = sys.argv[2] # pdb name
	ncgfile = sys.argv[3] # hcg
	msafile = sys.argv[4] # msa (full or reduced)
	sdiifile = sys.argv[5] # sdii
	target = sys.argv[6] # target name	
	orderlist = [int(i) for i in sys.argv[7].split(',')]
	outfile =  pdbfile[0:4]+'_'+msafile[0:7]+'.sdiicol'# new substitution matrix


	print 'pdbfile :%s' % pdbfile
	print 'ncgfile :%s' % ncgfile
	print 'msafile :%s' % msafile
	print 'sdiifile :%s' % sdiifile
	print 'uniprot name :%s' % target
	print 'ncg order list : [%s]' % repr(orderlist)
	print 'outfile: %s' % outfile

	# get msa in matrix format
	m = msa(msafile)
	msaMatrix = np.array([list(s[1]) for s in m.msaArray]) # matrix format of msa

	#for i in xrange(0, len(seqs)):
	#	print seqs[i]
	print 'msa matrix: ' + repr(msaMatrix.shape)

	# get resi -> msai map	
	p = protein(pdbfile)

	rtmap = m.getResiTargetMap(p, target) # ('A9', (14, 'V')) : (resi+chain, (MSA index, resn))

	sdiidict = loadsdii(sdiifile) # key: 39-140-210, value = 0.0788593466276019
	msaGroupArray = ncg2msa(ncgfile, rtmap) # unsorted [[86, 83, 198, 127, 120], [138, 76, 82, 127, 132]]

	# output msa column set
	colset = set()
	for i in orderlist:
		for g in msaGroupArray:
			rg = g[0:i] # get ith order contact group
			rg.sort() # for generating key
			sdiikey = '-'.join([str(r) for r in rg])
			if sdiikey not in sdiidict:
				#print 'ncg2sdiicol(): discard group: %s for low sdii' % sdiikey
				continue
			print (sdiikey, sdiidict[sdiikey])			
			for resi in rg: # for significant ncg, add corresponding MSA column index
				colset.add(resi)

	print 'ncg2sdiicol():writing %s: %s' % (outfile, repr(colset))
	fout = open(outfile, 'w')
	fout.write(' '.join([str(c) for c in colset]))
	fout.close()
Example #5
0
def MSAReduction():
	if len(sys.argv) < 4:
		print 'msareduction: reduce columns and rows by cutoffs'
		print 'example: python utils_msa.py msareduction PF07714_full.fa BTK_HUMAN 0.2 0.62\n'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	gap_cutoff = float(sys.argv[4]) # gap cutoff
	hamming_cutoff = float(sys.argv[5]) # hamming cutoff

	print 'msa file: %s' % msafile
	print 'target: %s' % target
	print 'gap cutoff: %f' % gap_cutoff
	print 'hamming cutoff: %s' % hamming_cutoff

	print 'loading msa file ...'
	m = msa(msafile)
	m.setTarget(target)

	(seqboard, scoreboard, column_index, row_index) = m.get_msaboard_RC_RR(gap_cutoff, hamming_cutoff)

	'''
	print 'score matrix:'
	for i in xrange(0, len(score)):
		print repr(score[i])
	print 'column index: %s' % repr(column_index)
	print 'row index: %s' % repr(row_index)
	'''
	#seqs = np.array(seqboard)[row_index,:][:,column_index]
	seqs = np.array(seqboard)[row_index,:] # complete column reduced row

	fout = open(msafile+'.rseq', 'w')
	for i in xrange(0, len(row_index)):
		header = m.msaArray[row_index[i]][0]
		fout.write('>'+header+'\n')
		fout.write(''.join(seqs[i,:])+'\n')
	fout.close()
	print 'save reduced sequences to file: [%s]' % (msafile+'.rseq')
	#np.savetxt(msafile+'.rseq', seqs, delimiter='')

	score = np.array(scoreboard)[row_index,:][:,column_index]
	#np.savetxt(msafile+'.score', score, fmt='%.8', delimiter=',')
	print 'save score to file: [%s]' % (msafile+'.score')
	np.savetxt(msafile+'.score', score, fmt='%d', delimiter=',')
	print 'save reduced row indices to file: [%s]' % (msafile+'.row')
	fout = open(msafile+'.row', 'w')
	fout.write(','.join([str(i) for i in row_index])+'\n')
	fout.close()

	print 'save reduced column indices to file: [%s]' % (msafile+'.col')
	fout = open(msafile+'.col', 'w')
	fout.write(','.join([str(i) for i in column_index])+'\n')
	fout.close()
Example #6
0
def reduceByWeight():
	if len(sys.argv) < 5:
		print 'reduceByWeight: reduce a msa file by weighing and reduce scale (x%)'
		print 'example: python utils_msa.py reducebyweight 1k2p_PF07714_full.fa test.weight BTK_HUMAN 0.5\n'
		return

	msafile = sys.argv[2]
	weightfile = sys.argv[3]
	target = sys.argv[4]
	scale = float(sys.argv[5])
	outfile ='%s.r%d' % (msafile, scale*100)
	print 'msa file: %s' % msafile
	print 'weight file: %s' % weightfile
	print 'target: %s' % target
	print 'reduce scale: %f' % scale
	print 'output file: %s' % outfile

	weight = np.loadtxt(weightfile, delimiter=',')
	print 'weight loaded : %s' % repr(weight.shape)

	print 'loading msa file ...'
	m = msa(msafile)
	m.setTarget(target)

	rlist=[]
	for i in xrange(0, len(weight)):
		rlist.append((i, weight[i]))

	# 0 -> len(weight)
	# small -> large
	sort_rlist = sorted(rlist, key=lambda x: x[1])

	#for k in xrange(0, len(sort_rlist)):
	#	print '[%d]:[%s]' % (k, repr(sort_rlist[k]))

	goal = int(len(weight) * (1-scale))

	target_flag = False
	fout = open(outfile, 'w')
	# save msa sequences with large weights
	print 'Writing output ...'
	for k in xrange(goal, len(weight)):
		(index, w) = sort_rlist[k]
		#print '%d, %f' % (index, w)
		if m.msaArray[index][0] == m.target[0]:
			target_flag = True
		fout.write('>%s\n%s\n' % (m.msaArray[index][0], m.msaArray[index][1]))
	if target_flag == False:
		print 'Inserting target sequence: %s' % m.target[0]
		fout.write('>%s\n%s\n' % (m.target[0], m.target[1]))
	fout.close()
	print 'reduced msa: [%s]\nlen: %d' % (outfile, goal)
Example #7
0
def sdii2resi():
	if len(sys.argv) < 5:
		print 'resi2target: given a residue number output the corresponding position in target msa'
		print 'example:python utils_msa.py sdii2resi PF07714_full.fa.r50 BTK_HUMAN 1k2p.pdb PF07714_full.fa.r50.3128_3_sdii\n'
		print 'output: PF07714_full.fa.r50.3128_3_sdii_resi'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	pdbfile = sys.argv[4]
	sdiifile = sys.argv[5]

	print 'msafile: %s\ntarget header: %s\npdbfile: %s\nsdii file: %s' % (msafile, target, pdbfile, sdiifile)
	m = msa(msafile)
	p = protein(pdbfile)
	rtmap = m.getResiTargetMap(p, target)
	if len(rtmap) < 1:
		print 'error occoured in generating rtmap'
		return
	#print '%s: %s' % (tvar, repr(rtmap[tvar]))
	# construct trmap from rtmap
	# 3128: (B641, 'R')
	trmap = {}
	#trmap = {v: k for k, v in rtmap.iteritems()}
	for k in rtmap:
		msai, resn = rtmap[k]
		if msai in trmap:
			print 'error. duplicate key [%d] in rtmap' % msai
			return
		trmap[msai] = (k, resn)

	#print trmap

	# read sdii file
	with open(sdiifile) as f:
		sdiilines = f.readlines()

	outfile = sdiifile + '_resi'
	fout = open(outfile, 'w')

	# 52 [pid:20029] 926-3089-3128 0.001106226720675
	count = 0
	for line in sdiilines:
		count += 1
		print '%d/%d processed ...' % (count, len(sdiilines))
		strArr = line.strip().split(' ')
		msailist = strArr[2].split('-')
		sdiivalue = strArr[3]
		fout.write('%s %s\n' % ('-'.join([repr(trmap[int(i)]) for i in msailist]), sdiivalue))
	fout.close()
	print 'done.\noutput file: [%s]' % outfile
Example #8
0
def main():

	# test msa weight	
	msafile = 'test_msa.txt'
	target = '1k2p'
	m = msa(msafile, target)
	score, varlist = m.msaboard(0.0, 0.5)
	print score
	sdii_core = sdii(score)
	print sdii_core.w_entropy(sdii_core.data[:,[0,1]].T)
	weight = np.loadtxt('test_msa.weight', delimiter=',')
	sdii_core.setWeight(weight)
	print sdii_core.w_entropy(sdii_core.data[:,[0,1]].T)
	print sdii_core.weight
	print 'sum(weight): %f' % sum(sdii_core.weight)
Example #9
0
def searchpdbseq():
	if len(sys.argv) < 2:
		print 'searchpdbseq: locate pdb sequence in MSA' 
		print 'example: python utils_msa.py searchpdbseq PF07714_full.fa 1T49_A.pdb\n'
		return	

	msafile = sys.argv[2]
	target = sys.argv[3]

	print 'msa file: %s' % msafile
	print 'pdb target: %s' % target

	m = msa(msafile)
	p = protein(target)

	if m.searchTargetPDB(p)==0:
		print 'cannot locate pdb sequence in MSA'
Example #10
0
def reduceByHamming():
	if len(sys.argv) < 3:
		print 'reduceByHamming: reduce a msa file by selecting sequences that have sequential similarity < 62% (hamming dist > 0.38)'
		print 'example: python utils_msa.py reducebyhamming PF07714_full.fa BTK_HUMAN\n'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	outfile = '%s.s62' % msafile

	print 'msa file: %s\ntarget header: %s\noutfile: %s\n' % (msafile, target, outfile)

	print 'loading msa ...'
	m = msa(msafile)
	m.setTarget(target)
	print 

	m.hammingReduction(outfile, 0.62)
Example #11
0
def getMsabyName():
	if len(sys.argv) < 4:
		print 'getMsabyName: get msa sequence with gaps by searching fasta name'
		print 'example: python utils_msa.py getmsabyname PF07714_full.fa BTK_HUMAN\n'
		return

	msafile = sys.argv[2]
	msaheader = sys.argv[3].upper()
	print 'msa file: %s' % msafile
	print 'target entry: %s' % msaheader

	msaseq = ''
	m = msa(msafile)
	m.setTarget(msaheader)

	for s in m.msaArray:
		if msaheader in s[0]:
			print s[0]
			print s[1]
Example #12
0
def resi2target():
	if len(sys.argv) < 5:
		print 'resi2target: given a residue number output the corresponding position in target msa'
		print 'example:python utils_msa.py resi2target PF07714_full.fa.r50 BTK_HUMAN 1k2p.pdb B641\n'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	pdbfile = sys.argv[4]
	tvar = sys.argv[5]

	print 'msafile: %s\ntarget header: %s\npdbfile: %s\ntarget variable: %s' % (msafile, target, pdbfile, tvar)
	m = msa(msafile)
	p = protein(pdbfile)
	print p.resDict[tvar]
	rtmap = m.getResiTargetMap(p, target)
	if len(rtmap) < 1:
		return
	print 'map %s: %s' % (tvar, repr(rtmap[tvar]))
	return (tvar, rtmap[tvar][0], rtmap[tvar][1])
Example #13
0
def resi2msai():
	if len(sys.argv) < 5:
		print 'resi2target: given a residue number output the corresponding position in target msa'
		print 'python utils_msa.py resi2msai PF00014_full.txt BPT1_BOVIN 5pti_pf.pdb A6'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	pdbfile = sys.argv[4]
	tvar = sys.argv[5]

	print 'msafile: %s\ntarget header: %s\npdbfile: %s\ntarget variable: %s' % (msafile, target, pdbfile, tvar)
	m = msa(msafile)
	p = protein(pdbfile)
	print p.resDict[tvar]
	rtmap = m.getResiTargetMap(p, target)
	if len(rtmap) < 1:
		return
	print 'map %s: %s' % (tvar, repr(rtmap[tvar]))
	return (tvar, rtmap[tvar][0], rtmap[tvar][1])
Example #14
0
def pdistDistribution():
	if len(sys.argv) < 3:
		print 'pdist: write pairwise sequence simiarity value in a file' 
		print 'example: python utils_msa.py pdist PF07714_full.fa BTK_HUMAN\n'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	outfile = '%s.pdist' % msafile

	print 'msa file: %s\ntarget header: %s\noutfile: %s\n' % (msafile, target, outfile)

	print 'loading msa ...'
	m = msa(msafile)
	m.setTarget(target)
	print

	print 'saving to [%s] ...' % outfile
	pdist = m.getpdist()
	np.savetxt(outfile, pdist, fmt='%.8', delimiter='\n')
	print '%d pdist saved.' % len(pdist)
Example #15
0
def init():
    if len(sys.argv) < 6:
        print "Usage: python mproc_coevol_sdii.py msafile weightfile cutoff target_seq msapos order"
        print "Example 1: python mproc_coevol_sdii.py PF07714_full.fa.r50 PF07714_full.fa.r50.weight 0.6 BTK_HUMAN 3128 3"
        print "Example 2: python mproc_coevol_sdii.py PF07714_full.fa.s62 NA 0.6 BTK_HUMAN all 3"
        return

    msafile = sys.argv[1]
    weightfile = sys.argv[2]
    drop_cutoff = float(sys.argv[3])  # for reduce columns
    targetHeader = sys.argv[4]
    target = sys.argv[5].lower()
    order = int(sys.argv[6])

    print "msafile: [%s]" % msafile
    print "weightfile: [%s]" % weightfile
    print "drop_cutoff: [%f]" % drop_cutoff
    print "target msa header: [%s]" % targetHeader
    print "target var: [%s]" % target
    print "order: [%d]" % order

    outfile = "%s.%s_%d_sdii" % (msafile, target, order)
    print "write to [%s]" % outfile

    # msa init
    m = msa(msafile)
    m.setTarget(targetHeader)
    print "original data dimension: (%d, %d)" % (m.seqNum, m.seqlen)
    # weight_cutoff = 0.3 # for weighting msa sequence # taken care of in matlab

    score, varlist = m.msaboard(drop_cutoff)  # , weight_cutoff) # return a compact score
    print "reduced data dimension: %s" % repr(score.shape)

    if (target != "all") and (int(target) not in varlist):
        print "The alignment for var %s is not significant. exit." % target
        return

        # sdii init
    sdii_core = sdii(score)

    print "Loading weight ..."
    if weightfile.upper() != "NA":
        pfam_weight = np.loadtxt(weightfile, delimiter=",")
        print "Weight vector: %s" % repr(pfam_weight.shape)
        print "Applying weight to sdii data ..."
        sdii_core.setWeight(pfam_weight)  # set sequence weight
    else:
        print "setting weight: %r" % sdii_core.isWeighted

    print "Setting varlist to sdii ..."
    sdii_core.setVarlist(varlist)  # set sequence weight
    print "Setting target variable ..."
    sdii_core.setTarget(target)
    print "Setting task order ..."
    sdii_core.setOrder(order)

    # tasklist init
    # calculating total tasks
    tasks = []
    if target == "all":
        print "generating tasks for all ..."
        for s in set(itertools.combinations(list(range(len(varlist))), order)):
            tasks.append(list(s))
        print "In total %d/%d for order %d." % (len(tasks), binom(len(varlist), order), order)
    else:
        print "generating tasks for variable %s" % target
        for s in set(itertools.combinations(list(range(len(varlist))), order - 1)):
            target_idx = varlist.index(int(target))
            if target_idx not in s:
                st = list(s)
                st.append(target_idx)
                tasks.append(st)
        print "In total %d/%d for order %d." % (len(tasks), binom(len(varlist), order), order)

    sdii_core.setTotalTask(len(tasks))
    # split tasks into blocks
    tasklist = []
    n = len(tasks) / 20 + 1
    for i in xrange(0, len(tasks), n):
        tasklist.append(tasks[i : i + n])
    print "spliting tasks into %d blocks" % len(tasklist)

    print "init done."
    return (sdii_core, tasklist, outfile)
Example #16
0
 def test_triple_compact(self):
     from msa import Disjunct as d, Sub as s
     self.assertEqual(msa.compact(msa.msa(['ABC', 'BCD', 'BCX'])),
                      [d((d((s('A'),), ()),), ()),
                       s('BC'),
                       d((d((), (s('D'),)),), (s('X'),))])
Example #17
0
 def test_pair(self):
     from msa import Disjunct as d, Sub as s
     self.assertEqual(msa.msa(['ABC', 'BCDE']),
                      [d((s('A'),), ()),
                       s('B'), s('C'),
                       d((), (s('D'), s('E')))])
Example #18
0
def main():
	global alphabet

	if len(sys.argv) < 6:
		print 'Usage: python proc_coevol_sdii.py msafile weightfile cutoff target_seq msapos order'
		print 'Example: python proc_coevol_sdii.py PF07714_full.fa.r50 PF07714_full.fa.r50.weight 0.6 BTK_HUMAN 3128 3'
		return

	msafile = sys.argv[1]
	weightfile = sys.argv[2]
	drop_cutoff = float(sys.argv[3]) # for reduce columns
	targetHeader = sys.argv[4]
	target = sys.argv[5].lower()
	order = int(sys.argv[6])

	print 'msafile: [%s]' % msafile
	print 'weightfile: [%s]' % weightfile
	print 'drop_cutoff: [%f]' % drop_cutoff
	print 'target msa header: [%s]' % targetHeader
	print 'target var: [%s]' % target
	print 'order: [%d]' % order

	outfile = '%s.%s_%d_sdii' % (msafile, target, order)
	print 'write to [%s]' % outfile

	m = msa(msafile)
	m.setTarget(targetHeader)
	print 'original data dimension: (%d, %d)' % (m.seqNum, m.seqlen)
	#weight_cutoff = 0.3 # for weighting msa sequence # taken care of in matlab

	score, varlist = m.msaboard(drop_cutoff) #, weight_cutoff) # return a compact score
	print 'reduced data dimension: %s' % repr(score.shape)

	'''
	score: A..C..D.EF
	index: 0123456789
	# after reduction
	score: ACDE
	index: 0123 -> input in sdii calculation
	index: 0368 = varlist = alphabet
	'''

	alphabet = [str(i) for i in varlist]
	#print alphabet
	#m.writeScoreboard('1k2p_PF07714_seed.score')
	if (target != 'all') and (int(target) not in varlist):
		print 'The alignment for var %s is not significant. exit.' % target
		return 

	if target == 'all':
		pk = binom(len(varlist), order)
	else:
		pk = binom(len(varlist), order-1) - len(varlist) - 1

	print 'total calculations: %d' % pk

	print 'Loading weight ...'
	pfam_weight = np.loadtxt(weightfile, delimiter=',')
	print 'Weight vector: %s' % repr(pfam_weight.shape)

	sdii_core = sdii(score)
	print 'Applying weight to sdii data ...'
	sdii_core.setWeight(pfam_weight) # set sequence weight

	fout = open(outfile, 'w')
	t0 = time.time()
	count = 0
	for s in set(itertools.combinations(list(range(len(alphabet))), order)): 
		if (target == 'all') or (alphabet.index(target) in s):
			count+=1
			print '%d/%d: %s          ' % (count, pk, '-'.join([(alphabet[i]) for i in s]))
			ret_sdii = sdii_core.calc_sdii(list(s))
			t1 = time.time()
			print 'time used: %d seconds\n' % (t1-t0)
			fout.write('%s %.15f\n' % ('-'.join([(alphabet[i]) for i in s]), ret_sdii))
			t0 = t1

	fout.close()
Example #19
0
def ncg2blossum():
	if len(sys.argv) < 7:
		print 'ncg2blossum: construct new substitution matrix from contact group'
		print 'example:python utils_msa.py ncg2blossum 5pti_pf.pdb 5pti_pf.tip.ncg PF00014_full.txt.rseq PF00014_full.txt.sdii BPT1_BOVIN order'
		print 'output: a substitution matrix file (same format as BLOSSUM62)'
		return
	#print sys.argv[0] # utils_msa.py
	#print sys.argv[1] # hcg2blossum
	pdbfile = sys.argv[2] # pdb name
	ncgfile = sys.argv[3] # hcg
	msafile = sys.argv[4] # msa (full or reduced)
	sdiifile = sys.argv[5] # sdii
	target = sys.argv[6] # target name
	order = int(sys.argv[7])
	outfile = msafile[0:7]+".sm" # new substitution matrix

	# get msa in matrix format
	m = msa(msafile)
	msaMatrix = np.array([list(s[1]) for s in m.msaArray]) # matrix format of msa

	#for i in xrange(0, len(seqs)):
	#	print seqs[i]
	print 'msa matrix: ' + repr(msaMatrix.shape)

	# get resi -> msai map	
	p = protein(pdbfile)
	rtmap = m.getResiTargetMap(p, target)

	sdiidict = loadsdii(sdiifile) # key: 39-140-210, value = 0.0788593466276019
	msaGroupArray = ncg2msa(ncgfile, rtmap) # [[210, 215], [106, 211], [73, 95, 166], [109, 124, 139]]

	# get non overlapped column indices
	colset = set()
	for g in msaGroupArray:
		rg = g[0:order] # get ith order contact group
		rg.sort() # for generating key
		sdiikey = '-'.join([str(r) for r in rg])
		if sdiikey not in sdiidict:
			#print 'ncg2sdiicol(): discard group: %s for low sdii' % sdiikey
			continue
		print (sdiikey, sdiidict[sdiikey])			
		for resi in rg: # for significant ncg, add corresponding MSA column index
			colset.add(resi)

	# init substitution matrix
	EBlist = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '*']
	#AAlist = sorted(EBlist)
	#AAlist = sorted(['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '*'])
	AAlist = sorted(['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'])
	sm = {}
	for i in xrange(0, len(AAlist)):
		for j in xrange(i, len(AAlist)):
			key = '%s%s' % (AAlist[i], AAlist[j])
			sm[key] = 0
	print AAlist
	print 'Alphabet: %d' % len(AAlist) 
	print 'AA: %d' % len(sm)

	# accumulate substitution matrix AA frequency for all the contact group columns
	# Sum the scores for each columns across column
	print ''
	w = 0 # count column number
	for col in colset:
		w+=1
		calcColSM(sm, msaMatrix, col)
	'''
	for mg in msaGroupArray:
		# form key for co-evolve value 
		sdiikey = '-'.join([str(i) for i in mg])
		if sdiikey not in sdiidict:
			print 'hcg2blossum():discard group: %s' % sdiikey
			continue
		sdiiweight = sdiidict[sdiikey]
		print (sdiikey, sdiiweight)

		# accumulate SM for each contact group / column group
		for col in mg:
			w +=1
			calcColSM(sm, msaMatrix, col)
		print ''
	'''
	#print repr(sm)
	#print ''

	n = msaMatrix.shape[0]
	T = w*n*(n-1)/2 # normalization term
	print 'w: %d' % w # number of columns (contact group)
	print 'n: %d' % n # number of sequence
	print 'T: %d' % T


	# convert cij to qij
	# Normalize the pair frequencies so they will sum to 1
	for c in sm:
		sm[c] = 1.0*sm[c]/T

	#print repr(sm)
	#print ''

	# Calculate the expected probability of occurrence of the ith residue in an (i,j) pair
	# pi = qii + sum( qij/2 )_{i!=j}
	pi = {}
	for i in xrange(0, len(AAlist)):
		A = AAlist[i]
		sum_qij = 0
		for j in xrange(i+1, len(AAlist)): # i should not = j
			B = AAlist[j]
			sum_qij += sm[A+B]/2
		pi[A] = sm[A+A] + sum_qij

	print repr(pi)	
	print ''

	# The desired denominator is the expected frequency for each pair 
	eij = {}
	for i in xrange(0, len(AAlist)):
		A = AAlist[i]
		for j in xrange(i+1, len(AAlist)):
			B = AAlist[j]
			eij[A+B] = 2 * pi[A] * pi[B]
		eij[A+A] = pi[A] * pi[A]

	print len(eij)
	print repr(eij)	
	print ''

	#  Log odds ratio sij = round(2*log2(qij/eij))
	sij = {}
	for i in xrange(0, len(AAlist)):
		A = AAlist[i]
		for j in xrange(i, len(AAlist)):
			B = AAlist[j]
			if eij[A+B] == 0.0 or sm[A+B]==0.0:
				sij[A+B] = 0
			else:
				sij[A+B] = int(round(2*math.log((sm[A+B]/eij[A+B]),2)))
			#	sij[A+B] = sm[A+B]/eij[A+B]

	print repr(sij)	
	print len(sij)
	print ''

	saveBlosum(EBlist, sij, outfile)
Example #20
0
    def test_pair(self):
        from msa import Disjunct as d, Sub as s

        self.assertEqual(msa.msa(["ABC", "BCDE"]), [d((s("A"),), ()), s("B"), s("C"), d((), (s("D"), s("E")))])