Example #1
0
def G_Bn(sdii_obj, bootstrap_indexSet, t, varset, order):
	B = len(bootstrap_indexSet) # number of bootstrap
	count = 0
	for i in range(B):
		#print 'G_Bn()::bootstrap # %d' % i
		t0 = time.time()
		#print 'G_Bn()::bootstrap index[1:10]: %s' % str(bootstrap_index[i][0:10])
		data_1 = bootstrap_data(sdii_obj.data, bootstrap_indexSet[i], len(varset))
		sdii_bootstrap = sdii(data_1) # new hashing object for new data
		'''
		print 'G_Bn()::data_1 shape: %s' % repr(data_1.shape)
		print 'G_Bn()::data_1 : %s' % repr(data_1)
		print
		print 'G_Bn()::data : %s' % repr(data)
		exit()
		'''
		for s in set(itertools.combinations(varset, order)): # generate all variable subset with length of 2
		# varset = Set([2,4,6]), order = 2
		# set([(2, 6), (2, 4), (4, 6)])
			if sdii_bootstrap.T_l(list(s)) >= t: # using the hash table in sdii_bootstrap 
					count+=1
		t1 = time.time()

	print 'G_Bn():: # of T >= t : %d, t: %f, count*(1/B): %f' % (count, t, (1.0/B)*count)
	return (1.0/B)*count
Example #2
0
def forward_selection(data, alpha, varset, order, B):
	global alphabet
	ret_varset = Set()
	#outfile = 'result_proc_sdii_test_%d.txt' % order
	#fout = open(outfile, 'w')
	print 'forward_selection()::varset: %s, order: %d' % (repr(varset), order)

	sdii_core = sdii(data)
	th = threshold_t_B(sdii_core, alpha, varset, order, B)
	print 'forward_selection()::threshold of order [%d]: %f' % (order, th)

	'''
	# generate all variable subset with length of order from varlist
	for s in set(itertools.combinations(varset, order)):
		ss = Set(s)
		#print 'forward_selection()::s: %s' % repr(s)
		if len(ss.intersection(varset)) == 0:
			print 'forward_selection()::%s is NOT in %s. skip' % (repr(ss), repr(varset))
			continue
		
		sdii_value = sdii_core.calc_sdii(list(s)) 

		fout.write('%s %.15f\n' % (''.join([(alphabet[i]) for i in s]), sdii_value))
		if sdii_value >= th:
			for var in s:
				ret_varset.add(var)

	print 'forward_selection()::Writing %s' % outfile
	fout.close()
	return ret_varset
	'''
	return th
Example #3
0
def main():
	global alphabet

	aa_alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
	na_alphabet = [
		'AB', 'AE', 'CB', 'CE', 'DB', 'DE', 'EB', 'EE', 'FB', 'FE', 'GB', 'GE', 'HB', 'HE', 'IB', 'IE',
		'KB', 'KE', 'LB', 'LE', 'MB', 'ME', 'NB', 'NE', 'PB', 'PE', 'QB', 'QE', 'RB', 'RE', 'SB', 'SE', 
		'TB', 'TE', 'VB', 'VE', 'WB', 'WE', 'YB', 'YE'
	]

	if len(sys.argv) < 3:
		print 'Usage: python proc_sdii.py var_type score_file'
		return

	vartype = sys.argv[1]
	if vartype == 'AA':
		alphabet = aa_alphabet
		print 'use AA varset : %s' % repr(alphabet)
	elif vartype == 'NA':
		alphabet = na_alphabet
		print 'use NA varset : %s' % repr(alphabet)

	scorefile = sys.argv[2]
	print 'score file: %s' % scorefile
	outfile = '%s.sdii' % scorefile
	print 'write to %s' % outfile

	score = np.loadtxt(scorefile, delimiter=',')
	#print score.shape[0]

	'''
	t1 = time.time()
	varset = range(len(alphabet))
	th2 = forward_selection(score, 0.1, varset, 2, 300)
	th3 = forward_selection(score, 0.1, varset, 3, 300)
	t2 = time.time()

	print 'Threshold of order 2: %f' % th2
	print 'Threshold of order 3: %f' % th3
	print 'use %d seconds' % (t2 - t1)

	return
	'''

	sdii_core = sdii(score)
	fout = open(outfile, 'w')
	print 'calculating mutual information ...'
	t0 = time.time()
	for s in set(itertools.combinations(list(range(len(alphabet))), 2)): # generate all variable subset with length of 2
		fout.write('%s %.15f\n' % ('-'.join([(alphabet[i]) for i in s]), sdii_core.calc_sdii(list(s))))

	t1 = time.time()
	print 'MI time: %d seconds' % (t1-t0)

	print 'calculating DeltaK(3) ...'
	for s in set(itertools.combinations(list(range(len(alphabet))), 3)): # generate all variable subset with length of 3
		fout.write('%s %.15f\n' % ('-'.join([(alphabet[i]) for i in s]), sdii_core.calc_sdii(list(s))))
	t2 = time.time()
	print 'DeltaK(3) time: %d seconds' % (t2-t1)
Example #4
0
def forward_selection(data, alpha, varset, order, B):
	global alphabet
	ret_varset = Set()

	print 'forward_selection()::varset: %s, order: %d' % (repr(varset), order)

	sdii_core = sdii(data)
	th = threshold_t_B(sdii_core, alpha, varset, order, B)
	print 'forward_selection()::threshold of order [%d]: %f' % (order, th)

	return th
Example #5
0
def main():

	# test msa weight	
	msafile = 'test_msa.txt'
	target = '1k2p'
	m = msa(msafile, target)
	score, varlist = m.msaboard(0.0, 0.5)
	print score
	sdii_core = sdii(score)
	print sdii_core.w_entropy(sdii_core.data[:,[0,1]].T)
	weight = np.loadtxt('test_msa.weight', delimiter=',')
	sdii_core.setWeight(weight)
	print sdii_core.w_entropy(sdii_core.data[:,[0,1]].T)
	print sdii_core.weight
	print 'sum(weight): %f' % sum(sdii_core.weight)
def init():
    if len(sys.argv) < 6:
        print "Usage: python mproc_coevol_sdii.py msafile weightfile cutoff target_seq msapos order"
        print "Example 1: python mproc_coevol_sdii.py PF07714_full.fa.r50 PF07714_full.fa.r50.weight 0.6 BTK_HUMAN 3128 3"
        print "Example 2: python mproc_coevol_sdii.py PF07714_full.fa.s62 NA 0.6 BTK_HUMAN all 3"
        return

    msafile = sys.argv[1]
    weightfile = sys.argv[2]
    drop_cutoff = float(sys.argv[3])  # for reduce columns
    targetHeader = sys.argv[4]
    target = sys.argv[5].lower()
    order = int(sys.argv[6])

    print "msafile: [%s]" % msafile
    print "weightfile: [%s]" % weightfile
    print "drop_cutoff: [%f]" % drop_cutoff
    print "target msa header: [%s]" % targetHeader
    print "target var: [%s]" % target
    print "order: [%d]" % order

    outfile = "%s.%s_%d_sdii" % (msafile, target, order)
    print "write to [%s]" % outfile

    # msa init
    m = msa(msafile)
    m.setTarget(targetHeader)
    print "original data dimension: (%d, %d)" % (m.seqNum, m.seqlen)
    # weight_cutoff = 0.3 # for weighting msa sequence # taken care of in matlab

    score, varlist = m.msaboard(drop_cutoff)  # , weight_cutoff) # return a compact score
    print "reduced data dimension: %s" % repr(score.shape)

    if (target != "all") and (int(target) not in varlist):
        print "The alignment for var %s is not significant. exit." % target
        return

        # sdii init
    sdii_core = sdii(score)

    print "Loading weight ..."
    if weightfile.upper() != "NA":
        pfam_weight = np.loadtxt(weightfile, delimiter=",")
        print "Weight vector: %s" % repr(pfam_weight.shape)
        print "Applying weight to sdii data ..."
        sdii_core.setWeight(pfam_weight)  # set sequence weight
    else:
        print "setting weight: %r" % sdii_core.isWeighted

    print "Setting varlist to sdii ..."
    sdii_core.setVarlist(varlist)  # set sequence weight
    print "Setting target variable ..."
    sdii_core.setTarget(target)
    print "Setting task order ..."
    sdii_core.setOrder(order)

    # tasklist init
    # calculating total tasks
    tasks = []
    if target == "all":
        print "generating tasks for all ..."
        for s in set(itertools.combinations(list(range(len(varlist))), order)):
            tasks.append(list(s))
        print "In total %d/%d for order %d." % (len(tasks), binom(len(varlist), order), order)
    else:
        print "generating tasks for variable %s" % target
        for s in set(itertools.combinations(list(range(len(varlist))), order - 1)):
            target_idx = varlist.index(int(target))
            if target_idx not in s:
                st = list(s)
                st.append(target_idx)
                tasks.append(st)
        print "In total %d/%d for order %d." % (len(tasks), binom(len(varlist), order), order)

    sdii_core.setTotalTask(len(tasks))
    # split tasks into blocks
    tasklist = []
    n = len(tasks) / 20 + 1
    for i in xrange(0, len(tasks), n):
        tasklist.append(tasks[i : i + n])
    print "spliting tasks into %d blocks" % len(tasklist)

    print "init done."
    return (sdii_core, tasklist, outfile)
Example #7
0
def main():
	global alphabet

	if len(sys.argv) < 6:
		print 'Usage: python proc_coevol_sdii.py msafile weightfile cutoff target_seq msapos order'
		print 'Example: python proc_coevol_sdii.py PF07714_full.fa.r50 PF07714_full.fa.r50.weight 0.6 BTK_HUMAN 3128 3'
		return

	msafile = sys.argv[1]
	weightfile = sys.argv[2]
	drop_cutoff = float(sys.argv[3]) # for reduce columns
	targetHeader = sys.argv[4]
	target = sys.argv[5].lower()
	order = int(sys.argv[6])

	print 'msafile: [%s]' % msafile
	print 'weightfile: [%s]' % weightfile
	print 'drop_cutoff: [%f]' % drop_cutoff
	print 'target msa header: [%s]' % targetHeader
	print 'target var: [%s]' % target
	print 'order: [%d]' % order

	outfile = '%s.%s_%d_sdii' % (msafile, target, order)
	print 'write to [%s]' % outfile

	m = msa(msafile)
	m.setTarget(targetHeader)
	print 'original data dimension: (%d, %d)' % (m.seqNum, m.seqlen)
	#weight_cutoff = 0.3 # for weighting msa sequence # taken care of in matlab

	score, varlist = m.msaboard(drop_cutoff) #, weight_cutoff) # return a compact score
	print 'reduced data dimension: %s' % repr(score.shape)

	'''
	score: A..C..D.EF
	index: 0123456789
	# after reduction
	score: ACDE
	index: 0123 -> input in sdii calculation
	index: 0368 = varlist = alphabet
	'''

	alphabet = [str(i) for i in varlist]
	#print alphabet
	#m.writeScoreboard('1k2p_PF07714_seed.score')
	if (target != 'all') and (int(target) not in varlist):
		print 'The alignment for var %s is not significant. exit.' % target
		return 

	if target == 'all':
		pk = binom(len(varlist), order)
	else:
		pk = binom(len(varlist), order-1) - len(varlist) - 1

	print 'total calculations: %d' % pk

	print 'Loading weight ...'
	pfam_weight = np.loadtxt(weightfile, delimiter=',')
	print 'Weight vector: %s' % repr(pfam_weight.shape)

	sdii_core = sdii(score)
	print 'Applying weight to sdii data ...'
	sdii_core.setWeight(pfam_weight) # set sequence weight

	fout = open(outfile, 'w')
	t0 = time.time()
	count = 0
	for s in set(itertools.combinations(list(range(len(alphabet))), order)): 
		if (target == 'all') or (alphabet.index(target) in s):
			count+=1
			print '%d/%d: %s          ' % (count, pk, '-'.join([(alphabet[i]) for i in s]))
			ret_sdii = sdii_core.calc_sdii(list(s))
			t1 = time.time()
			print 'time used: %d seconds\n' % (t1-t0)
			fout.write('%s %.15f\n' % ('-'.join([(alphabet[i]) for i in s]), ret_sdii))
			t0 = t1

	fout.close()
Example #8
0
def init():
	if len(sys.argv) < 3:
		print 'Usage: python mp_ce_sdii_rcrr.py MSATitle targetVar order'
		print 'Example 1: python mp_ce_sdii_rcrr.py PF07714_full.fa 3128 3'
		print 'Example 1: python mp_ce_sdii_rcrr.py PF07714_full.fa all 3'
		return

	scoreFile = sys.argv[1]+'.score'
	rowIndexFile = sys.argv[1]+'.row'
	colIndexFile = sys.argv[1]+'.col'

	targetVar = sys.argv[2].lower()
	order = int(sys.argv[3])

	print 'score file: [%s]' % scoreFile
	print 'row index file: [%s]' % rowIndexFile
	print 'column index file: [%s]' % colIndexFile
	print 'target var: [%s]' % targetVar
	print 'order: [%d]' % order

	outfile = '%s.%s_%d_sdii' % (sys.argv[1], targetVar, order)
	print 'write to [%s]' % outfile

	# msa init
	score = np.loadtxt(scoreFile, delimiter=',')

	rowIndex = [int(i) for i in np.loadtxt(rowIndexFile, delimiter=',')]
	colIndex = [int(j) for j in np.loadtxt(colIndexFile, delimiter=',')]

	print 'row index: %s' % repr(rowIndex)
	print 'col index: %s' % repr(colIndex)
	print 'reduced data dimension: %s, (%d, %d)' % (repr(score.shape), len(rowIndex), len(colIndex))

	varlist = colIndex

	if (targetVar != 'all') and (int(targetVar) not in varlist):
		print 'The alignment for var %s is not significant. exit.' % targetVar
		return 

	# sdii init
	sdii_core = sdii(score)
	print 'Setting varlist to sdii ...'
	sdii_core.setVarlist(varlist) # set sequence weight
	print 'Setting target variable ...'
	sdii_core.setTarget(targetVar)
	print 'Setting task order ...'
	sdii_core.setOrder(order)
	print repr(varlist)

	# tasklist init
	# calculating total tasks
	tasks = []
	if targetVar == 'all':
		print 'generating tasks for all ...'
		for s in set(itertools.combinations(list(range(len(varlist))), order)):
			tasks.append(list(s))
		print 'In total %d/%d for order %d.' % (len(tasks), binom(len(varlist), order), order)
	else:
		print 'generating tasks for variable %s' % targetVar
		for s in set(itertools.combinations(list(range(len(varlist))), order-1)):
			target_idx = varlist.index(int(targetVar))
			if target_idx not in s:
 				st = list(s)
 				st.append(target_idx)
 				tasks.append(st)
		print 'In total %d/%d for order %d.' % (len(tasks), binom(len(varlist), order), order)

	sdii_core.setTotalTask(len(tasks))
	# split tasks into blocks
	tasklist = []
	n = len(tasks)/20 +1
	for i in xrange(0, len(tasks), n):
		tasklist.append(tasks[i:i+n])
	print 'spliting tasks into %d blocks' % len(tasklist)

	print 'init done.'
	return (sdii_core, tasklist, outfile)