예제 #1
0
def main(dirname):
	from MSA import MSA
	rex = re.compile('\S+c(\d+)p(\d+)\S+')
	result = defaultdict(lambda: []) # (# of single, # of paired) --> list of sto entropies
	just_single = [] # a tally list of MI for each single col
	just_paired = [] # a tally list of MI for each paired col
	for fulld in glob.iglob(os.path.join(dirname, '*iter*/')):
		if not os.path.isdir(fulld):
			continue
		for sto in glob.iglob(os.path.join(fulld, '*.sto')):
			m = rex.match(sto)
			if m is None:
				pass # ignore
			else:
				num_single = int(m.group(1))
				num_paired = int(m.group(2))
				msa = MSA(sto)
				ent = 0
				for i in msa.single_cols():
					s = calc_entropy([x[i] for x in msa.aln])
					just_single.append(s)
					ent += s
				for i,j in msa.BP.iteritems():
					p = calc_entropy([x[i]+x[j] for x in msa.aln])
					just_paired.append(p)
					ent += p
				result[(num_single,num_paired)].append(ent)
	return result, just_single, just_paired
예제 #2
0
def main():
    if len(sys.argv) != 2:
        print ("Usage: python Main.py <input_file_location>")
        return

    print ("INFO: Starting Multiple Sequence Alignment.")
    performance = dict()
    start_time = time.time()
    #input_data = InputData("./input.txt")
    input_data = InputData(sys.argv[1])
    end_time = time.time()
    performance["input"] = (end_time - start_time, memory_usage())

    start_time = time.time()
    hypercube = Hypercube(input_data.sequences)
    end_time = time.time()
    performance["hcube"] = (end_time - start_time, memory_usage())


    msa = MSA(hypercube)
    start_time = time.time()
    msa.align()
    end_time = time.time()
    performance["MSA"] = (end_time - start_time, memory_usage())

    for output in msa.output:
        print (output)

    output_data = OutputData("./output.txt", msa.output)

    print ("INFO: Performance: (Execution time [s], Memory usage [MB])")
    for p in performance:
        print ("\t{0}: \t{1} s, \t{2} MB".format(p, performance[p][0], performance[p][1]))
    print ("INFO: Done.")
    return
예제 #3
0
def main(msa_filename, tree_filename, single_model_filename=os.path.join(os.environ['LCODE'],'data/single_model'), \
		paired_model_filename=os.path.join(os.environ['LCODE'],'data/pair_model')):
	from MSA import MSA
	from EvoModel import SingleModel, PairedModel
	from Tree import *

	msa = MSA(msa_filename)

	single_model = SingleModel(single_model_filename)
	paired_model = PairedModel(paired_model_filename, single_model)

	# --------------- using newick ---------------------
#	acc = list(msa.ids)
#	post_order_traversal(t, acc)
#	order = acc[msa.nseq:]
	# -------------- using dendropy -------------------
	t2 = dendropy.Tree.get_from_path(tree_filename, 'newick')
	msa.remove_seqs_not_in_tree([x.taxon.label for x in t2.leaf_nodes()])
	t = t2
	order = postorder_assign_then_traverse(t, list(msa.ids))
	
	single_cols = xrange(msa.aln_len)
	paired_cols = msa.BP.items()
	paired_cols.sort()
	n = msa.nseq

	S = init_likelihood(msa, single_cols, single_model)

	g = MyMat.calc_likelihood
	# NOTE: NO LONGER logs the single model Frequency!
	# first calculate the null model (joint indep prob at each position)
	# TODO: this is not the fastest code ever....but will do for now
	L_null = [sum(sum(exp(S[:msa.nseq, col, :4]) * log(single_model.Frequency))) for col in single_cols]

	# convert S into 1d
	nnode, ncol, nbase = S.shape
	S = scipy.ascontiguousarray(S.reshape(S.size))

	P = init_likelihood_paired(msa, paired_cols, paired_model, nnode)
	nnode_p, ncol_p, nbase_p = P.shape
	P = scipy.ascontiguousarray(P.reshape(P.size))

	like_s, like_s_n_p, S, P = calc_likelihood(msa, order, single_model, paired_model) # need to use this to set up S, P for rearr
	return like_s_n_p
예제 #4
0
	args = parser.parse_args()
	return args

if __name__ == "__main__":
	options = usage()

	msa_filename = options.msa_filename
	tree_filename = options.tree_filename
	single_model_filename = options.single_model
	paired_model_filename = options.paired_model
	treat_gap_as_missing  = options.treat_gap_as_missing

	assert 0. < options.trim_gap_threshold <= 1.
	assert 1 <= options.cpu

	msa = MSA(msa_filename, options.ignore_bp)
	msa.trim_gaps(removeAmbs=True, threshold=options.trim_gap_threshold) 

	single_model = SingleModel(single_model_filename)
	paired_model = PairedModel(paired_model_filename, single_model)

	# -------------- using dendropy -------------------
	t = dendropy.Tree.get_from_path(tree_filename, 'newick')
	# have to call remove_seqs_not_in_tree becuz sometimes I 
	# will manually trim leaves from the tree
	msa.remove_seqs_not_in_tree([x.taxon.label for x in t.leaf_nodes()])
	# edge lengths of 0 will cause calculation problems...
	# TODO: better way to handle this?
	for n in t.nodes():
		if n.edge_length <= 0:
			n.edge_length = 1e-3
def main(output_prefix, trim_gap_threshold, singlify_threshold, d, cmdf):
	"""
	Preparing for the rRNA concordance test.
	Takes <sto_filename>, randomly picks <ntaxa> species,
	a) removes all paired cols w/ too little canonical pairs (<singlify_threshold>) or ambiguous code
	b) removes all unpaired cols w/ too much gaps (<trim_gap_threshold>) or ambiguous code

	Filles up dict <d> with: <prefix base> --> ids, stats of pre/post filtering
	"""
	msa = MSA(output_prefix + '.raw.sto')
	d_key = os.path.basename(output_prefix)
	d[d_key] = {}
	d[d_key]['ids'] = msa.ids
	d[d_key]['pre'] = msa.get_stats()

	print >> sys.stderr, "delete pair cols ambiguous or has canonical less than ", singlify_threshold
	msa.singlify_pairs(singlify_threshold, delete_instead_of_singlify=True, removeAmb=True)
	print >> sys.stderr, "delete single cols ambiguous or more gaps % than", trim_gap_threshold
	msa.trim_gaps(removeAmbs=True, threshold=trim_gap_threshold)
	msa.write_stockholm(output_prefix+'.original.sto')
	old = msa.get_stats()['single']
	d[d_key]['post'] = msa.get_stats()
	
	msa.trim_gaps(removeAmbs=True, threshold=trim_gap_threshold) # sanity check
	assert msa.get_stats()['single'] == old #just sanity check

	file1, file2 = halve_msa(msa, output_prefix)
	cmdf.write("python $GBPML/run_pfold.py " + file1 + '\n')
	cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file1))
	cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file1))
	cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file1 + '\n')
	cmdf.write("python $GBPML/run_pfold.py " + file2 + '\n')
	cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file2))
	cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file2))
	cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file2 + '\n')

#	o_sto = output_prefix+'.original.sto'
#	print("python $GBPML/run_pfold.py " + o_sto)
#	print("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree".format(o_sto))
#	print("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree".format(o_sto))
#	print("bash $GBPML/run_dnaml-erate.sh " + o_sto)
	return True
def main(sto_filename, output_prefix, ntaxa, cmdf, shuffle_iter):
	"""
	Takes <sto_filename>, randomly picks <ntaxa> species,
	"""
	msa = MSA(sto_filename)
	n = random.sample(range(msa.nseq), ntaxa)
	msa.nseq = ntaxa
	msa.ids = ['T'+str(i) for i in xrange(ntaxa)]
	msa.aln = [msa.aln[i] for i in n]
	msa.trim_gaps(removeAmbs=True, threshold=1.) # remove just ambs and all-gap cols
	file1 = output_prefix + '.original.sto'
	msa.write_stockholm(file1)

	cmdf.write("python $GBPML/run_pfold.py " + file1 + '\n')
	cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file1))
	cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file1))
	cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file1 + '\n')
	cmdf.write("find {0}*.tree|xargs -n1 -i bash $GBPML/scripts/run_pscore.sh {0} {{}}\n".format(file1))

	if shuffle_iter > 0:
		for iter in xrange(shuffle_iter):
			file2 = output_prefix + '.shuffle_iter' + str(iter) + '.sto'
			msa.shuffle_cols()
			msa.write_stockholm(file2)
			cmdf.write("python $GBPML/run_pfold.py " + file2 + '\n')
			cmdf.write("python $GBPML/run_gbpml.py -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml.py_log -o {0}.gbpml.tree\n".format(file2))
			cmdf.write("python $GBPML/run_gbpml.py -n -t {0}.fasta.mltree.tree -a {0} -f {0}.gbpml_nobp.py_log -o {0}.gbpml_nobp.tree\n".format(file2))
			cmdf.write("bash $GBPML/run_dnaml-erate.sh " + file2 + '\n')
			cmdf.write("find {0}*.tree|xargs -n1 -i bash $GBPML/scripts/run_pscore.sh {0} {{}}\n".format(file2))
		
	return True
예제 #7
0
	P = init_likelihood_paired(msa, paired_cols, paired_model, nnode)
	nnode_p, ncol_p, nbase_p = P.shape
	P = scipy.ascontiguousarray(P.reshape(P.size))

	like_s, like_s_n_p, S, P = calc_likelihood(msa, order, single_model, paired_model) # need to use this to set up S, P for rearr
	return like_s_n_p


if __name__ == "__main__":
	from MSA import *
	from EvoModel import *
	from Tree import *

	usage()

	msa = MSA(msa_filename)
	msa.trim_gaps(0.7)

	single_model = SingleModel(single_model_filename)
	paired_model = PairedModel(paired_model_filename, single_model)

	# --------------- using newick ---------------------
#	acc = list(msa.ids)
#	post_order_traversal(t, acc)
#	order = acc[msa.nseq:]
	# -------------- using dendropy -------------------
	t2 = dendropy.Tree.get_from_path(tree_filename, 'newick')
	msa.remove_seqs_not_in_tree([x.taxon.label for x in t2.leaf_nodes()])
	t = t2
	order = postorder_assign_then_traverse(t, list(msa.ids))
	raw_input("break")